You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 12:49:36 UTC
svn commit: r1533382 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/tagging/
java/org/apache/lucene/analysis/ko/utils/
resources/org/apache/lucene/analysis/ko...
Author: uschindler
Date: Fri Oct 18 10:49:35 2013
New Revision: 1533382
URL: http://svn.apache.org/r1533382
Log:
LUCENE-4956: Don't load full files into big List<>, instead process them line by line (the current code uses iterator anyway).
Added:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java (with props)
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Fri Oct 18 10:49:35 2013
@@ -23,8 +23,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
import org.apache.lucene.util.IOUtils;
@@ -62,36 +60,34 @@ public class DictionaryResources {
private DictionaryResources() {}
- public static List<String> readLines(String file) throws IOException {
+ /**
+ * Get the contents of a <code>Reader</code> invoking {@link LineProcessor}
+ * for each line, removing comment lines starting with '!'.
+ * @param file the name of the dictionary resource to read
+ * @param processor lines are reported to this interface
+ * @throws IOException if an I/O error occurs
+ */
+ public static void readLines(String file, LineProcessor processor) throws IOException {
InputStream in = null;
try {
in = DictionaryResources.class.getResourceAsStream(file);
if (in == null)
throw new FileNotFoundException(file);
- return readLines(new InputStreamReader(in, IOUtils.CHARSET_UTF_8));
+ readLines(new InputStreamReader(in, IOUtils.CHARSET_UTF_8), processor);
} finally {
IOUtils.closeWhileHandlingException(in);
}
}
- /**
- * Get the contents of a <code>Reader</code> as a list of Strings,
- * one entry per line, removing comment lines starting with '!'.
- * @param input the <code>Reader</code> to read from, not null
- * @return the list of Strings, never null
- * @throws IOException if an I/O error occurs
- */
- private static List<String> readLines(Reader input) throws IOException {
+ private static void readLines(Reader input, LineProcessor processor) throws IOException {
BufferedReader reader = new BufferedReader(input);
- List<String> list = new ArrayList<String>();
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("!") || line.startsWith("\uFEFF!")) { // Skip comment lines starting with '!'
continue;
}
- list.add(line);
+ processor.processLine(line);
}
- return list;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Fri Oct 18 10:49:35 2013
@@ -51,56 +51,65 @@ public class DictionaryUtil {
static {
try {
- List<String> strList = DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY);
- strList.addAll(DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION));
- for(String str:strList) {
- String[] infos = str.split("[,]+");
- if(infos.length!=2) continue;
- infos[1] = infos[1].trim();
- if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
-
- WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
- dictionary.add(entry.getWord(), entry);
- }
-
- List<String> compounds = DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS);
- for(String compound: compounds)
- {
- String[] infos = compound.split("[:]+");
- if(infos.length!=3&&infos.length!=2) continue;
-
- final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
- final WordEntry entry;
- if(infos.length==2)
- entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
- else
- entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
- dictionary.add(entry.getWord(), entry);
- }
-
- List<String> abbrevs = DictionaryResources.readLines(DictionaryResources.FILE_ABBREV);
- for(String abbrev: abbrevs)
- {
- String[] infos = abbrev.split("[:]+");
- if(infos.length!=2) continue;
- abbreviations.put(infos[0].trim(), infos[1].trim());
- }
-
- List<String> lines = DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS);
- for(String compound: lines) {
- String[] infos = compound.split("[:]+");
- if(infos.length!=2) continue;
- WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
- uncompounds.put(entry.getWord(), entry);
- }
-
- lines = DictionaryResources.readLines(DictionaryResources.FILE_CJ);
- for(String cj: lines) {
- String[] infos = cj.split("[:]+");
- if(infos.length!=2) continue;
- cjwords.put(infos[0], infos[1]);
- }
-
+ final LineProcessor proc = new LineProcessor() {
+ @Override
+ public void processLine(String line) {
+ String[] infos = line.split("[,]+");
+ if(infos.length!=2) return;
+ infos[1] = infos[1].trim();
+ if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
+
+ WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
+ dictionary.add(entry.getWord(), entry);
+ }
+ };
+ DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY, proc);
+ DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION, proc);
+
+ DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
+ @Override
+ public void processLine(String compound) {
+ String[] infos = compound.split("[:]+");
+ if(infos.length!=3&&infos.length!=2) return;
+
+ final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
+ final WordEntry entry;
+ if(infos.length==2)
+ entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
+ else
+ entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
+ dictionary.add(entry.getWord(), entry);
+ }
+ });
+
+ DictionaryResources.readLines(DictionaryResources.FILE_ABBREV, new LineProcessor() {
+ @Override
+ public void processLine(String abbrev) {
+ String[] infos = abbrev.split("[:]+");
+ if(infos.length!=2) return;
+ abbreviations.put(infos[0].trim(), infos[1].trim());
+ }
+ });
+
+ DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
+ @Override
+ public void processLine(String compound) {
+ String[] infos = compound.split("[:]+");
+ if(infos.length!=2) return;
+ WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
+ uncompounds.put(entry.getWord(), entry);
+ }
+ });
+
+ DictionaryResources.readLines(DictionaryResources.FILE_CJ, new LineProcessor() {
+ @Override
+ public void processLine(String cj) {
+ String[] infos = cj.split("[:]+");
+ if(infos.length!=2) return;
+ cjwords.put(infos[0], infos[1]);
+ }
+ });
+
readFileToSet(josas,DictionaryResources.FILE_JOSA);
readFileToSet(eomis,DictionaryResources.FILE_EOMI);
@@ -261,11 +270,13 @@ public class DictionaryUtil {
}
- private static void readFileToSet(Set<String> set, String dic) throws IOException {
- List<String> line = DictionaryResources.readLines(dic);
- for(int i=1;i<line.size();i++) {
- set.add(line.get(i).trim());
- }
+ private static void readFileToSet(final Set<String> set, String dic) throws IOException {
+ DictionaryResources.readLines(dic, new LineProcessor() {
+ @Override
+ public void processLine(String line) {
+ set.add(line.trim());
+ }
+ });
}
private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java?rev=1533382&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java Fri Oct 18 10:49:35 2013
@@ -0,0 +1,27 @@
+package org.apache.lucene.analysis.ko.dic;
+
+import java.io.IOException;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Simple interface that is used to read lines from a resource.
+ * @lucene.internal
+ */
+public interface LineProcessor {
+ void processLine(String line) throws IOException;
+}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 10:49:35 2013
@@ -27,6 +27,7 @@ import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
import org.apache.lucene.analysis.ko.morph.AnalysisOutput;
import org.apache.lucene.analysis.ko.morph.PatternConstants;
import org.apache.lucene.analysis.ko.utils.ConstraintUtil;
@@ -41,23 +42,25 @@ public class Tagger {
static {
try {
final SortedMap<String, String[]> map = new TreeMap<String, String[]>();;
- final List<String> strs = DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC);
- for(String str : strs) {
- str=str.trim();
- if(str.isEmpty()) continue;
- String[] syls = str.split("[:]+");
- if(syls.length!=4)
- throw new IOException("Invalid file format: "+Arrays.toString(syls));
-
- final String key;
- if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
- else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
+ DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC, new LineProcessor() {
+ @Override
+ public void processLine(String str) throws IOException {
+ str=str.trim();
+ if(str.isEmpty()) return;
+ String[] syls = str.split("[:]+");
+ if(syls.length!=4)
+ throw new IOException("Invalid file format: "+Arrays.toString(syls));
+
+ final String key;
+ if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
+ else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
- final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
- String[] patns = joined.split("[/]+");
-
- map.put(syls[0]+key, patns);
- }
+ final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
+ String[] patns = joined.split("[/]+");
+
+ map.put(syls[0]+key, patns);
+ }
+ });
occurrences = Collections.unmodifiableSortedMap(map);
} catch (IOException ioe) {
throw new Error("Failed to read the tagger dictionary.", ioe);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java Fri Oct 18 10:49:35 2013
@@ -20,10 +20,10 @@ package org.apache.lucene.analysis.ko.ut
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
public class HanjaUtils {
private HanjaUtils() {}
@@ -31,18 +31,19 @@ public class HanjaUtils {
private static final Map<Character, char[]> mapHanja;
static {
try {
- List<String> strList = DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC);
- Map<Character, char[]> map = new HashMap<Character, char[]>();
-
- for(String s : strList) {
- if(s.isEmpty() || s.indexOf(",")==-1) continue;
+ final Map<Character, char[]> map = new HashMap<Character, char[]>();
+ DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC, new LineProcessor() {
+ @Override
+ public void processLine(String s) throws IOException {
+ if(s.isEmpty() || s.indexOf(",")==-1) return;
- String[] hanInfos = s.split("[,]+");
- if(hanInfos.length!=2 || hanInfos[0].length()!=1) throw new IOException("Invalid file format.");
-
- map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
- }
-
+ String[] hanInfos = s.split("[,]+");
+ if(hanInfos.length!=2 || hanInfos[0].length()!=1)
+ throw new IOException("Invalid file format.");
+
+ map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
+ }
+ });
mapHanja = Collections.unmodifiableMap(map);
} catch (IOException ioe) {
throw new RuntimeException("Cannot load: " + DictionaryResources.FILE_MAP_HANJA_DIC, ioe);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java Fri Oct 18 10:49:35 2013
@@ -19,9 +19,11 @@ package org.apache.lucene.analysis.ko.ut
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
public class SyllableUtil {
private SyllableUtil() {}
@@ -72,7 +74,21 @@ public class SyllableUtil {
public static int IDX_EOGAN = 39; // ì´ë¯¸ ëë ì´ë¯¸ì ë³íì¼ë¡ ì¡´ì¬í ì ìë ì (ì¦ IDX_EOMI ì´ê±°ë IDX_YNPNA ì´íì 1ì´ ìë ìì )
- private static List<char[]> syllables; // ìì í¹ì± ì ë³´
+ private static final List<char[]> syllables; // ìì í¹ì± ì ë³´
+ static {
+ try{
+ final List<char[]> list = new ArrayList<char[]>();
+ DictionaryResources.readLines(DictionaryResources.FILE_SYLLABLE_FEATURE, new LineProcessor() {
+ @Override
+ public void processLine(String line) throws IOException {
+ list.add(line.toCharArray());
+ }
+ });
+ syllables = Collections.unmodifiableList(list);
+ } catch(IOException ioe) {
+ throw new Error("Cannot load ressource", ioe);
+ }
+ }
/**
* ì¸ë±ì¤ ê°ì í´ë¹íë ìì ì í¹ì±ì ë°ííë¤.
@@ -98,20 +114,6 @@ public class SyllableUtil {
}
- static {
- try{
- syllables = new ArrayList<char[]>();
-
- List<String> line = DictionaryResources.readLines(DictionaryResources.FILE_SYLLABLE_FEATURE);
- for(int i=0;i<line.size();i++) {
- if(i!=0)
- syllables.add(line.get(i).toCharArray());
- }
- }catch(IOException ioe) {
- throw new Error("Cannot load ressource", ioe);
- }
- }
-
public static boolean isAlpanumeric(char ch) {
return (ch>='0'&&ch<='z');
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic Fri Oct 18 10:49:35 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-//#########################################
1111111111110111100000000000000000000001 //ê°
0000010111100111001000000000000000000000 //ê°
0000000000000000000000000000000000000000 //ê°