You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 12:49:36 UTC

svn commit: r1533382 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/dic/ java/org/apache/lucene/analysis/ko/tagging/ java/org/apache/lucene/analysis/ko/utils/ resources/org/apache/lucene/analysis/ko...

Author: uschindler
Date: Fri Oct 18 10:49:35 2013
New Revision: 1533382

URL: http://svn.apache.org/r1533382
Log:
LUCENE-4956: Don't load full files into big List<>, instead process them line by line (the current code uses iterator anyway).

Added:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java   (with props)
Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Fri Oct 18 10:49:35 2013
@@ -23,8 +23,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
 
 import org.apache.lucene.util.IOUtils;
 
@@ -62,36 +60,34 @@ public class DictionaryResources {
 
   private DictionaryResources() {}
 
-  public static List<String> readLines(String file) throws IOException {
+  /**
+   * Get the contents of a <code>Reader</code> invoking {@link LineProcessor}
+   * for each line, removing comment lines starting with '!'.
+   * @param file the name of the dictionary resource to read
+   * @param processor lines are reported to this interface
+   * @throws IOException if an I/O error occurs
+   */
+  public static void readLines(String file, LineProcessor processor) throws IOException {
     InputStream in = null;
     try {
       in = DictionaryResources.class.getResourceAsStream(file);
       if (in == null)
         throw new FileNotFoundException(file);
-      return readLines(new InputStreamReader(in, IOUtils.CHARSET_UTF_8));
+      readLines(new InputStreamReader(in, IOUtils.CHARSET_UTF_8), processor);
     } finally {
       IOUtils.closeWhileHandlingException(in);
     }
   }
 
-  /**
-   * Get the contents of a <code>Reader</code> as a list of Strings,
-   * one entry per line, removing comment lines starting with '!'.
-   * @param input  the <code>Reader</code> to read from, not null
-   * @return the list of Strings, never null
-   * @throws IOException if an I/O error occurs
-   */
-  private static List<String> readLines(Reader input) throws IOException {
+  private static void readLines(Reader input, LineProcessor processor) throws IOException {
     BufferedReader reader = new BufferedReader(input);
-    List<String> list = new ArrayList<String>();
     String line;
     while ((line = reader.readLine()) != null) {
       if (line.startsWith("!") || line.startsWith("\uFEFF!")) { // Skip comment lines starting with '!'
         continue;
       }
-      list.add(line);
+      processor.processLine(line);
     }
-    return list;
   }
 
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Fri Oct 18 10:49:35 2013
@@ -51,56 +51,65 @@ public class DictionaryUtil {
   
   static {  
     try {
-      List<String> strList = DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY);
-      strList.addAll(DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION));
-      for(String str:strList) {
-        String[] infos = str.split("[,]+");
-        if(infos.length!=2) continue;
-        infos[1] = infos[1].trim();
-        if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
-        
-        WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
-        dictionary.add(entry.getWord(), entry);
-      }
-      
-      List<String> compounds = DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS); 
-      for(String compound: compounds) 
-      {    
-        String[] infos = compound.split("[:]+");
-        if(infos.length!=3&&infos.length!=2) continue;
-        
-        final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
-        final WordEntry entry;
-        if(infos.length==2) 
-          entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
-        else 
-          entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
-        dictionary.add(entry.getWord(), entry);
-      }
-      
-      List<String> abbrevs = DictionaryResources.readLines(DictionaryResources.FILE_ABBREV); 
-      for(String abbrev: abbrevs) 
-      {    
-        String[] infos = abbrev.split("[:]+");
-        if(infos.length!=2) continue;      
-        abbreviations.put(infos[0].trim(), infos[1].trim());
-      }
-      
-      List<String> lines = DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS);  
-      for(String compound: lines) {    
-        String[] infos = compound.split("[:]+");
-        if(infos.length!=2) continue;
-        WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
-        uncompounds.put(entry.getWord(), entry);
-      }      
-  
-      lines = DictionaryResources.readLines(DictionaryResources.FILE_CJ);  
-      for(String cj: lines) {    
-        String[] infos = cj.split("[:]+");
-        if(infos.length!=2) continue;
-        cjwords.put(infos[0], infos[1]);
-      }
-      
+      final LineProcessor proc = new LineProcessor() {
+        @Override
+        public void processLine(String line) {
+          String[] infos = line.split("[,]+");
+          if(infos.length!=2) return;
+          infos[1] = infos[1].trim();
+          if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
+          
+          WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
+          dictionary.add(entry.getWord(), entry);          
+        }
+      };
+      DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY, proc);
+      DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION, proc);
+      
+      DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
+        @Override
+        public void processLine(String compound) {
+          String[] infos = compound.split("[:]+");
+          if(infos.length!=3&&infos.length!=2) return;
+          
+          final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
+          final WordEntry entry;
+          if(infos.length==2) 
+            entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
+          else 
+            entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
+          dictionary.add(entry.getWord(), entry);          
+        }       
+      }); 
+      
+      DictionaryResources.readLines(DictionaryResources.FILE_ABBREV, new LineProcessor() {
+        @Override
+        public void processLine(String abbrev) {
+          String[] infos = abbrev.split("[:]+");
+          if(infos.length!=2) return;      
+          abbreviations.put(infos[0].trim(), infos[1].trim());          
+        }
+      });
+      
+      DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
+        @Override
+        public void processLine(String compound) {
+          String[] infos = compound.split("[:]+");
+          if(infos.length!=2) return;
+          WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
+          uncompounds.put(entry.getWord(), entry);
+        }
+      });
+  
+      DictionaryResources.readLines(DictionaryResources.FILE_CJ, new LineProcessor() {
+        @Override
+        public void processLine(String cj) {
+          String[] infos = cj.split("[:]+");
+          if(infos.length!=2) return;
+          cjwords.put(infos[0], infos[1]);
+        }
+      });
+
       readFileToSet(josas,DictionaryResources.FILE_JOSA);
       
       readFileToSet(eomis,DictionaryResources.FILE_EOMI);
@@ -261,11 +270,13 @@ public class DictionaryUtil {
     
   }
   
-  private static void readFileToSet(Set<String> set, String dic) throws IOException {    
-    List<String> line = DictionaryResources.readLines(dic);
-    for(int i=1;i<line.size();i++) {
-      set.add(line.get(i).trim());
-    }
+  private static void readFileToSet(final Set<String> set, String dic) throws IOException {    
+    DictionaryResources.readLines(dic, new LineProcessor() {
+      @Override
+      public void processLine(String line) {
+        set.add(line.trim());
+      }
+    });
   }
   
   private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java?rev=1533382&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/LineProcessor.java Fri Oct 18 10:49:35 2013
@@ -0,0 +1,27 @@
+package org.apache.lucene.analysis.ko.dic;
+
+import java.io.IOException;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Simple interface that is used to read lines from a resource.
+ * @lucene.internal
+ */
+public interface LineProcessor {
+  void processLine(String line) throws IOException;
+}

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 10:49:35 2013
@@ -27,6 +27,7 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 
 import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
 import org.apache.lucene.analysis.ko.morph.AnalysisOutput;
 import org.apache.lucene.analysis.ko.morph.PatternConstants;
 import org.apache.lucene.analysis.ko.utils.ConstraintUtil;
@@ -41,23 +42,25 @@ public class Tagger {
   static {
     try {
       final SortedMap<String, String[]> map = new TreeMap<String, String[]>();;
-      final List<String> strs = DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC);
-      for(String str : strs) {
-        str=str.trim();
-        if(str.isEmpty()) continue;
-        String[] syls = str.split("[:]+");
-        if(syls.length!=4)
-          throw new IOException("Invalid file format: "+Arrays.toString(syls));
-        
-        final String key;        
-        if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
-        else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
+      DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC, new LineProcessor() {
+        @Override
+        public void processLine(String str) throws IOException {
+          str=str.trim();
+          if(str.isEmpty()) return;
+          String[] syls = str.split("[:]+");
+          if(syls.length!=4)
+            throw new IOException("Invalid file format: "+Arrays.toString(syls));
+          
+          final String key;        
+          if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
+          else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
 
-        final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
-        String[] patns = joined.split("[/]+");
-        
-        map.put(syls[0]+key, patns);
-      }
+          final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
+          String[] patns = joined.split("[/]+");
+          
+          map.put(syls[0]+key, patns);
+        }
+      });
       occurrences = Collections.unmodifiableSortedMap(map);
     } catch (IOException ioe) {
       throw new Error("Failed to read the tagger dictionary.", ioe);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java Fri Oct 18 10:49:35 2013
@@ -20,10 +20,10 @@ package org.apache.lucene.analysis.ko.ut
 import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
 import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
 
 public class HanjaUtils {
   private HanjaUtils() {}
@@ -31,18 +31,19 @@ public class HanjaUtils {
   private static final Map<Character, char[]> mapHanja;
   static {
     try {
-      List<String> strList = DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC);
-      Map<Character, char[]> map = new HashMap<Character, char[]>();    
-    
-      for(String s : strList) {
-        if(s.isEmpty() || s.indexOf(",")==-1) continue;
+      final Map<Character, char[]> map = new HashMap<Character, char[]>();    
+      DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC, new LineProcessor() {
+        @Override
+        public void processLine(String s) throws IOException {
+          if(s.isEmpty() || s.indexOf(",")==-1) return;
 
-        String[] hanInfos = s.split("[,]+");
-        if(hanInfos.length!=2 || hanInfos[0].length()!=1) throw new IOException("Invalid file format.");
-        
-        map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
-      }
-      
+          String[] hanInfos = s.split("[,]+");
+          if(hanInfos.length!=2 || hanInfos[0].length()!=1)
+            throw new IOException("Invalid file format.");
+          
+          map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
+        }
+      });      
       mapHanja = Collections.unmodifiableMap(map);
     } catch (IOException ioe) {
       throw new RuntimeException("Cannot load: " + DictionaryResources.FILE_MAP_HANJA_DIC, ioe);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java Fri Oct 18 10:49:35 2013
@@ -19,9 +19,11 @@ package org.apache.lucene.analysis.ko.ut
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.lucene.analysis.ko.dic.DictionaryResources;
+import org.apache.lucene.analysis.ko.dic.LineProcessor;
 
 public class SyllableUtil {
   private SyllableUtil() {}
@@ -72,7 +74,21 @@ public class SyllableUtil {
   
   public static int IDX_EOGAN = 39; // 어미 또는 어미의 변형으로 존재할 수 있는 음 (즉 IDX_EOMI 이거나 IDX_YNPNA 이후에 1이 있는 음절)
   
-  private static List<char[]> syllables;  // 음절특성 정보
+  private static final List<char[]> syllables;  // 음절특성 정보
+  static {
+    try{
+      final List<char[]> list = new ArrayList<char[]>();
+      DictionaryResources.readLines(DictionaryResources.FILE_SYLLABLE_FEATURE, new LineProcessor() {
+        @Override
+        public void processLine(String line) throws IOException {
+          list.add(line.toCharArray());
+        }
+      });
+      syllables = Collections.unmodifiableList(list);
+    } catch(IOException ioe) {
+      throw new Error("Cannot load ressource", ioe);
+    } 
+  }
   
   /**
    * 인덱스 값에 해당하는 음절의 특성을 반환한다.
@@ -98,20 +114,6 @@ public class SyllableUtil {
     
   }
   
-  static {
-    try{
-      syllables = new ArrayList<char[]>();
-
-      List<String> line = DictionaryResources.readLines(DictionaryResources.FILE_SYLLABLE_FEATURE);  
-      for(int i=0;i<line.size();i++) {        
-        if(i!=0)
-          syllables.add(line.get(i).toCharArray());
-      }
-    }catch(IOException ioe) {
-      throw new Error("Cannot load ressource", ioe);
-    } 
-  }  
-  
   public static boolean isAlpanumeric(char ch) {
     return (ch>='0'&&ch<='z');
   }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic?rev=1533382&r1=1533381&r2=1533382&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic Fri Oct 18 10:49:35 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-//#########################################
 1111111111110111100000000000000000000001 //가
 0000010111100111001000000000000000000000 //각
 0000000000000000000000000000000000000000 //갂