You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 11:44:27 UTC

svn commit: r1533371 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/dic/ java/org/apache/lucene/analysis/ko/tagging/ resources/org/apache/lucene/analysis/ko/dic/

Author: uschindler
Date: Fri Oct 18 09:44:27 2013
New Revision: 1533371

URL: http://svn.apache.org/r1533371
Log:
LUCENE-4956: Quick fix to remove the Trie for the Tagger. The file is very slow and a TreeMap is perfectly fine. Can still be improved, but the primary concern is to remove Trie.java

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Fri Oct 18 09:44:27 2013
@@ -76,26 +76,20 @@ public class DictionaryResources {
 
   /**
    * Get the contents of a <code>Reader</code> as a list of Strings,
-   * one entry per line.
-   * <p>
-   * This method buffers the input internally, so there is no need to use a
-   * <code>BufferedReader</code>.
-   *
+   * one entry per line, removing comment lines starting with '!'.
    * @param input  the <code>Reader</code> to read from, not null
    * @return the list of Strings, never null
-   * @throws NullPointerException if the input is null
    * @throws IOException if an I/O error occurs
-   * @since Commons IO 1.1
    */
   private static List<String> readLines(Reader input) throws IOException {
     BufferedReader reader = new BufferedReader(input);
     List<String> list = new ArrayList<String>();
-    String line = reader.readLine();
-    while (line != null) {
-      if ( ! (line.startsWith("!") || line.startsWith("\uFEFF!"))) { // Skip comment lines starting with '!'
-        list.add(line);
+    String line;
+    while ((line = reader.readLine()) != null) {
+      if (line.startsWith("!") || line.startsWith("\uFEFF!")) { // Skip comment lines starting with '!'
+        continue;
       }
-      line = reader.readLine();
+      list.add(line);
     }
     return list;
   }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 09:44:27 2013
@@ -19,14 +19,16 @@ package org.apache.lucene.analysis.ko.ta
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
+import java.util.NavigableMap;
+import java.util.TreeMap;
 
 import org.apache.lucene.analysis.ko.dic.DictionaryResources;
 import org.apache.lucene.analysis.ko.morph.AnalysisOutput;
 import org.apache.lucene.analysis.ko.morph.PatternConstants;
 import org.apache.lucene.analysis.ko.utils.ConstraintUtil;
-import org.apache.lucene.analysis.ko.utils.Trie;
 
 /**
  * 여러개의 형태소분석 결과 중에 최적의 것을 선택한다.
@@ -34,28 +36,27 @@ import org.apache.lucene.analysis.ko.uti
  */
 public class Tagger {
     
-  private static final Trie<String, String[]> occurrences = new Trie<String, String[]>(true);;
+  private static final NavigableMap<String, String[]> occurrences = new TreeMap<String, String[]>();;
   static { 
     try {
       List<String> strs = DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC);
-      
       for(String str : strs) {
-        if(str==null) continue;
-        str = str.trim();
+        str=str.trim();
+        if(str.isEmpty()) continue;
         String[] syls = str.split("[:]+");
-        if(syls.length!=4) continue;
+        if(syls.length!=4)
+          throw new IOException("Invalid file format: "+Arrays.toString(syls));
         
-        String key = null;        
+        final String key;        
         if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
         else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
 
         final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
         String[] patns = joined.split("[/]+");
         
-        occurrences.add(syls[0]+key, patns);
-        
-      }      
-      
+        occurrences.put(syls[0]+key, patns);
+      }
+      System.out.println(occurrences);
     } catch (IOException ioe) {
       throw new Error("Fail to read the tagger dictionary.", ioe);
     }
@@ -202,8 +203,6 @@ public class Tagger {
   
   private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) {
     
-    boolean ok = true;    
-    
     String pend = pmorph.getJosa();
     if(pend==null) pend = pmorph.getEomi();
 
@@ -284,9 +283,8 @@ public class Tagger {
     return false;    
   }
 
-  @SuppressWarnings("unchecked")
   public static Iterator<String[]> getGR(String prefix) {
-    return occurrences.getPrefixedBy(prefix);
+    return occurrences.subMap(prefix, prefix + "\uFFFF").values().iterator();
   }
   
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic Fri Oct 18 09:44:27 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-//#####################
 F:NILL/에/0:대하^S/NILL/11:0
 F:NILL/에/0:관하^S/NILL/11:0
 F:NILL/에/0:따르^S/NILL/11:0