You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/28 03:31:11 UTC

svn commit: r1536235 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko: dic/DictionaryUtil.java dic/HangulDictionary.java morph/CompoundNounAnalyzer.java

Author: rmuir
Date: Mon Oct 28 02:31:11 2013
New Revision: 1536235

URL: http://svn.apache.org/r1536235
Log:
LUCENE-4956: improve the runtime of maxWord

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 28 02:31:11 2013
@@ -143,6 +143,11 @@ public class DictionaryUtil {
     return getWord(key, WordEntry.COMPOUND, 0);
   }
   
+  /** Returns length of longest matching noun */
+  public static int longestMatchAllNoun(CharSequence key) {
+    return dictionary.longestMatch(key, WordEntry.NOUN);
+  }
+  
   /** true if there exists noun including compound noun */
   public static boolean hasAllNoun(String key) {  
     return hasWord(key, WordEntry.NOUN, 0);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 28 02:31:11 2013
@@ -164,4 +164,32 @@ class HangulDictionary {
     }
     return true;
   }
+  
+  /** looks up word class for a word (exact match) */
+  int longestMatch(CharSequence key, int flags) {
+    final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
+
+    final BytesReader fstReader = fst.getBytesReader();
+
+    // Accumulate output as we go
+    byte output = 0;
+    int max = 0;
+    for (int i = 0; i < key.length(); i++) {
+      try {
+        if (findTargetArc(key.charAt(i), arc, arc, i == 0, fstReader) == null) {
+          return max;
+        }
+      } catch (IOException bogus) {
+        throw new RuntimeException();
+      }
+      output += arc.output;
+      if (arc.isFinal()) {
+        byte clazz = (byte) (output + arc.nextFinalOutput);
+        if ((getFlags(clazz) & flags) != 0) {
+          max = Math.max(max, i+1);
+        }
+      }
+    }
+    return max;
+  }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 28 02:31:11 2013
@@ -257,29 +257,28 @@ public class CompoundNounAnalyzer {
    * @param hasSuffix   whether the input text is including a suffix character at the end
    * @return  the max length
    */
-  private int maxWord(String text, boolean hasSuffix, String prvText) {
-       
-    // nocommit: this should really be an FST walk...
-    for(int i=text.length();i>1;i--) {
-      String seg = text.substring(0,i);
-      if (!DictionaryUtil.hasAllNoun(seg)) {
-        continue;
+  private int maxWord(String text, boolean hasSuffix, String prvText) {    
+    int max = DictionaryUtil.longestMatchAllNoun(text);
+    
+    if (max < 2) {
+      return 0; // matches this short don't count
+    }
+    
+    // TODO: try to clean this up
+    if (max == text.length()-1 && hasSuffix) {
+      boolean existPrv = false;
+      if (prvText.length() >= 2) {
+        existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-2)));
       }
-      
-      if (i == text.length()-1 && hasSuffix) {
-        // if previous text exist in the dictionary.
-        boolean existPrv = false;
-        if(prvText.length()>=2) 
-          existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-2)));
-        if(!existPrv&&prvText.length()>=3)
-          existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-3)));
-        return existPrv ? i : i+1;
-      } else {
-        return i;
+      if (!existPrv && prvText.length() >= 3) {
+        existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-3)));
       }
-    }   
+      if (!existPrv) {
+        max++; // adjust for suffix
+      }
+    }
     
-    return 0;
+    return max;
   }
   
   private CompoundEntry[] analysisBySplited(int[] units, String input, boolean isFirst) {