You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/28 03:31:11 UTC
svn commit: r1536235 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko:
dic/DictionaryUtil.java dic/HangulDictionary.java
morph/CompoundNounAnalyzer.java
Author: rmuir
Date: Mon Oct 28 02:31:11 2013
New Revision: 1536235
URL: http://svn.apache.org/r1536235
Log:
LUCENE-4956: improve the runtime of maxWord
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 28 02:31:11 2013
@@ -143,6 +143,11 @@ public class DictionaryUtil {
return getWord(key, WordEntry.COMPOUND, 0);
}
+ /** Returns length of longest matching noun */
+ public static int longestMatchAllNoun(CharSequence key) {
+ return dictionary.longestMatch(key, WordEntry.NOUN);
+ }
+
/** true if there exists noun including compound noun */
public static boolean hasAllNoun(String key) {
return hasWord(key, WordEntry.NOUN, 0);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 28 02:31:11 2013
@@ -164,4 +164,32 @@ class HangulDictionary {
}
return true;
}
+
+ /** looks up word class for a word (exact match) */
+ int longestMatch(CharSequence key, int flags) {
+ final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
+
+ final BytesReader fstReader = fst.getBytesReader();
+
+ // Accumulate output as we go
+ byte output = 0;
+ int max = 0;
+ for (int i = 0; i < key.length(); i++) {
+ try {
+ if (findTargetArc(key.charAt(i), arc, arc, i == 0, fstReader) == null) {
+ return max;
+ }
+ } catch (IOException bogus) {
+ throw new RuntimeException();
+ }
+ output += arc.output;
+ if (arc.isFinal()) {
+ byte clazz = (byte) (output + arc.nextFinalOutput);
+ if ((getFlags(clazz) & flags) != 0) {
+ max = Math.max(max, i+1);
+ }
+ }
+ }
+ return max;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1536235&r1=1536234&r2=1536235&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 28 02:31:11 2013
@@ -257,29 +257,28 @@ public class CompoundNounAnalyzer {
* @param hasSuffix whether the input text is including a suffix character at the end
* @return the max length
*/
- private int maxWord(String text, boolean hasSuffix, String prvText) {
-
- // nocommit: this should really be an FST walk...
- for(int i=text.length();i>1;i--) {
- String seg = text.substring(0,i);
- if (!DictionaryUtil.hasAllNoun(seg)) {
- continue;
+ private int maxWord(String text, boolean hasSuffix, String prvText) {
+ int max = DictionaryUtil.longestMatchAllNoun(text);
+
+ if (max < 2) {
+ return 0; // matches this short don't count
+ }
+
+ // TODO: try to clean this up
+ if (max == text.length()-1 && hasSuffix) {
+ boolean existPrv = false;
+ if (prvText.length() >= 2) {
+ existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-2)));
}
-
- if (i == text.length()-1 && hasSuffix) {
- // if previous text exist in the dictionary.
- boolean existPrv = false;
- if(prvText.length()>=2)
- existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-2)));
- if(!existPrv&&prvText.length()>=3)
- existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-3)));
- return existPrv ? i : i+1;
- } else {
- return i;
+ if (!existPrv && prvText.length() >= 3) {
+ existPrv = (DictionaryUtil.hasNoun(prvText.substring(prvText.length()-3)));
}
- }
+ if (!existPrv) {
+ max++; // adjust for suffix
+ }
+ }
- return 0;
+ return max;
}
private CompoundEntry[] analysisBySplited(int[] units, String input, boolean isFirst) {