You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/01 12:06:32 UTC

svn commit: r1239067 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/ lucene/contrib/analyzers/kuromoji/ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ lucene/contrib/analyzers/kuromoji/src/test/o...

Author: rmuir
Date: Wed Feb  1 11:06:32 2012
New Revision: 1239067

URL: http://svn.apache.org/viewvc?rev=1239067&view=rev
Log:
LUCENE-3730: improve Kuromoji search mode heuristics

Added:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
      - copied, changed from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
      - copied unchanged from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed Feb  1 11:06:32 2012
@@ -39,6 +39,9 @@ New Features
  * LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
    (Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Mike McCandless, Robert Muir)
 
+ * LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
+   heuristics.  (Christian Moen via Robert Muir)
+
  * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
    BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
    joins in both parent to child and child to parent directions.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Wed Feb  1 11:06:32 2012
@@ -54,11 +54,13 @@ public class Viterbi {
   
   private static final int DEFAULT_COST = 10000000;
   
-  private static final int SEARCH_MODE_LENGTH_KANJI = 3;
-  
-  private static final int SEARCH_MODE_LENGTH = 7;
-  
-  private static final int SEARCH_MODE_PENALTY = 10000;
+  private static final int SEARCH_MODE_KANJI_LENGTH = 2;
+
+  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
   
   private static final char[] BOS = "BOS".toCharArray();
   
@@ -137,7 +139,7 @@ public class Viterbi {
             char[] surfaceForm = node.getSurfaceForm();
             int offset = node.getOffset();
             int length = node.getLength();
-            if (length > SEARCH_MODE_LENGTH_KANJI) {
+            if (length > SEARCH_MODE_KANJI_LENGTH) {
               boolean allKanji = true;
               // check if node consists of only kanji
               for (int pos = 0; pos < length; pos++) {
@@ -148,9 +150,9 @@ public class Viterbi {
               }
               
               if (allKanji) {	// Process only Kanji keywords
-                pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
-              } else if (length > SEARCH_MODE_LENGTH) {
-                pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;								
+                pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+              } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+                pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
               }
             }
           }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java Wed Feb  1 11:06:32 2012
@@ -1,19 +1,5 @@
 package org.apache.lucene.analysis.kuromoji;
 
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.ReusableAnalyzerBase;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util._TestUtil;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -31,6 +17,20 @@ import org.apache.lucene.util._TestUtil;
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
 public class TestExtendedMode extends BaseTokenStreamTestCase {
   private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
   private final Analyzer analyzer = new ReusableAnalyzerBase() {

Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java&p1=lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java&r1=1239061&r2=1239067&rev=1239067&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Wed Feb  1 11:06:32 2012
@@ -26,6 +26,7 @@ import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
 import org.apache.lucene.util.IOUtils;
@@ -33,7 +34,7 @@ import org.apache.lucene.util.IOUtils;
 public class TestSearchMode extends BaseTokenStreamTestCase {
   private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
   private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
-  private final Analyzer analyzer = new Analyzer() {
+  private final Analyzer analyzer = new ReusableAnalyzerBase() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
       Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);