You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/01 12:06:32 UTC
svn commit: r1239067 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/ lucene/contrib/analyzers/kuromoji/
lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
lucene/contrib/analyzers/kuromoji/src/test/o...
Author: rmuir
Date: Wed Feb 1 11:06:32 2012
New Revision: 1239067
URL: http://svn.apache.org/viewvc?rev=1239067&view=rev
Log:
LUCENE-3730: improve Kuromoji search mode heuristics
Added:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
- copied, changed from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
- copied unchanged from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed Feb 1 11:06:32 2012
@@ -39,6 +39,9 @@ New Features
* LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Mike McCandless, Robert Muir)
+ * LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
+ heuristics. (Christian Moen via Robert Muir)
+
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Wed Feb 1 11:06:32 2012
@@ -54,11 +54,13 @@ public class Viterbi {
private static final int DEFAULT_COST = 10000000;
- private static final int SEARCH_MODE_LENGTH_KANJI = 3;
-
- private static final int SEARCH_MODE_LENGTH = 7;
-
- private static final int SEARCH_MODE_PENALTY = 10000;
+ private static final int SEARCH_MODE_KANJI_LENGTH = 2;
+
+ private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+ private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+ private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
private static final char[] BOS = "BOS".toCharArray();
@@ -137,7 +139,7 @@ public class Viterbi {
char[] surfaceForm = node.getSurfaceForm();
int offset = node.getOffset();
int length = node.getLength();
- if (length > SEARCH_MODE_LENGTH_KANJI) {
+ if (length > SEARCH_MODE_KANJI_LENGTH) {
boolean allKanji = true;
// check if node consists of only kanji
for (int pos = 0; pos < length; pos++) {
@@ -148,9 +150,9 @@ public class Viterbi {
}
if (allKanji) { // Process only Kanji keywords
- pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
- } else if (length > SEARCH_MODE_LENGTH) {
- pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
+ pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+ } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+ pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
}
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java?rev=1239067&r1=1239066&r2=1239067&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java Wed Feb 1 11:06:32 2012
@@ -1,19 +1,5 @@
package org.apache.lucene.analysis.kuromoji;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.ReusableAnalyzerBase;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util._TestUtil;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -31,6 +17,20 @@ import org.apache.lucene.util._TestUtil;
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
public class TestExtendedMode extends BaseTokenStreamTestCase {
private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
private final Analyzer analyzer = new ReusableAnalyzerBase() {
Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (from r1239061, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java&p1=lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java&r1=1239061&r2=1239067&rev=1239067&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Wed Feb 1 11:06:32 2012
@@ -26,6 +26,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.util.IOUtils;
@@ -33,7 +34,7 @@ import org.apache.lucene.util.IOUtils;
public class TestSearchMode extends BaseTokenStreamTestCase {
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
- private final Analyzer analyzer = new Analyzer() {
+ private final Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);