You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/01 12:03:18 UTC
svn commit: r1239061 - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
Author: rmuir
Date: Wed Feb 1 11:03:17 2012
New Revision: 1239061
URL: http://svn.apache.org/viewvc?rev=1239061&view=rev
Log:
LUCENE-3730: improve Kuromoji search mode heuristics
Added:
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1239061&r1=1239060&r2=1239061&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed Feb 1 11:03:17 2012
@@ -142,6 +142,9 @@ New Features
* LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Mike McCandless, Robert Muir)
+ * LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
+ heuristics. (Christian Moen via Robert Muir)
+
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1239061&r1=1239060&r2=1239061&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Wed Feb 1 11:03:17 2012
@@ -54,11 +54,13 @@ public class Viterbi {
private static final int DEFAULT_COST = 10000000;
- private static final int SEARCH_MODE_LENGTH_KANJI = 3;
-
- private static final int SEARCH_MODE_LENGTH = 7;
-
- private static final int SEARCH_MODE_PENALTY = 10000;
+ private static final int SEARCH_MODE_KANJI_LENGTH = 2;
+
+ private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+ private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+ private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
private static final char[] BOS = "BOS".toCharArray();
@@ -137,7 +139,7 @@ public class Viterbi {
char[] surfaceForm = node.getSurfaceForm();
int offset = node.getOffset();
int length = node.getLength();
- if (length > SEARCH_MODE_LENGTH_KANJI) {
+ if (length > SEARCH_MODE_KANJI_LENGTH) {
boolean allKanji = true;
// check if node consists of only kanji
for (int pos = 0; pos < length; pos++) {
@@ -148,9 +150,9 @@ public class Viterbi {
}
if (allKanji) { // Process only Kanji keywords
- pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
- } else if (length > SEARCH_MODE_LENGTH) {
- pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
+ pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+ } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+ pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
}
}
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java?rev=1239061&r1=1239060&r2=1239061&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java Wed Feb 1 11:03:17 2012
@@ -1,18 +1,5 @@
package org.apache.lucene.analysis.kuromoji;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util._TestUtil;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -30,6 +17,19 @@ import org.apache.lucene.util._TestUtil;
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
public class TestExtendedMode extends BaseTokenStreamTestCase {
private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
private final Analyzer analyzer = new Analyzer() {
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?rev=1239061&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Wed Feb 1 11:03:17 2012
@@ -0,0 +1,72 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.util.IOUtils;
+
+public class TestSearchMode extends BaseTokenStreamTestCase {
+ private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
+ private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
+ private final Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ /** Test search mode segmentation */
+ public void testSearchSegmentation() throws IOException {
+ InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
+ if (is == null) {
+ throw new FileNotFoundException("Cannot find " + SEGMENTATION_FILENAME + " in test classpath");
+ }
+ try {
+ LineNumberReader reader = new LineNumberReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8));
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ // Remove comments
+ line = line.replaceAll("#.*$", "");
+ // Skip empty lines or comment lines
+ if (line.trim().isEmpty()) {
+ continue;
+ }
+ if (VERBOSE) {
+ System.out.println("Line no. " + reader.getLineNumber() + ": " + line);
+ }
+ String[] fields = line.split("\t", 2);
+ String sourceText = fields[0];
+ String[] expectedTokens = fields[1].split("\\s+");
+ assertAnalyzesTo(analyzer, sourceText, expectedTokens);
+ }
+ } finally {
+ is.close();
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt?rev=1239061&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt Wed Feb 1 11:03:17 2012
@@ -0,0 +1,140 @@
+###
+### Tests for Kuromoji's search mode heuristic
+###
+### In search-mode, Kuromoji uses a heuristic to do extra splitting of words
+### to get a decompounding effect useful for search. This file includes tests
+### for this heuristic and demonstrates its usefulness, but also weaknesses.
+###
+### This file's format is as follows:
+### <text><tab><token1> <token2> ... <token>
+###
+### This file should use UTF-8 encoding and there is one test per line. The
+### text to be segmented and its expected surface form token sequence is
+### separated by a tab ('\t'). Tokens are separated by a half-width space.
+### Whitespace lines and lines starting with a '#' are ignored. Comments
+### are not allowed on entry line.
+###
+### NOTE: These tests depends on IPADIC
+###
+### Revision history:
+### - 2012-01-29: Initial version
+###
+
+##
+## Organizations
+##
+
+# Kansai Internationl Airport
+é¢è¥¿å½é空港 é¢è¥¿ å½é 空港
+# Narita Airport
+æç°ç©ºæ¸¯ æç° ç©ºæ¸¯
+# Haneda Airport
+ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç©ºæ¸¯
+# Nara Institute of Science and Technology
+å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å
端 ç§å¦ æè¡ å¤§å¦é¢ 大å¦
+# Tokyo University
+æ±äº¬å¤§å¦ æ±äº¬ 大å¦
+# Kyoto University
+京é½å¤§å¦ äº¬é½ å¤§å¦
+# Kyoto University Baseball Club
+京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½ å¤§å¦ ç¡¬å¼ éç é¨
+
+##
+## Katakana titles
+##
+
+# Senior Software Engineer
+ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
+# Software Engineer
+ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
+# Senior Project Manager
+ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ããã¸ã§ã¯ã ããã¸ã£ã¼
+# Project Manager
+ããã¸ã§ã¯ãããã¸ã£ã¼ ããã¸ã§ã¯ã ããã¸ã£ã¼
+# Senior Sales Engineer
+ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã»ã¼ã«ã¹ ã¨ã³ã¸ãã¢
+# System Architect
+ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã¢ã¼ããã¯ã
+# Senior System Architect
+ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ã¹ãã ã¢ã¼ããã¯ã
+# System Administrator
+ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿
+ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+# Senior System Administrator
+ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+
+##
+## Company names (several are fictitious)
+##
+
+# SoftBank Mobile
+ã½ãããã³ã¯ã¢ãã¤ã« ã½ãããã³ã¯ ã¢ãã¤ã«
+# Alpine Materials
+ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ãããªã¢ã«ãº
+# Sapporo Holdings
+ãµããããã¼ã«ãã£ã³ã°ã¹ ãµããã ãã¼ã«ãã£ã³ã°ã¹
+# Yamada Corporation
+ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã³ã¼ãã¬ã¼ã·ã§ã³
+# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor
+ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã
+# Orental Chain
+ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãã¨ã³
+# Ally Projects Japan NOTE: Becomes one token as ããã¸ã§ã¯ã is not in IPADIC
+ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³ ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³
+# Peter Pan Corporation
+ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³
+# AIM Create
+ã¨ã¤ã ã¯ãªã¨ã¤ã ã¨ã¤ã ã¯ãªã¨ã¤ã
+# Mars Engineering
+ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ã¨ã³ã¸ãã¢ãªã³ã°
+# Fuji Protein Technology
+ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãããã¤ã³ ãã¯ããã¸ã¼
+
+##
+## Person names
+##
+
+# Michael Jackson
+ãã¤ã±ã«ã¸ã£ã¯ã½ã³ ãã¤ã±ã« ã¸ã£ã¯ã½ã³
+# Steve Jobs
+ã¹ãã£ã¼ãã¸ã§ã㺠ã¹ãã£ã¼ã ã¸ã§ããº
+# Harry Potter NOTE: Becomes one token (short word)
+ããªã¼ããã¿ã¼ ããªã¼ããã¿ã¼
+# Bill Gates NOTE: Becomes one token (short work)
+ãã«ã²ã¤ã ãã«ã²ã¤ã
+# Sean Connery NOTE: Becomes one token (okay)
+ã·ã§ã¼ã³ã³ããªã¼ ã·ã§ã¼ã³ã³ããªã¼
+
+##
+## Other nouns
+##
+
+# Holdings
+ãã¼ã«ãã£ã³ã°ã¹ ãã¼ã«ãã£ã³ã°ã¹
+# Engineering
+ã¨ã³ã¸ãã¢ãªã³ã° ã¨ã³ã¸ãã¢ãªã³ã°
+# Software Engineering
+ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢ãªã³ã° ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢ãªã³ã°
+# Shopping center
+ã·ã§ããã³ã°ã»ã³ã¿ã¼ ã·ã§ããã³ã° ã»ã³ã¿ã¼
+# Game center (arcade) NOTE: One token because of short word
+ã²ã¼ã ã»ã³ã¿ã¼ ã²ã¼ã ã»ã³ã¿ã¼
+# Christmas shopping
+ã¯ãªã¹ãã¹ã·ã§ããã³ã° ã¯ãªã¹ãã¹ ã·ã§ããã³ã°
+# Download file
+ãã¦ã³ãã¼ããã¡ã¤ã« ãã¦ã³ãã¼ã ãã¡ã¤ã«
+# Technology
+ãã¯ããã¸ã¼ ãã¯ããã¸ã¼
+# Lillehammer Olympics
+ãªã¬ãã³ã¡ã«ãªãªã³ãã㯠ãªã¬ãã³ã¡ã« ãªãªã³ããã¯
+
+##
+## Problematic terms
+##
+
+# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
+ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ãã£ã¨ã³ ã¸ã㢠ãªã³ã°
+# Anchovy pasta NOTE: Become Anch yvipasta
+ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã§ããã¹ã¿
+# Surprise gift NOTE: Becomes one token (surprise not in IPADIC)
+ãµãã©ã¤ãºã®ãã ãµãã©ã¤ãºã®ãã