You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/24 08:56:22 UTC
svn commit: r1304727 - in /lucene/dev/branches/branch_3x: lucene/contrib/
lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
lucene/contrib/fac...
Author: cm
Date: Sat Mar 24 07:56:21 2012
New Revision: 1304727
URL: http://svn.apache.org/viewvc?rev=1304727&view=rev
Log:
Backport of LUCENE-3901 (add katakana stem filter)
Added:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
- copied unchanged from r1304719, lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
- copied, changed from r1304719, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
- copied unchanged from r1304719, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
lucene/dev/branches/branch_3x/lucene/contrib/facet/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/core/ (props changed)
lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1304727&r1=1304726&r2=1304727&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Sat Mar 24 07:56:21 2012
@@ -56,6 +56,10 @@ New Features
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
+ * LUCENE-3901: Added katakana stem filter to normalize common spelling variants
+ with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
+ and the "text_ja" field type in schema.xml. (Christian Moen)
+
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1304727&r1=1304726&r2=1304727&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Sat Mar 24 07:56:21 2012
@@ -92,6 +92,7 @@ public class KuromojiAnalyzer extends St
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords);
+ stream = new KuromojiKatakanaStemFilter(stream);
stream = new LowerCaseFilter(matchVersion, stream);
return new TokenStreamComponents(tokenizer, stream);
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java?rev=1304727&r1=1304726&r2=1304727&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java Sat Mar 24 07:56:21 2012
@@ -24,6 +24,9 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+/**
+ * Test Kuromoji Japanese morphological analyzer
+ */
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
@@ -54,27 +57,26 @@ public class TestKuromojiAnalyzer extend
KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags());
- /*
- //TokenStream ts = a.tokenStream("foo", new StringReader("妹ã®å²åã§ãã俺ã¨å¹´åã§ãä»åé¨çã§ãã"));
- TokenStream ts = a.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
- ts.reset();
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- while(ts.incrementToken()) {
- System.out.println(" " + termAtt.toString());
- }
- System.out.println("DONE PARSE\n\n");
- */
-
// Senior software engineer:
assertAnalyzesToPositions(a, "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
new String[] { "ã·ãã¢",
- "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
+ "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", // zero pos inc
"ã½ããã¦ã§ã¢",
"ã¨ã³ã¸ãã¢" },
new int[] { 1, 0, 1, 1},
new int[] { 1, 3, 1, 1}
);
+ // Senior project manager: also tests katakana spelling variation stemming
+ assertAnalyzesToPositions(a, "ã·ãã¢ããã¸ã§ã¯ãããã¼ã¸ã£ã¼",
+ new String[] { "ã·ãã¢",
+ "ã·ãã¢ããã¸ã§ã¯ãããã¼ã¸ã£", // trailing ã¼ removed by stemming, zero pos inc
+ "ããã¸ã§ã¯ã",
+ "ããã¼ã¸ã£"}, // trailing ã¼ removed by stemming
+ new int[]{1, 0, 1, 1},
+ new int[]{1, 3, 1, 1}
+ );
+
// Kansai International Airport:
assertAnalyzesToPositions(a, "é¢è¥¿å½é空港",
new String[] { "é¢è¥¿",
Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java (from r1304719, lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java&p1=lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java&r1=1304719&r2=1304727&rev=1304727&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java Sat Mar 24 07:56:21 2012
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.kurom
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException;
@@ -29,7 +30,7 @@ import java.io.Reader;
* Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter}
*/
public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
- private Analyzer analyzer = new Analyzer() {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
// Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml?rev=1304727&r1=1304726&r2=1304727&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml Sat Mar 24 07:56:21 2012
@@ -520,13 +520,13 @@
<!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
- <analyzer>
+ <analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- normalize width before bigram, as e.g. half-width dakuten combine -->
<filter class="solr.CJKWidthFilterFactory"/>
<!-- for any non-CJK -->
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.CJKBigramFilterFactory"/>
+ <filter class="solr.CJKBigramFilterFactory"/>
</analyzer>
</fieldType>
@@ -736,7 +736,9 @@
<filter class="solr.CJKWidthFilterFactory"/>
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
- <!-- Lower-case romaji characters -->
+ <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
+ <filter class="solr.KuromojiKatakanaStemFilterFactory" minimumLength="4"/>
+ <!-- Lower-cases romaji characters -->
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>