You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/09 23:45:22 UTC
svn commit: r1242580 - in /lucene/dev/branches/branch_3x: ./ solr/
solr/CHANGES.txt solr/core/
solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
solr/example/solr/conf/schema.xml
Author: rmuir
Date: Thu Feb 9 22:45:21 2012
New Revision: 1242580
URL: http://svn.apache.org/viewvc?rev=1242580&view=rev
Log:
SOLR-3056: add example japanese field type, lazy-load kuromoji resources
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/core/ (props changed)
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1242580&r1=1242579&r2=1242580&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Thu Feb 9 22:45:21 2012
@@ -82,6 +82,7 @@ New Features
Uwe Schindler)
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
+ See the 'text_ja' fieldtype in the example to get started.
(Christian Moen, Masaru Hasegawa via Robert Muir)
* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1242580&r1=1242579&r2=1242580&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Feb 9 22:45:21 2012
@@ -59,11 +59,12 @@ public class KuromojiTokenizerFactory ex
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
- private Segmenter segmenter;
+ private UserDictionary userDictionary;
+ private Mode mode;
//@Override
public void inform(ResourceLoader loader) {
- Mode mode = getMode(args);
+ mode = getMode(args);
String userDictionaryPath = args.get(USER_DICT_PATH);
try {
if (userDictionaryPath != null) {
@@ -76,9 +77,9 @@ public class KuromojiTokenizerFactory ex
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);
- this.segmenter = new Segmenter(new UserDictionary(reader), mode);
+ userDictionary = new UserDictionary(reader);
} else {
- this.segmenter = new Segmenter(mode);
+ userDictionary = null;
}
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
@@ -87,7 +88,7 @@ public class KuromojiTokenizerFactory ex
//@Override
public Tokenizer create(Reader input) {
- return new KuromojiTokenizer(segmenter, input);
+ return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
}
private Mode getMode(Map<String, String> args) {
Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml?rev=1242580&r1=1242579&r2=1242580&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml Thu Feb 9 22:45:21 2012
@@ -507,7 +507,7 @@
</analyzer>
</fieldType>
- <!-- CJK bigram (see text_ja for an alternative Japanese configuration) -->
+ <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
@@ -692,6 +692,44 @@
</analyzer>
</fieldType>
+ <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
+
+ NOTE: If you want to optimize search for precision, use default operator AND in your query
+ parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use
+ OR if you would like to optimize for recall (default).
+ -->
+ <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+ <analyzer>
+ <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+
+ Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
+
+ Example:
+ é¢è¥¿å½é空港 (Kansai International Airpart) becomes é¢è¥¿ (Kansai) å½é (International) 空港 (airport)
+ so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
+ (With regular segmentation é¢è¥¿å½é空港 becomes one word and we don't get a hit.)
+
+ Valid values for mode are:
+ normal: regular segmentation
+ search: segmentation useful for search with extra splitting (default)
+ extended: same as search mode, but unigrams unknown words (experimental)
+
+ NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+ -->
+ <tokenizer class="solr.KuromojiTokenizerFactory" mode="search"/>
+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (è¾æ¸å½¢) -->
+ <filter class="solr.KuromojiBaseFormFilterFactory"/>
+ <!-- Removes tokens with certain part-of-speech tags -->
+ <filter class="solr.KuromojiPartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> -->
+ <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
+ <filter class="solr.CJKWidthFilterFactory"/>
+ <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
+ <!-- Lower-case romaji characters -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
<!-- Latvian -->
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>