You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/09 23:35:25 UTC

svn commit: r1242573 - in /lucene/dev/trunk/solr: CHANGES.txt core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java example/solr/conf/schema.xml

Author: rmuir
Date: Thu Feb  9 22:35:24 2012
New Revision: 1242573

URL: http://svn.apache.org/viewvc?rev=1242573&view=rev
Log:
SOLR-3056: add example japanese field type, lazy-load kuromoji resources

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
    lucene/dev/trunk/solr/example/solr/conf/schema.xml

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1242573&r1=1242572&r2=1242573&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Feb  9 22:35:24 2012
@@ -485,6 +485,7 @@ New Features
   Uwe Schindler)
 
 * LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
+  See the 'text_ja' fieldtype in the example to get started.
   (Christian Moen, Masaru Hasegawa via Robert Muir)
 
 * SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and 

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1242573&r1=1242572&r2=1242573&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Feb  9 22:35:24 2012
@@ -59,11 +59,12 @@ public class KuromojiTokenizerFactory ex
   
   private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
 
-  private Segmenter segmenter;
+  private UserDictionary userDictionary;
+  private Mode mode;
   
   @Override
   public void inform(ResourceLoader loader) {
-    Mode mode = getMode(args);
+    mode = getMode(args);
     String userDictionaryPath = args.get(USER_DICT_PATH);
     try {
       if (userDictionaryPath != null) {
@@ -76,9 +77,9 @@ public class KuromojiTokenizerFactory ex
             .onMalformedInput(CodingErrorAction.REPORT)
             .onUnmappableCharacter(CodingErrorAction.REPORT);
         Reader reader = new InputStreamReader(stream, decoder);
-        this.segmenter = new Segmenter(new UserDictionary(reader), mode);
+        userDictionary = new UserDictionary(reader);
       } else {
-        this.segmenter = new Segmenter(mode);
+        userDictionary = null;
       }
     } catch (Exception e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
@@ -87,7 +88,7 @@ public class KuromojiTokenizerFactory ex
   
   @Override
   public Tokenizer create(Reader input) {
-    return new KuromojiTokenizer(segmenter, input);
+    return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
   }
   
   private Mode getMode(Map<String, String> args) {

Modified: lucene/dev/trunk/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?rev=1242573&r1=1242572&r2=1242573&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/schema.xml Thu Feb  9 22:35:24 2012
@@ -491,7 +491,7 @@
       </analyzer>
     </fieldType>
     
-    <!-- CJK bigram (see text_ja for an alternative Japanese configuration) -->
+    <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
     <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
       <analyzer> 
         <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -676,6 +676,44 @@
       </analyzer>
     </fieldType>
     
+    <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
+
+         NOTE: If you want to optimize search for precision, use default operator AND in your query
+         parser config with <solrQueryParser defaultOperator="AND"/> further down in this file.  Use 
+         OR if you would like to optimize for recall (default).
+    -->
+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+      <analyzer>
+      <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+
+           Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
+           
+           Example:
+             関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
+             so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
+             (With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
+
+           Valid values for mode are:
+              normal: regular segmentation
+              search: segmentation useful for search with extra splitting (default)
+            extended: same as search mode, but unigrams unknown words (experimental)
+
+           NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+        -->
+        <tokenizer class="solr.KuromojiTokenizerFactory" mode="search"/>
+        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->	
+        <filter class="solr.KuromojiBaseFormFilterFactory"/>
+        <!-- Removes tokens with certain part-of-speech tags -->
+        <filter class="solr.KuromojiPartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> -->
+        <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
+        <filter class="solr.CJKWidthFilterFactory"/>
+        <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
+        <!-- Lower-case romaji characters -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+    
     <!-- Latvian -->
     <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
       <analyzer>