You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/28 19:20:48 UTC

svn commit: r1306476 - in /lucene/dev/trunk: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/ solr/core/src/java/org/apache/solr/analysis/ solr/core/src/test/o...

Author: cm
Date: Wed Mar 28 17:20:48 2012
New Revision: 1306476

URL: http://svn.apache.org/viewvc?rev=1306476&view=rev
Log:
Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)

Added:
    lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt   (with props)
Modified:
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
    lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt
    lucene/dev/trunk/solr/example/solr/conf/schema.xml

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java Wed Mar 28 17:20:48 2012
@@ -99,8 +99,9 @@ public final class UserDictionary implem
       String pos = values[3];
       
       if (segmentation.length != readings.length) {
-        // FIXME: Should probably deal with this differently.  Exception?
-        System.out.println("This entry is not properly formatted : " + line);
+        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+                                   " - the number of segmentations (" + segmentation.length + ")" +
+                                   " does not the match number of readings (" + readings.length + ")");
       }
       
       int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt Wed Mar 28 17:20:48 2012
@@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
@@ -417,4 +417,4 @@
 #  unknown: unknown part of speech.
 #未知語
 #
-##### End of file
\ No newline at end of file
+##### End of file

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt Wed Mar 28 17:20:48 2012
@@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java Wed Mar 28 17:20:48 2012
@@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.Resou
  *   <analyzer>
  *     <tokenizer class="solr.JapaneseTokenizerFactory"
  *       mode=NORMAL
- *       user-dictionary=user.txt
- *       user-dictionary-encoding=UTF-8
+ *       userDictionary=user.txt
+ *       userDictionaryEncoding=UTF-8
  *     />
  *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
  *   </analyzer>
@@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.Resou
 public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
   private static final String MODE = "mode";
   
-  private static final String USER_DICT_PATH = "user-dictionary";
+  private static final String USER_DICT_PATH = "userDictionary";
   
-  private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
+  private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
 
   private UserDictionary userDictionary;
   private Mode mode;

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java Wed Mar 28 17:20:48 2012
@@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactor
         "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
     JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
     Map<String,String> args = new HashMap<String,String>();
-    args.put("user-dictionary", "userdict.txt");
+    args.put("userDictionary", "userdict.txt");
     factory.init(args);
     factory.inform(new StringMockSolrResourceLoader(userDict));
     TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));

Modified: lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt (original)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt Wed Mar 28 17:20:48 2012
@@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
@@ -417,4 +417,4 @@
 #  unknown: unknown part of speech.
 #未知語
 #
-##### End of file
\ No newline at end of file
+##### End of file

Modified: lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt (original)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt Wed Mar 28 17:20:48 2012
@@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter

Added: lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt?rev=1306476&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt Wed Mar 28 17:20:48 2012
@@ -0,0 +1,29 @@
+#
+# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
+#
+# Add entries to this file in order to override the statistical model in terms
+# of segmentation, readings and part-of-speech tags.  Notice that entries do
+# not have weights since they are always used when found.  This is by-design
+# in order to maximize ease-of-use.
+#
+# Entries are defined using the following CSV format:
+#  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
+#
+# Notice that a single half-width space separates tokens and readings, and
+# that the number tokens and readings must match exactly.
+#
+# Also notice that multiple entries with the same <text> is undefined.
+#
+# Whitespace only lines are ignored.  Comments are not allowed on entry lines.
+#
+
+# Custom segmentation for kanji compounds
+日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
+関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
+
+# Custom segmentation for compound katakana
+トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
+ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
+
+# Custom reading for former sumo wrestler
+朝青龍,朝青龍,アサショウリュウ,カスタム人名

Modified: lucene/dev/trunk/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/schema.xml Wed Mar 28 17:20:48 2012
@@ -709,24 +709,35 @@
     -->
     <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
       <analyzer>
-      <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+      <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
 
-           Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
-           
-           Example:
-             関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
-             so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
-             (With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
+           Kuromoji has a search mode (default) that does segmentation useful for search.  A heuristic
+           is used to segment compounds into its parts and the compound itself is kept as synonym.
 
-           Valid values for mode are:
+           Valid values for attribute mode are:
               normal: regular segmentation
-              search: segmentation useful for search with extra splitting (default)
+              search: segmentation useful for search with synonyms compounds (default)
             extended: same as search mode, but unigrams unknown words (experimental)
 
-           NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+           For some applications it might be good to use search mode for indexing and normal mode for
+           queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+           Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+           Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+           model with your own entries for segmentation, part-of-speech tags and readings without a need
+           to specify weights.  Notice that user dictionaries have not been subject to extensive testing.
+
+           User dictionary attributes are:
+                     userDictionary: user dictionary filename
+             userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+           See lang/userdict_ja.txt for a sample user dictionary file.
+
+           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
         -->
         <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
-        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->	
+        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
+        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
         <filter class="solr.JapaneseBaseFormFilterFactory"/>
         <!-- Removes tokens with certain part-of-speech tags -->
         <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>