You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/28 19:20:48 UTC
svn commit: r1306476 - in /lucene/dev/trunk:
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/
modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/
solr/core/src/java/org/apache/solr/analysis/ solr/core/src/test/o...
Author: cm
Date: Wed Mar 28 17:20:48 2012
New Revision: 1306476
URL: http://svn.apache.org/viewvc?rev=1306476&view=rev
Log:
Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)
Added:
lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt (with props)
Modified:
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt
lucene/dev/trunk/solr/example/solr/conf/schema.xml
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java Wed Mar 28 17:20:48 2012
@@ -99,8 +99,9 @@ public final class UserDictionary implem
String pos = values[3];
if (segmentation.length != readings.length) {
- // FIXME: Should probably deal with this differently. Exception?
- System.out.println("This entry is not properly formatted : " + line);
+ throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+ " - the number of segmentations (" + segmentation.length + ")" +
+ " does not the match number of readings (" + readings.length + ")");
}
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt Wed Mar 28 17:20:48 2012
@@ -1,5 +1,5 @@
#
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
#
# Any token with a part-of-speech tag that exactly matches those defined in this
# file are removed from the token stream.
@@ -417,4 +417,4 @@
# unknown: unknown part of speech.
#æªç¥èª
#
-##### End of file
\ No newline at end of file
+##### End of file
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt Wed Mar 28 17:20:48 2012
@@ -6,7 +6,7 @@
# for frequency lists, etc. that can be useful for making your own set (if desired)
#
# Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
# that comments are not allowed on the same line as stopwords.
#
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java Wed Mar 28 17:20:48 2012
@@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.Resou
* <analyzer>
* <tokenizer class="solr.JapaneseTokenizerFactory"
* mode=NORMAL
- * user-dictionary=user.txt
- * user-dictionary-encoding=UTF-8
+ * userDictionary=user.txt
+ * userDictionaryEncoding=UTF-8
* />
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
* </analyzer>
@@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.Resou
public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
private static final String MODE = "mode";
- private static final String USER_DICT_PATH = "user-dictionary";
+ private static final String USER_DICT_PATH = "userDictionary";
- private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
+ private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
private UserDictionary userDictionary;
private Mode mode;
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java Wed Mar 28 17:20:48 2012
@@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactor
"æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å\n";
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
- args.put("user-dictionary", "userdict.txt");
+ args.put("userDictionary", "userdict.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(userDict));
TokenStream ts = factory.create(new StringReader("é¢è¥¿å½é空港ã«è¡ã£ã"));
Modified: lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt (original)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt Wed Mar 28 17:20:48 2012
@@ -1,5 +1,5 @@
#
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
#
# Any token with a part-of-speech tag that exactly matches those defined in this
# file are removed from the token stream.
@@ -417,4 +417,4 @@
# unknown: unknown part of speech.
#æªç¥èª
#
-##### End of file
\ No newline at end of file
+##### End of file
Modified: lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt (original)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt Wed Mar 28 17:20:48 2012
@@ -6,7 +6,7 @@
# for frequency lists, etc. that can be useful for making your own set (if desired)
#
# Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
# that comments are not allowed on the same line as stopwords.
#
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
Added: lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt?rev=1306476&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/userdict_ja.txt Wed Mar 28 17:20:48 2012
@@ -0,0 +1,29 @@
+#
+# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
+#
+# Add entries to this file in order to override the statistical model in terms
+# of segmentation, readings and part-of-speech tags. Notice that entries do
+# not have weights since they are always used when found. This is by-design
+# in order to maximize ease-of-use.
+#
+# Entries are defined using the following CSV format:
+# <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
+#
+# Notice that a single half-width space separates tokens and readings, and
+# that the number tokens and readings must match exactly.
+#
+# Also notice that multiple entries with the same <text> is undefined.
+#
+# Whitespace only lines are ignored. Comments are not allowed on entry lines.
+#
+
+# Custom segmentation for kanji compounds
+æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè©
+é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ã«ã¹ã¿ã åè©
+
+# Custom segmentation for compound katakana
+ãã¼ãããã°,ãã¼ã ããã°,ãã¼ã ããã°,ããã«ãåè©
+ã·ã§ã«ãã¼ããã°,ã·ã§ã«ãã¼ ããã°,ã·ã§ã«ãã¼ ããã°,ããã«ãåè©
+
+# Custom reading for former sumo wrestler
+æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å
Modified: lucene/dev/trunk/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?rev=1306476&r1=1306475&r2=1306476&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/schema.xml Wed Mar 28 17:20:48 2012
@@ -709,24 +709,35 @@
-->
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
- <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+ <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
- Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
-
- Example:
- é¢è¥¿å½é空港 (Kansai International Airpart) becomes é¢è¥¿ (Kansai) å½é (International) 空港 (airport)
- so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
- (With regular segmentation é¢è¥¿å½é空港 becomes one word and we don't get a hit.)
+ Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
+ is used to segment compounds into its parts and the compound itself is kept as synonym.
- Valid values for mode are:
+ Valid values for attribute mode are:
normal: regular segmentation
- search: segmentation useful for search with extra splitting (default)
+ search: segmentation useful for search with synonyms compounds (default)
extended: same as search mode, but unigrams unknown words (experimental)
- NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+ For some applications it might be good to use search mode for indexing and normal mode for
+ queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+ Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+ Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+ model with your own entries for segmentation, part-of-speech tags and readings without a need
+ to specify weights. Notice that user dictionaries have not been subject to extensive testing.
+
+ User dictionary attributes are:
+ userDictionary: user dictionary filename
+ userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+ See lang/userdict_ja.txt for a sample user dictionary file.
+
+ See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
-->
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
- <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (è¾æ¸å½¢) -->
+ <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (è¾æ¸å½¢) -->
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<!-- Removes tokens with certain part-of-speech tags -->
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>