You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/07/12 12:14:18 UTC
svn commit: r1360613 - in /lucene/dev/branches/branch_4x/solr: ./
core/src/java/org/apache/solr/analysis/
core/src/test/org/apache/solr/analysis/ example/solr/collection1/conf/
Author: cm
Date: Thu Jul 12 10:14:18 2012
New Revision: 1360613
URL: http://svn.apache.org/viewvc?rev=1360613&view=rev
Log:
Merge of SOLR-3524 (Make discarding punctuation configurable in JapaneseTokenizerFactory)
Modified:
lucene/dev/branches/branch_4x/solr/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml
Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java Thu Jul 12 10:14:18 2012
@@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.R
* <fieldType name="text_ja" class="solr.TextField">
* <analyzer>
* <tokenizer class="solr.JapaneseTokenizerFactory"
- * mode=NORMAL
- * userDictionary=user.txt
- * userDictionaryEncoding=UTF-8
+ * mode="NORMAL"
+ * userDictionary="user.txt"
+ * userDictionaryEncoding="UTF-8"
+ * discardPunctuation="true"
* />
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
* </analyzer>
@@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory ex
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
+ private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
+
private UserDictionary userDictionary;
+
private Mode mode;
-
+
+ private boolean discardPunctuation;
+
@Override
public void inform(ResourceLoader loader) {
mode = getMode(args);
@@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory ex
} catch (Exception e) {
throw new InitializationException("Exception thrown while loading dictionary", e);
}
+ discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
}
@Override
public Tokenizer create(Reader input) {
- return new JapaneseTokenizer(input, userDictionary, true, mode);
+ return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
}
private Mode getMode(Map<String, String> args) {
Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java Thu Jul 12 10:14:18 2012
@@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactor
new String[] { "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢" }
);
}
-
+
+ /**
+ * Test user dictionary
+ */
public void testUserDict() throws IOException {
String userDict =
"# Custom segmentation for long entries\n" +
@@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactor
new String[] { "é¢è¥¿", "å½é", "空港", "ã«", "è¡ã£", "ã" }
);
}
+
+ /**
+ * Test preserving punctuation
+ */
+ public void testPreservePunctuation() throws IOException {
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("discardPunctuation", "false");
+ factory.init(args);
+ factory.inform(new SolrResourceLoader(null, null));
+ TokenStream ts = factory.create(
+ new StringReader("ä»ãã«ã¦ã§ã¼ã«ãã¾ãããæ¥é±ã®é æ¥æ¬ã«æ»ãã¾ãã楽ãã¿ã«ãã¦ãã¾ãï¼ã寿å¸ãé£ã¹ãããªããã")
+ );
+ System.out.println(ts.toString());
+ assertTokenStreamContents(ts,
+ new String[] { "ä»", "ãã«ã¦ã§ã¼", "ã«", "ã", "ã¾ã", "ã", "ã",
+ "æ¥é±", "ã®", "é ", "æ¥æ¬", "ã«", "æ»ã", "ã¾ã", "ã",
+ "楽ãã¿", "ã«", "ã", "ã¦", "ã", "ã¾ã", "ï¼",
+ "ã", "寿å¸", "ã", "é£ã¹", "ãã", "ãª", "ã", "ã", "ã"}
+ );
+ }
}
Modified: lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml (original)
+++ lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml Thu Jul 12 10:14:18 2012
@@ -923,6 +923,8 @@
See lang/userdict_ja.txt for a sample user dictionary file.
+ Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.
+
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
-->
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>