You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/07/12 12:14:18 UTC

svn commit: r1360613 - in /lucene/dev/branches/branch_4x/solr: ./ core/src/java/org/apache/solr/analysis/ core/src/test/org/apache/solr/analysis/ example/solr/collection1/conf/

Author: cm
Date: Thu Jul 12 10:14:18 2012
New Revision: 1360613

URL: http://svn.apache.org/viewvc?rev=1360613&view=rev
Log:
Merge of SOLR-3524 (Make discarding punctuation configurable in JapaneseTokenizerFactory)

Modified:
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
    lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java Thu Jul 12 10:14:18 2012
@@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.R
  * <fieldType name="text_ja" class="solr.TextField">
  *   <analyzer>
  *     <tokenizer class="solr.JapaneseTokenizerFactory"
- *       mode=NORMAL
- *       userDictionary=user.txt
- *       userDictionaryEncoding=UTF-8
+ *       mode="NORMAL"
+ *       userDictionary="user.txt"
+ *       userDictionaryEncoding="UTF-8"
+ *       discardPunctuation="true"
  *     />
  *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
  *   </analyzer>
@@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory ex
   
   private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
 
+  private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
+
   private UserDictionary userDictionary;
+
   private Mode mode;
-  
+
+  private boolean discardPunctuation;
+
   @Override
   public void inform(ResourceLoader loader) {
     mode = getMode(args);
@@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory ex
     } catch (Exception e) {
       throw new InitializationException("Exception thrown while loading dictionary", e);
     }
+    discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
   }
   
   @Override
   public Tokenizer create(Reader input) {
-    return new JapaneseTokenizer(input, userDictionary, true, mode);
+    return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
   }
   
   private Mode getMode(Map<String, String> args) {

Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java Thu Jul 12 10:14:18 2012
@@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactor
         new String[] { "シニアソフトウェアエンジニア" }
     );
   }
-  
+
+  /**
+   * Test user dictionary
+   */
   public void testUserDict() throws IOException {
     String userDict = 
         "# Custom segmentation for long entries\n" +
@@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactor
         new String[] { "関西", "国際", "空港", "に",  "行っ",  "た" }
     );
   }
+
+  /**
+   * Test preserving punctuation
+   */
+  public void testPreservePunctuation() throws IOException {
+    JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("discardPunctuation", "false");
+    factory.init(args);
+    factory.inform(new SolrResourceLoader(null, null));
+    TokenStream ts = factory.create(
+        new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
+    );
+    System.out.println(ts.toString());
+    assertTokenStreamContents(ts,
+        new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、",
+            "来週", "の", "頭", "日本", "に", "戻り", "ます", "。",
+            "楽しみ", "に", "し", "て", "い", "ます", "!",
+            "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。"}
+    );
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml?rev=1360613&r1=1360612&r2=1360613&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml (original)
+++ lucene/dev/branches/branch_4x/solr/example/solr/collection1/conf/schema.xml Thu Jul 12 10:14:18 2012
@@ -923,6 +923,8 @@
 
            See lang/userdict_ja.txt for a sample user dictionary file.
 
+           Punctuation characters are discarded by default.  Use discardPunctuation="false" to keep them.
+
            See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
         -->
         <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>