You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/11/02 15:51:38 UTC

svn commit: r1030073 - in /lucene/dev/trunk/solr: ./ contrib/analysis-extras/src/java/org/apache/solr/analysis/ contrib/analysis-extras/src/test/org/apache/solr/analysis/

Author: rmuir
Date: Tue Nov  2 14:51:38 2010
New Revision: 1030073

URL: http://svn.apache.org/viewvc?rev=1030073&view=rev
Log:
SOLR-1336: Add support for Lucene's SmartChineseAnalyzer

Added:
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java   (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1030073&r1=1030072&r2=1030073&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Nov  2 14:51:38 2010
@@ -299,6 +299,9 @@ New Features
 
 * SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
 
+* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
+  tokenizer and filters to contrib/analysis-extras (rmuir)
+
 Optimizations
 ----------------------
 

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java Tue Nov  2 14:51:38 2010
@@ -0,0 +1,33 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link SentenceTokenizer}
+ * @lucene.experimental
+ */
+public class SmartChineseSentenceTokenizerFactory extends BaseTokenizerFactory {
+  public Tokenizer create(Reader input) {
+    return new SentenceTokenizer(input);
+  }
+}

Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java Tue Nov  2 14:51:38 2010
@@ -0,0 +1,37 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link WordTokenFilter}
+ * <p>
+ * Note: this class will currently emit tokens for punctuation. So you should either add
+ * a WordDelimiterFilter after to remove these (with concatenate off), or use the 
+ * SmartChinese stoplist with a StopFilterFactory via:
+ * <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
+ * @lucene.experimental
+ */
+public class SmartChineseWordTokenFilterFactory extends BaseTokenFilterFactory {
+  public TokenFilter create(TokenStream input) {
+      return new WordTokenFilter(input);
+  }
+}

Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java Tue Nov  2 14:51:38 2010
@@ -0,0 +1,57 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** 
+ * Tests for {@link SmartChineseSentenceTokenizerFactory} and 
+ * {@link SmartCHineseWordTokenFilterFactory}
+ */
+public class TestSmartChineseFactories extends BaseTokenTestCase {
+  /** Test showing the behavior with whitespace */
+  public void testSimple() throws Exception {
+    String sentence = "我购买了道具和服装。";
+    WhitespaceTokenizer ws = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(sentence));
+    SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+    TokenStream ts = factory.create(ws);
+    // TODO: fix smart chinese to not emit punctuation tokens
+    // at the moment: you have to clean up with WDF, or use the stoplist, etc
+    assertTokenStreamContents(ts, 
+       new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
+  }
+  
+  /** Test showing the behavior with whitespace */
+  public void testTokenizer() throws Exception {
+    String sentence = "我购买了道具和服装。我购买了道具和服装。";
+    SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory();
+    Tokenizer tokenizer = tokenizerFactory.create(new StringReader(sentence));
+    SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+    TokenStream ts = factory.create(tokenizer);
+    // TODO: fix smart chinese to not emit punctuation tokens
+    // at the moment: you have to clean up with WDF, or use the stoplist, etc
+    assertTokenStreamContents(ts, 
+       new String[] { "我", "购买", "了", "道具", "和", "服装", ",", 
+        "我", "购买", "了", "道具", "和", "服装", ","
+        });
+  }
+}

Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
------------------------------------------------------------------------------
    svn:eol-style = native