You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/11/02 15:51:38 UTC
svn commit: r1030073 - in /lucene/dev/trunk/solr: ./
contrib/analysis-extras/src/java/org/apache/solr/analysis/
contrib/analysis-extras/src/test/org/apache/solr/analysis/
Author: rmuir
Date: Tue Nov 2 14:51:38 2010
New Revision: 1030073
URL: http://svn.apache.org/viewvc?rev=1030073&view=rev
Log:
SOLR-1336: Add support for Lucene's SmartChineseAnalyzer
Added:
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java (with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1030073&r1=1030072&r2=1030073&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Nov 2 14:51:38 2010
@@ -299,6 +299,9 @@ New Features
* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
+* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
+ tokenizer and filters to contrib/analysis-extras (rmuir)
+
Optimizations
----------------------
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java Tue Nov 2 14:51:38 2010
@@ -0,0 +1,33 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link SentenceTokenizer}
+ * @lucene.experimental
+ */
+public class SmartChineseSentenceTokenizerFactory extends BaseTokenizerFactory {
+ public Tokenizer create(Reader input) {
+ return new SentenceTokenizer(input);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java Tue Nov 2 14:51:38 2010
@@ -0,0 +1,37 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link WordTokenFilter}
+ * <p>
+ * Note: this class will currently emit tokens for punctuation. So you should either add
+ * a WordDelimiterFilter after to remove these (with concatenate off), or use the
+ * SmartChinese stoplist with a StopFilterFactory via:
+ * <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
+ * @lucene.experimental
+ */
+public class SmartChineseWordTokenFilterFactory extends BaseTokenFilterFactory {
+ public TokenFilter create(TokenStream input) {
+ return new WordTokenFilter(input);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java?rev=1030073&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java Tue Nov 2 14:51:38 2010
@@ -0,0 +1,57 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Tests for {@link SmartChineseSentenceTokenizerFactory} and
+ * {@link SmartCHineseWordTokenFilterFactory}
+ */
+public class TestSmartChineseFactories extends BaseTokenTestCase {
+ /** Test showing the behavior with whitespace */
+ public void testSimple() throws Exception {
+ String sentence = "æè´ä¹°äºéå
·åæè£
ã";
+ WhitespaceTokenizer ws = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(sentence));
+ SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+ TokenStream ts = factory.create(ws);
+ // TODO: fix smart chinese to not emit punctuation tokens
+ // at the moment: you have to clean up with WDF, or use the stoplist, etc
+ assertTokenStreamContents(ts,
+ new String[] { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
", "," });
+ }
+
+ /** Test showing the behavior with whitespace */
+ public void testTokenizer() throws Exception {
+ String sentence = "æè´ä¹°äºéå
·åæè£
ãæè´ä¹°äºéå
·åæè£
ã";
+ SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory();
+ Tokenizer tokenizer = tokenizerFactory.create(new StringReader(sentence));
+ SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+ TokenStream ts = factory.create(tokenizer);
+ // TODO: fix smart chinese to not emit punctuation tokens
+ // at the moment: you have to clean up with WDF, or use the stoplist, etc
+ assertTokenStreamContents(ts,
+ new String[] { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
", ",",
+ "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
", ","
+ });
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
------------------------------------------------------------------------------
svn:eol-style = native