You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/21 02:04:46 UTC
svn commit: r1579855 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/th/
lucene/analysis/common/src/java/org/apache/lucene/analysis/util/
lucene/analysis/common/src/resources/M...
Author: rmuir
Date: Fri Mar 21 01:04:45 2014
New Revision: 1579855
URL: http://svn.apache.org/r1579855
Log:
LUCENE-4984: Fix ThaiWordFilter, smartcn WordTokenFilter
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java
- copied, changed from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseSentenceTokenizerFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Fri Mar 21 01:04:45 2014
@@ -66,6 +66,10 @@ New Features
set of matching hits, in cases where there are millions of hits.
(Rob Audenaerde, Gilad Barkai, Shai Erera)
+* LUCENE-4984: Add SegmentingTokenizerBase, abstract class for tokenizers
+ that want to do two-pass tokenization such as by sentence and then by word.
+ (Robert Muir)
+
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
@@ -97,6 +101,11 @@ API Changes
is complete in the sense of the top N or not. Consumers of this API should assert
on the completeness if the bounded queue size is know ahead of time. (Simon Willnauer)
+* LUCENE-4984: Deprecate ThaiWordFilter and smartcn SentenceTokenizer and WordTokenFilter.
+ These filters would not work correctly with CharFilters and could not be safely placed
+ at an arbitrary position in the analysis chain. Use ThaiTokenizer and HMMChineseTokenizer
+ instead. (Robert Muir)
+
Optimizations
* LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Fri Mar 21 01:04:45 2014
@@ -110,12 +110,19 @@ public final class ThaiAnalyzer extends
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new StandardTokenizer(matchVersion, reader);
- TokenStream result = new StandardFilter(matchVersion, source);
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- result = new LowerCaseFilter(matchVersion, result);
- result = new ThaiWordFilter(matchVersion, result);
- return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ if (matchVersion.onOrAfter(Version.LUCENE_48)) {
+ final Tokenizer source = new ThaiTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new ThaiWordFilter(matchVersion, result);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
result, stopwords));
+ }
}
}
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java (from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java&p1=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java Fri Mar 21 01:04:45 2014
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
+import java.io.Reader;
import java.text.BreakIterator;
import java.util.Locale;
@@ -58,13 +59,13 @@ public class ThaiTokenizer extends Segme
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** Creates a new ThaiTokenizer */
- public ThaiTokenizer() {
- this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+ public ThaiTokenizer(Reader reader) {
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader);
}
/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
- public ThaiTokenizer(AttributeFactory factory) {
- super((BreakIterator)sentenceProto.clone());
+ public ThaiTokenizer(AttributeFactory factory, Reader reader) {
+ super(factory, reader, (BreakIterator)sentenceProto.clone());
if (!DBBI_AVAILABLE) {
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
}
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java (from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java&p1=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizerFactory.java Fri Mar 21 01:04:45 2014
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
+import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
@@ -43,8 +44,8 @@ public class ThaiTokenizerFactory extend
}
@Override
- public Tokenizer create(AttributeSource.AttributeFactory factory) {
- return new ThaiTokenizer(factory);
+ public Tokenizer create(AttributeSource.AttributeFactory factory, Reader reader) {
+ return new ThaiTokenizer(factory, reader);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Fri Mar 21 01:04:45 2014
@@ -42,19 +42,16 @@ import org.apache.lucene.util.Version;
* It is known to work with Sun/Oracle and Harmony JREs.
* If your application needs to be fully portable, consider using ICUTokenizer instead,
* which uses an ICU Thai BreakIterator that will always be available.
+ * @deprecated Use {@link ThaiTokenizer} instead.
*/
+@Deprecated
public final class ThaiWordFilter extends TokenFilter {
/**
* True if the JRE supports a working dictionary-based breakiterator for Thai.
* If this is false, this filter will not work at all!
*/
- public static final boolean DBBI_AVAILABLE;
+ public static final boolean DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
- static {
- // check that we have a working dictionary-based break iterator for thai
- proto.setText("ภาษาà¹à¸à¸¢");
- DBBI_AVAILABLE = proto.isBoundary(4);
- }
private final BreakIterator breaker = (BreakIterator) proto.clone();
private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java Fri Mar 21 01:04:45 2014
@@ -33,7 +33,9 @@ import org.apache.lucene.analysis.util.T
* <filter class="solr.ThaiWordFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ * @deprecated Use {@link ThaiTokenizerFactory} instead
*/
+@Deprecated
public class ThaiWordFilterFactory extends TokenFilterFactory {
/** Creates a new ThaiWordFilterFactory */
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (from r1579846, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java&p1=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java Fri Mar 21 01:04:45 2014
@@ -61,15 +61,15 @@ public abstract class SegmentingTokenize
* TokenStreams, instead a newly created or cloned one should always
* be provided to this constructor.
*/
- public SegmentingTokenizerBase(BreakIterator iterator) {
- this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, iterator);
+ public SegmentingTokenizerBase(Reader reader, BreakIterator iterator) {
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator);
}
/**
* Construct a new SegmenterBase, also supplying the AttributeFactory
*/
- public SegmentingTokenizerBase(AttributeFactory factory, BreakIterator iterator) {
- super(factory);
+ public SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator) {
+ super(factory, reader);
this.iterator = iterator;
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory Fri Mar 21 01:04:45 2014
@@ -28,4 +28,5 @@ org.apache.lucene.analysis.ru.RussianLet
org.apache.lucene.analysis.standard.ClassicTokenizerFactory
org.apache.lucene.analysis.standard.StandardTokenizerFactory
org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
+org.apache.lucene.analysis.th.ThaiTokenizerFactory
org.apache.lucene.analysis.wikipedia.WikipediaTokenizerFactory
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Fri Mar 21 01:04:45 2014
@@ -79,7 +79,6 @@ import org.apache.lucene.analysis.payloa
import org.apache.lucene.analysis.snowball.TestSnowball;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
@@ -167,8 +166,6 @@ public class TestRandomChains extends Ba
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: doesn't handle graph inputs
- ThaiWordFilter.class,
- // TODO: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Mar 21 01:04:45 2014
@@ -41,7 +41,7 @@ public class TestThaiAnalyzer extends Ba
@Override
public void setUp() throws Exception {
super.setUp();
- assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
}
/*
* testcase for offsets
@@ -68,16 +68,6 @@ public class TestThaiAnalyzer extends Ba
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
- public void testTokenType() throws Exception {
- assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹",
- new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ", "à¹à¹à¹" },
- new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
- "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
- "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
- "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
- "<NUM>" });
- }
-
/**
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
* @deprecated (3.1) testing backwards behavior
@@ -189,15 +179,4 @@ public class TestThaiAnalyzer extends Ba
ts.addAttribute(FlagsAttribute.class);
assertTokenStreamContents(ts, new String[] { "ภาษา", "à¹à¸à¸¢" });
}
-
- public void testEmptyTerm() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new ThaiWordFilter(TEST_VERSION_CURRENT, tokenizer));
- }
- };
- checkOneTerm(a, "", "");
- }
}
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java (from r1579846, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java&p1=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiTokenizerFactory.java Fri Mar 21 01:04:45 2014
@@ -31,8 +31,7 @@ public class TestThaiTokenizerFactory ex
*/
public void testWordBreak() throws Exception {
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
- Tokenizer tokenizer = tokenizerFactory("Thai").create();
- tokenizer.setReader(new StringReader("à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ"));
+ Tokenizer tokenizer = tokenizerFactory("Thai").create(new StringReader("à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ"));
assertTokenStreamContents(tokenizer, new String[] {"à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹",
"à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ"});
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java Fri Mar 21 01:04:45 2014
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.util.B
/**
* Simple tests to ensure the Thai word filter factory is working.
*/
+@Deprecated
public class TestThaiWordFilterFactory extends BaseTokenStreamFactoryTestCase {
/**
* Ensure the filter actually decomposes text.
Copied: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (from r1579846, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java&p1=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Fri Mar 21 01:04:45 2014
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.util;
*/
import java.io.IOException;
+import java.io.Reader;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Locale;
@@ -32,15 +33,15 @@ import org.apache.lucene.analysis.tokena
public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
private Analyzer sentence = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new WholeSentenceTokenizer());
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new WholeSentenceTokenizer(reader));
}
};
private Analyzer sentenceAndWord = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new SentenceAndWordTokenizer());
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new SentenceAndWordTokenizer(reader));
}
};
@@ -139,8 +140,8 @@ public class TestSegmentingTokenizerBase
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- public WholeSentenceTokenizer() {
- super(BreakIterator.getSentenceInstance(Locale.ROOT));
+ public WholeSentenceTokenizer(Reader reader) {
+ super(reader, BreakIterator.getSentenceInstance(Locale.ROOT));
}
@Override
@@ -177,8 +178,8 @@ public class TestSegmentingTokenizerBase
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- public SentenceAndWordTokenizer() {
- super(BreakIterator.getSentenceInstance(Locale.ROOT));
+ public SentenceAndWordTokenizer(Reader reader) {
+ super(reader, BreakIterator.getSentenceInstance(Locale.ROOT));
}
@Override
Copied: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java (from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java&p1=lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java Fri Mar 21 01:04:45 2014
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cn.sm
*/
import java.io.IOException;
+import java.io.Reader;
import java.text.BreakIterator;
import java.util.Iterator;
import java.util.Locale;
@@ -46,13 +47,13 @@ public class HMMChineseTokenizer extends
private Iterator<SegToken> tokens;
/** Creates a new HMMChineseTokenizer */
- public HMMChineseTokenizer() {
- this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+ public HMMChineseTokenizer(Reader reader) {
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader);
}
/** Creates a new HMMChineseTokenizer, supplying the AttributeFactory */
- public HMMChineseTokenizer(AttributeFactory factory) {
- super((BreakIterator)sentenceProto.clone());
+ public HMMChineseTokenizer(AttributeFactory factory, Reader reader) {
+ super(factory, reader, (BreakIterator)sentenceProto.clone());
}
@Override
Copied: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java (from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java&p1=lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java Fri Mar 21 01:04:45 2014
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.cn.smart;
+import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
@@ -43,7 +44,7 @@ public final class HMMChineseTokenizerFa
}
@Override
- public Tokenizer create(AttributeFactory factory) {
- return new HMMChineseTokenizer(factory);
+ public Tokenizer create(AttributeFactory factory, Reader reader) {
+ return new HMMChineseTokenizer(factory, reader);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Fri Mar 21 01:04:45 2014
@@ -31,7 +31,9 @@ import org.apache.lucene.analysis.tokena
* The output tokens can then be broken into words with {@link WordTokenFilter}
* </p>
* @lucene.experimental
+ * @deprecated Use {@link HMMChineseTokenizer} instead
*/
+@Deprecated
public final class SentenceTokenizer extends Tokenizer {
/**
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java Fri Mar 21 01:04:45 2014
@@ -137,8 +137,15 @@ public final class SmartChineseAnalyzer
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new SentenceTokenizer(reader);
- TokenStream result = new WordTokenFilter(tokenizer);
+ final Tokenizer tokenizer;
+ TokenStream result;
+ if (matchVersion.onOrAfter(Version.LUCENE_48)) {
+ tokenizer = new HMMChineseTokenizer(reader);
+ result = tokenizer;
+ } else {
+ tokenizer = new SentenceTokenizer(reader);
+ result = new WordTokenFilter(tokenizer);
+ }
// result = new LowerCaseFilter(result);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseSentenceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseSentenceTokenizerFactory.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseSentenceTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseSentenceTokenizerFactory.java Fri Mar 21 01:04:45 2014
@@ -26,7 +26,9 @@ import org.apache.lucene.util.AttributeS
/**
* Factory for the SmartChineseAnalyzer {@link SentenceTokenizer}
* @lucene.experimental
+ * @deprecated Use {@link HMMChineseTokenizerFactory} instead
*/
+@Deprecated
public class SmartChineseSentenceTokenizerFactory extends TokenizerFactory {
/** Creates a new SmartChineseSentenceTokenizerFactory */
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java Fri Mar 21 01:04:45 2014
@@ -32,7 +32,9 @@ import org.apache.lucene.analysis.util.T
* SmartChinese stoplist with a StopFilterFactory via:
* <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
* @lucene.experimental
+ * @deprecated Use {@link HMMChineseTokenizerFactory} instead
*/
+@Deprecated
public class SmartChineseWordTokenFilterFactory extends TokenFilterFactory {
/** Creates a new SmartChineseWordTokenFilterFactory */
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java Fri Mar 21 01:04:45 2014
@@ -31,7 +31,9 @@ import org.apache.lucene.analysis.tokena
/**
* A {@link TokenFilter} that breaks sentences into words.
* @lucene.experimental
+ * @deprecated Use {@link HMMChineseTokenizer} instead.
*/
+@Deprecated
public final class WordTokenFilter extends TokenFilter {
private WordSegmenter wordSegmenter;
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory Fri Mar 21 01:04:45 2014
@@ -13,4 +13,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.lucene.analysis.cn.smart.HMMChineseTokenizerFactory
org.apache.lucene.analysis.cn.smart.SmartChineseSentenceTokenizerFactory
Copied: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java (from r1579846, lucene/dev/trunk/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java&p1=lucene/dev/trunk/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java&r1=1579846&r2=1579855&rev=1579855&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestHMMChineseTokenizerFactory.java Fri Mar 21 01:04:45 2014
@@ -34,7 +34,7 @@ public class TestHMMChineseTokenizerFact
public void testSimple() throws Exception {
Reader reader = new StringReader("æè´ä¹°äºéå
·åæè£
ã");
TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
- Tokenizer tokenizer = factory.create();
+ Tokenizer tokenizer = factory.create(reader);
tokenizer.setReader(reader);
// TODO: fix smart chinese to not emit punctuation tokens
// at the moment: you have to clean up with WDF, or use the stoplist, etc
Modified: lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java?rev=1579855&r1=1579854&r2=1579855&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java Fri Mar 21 01:04:45 2014
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenS
* Tests for {@link SmartChineseSentenceTokenizerFactory} and
* {@link SmartChineseWordTokenFilterFactory}
*/
+@Deprecated
public class TestSmartChineseFactories extends BaseTokenStreamTestCase {
/** Test showing the behavior with whitespace */
public void testSimple() throws Exception {