You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/06/19 17:52:36 UTC
svn commit: r786560 - in /lucene/java/trunk/contrib/analyzers/src:
java/org/apache/lucene/analysis/cn/ java/org/apache/lucene/analysis/nl/
java/org/apache/lucene/analysis/th/ test/org/apache/lucene/analysis/cn/
test/org/apache/lucene/analysis/th/
Author: mikemccand
Date: Fri Jun 19 15:52:36 2009
New Revision: 786560
URL: http://svn.apache.org/viewvc?rev=786560&view=rev
Log:
LUCENE-1692: add tests for Thai & SmartChinese analyzers; fix wrong endOffset bug in ThaiWordFilter; use stop words by default with SmartChineseAnalyzer
Removed:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/stems.txt
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/words.txt
Modified:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java?rev=786560&r1=786559&r2=786560&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java Fri Jun 19 15:52:36 2009
@@ -58,7 +58,7 @@
private WordSegmenter wordSegment;
public SmartChineseAnalyzer() {
- this(false);
+ this(true);
}
/**
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=786560&r1=786559&r2=786560&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Fri Jun 19 15:52:36 2009
@@ -47,7 +47,7 @@
if (end != BreakIterator.DONE) {
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
reusableToken.setStartOffset(thaiToken.startOffset()+start);
- reusableToken.setEndOffset(thaiToken.endOffset()+end);
+ reusableToken.setEndOffset(thaiToken.startOffset()+end);
return reusableToken;
}
thaiToken = null;
Modified: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java?rev=786560&r1=786559&r2=786560&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java Fri Jun 19 15:52:36 2009
@@ -31,7 +31,27 @@
import org.apache.lucene.analysis.TokenStream;
public class TestSmartChineseAnalyzer extends TestCase {
-
+
+ public void testChineseStopWordsDefault() throws Exception {
+ Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+ String sentence = "æè´ä¹°äºéå
·åæè£
ã";
+ String result[] = { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
" };
+ assertAnalyzesTo(ca, sentence, result);
+ }
+
+ /*
+ * Punctuation is handled in a strange way if you disable stopwords
+ * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
+ * if you don't supply (true) to the constructor, or use a different stopwords list,
+ * then punctuation is indexed.
+ */
+ public void testChineseStopWordsOff() throws Exception {
+ Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
+ String sentence = "æè´ä¹°äºéå
·åæè£
ã";
+ String result[] = { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
", "," };
+ assertAnalyzesTo(ca, sentence, result);
+ }
+
public void testChineseAnalyzer() throws IOException {
Token nt = new Token();
Analyzer ca = new SmartChineseAnalyzer(true);
@@ -47,6 +67,54 @@
}
ts.close();
}
+
+ /*
+ * English words are lowercased and porter-stemmed.
+ */
+ public void testMixedLatinChinese() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "æè´ä¹° Tests äºéå
·åæè£
",
+ new String[] { "æ", "è´ä¹°", "test", "äº", "éå
·", "å", "æè£
"});
+ }
+
+ public void testOffsets() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "æè´ä¹°äºéå
·åæè£
",
+ new String[] { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
" },
+ new int[] { 0, 1, 3, 4, 6, 7 },
+ new int[] { 1, 3, 4, 6, 7, 9 });
+ }
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
+ throws Exception {
+
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ final Token reusableToken = new Token();
+ for (int i = 0; i < output.length; i++) {
+ Token nextToken = ts.next(reusableToken);
+ assertNotNull(nextToken);
+ assertEquals(nextToken.term(), output[i]);
+ if (startOffsets != null)
+ assertEquals(nextToken.startOffset(), startOffsets[i]);
+ if (endOffsets != null)
+ assertEquals(nextToken.endOffset(), endOffsets[i]);
+ if (types != null)
+ assertEquals(nextToken.type(), types[i]);
+ }
+ assertNull(ts.next(reusableToken));
+ ts.close();
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
+ assertAnalyzesTo(a, input, output, null, null, null);
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
+ assertAnalyzesTo(a, input, output, null, null, types);
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
+ assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
+}
+
/**
* @param args
Modified: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=786560&r1=786559&r2=786560&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Jun 19 15:52:36 2009
@@ -30,8 +30,43 @@
*/
public class TestThaiAnalyzer extends TestCase {
-
- public void assertAnalyzesTo(Analyzer a, String input, String[] output)
+
+ /*
+ * testcase for offsets
+ */
+ public void testOffsets() throws Exception {
+ assertAnalyzesTo(new ThaiAnalyzer(), "à¹à¸à¸à¸°à¸à¸´à¸§à¸¢à¸à¸£à¹à¸à¹à¸à¸¡à¸ªà¹",
+ new String[] { "à¹à¸", "à¸à¸°à¸à¸´à¸§", "ยà¸", "รà¹à¸", "à¹à¸à¸¡à¸ªà¹"},
+ new int[] { 0, 2, 7, 9, 12 },
+ new int[] { 2, 7, 9, 12, 17});
+ }
+
+
+ /*
+ * Thai numeric tokens are typed as <ALPHANUM> instead of <NUM>.
+ * This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer.
+ *
+ * The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc)
+ * Fix is easy: refine this spec to exclude thai punctuation and digits.
+ *
+ * A better fix, that would also fix quite a few other languages would be to remove the thai hack.
+ * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
+ */
+ public void testBuggyTokenType() throws Exception {
+ assertAnalyzesTo(new ThaiAnalyzer(), "à¹à¸à¸à¸°à¸à¸´à¸§à¸¢à¸à¸£à¹à¸à¹à¸à¸¡à¸ªà¹ à¹à¹à¹",
+ new String[] { "à¹à¸", "à¸à¸°à¸à¸´à¸§", "ยà¸", "รà¹à¸", "à¹à¸à¸¡à¸ªà¹", "à¹à¹à¹" },
+ new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
+ }
+
+ /* correct testcase
+ public void testTokenType() throws Exception {
+ assertAnalyzesTo(new ThaiAnalyzer(), "à¹à¸à¸à¸°à¸à¸´à¸§à¸¢à¸à¸£à¹à¸à¹à¸à¸¡à¸ªà¹ à¹à¹à¹",
+ new String[] { "à¹à¸", "à¸à¸°à¸à¸´à¸§", "ยà¸", "รà¹à¸", "à¹à¸à¸¡à¸ªà¹", "à¹à¹à¹" },
+ new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>" });
+ }
+ */
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
@@ -40,10 +75,28 @@
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(nextToken.term(), output[i]);
+ if (startOffsets != null)
+ assertEquals(nextToken.startOffset(), startOffsets[i]);
+ if (endOffsets != null)
+ assertEquals(nextToken.endOffset(), endOffsets[i]);
+ if (types != null)
+ assertEquals(nextToken.type(), types[i]);
}
assertNull(ts.next(reusableToken));
ts.close();
}
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
+ assertAnalyzesTo(a, input, output, null, null, null);
+ }
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
+ assertAnalyzesTo(a, input, output, null, null, types);
+ }
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
+ assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
+ }
public void testAnalyzer() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer();