You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/10 15:54:49 UTC
svn commit: r1299213 [2/2] - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/
lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
luce...
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Sat Mar 10 14:54:47 2012
@@ -17,7 +17,13 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
+import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
@@ -26,21 +32,76 @@ import org.apache.lucene.analysis.BaseTo
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+
+ public static UserDictionary readDict() {
+ InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt");
+ if (is == null) {
+ throw new RuntimeException("Cannot find userdict.txt in test classpath!");
+ }
+ try {
+ try {
+ Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
+ return new UserDictionary(reader);
+ } finally {
+ is.close();
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer(reader);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
-
+
+ private Analyzer analyzerNormal = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ private Analyzer analyzerNoPunct = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ private Analyzer extendedModeAnalyzerNoPunct = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ public void testNormalMode() throws Exception {
+ assertAnalyzesTo(analyzerNormal,
+ "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
+ new String[] {"ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"});
+ }
+
public void testDecomposition1() throws Exception {
- assertAnalyzesTo(analyzer, "æ¬æ¥ã¯ã貧å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
+ assertAnalyzesTo(analyzerNoPunct, "æ¬æ¥ã¯ã貧å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
"ã¢ã¡ãªã«ä½æå¾è
å»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãè人ã«è²»ããã¦ããã",
new String[] { "æ¬æ¥", "ã¯", "貧å°", "層", "ã®", "女æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",
"æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã", "ã¢ã¡ãªã«",
@@ -56,7 +117,7 @@ public class TestKuromojiTokenizer exten
}
public void testDecomposition2() throws Exception {
- assertAnalyzesTo(analyzer, "麻è¬ã®å¯å£²ã¯æ ¹ããã絶ãããªããã°ãªããªã",
+ assertAnalyzesTo(analyzerNoPunct, "麻è¬ã®å¯å£²ã¯æ ¹ããã絶ãããªããã°ãªããªã",
new String[] { "麻è¬", "ã®", "å¯å£²", "ã¯", "æ ¹ããã", "絶ãã", "ãªãã", "ã°", "ãªã", "ãªã" },
new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
@@ -64,7 +125,7 @@ public class TestKuromojiTokenizer exten
}
public void testDecomposition3() throws Exception {
- assertAnalyzesTo(analyzer, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
+ assertAnalyzesTo(analyzerNoPunct, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" },
new int[] { 0, 2, 3, 5, 10 },
new int[] { 2, 3, 5, 9, 15 }
@@ -92,9 +153,32 @@ public class TestKuromojiTokenizer exten
ts.close();
}
+ /*
+ // NOTE: intentionally fails! Just trying to debug this
+ // one input...
+ public void testDecomposition6() throws Exception {
+ assertAnalyzesTo(analyzer, "å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦",
+ new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
+ new int[] { 0, 2, 3, 4, 5, 6 },
+ new int[] { 2, 3, 4, 5, 6, 8 }
+ );
+ }
+ */
+
/** Tests that sentence offset is incorporated into the resulting offsets */
public void testTwoSentences() throws Exception {
- assertAnalyzesTo(analyzer, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
+ /*
+ //TokenStream ts = a.tokenStream("foo", new StringReader("妹ã®å²åã§ãã俺ã¨å¹´åã§ãä»åé¨çã§ãã"));
+ TokenStream ts = analyzer.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+ ts.reset();
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ while(ts.incrementToken()) {
+ System.out.println(" " + termAtt.toString());
+ }
+ System.out.println("DONE PARSE\n\n");
+ */
+
+ assertAnalyzesTo(analyzerNoPunct, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹", "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" },
new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
@@ -104,6 +188,7 @@ public class TestKuromojiTokenizer exten
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER);
}
public void testLargeDocReliability() throws Exception {
@@ -126,6 +211,9 @@ public class TestKuromojiTokenizer exten
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + i);
+ }
String s = _TestUtil.randomUnicodeString(random, 100);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
@@ -135,22 +223,410 @@ public class TestKuromojiTokenizer exten
}
}
}
+
+ public void testOnlyPunctuation() throws IOException {
+ TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("ãããã"));
+ ts.reset();
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ public void testOnlyPunctuationExtended() throws IOException {
+ TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+ ts.reset();
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
// note: test is kinda silly since kuromoji emits punctuation tokens.
// but, when/if we filter these out it will be useful.
public void testEnd() throws Exception {
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã")),
+ assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã")),
new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 },
new Integer(8)
);
-
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã ")),
+
+ assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã ")),
new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
new int[] { 0, 2, 3, 4, 5, 6, 8 },
new int[] { 2, 3, 4, 5, 6, 8, 9 },
new Integer(12)
);
}
+
+ public void testUserDict() throws Exception {
+ // Not a great test because w/o userdict.txt the
+ // segmentation is the same:
+ assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("é¢è¥¿å½é空港ã«è¡ã£ã")),
+ new String[] { "é¢è¥¿", "å½é", "空港", "ã«", "è¡ã£", "ã" },
+ new int[] { 0, 2, 4, 6, 7, 9 },
+ new int[] { 2, 4, 6, 7, 9, 10 },
+ new Integer(10)
+ );
+ }
+
+ public void testUserDict2() throws Exception {
+ // Better test: w/o userdict the segmentation is different:
+ assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("æéé¾")),
+ new String[] { "æéé¾" },
+ new int[] { 0 },
+ new int[] { 3 },
+ new Integer(3)
+ );
+ }
+
+ public void testUserDict3() throws Exception {
+ // Test entry that breaks into multiple tokens:
+ assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+ new String[] { "a", "b", "cd" },
+ new int[] { 0, 1, 2 },
+ new int[] { 1, 2, 4 },
+ new Integer(4)
+ );
+ }
+
+ // HMM: fails (segments as a/b/cd/efghij)... because the
+ // two paths have exactly equal paths (1 KNOWN + 1
+ // UNKNOWN) and we don't seem to favor longer KNOWN /
+ // shorter UNKNOWN matches:
+
+ /*
+ public void testUserDict4() throws Exception {
+ // Test entry that has another entry as prefix
+ assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+ new String[] { "ab", "cd", "efg", "hij" },
+ new int[] { 0, 2, 4, 7 },
+ new int[] { 2, 4, 7, 10 },
+ new Integer(10)
+ );
+ }
+ */
+
+ public void testSegmentation() throws Exception {
+ // Skip tests for Michelle Kwan -- UniDic segments Kwan as 㯠ã¯ã³
+ // String input = "ãã·ã§ã«ã»ã¯ã¯ã³ãåªåãã¾ãããã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ // String[] surfaceForms = {
+ // "ãã·ã§ã«", "ã»", "ã¯ã¯ã³", "ã", "åªå", "ã", "ã¾ã", "ã", "ã",
+ // "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ // "ãããããã", "ã"
+ // };
+ String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ String[] surfaceForms = {
+ "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ "ãããããã", "ã"
+ };
+ assertAnalyzesTo(analyzer,
+ input,
+ surfaceForms);
+ }
+
+ public void testLatticeToDot() throws Exception {
+ final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+ final Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+ tokenizer.setGraphvizFormatter(gv2);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ String[] surfaceForms = {
+ "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ "ãããããã", "ã"
+ };
+ assertAnalyzesTo(analyzer,
+ input,
+ surfaceForms);
+
+ assertTrue(gv2.finish().indexOf("22.0") != -1);
+ }
+
+ private void assertReadings(String input, String... readings) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+ ts.reset();
+ for(String reading : readings) {
+ assertTrue(ts.incrementToken());
+ assertEquals(reading, readingAtt.getReading());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ private void assertPronunciations(String input, String... pronunciations) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+ ts.reset();
+ for(String pronunciation : pronunciations) {
+ assertTrue(ts.incrementToken());
+ assertEquals(pronunciation, readingAtt.getPronunciation());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ private void assertBaseForms(String input, String... baseForms) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
+ ts.reset();
+ for(String baseForm : baseForms) {
+ assertTrue(ts.incrementToken());
+ assertEquals(baseForm, baseFormAtt.getBaseForm());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+ ts.reset();
+ for(String inflectionType : inflectionTypes) {
+ assertTrue(ts.incrementToken());
+ assertEquals(inflectionType, inflectionAtt.getInflectionType());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+ ts.reset();
+ for(String inflectionForm : inflectionForms) {
+ assertTrue(ts.incrementToken());
+ assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
+ TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
+ ts.reset();
+ for(String partOfSpeech : partsOfSpeech) {
+ assertTrue(ts.incrementToken());
+ assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
+ }
+ assertFalse(ts.incrementToken());
+ ts.end();
+ }
+
+ public void testReadings() throws Exception {
+ assertReadings("寿å¸ãé£ã¹ããã§ãã",
+ "ã¹ã·",
+ "ã¬",
+ "ã¿ã",
+ "ã¿ã¤",
+ "ãã¹",
+ "ã");
+ }
+
+ public void testReadings2() throws Exception {
+ assertReadings("å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ "ãªãªã¯",
+ "ã",
+ "ã¬ã¯ã»ã¤",
+ "ã¬",
+ "ã·ã±ã³",
+ "ã",
+ "ãªã",
+ "ã¿",
+ "ã");
+ }
+
+ public void testPronunciations() throws Exception {
+ assertPronunciations("寿å¸ãé£ã¹ããã§ãã",
+ "ã¹ã·",
+ "ã¬",
+ "ã¿ã",
+ "ã¿ã¤",
+ "ãã¹",
+ "ã");
+ }
+
+ public void testPronunciations2() throws Exception {
+ // pronunciation differs from reading here
+ assertPronunciations("å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ "ãªã¼ã¯",
+ "ã",
+ "ã¬ã¯ã»ã¤",
+ "ã¬",
+ "ã·ã±ã³",
+ "ã",
+ "ãªã",
+ "ã¿",
+ "ã");
+ }
+
+ public void testBasicForms() throws Exception {
+ assertBaseForms("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "ãã",
+ null,
+ null);
+ }
+
+ public void testInflectionTypes() throws Exception {
+ assertInflectionTypes("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "äºæ®µã»ã©è¡",
+ "ç¹æ®ã»ãã¹",
+ null);
+ }
+
+ public void testInflectionForms() throws Exception {
+ assertInflectionForms("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "é£ç¨å½¢",
+ "åºæ¬å½¢",
+ null);
+ }
+
+ public void testPartOfSpeech() throws Exception {
+ assertPartsOfSpeech("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
+ "åè©-代åè©-ä¸è¬",
+ "å©è©-ä¿å©è©",
+ "å¯è©-å©è©é¡æ¥ç¶",
+ "åè©-ãµå¤æ¥ç¶",
+ "åè©-ä¸è¬",
+ "å©è©-æ ¼å©è©-ä¸è¬",
+ "åè©-èªç«",
+ "å©åè©",
+ "è¨å·-å¥ç¹");
+ }
+
+ // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+ // do we have a possibility to actually lookup the first and last word from dictionary?
+ public void testYabottai() throws Exception {
+ assertAnalyzesTo(analyzer, "ãã¼ã£ãã",
+ new String[] {"ãã¼ã£ãã"});
+ }
+
+ public void testTsukitosha() throws Exception {
+ assertAnalyzesTo(analyzer, "çªãéãã",
+ new String[] {"çªãéãã"});
+ }
+
+ public void testBocchan() throws Exception {
+ doTestBocchan(1);
+ }
+
+ @Nightly
+ public void testBocchanBig() throws Exception {
+ doTestBocchan(100);
+ }
+
+ /*
+ public void testWikipedia() throws Exception {
+ final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
+ final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
+
+ final long startTimeNS = System.nanoTime();
+ boolean done = false;
+ long compoundCount = 0;
+ long nonCompoundCount = 0;
+ long netOffset = 0;
+ while (!done) {
+ final TokenStream ts = analyzer.tokenStream("ignored", r);
+ ts.reset();
+ final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ int count = 0;
+ while (true) {
+ if (!ts.incrementToken()) {
+ done = true;
+ break;
+ }
+ count++;
+ if (posIncAtt.getPositionIncrement() == 0) {
+ compoundCount++;
+ } else {
+ nonCompoundCount++;
+ if (nonCompoundCount % 1000000 == 0) {
+ System.out.println(String.format("%.2f msec [pos=%d, %d, %d]",
+ (System.nanoTime()-startTimeNS)/1000000.0,
+ netOffset + offsetAtt.startOffset(),
+ nonCompoundCount,
+ compoundCount));
+ }
+ }
+ if (count == 100000000) {
+ System.out.println(" again...");
+ break;
+ }
+ }
+ ts.end();
+ netOffset += offsetAtt.endOffset();
+ }
+ System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount);
+ r.close();
+ }
+ */
+
+
+ private void doTestBocchan(int numIterations) throws Exception {
+ LineNumberReader reader = new LineNumberReader(new InputStreamReader(
+ this.getClass().getResourceAsStream("bocchan.utf-8")));
+ String line = reader.readLine();
+ reader.close();
+
+ if (VERBOSE) {
+ System.out.println("Test for Bocchan without pre-splitting sentences");
+ }
+
+ /*
+ if (numIterations > 1) {
+ // warmup
+ for (int i = 0; i < numIterations; i++) {
+ final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+ ts.reset();
+ while(ts.incrementToken());
+ }
+ }
+ */
+
+ long totalStart = System.currentTimeMillis();
+ for (int i = 0; i < numIterations; i++) {
+ final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+ ts.reset();
+ while(ts.incrementToken());
+ }
+ String[] sentences = line.split("ã|ã");
+ if (VERBOSE) {
+ System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+ System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)");
+ }
+ totalStart = System.currentTimeMillis();
+ for (int i = 0; i < numIterations; i++) {
+ for (String sentence: sentences) {
+ final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+ ts.reset();
+ while(ts.incrementToken());
+ }
+ }
+ if (VERBOSE) {
+ System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+ }
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Sat Mar 10 14:54:47 2012
@@ -28,20 +28,19 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.util.IOUtils;
public class TestSearchMode extends BaseTokenStreamTestCase {
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
- private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
private final Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
-
+
/** Test search mode segmentation */
public void testSearchSegmentation() throws IOException {
InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
@@ -64,7 +63,18 @@ public class TestSearchMode extends Base
String[] fields = line.split("\t", 2);
String sourceText = fields[0];
String[] expectedTokens = fields[1].split("\\s+");
- assertAnalyzesTo(analyzer, sourceText, expectedTokens);
+ int[] expectedPosIncrs = new int[expectedTokens.length];
+ int[] expectedPosLengths = new int[expectedTokens.length];
+ for(int tokIDX=0;tokIDX<expectedTokens.length;tokIDX++) {
+ if (expectedTokens[tokIDX].endsWith("/0")) {
+ expectedTokens[tokIDX] = expectedTokens[tokIDX].replace("/0", "");
+ expectedPosLengths[tokIDX] = expectedTokens.length-1;
+ } else {
+ expectedPosIncrs[tokIDX] = 1;
+ expectedPosLengths[tokIDX] = 1;
+ }
+ }
+ assertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
}
} finally {
is.close();
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Sat Mar 10 14:54:47 2012
@@ -23,29 +23,17 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
- private UserDictionary readDict() throws IOException {
- InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
- if (is == null)
- throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
- try {
- Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
- return new UserDictionary(reader);
- } finally {
- is.close();
- }
- }
-
@Test
public void testLookup() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
String s = "é¢è¥¿å½é空港ã«è¡ã£ã";
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three é¢è¥¿, å½é, 空港
@@ -69,7 +57,7 @@ public class UserDictionaryTest extends
@Test
public void testReadings() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
@@ -83,7 +71,7 @@ public class UserDictionaryTest extends
@Test
public void testPartOfSpeech() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdKeizai = result[1][0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
@@ -92,7 +80,7 @@ public class UserDictionaryTest extends
@Test
public void testRead() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
assertNotNull(dictionary);
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt Sat Mar 10 14:54:47 2012
@@ -25,43 +25,45 @@
##
# Kansai Internationl Airport
-é¢è¥¿å½é空港 é¢è¥¿ å½é 空港
+é¢è¥¿å½é空港 é¢è¥¿ é¢è¥¿å½é空港/0 å½é 空港
# Narita Airport
-æç°ç©ºæ¸¯ æç° ç©ºæ¸¯
+æç°ç©ºæ¸¯ æç° æç°ç©ºæ¸¯/0 空港
# Haneda Airport
-ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç©ºæ¸¯
+ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç¾½ç°ç©ºæ¸¯/0 空港
# Nara Institute of Science and Technology
-å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å
端 ç§å¦ æè¡ å¤§å¦é¢ 大å¦
+å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦/0 å
端 ç§å¦ æè¡ å¤§å¦é¢ 大å¦
# Tokyo University
-æ±äº¬å¤§å¦ æ±äº¬ 大å¦
+æ±äº¬å¤§å¦ æ±äº¬ æ±äº¬å¤§å¦/0 大å¦
# Kyoto University
-京é½å¤§å¦ äº¬é½ å¤§å¦
+京é½å¤§å¦ äº¬é½ äº¬é½å¤§å¦/0 大å¦
+
+# NOTE: differs from non-compound mode:
# Kyoto University Baseball Club
-京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½ å¤§å¦ ç¡¬å¼ éç é¨
+京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½å¤§ å¦ ç¡¬å¼ éç é¨
##
## Katakana titles
##
# Senior Software Engineer
-ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
+ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢/0 ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
# Software Engineer
ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
# Senior Project Manager
-ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ããã¸ã§ã¯ã ããã¸ã£ã¼
+ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼/0 ããã¸ã§ã¯ã ããã¸ã£ã¼
# Project Manager
ããã¸ã§ã¯ãããã¸ã£ã¼ ããã¸ã§ã¯ã ããã¸ã£ã¼
# Senior Sales Engineer
-ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã»ã¼ã«ã¹ ã¨ã³ã¸ãã¢
+ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ãã¢/0 ã»ã¼ã«ã¹ ã¨ã³ã¸ãã¢
# System Architect
-ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã¢ã¼ããã¯ã
+ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã¢ã¼ããã¯ã
# Senior System Architect
-ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ã¹ãã ã¢ã¼ããã¯ã
+ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã·ã¹ãã ã¢ã¼ããã¯ã
# System Administrator
ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿
-ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã¢ãããã¹ãã¬ã¼ã¿ã¼
# Senior System Administrator
-ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
##
## Company names (several are fictitious)
@@ -70,25 +72,25 @@
# SoftBank Mobile
ã½ãããã³ã¯ã¢ãã¤ã« ã½ãããã³ã¯ ã¢ãã¤ã«
# Alpine Materials
-ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ãããªã¢ã«ãº
+ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ã¢ã«ãã¤ã³ãããªã¢ã«ãº/0 ãããªã¢ã«ãº
# Sapporo Holdings
ãµããããã¼ã«ãã£ã³ã°ã¹ ãµããã ãã¼ã«ãã£ã³ã°ã¹
# Yamada Corporation
-ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã³ã¼ãã¬ã¼ã·ã§ã³
+ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³/0 ã³ã¼ãã¬ã¼ã·ã§ã³
# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor
-ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã
+ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã/0 ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã
# Orental Chain
-ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãã¨ã³
+ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãªãªã¨ã³ã¿ã«ãã¨ã³/0 ãã¨ã³
# Ally Projects Japan NOTE: Becomes one token as ããã¸ã§ã¯ã is not in IPADIC
ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³ ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³
# Peter Pan Corporation
-ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³
+ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³/0 ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³
# AIM Create
ã¨ã¤ã ã¯ãªã¨ã¤ã ã¨ã¤ã ã¯ãªã¨ã¤ã
# Mars Engineering
-ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ã¨ã³ã¸ãã¢ãªã³ã°
+ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã°/0 ã¨ã³ã¸ãã¢ãªã³ã°
# Fuji Protein Technology
-ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãããã¤ã³ ãã¯ããã¸ã¼
+ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãã¸ãããã¤ã³ãã¯ããã¸ã¼/0 ãããã¤ã³ ãã¯ããã¸ã¼
##
## Person names
@@ -100,7 +102,7 @@
ã¹ãã£ã¼ãã¸ã§ã㺠ã¹ãã£ã¼ã ã¸ã§ããº
# Harry Potter NOTE: Becomes one token (short word)
ããªã¼ããã¿ã¼ ããªã¼ããã¿ã¼
-# Bill Gates NOTE: Becomes one token (short work)
+# Bill Gates NOTE: Becomes one token (short word)
ãã«ã²ã¤ã ãã«ã²ã¤ã
# Sean Connery NOTE: Becomes one token (okay)
ã·ã§ã¼ã³ã³ããªã¼ ã·ã§ã¼ã³ã³ããªã¼
@@ -133,8 +135,8 @@
##
# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
-ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ãã£ã¨ã³ ã¸ã㢠ãªã³ã°
+ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã°/0 ãã£ã¨ã³ ã¸ã㢠ãªã³ã°
# Anchovy pasta NOTE: Become Anch yvipasta
-ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã§ããã¹ã¿
+ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã¢ã³ãã§ããã¹ã¿/0 ã§ããã¹ã¿
# Surprise gift NOTE: Becomes one token (surprise not in IPADIC)
ãµãã©ã¤ãºã®ãã ãµãã©ã¤ãºã®ãã
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt Sat Mar 10 14:54:47 2012
@@ -4,3 +4,7 @@
# Custom reading for sumo wrestler
æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å
+
+# Silly entry:
+abcd,a b cd,foo1 foo2 foo3,bar
+abcdefg,ab cd efg,foo1 foo2 foo4,bar
Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java Sat Mar 10 14:54:47 2012
@@ -54,9 +54,10 @@ public class PositionIncrementAttributeI
* @param positionIncrement the distance from the prior term
*/
public void setPositionIncrement(int positionIncrement) {
- if (positionIncrement < 0)
+ if (positionIncrement < 0) {
throw new IllegalArgumentException
- ("Increment must be zero or greater: " + positionIncrement);
+ ("Increment must be zero or greater: got " + positionIncrement);
+ }
this.positionIncrement = positionIncrement;
}
@@ -79,7 +80,8 @@ public class PositionIncrementAttributeI
}
if (other instanceof PositionIncrementAttributeImpl) {
- return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
+ PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
+ return positionIncrement == _other.positionIncrement;
}
return false;
@@ -95,5 +97,4 @@ public class PositionIncrementAttributeI
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
}
-
}
Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sat Mar 10 14:54:47 2012
@@ -840,6 +840,7 @@ public final class FST<T> {
}
public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
+ assert in.bytes == bytes;
final int address = getNodeAddress(node);
in.pos = address;
//System.out.println(" readFirstRealTargtArc address="
@@ -936,6 +937,7 @@ public final class FST<T> {
/** Never returns null, but you should never call this if
* arc.isLast() is true. */
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
+ assert in.bytes == bytes;
// TODO: can't assert this because we call from readFirstArc
// assert !flag(arc.flags, BIT_LAST_ARC);
@@ -1019,6 +1021,7 @@ public final class FST<T> {
* This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
assert cachedRootArcs != null;
+ assert in.bytes == bytes;
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
@@ -1225,17 +1228,20 @@ public final class FST<T> {
/** Expert */
public static abstract class BytesReader extends DataInput {
- int pos;
+ protected int pos;
+ protected final byte[] bytes;
+ protected BytesReader(byte[] bytes, int pos) {
+ this.bytes = bytes;
+ this.pos = pos;
+ }
abstract void skip(int byteCount);
abstract void skip(int base, int byteCount);
}
final static class ReverseBytesReader extends BytesReader {
- final byte[] bytes;
public ReverseBytesReader(byte[] bytes, int pos) {
- this.bytes = bytes;
- this.pos = pos;
+ super(bytes, pos);
}
@Override
@@ -1262,11 +1268,9 @@ public final class FST<T> {
// TODO: can we use just ByteArrayDataInput...? need to
// add a .skipBytes to DataInput.. hmm and .setPosition
final static class ForwardBytesReader extends BytesReader {
- final byte[] bytes;
public ForwardBytesReader(byte[] bytes, int pos) {
- this.bytes = bytes;
- this.pos = pos;
+ super(bytes, pos);
}
@Override
Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 10 14:54:47 2012
@@ -17,13 +17,18 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
-import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
-
+
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
@@ -83,7 +88,7 @@ public abstract class BaseTokenStreamTes
}
}
- public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@@ -107,6 +112,12 @@ public abstract class BaseTokenStreamTes
assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
}
+
+ PositionLengthAttribute posLengthAtt = null;
+ if (posLengths != null) {
+ assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
+ posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
+ }
ts.reset();
for (int i = 0; i < output.length; i++) {
@@ -116,6 +127,7 @@ public abstract class BaseTokenStreamTes
if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
if (typeAtt != null) typeAtt.setType("bogusType");
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
+ if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
assertTrue("token "+i+" does not exist", ts.incrementToken());
@@ -130,6 +142,8 @@ public abstract class BaseTokenStreamTes
assertEquals("type "+i, types[i], typeAtt.type());
if (posIncrements != null)
assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
+ if (posLengths != null)
+ assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
@@ -138,14 +152,18 @@ public abstract class BaseTokenStreamTes
assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
if (finalOffset != null) {
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
- assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
+ assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
+ offsetAtt.endOffset() <= finalOffset.intValue());
}
}
if (posIncrAtt != null) {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
}
+ if (posLengthAtt != null) {
+ assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
+ }
}
- assertFalse("end of stream", ts.incrementToken());
+ assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
ts.end();
if (finalOffset != null)
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@@ -155,65 +173,81 @@ public abstract class BaseTokenStreamTes
ts.close();
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
+ }
+
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
- assertTokenStreamContents(ts, output, null, null, null, null, null);
+ assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
- assertTokenStreamContents(ts, output, null, null, types, null, null);
+ assertTokenStreamContents(ts, output, null, null, types, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
- assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
+ assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
+ }
+
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
- assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
+ }
+
+ public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
- assertAnalyzesTo(a, input, output, null, null, null, null);
+ assertAnalyzesTo(a, input, output, null, null, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
- assertAnalyzesTo(a, input, output, null, null, types, null);
+ assertAnalyzesTo(a, input, output, null, null, types, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
- assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
+ assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
+ }
+
+ public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
+ assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
- assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
+ assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
- assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
+ assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
}
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
- assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
}
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
@@ -326,7 +360,7 @@ public abstract class BaseTokenStreamTes
}
if (VERBOSE) {
- System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
int remainder = random.nextInt(10);
@@ -336,10 +370,12 @@ public abstract class BaseTokenStreamTes
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+ PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
List<String> tokens = new ArrayList<String>();
List<String> types = new ArrayList<String>();
List<Integer> positions = new ArrayList<Integer>();
+ List<Integer> positionLengths = new ArrayList<Integer>();
List<Integer> startOffsets = new ArrayList<Integer>();
List<Integer> endOffsets = new ArrayList<Integer>();
ts.reset();
@@ -347,6 +383,7 @@ public abstract class BaseTokenStreamTes
tokens.add(termAtt.toString());
if (typeAtt != null) types.add(typeAtt.type());
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+ if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
if (offsetAtt != null) {
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
@@ -357,11 +394,21 @@ public abstract class BaseTokenStreamTes
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
if (VERBOSE) {
- System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
- if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+ if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+ // offset + pos + posLength + type
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ types.toArray(new String[types.size()]),
+ toIntArray(positions),
+ toIntArray(positionLengths),
+ text.length());
+ } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
@@ -369,7 +416,18 @@ public abstract class BaseTokenStreamTes
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
+ null,
text.length());
+ } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+ // offset + pos + posLength
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ null,
+ toIntArray(positions),
+ toIntArray(positionLengths),
+ text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@@ -378,6 +436,7 @@ public abstract class BaseTokenStreamTes
toIntArray(endOffsets),
null,
toIntArray(positions),
+ null,
text.length());
} else if (offsetAtt != null) {
// offset
@@ -387,6 +446,7 @@ public abstract class BaseTokenStreamTes
toIntArray(endOffsets),
null,
null,
+ null,
text.length());
} else {
// terms only
@@ -396,6 +456,22 @@ public abstract class BaseTokenStreamTes
}
}
}
+
+ protected String toDot(Analyzer a, String inputText) throws IOException {
+ final StringWriter sw = new StringWriter();
+ final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+ ts.reset();
+ new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
+ return sw.toString();
+ }
+
+ protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
+ Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8");
+ final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+ ts.reset();
+ new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
+ w.close();
+ }
static int[] toIntArray(List<Integer> list) {
int ret[] = new int[list.size()];
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Sat Mar 10 14:54:47 2012
@@ -28,8 +28,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory ex
//@Override
public Tokenizer create(Reader input) {
- return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+ return new KuromojiTokenizer(input, userDictionary, true, mode);
}
private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory ex
if (mode != null) {
return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
} else {
- return Segmenter.DEFAULT_MODE;
+ return KuromojiTokenizer.DEFAULT_MODE;
}
}
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java Sat Mar 10 14:54:47 2012
@@ -50,7 +50,7 @@ public class TestKuromojiTokenizerFactor
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
assertTokenStreamContents(ts,
- new String[] { "ã·ãã¢", "ã½ããã¦ã§ã¢", "ã¨ã³ã¸ãã¢" }
+ new String[] { "ã·ãã¢", "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", "ã½ããã¦ã§ã¢", "ã¨ã³ã¸ãã¢" }
);
}