You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/08/05 00:42:03 UTC
svn commit: r1369504 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/
lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/
Author: rmuir
Date: Sat Aug 4 22:42:03 2012
New Revision: 1369504
URL: http://svn.apache.org/viewvc?rev=1369504&view=rev
Log:
LUCENE-4286: add unibigram option to CJKBigramFilter
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1369504&r1=1369503&r2=1369504&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sat Aug 4 22:42:03 2012
@@ -46,6 +46,11 @@ New features
int docID), to attempt deletion by docID as long as the provided
reader is an NRT reader, and the segment has not yet been merged
away (Mike McCandless).
+
+* LUCENE-4286: Added option to CJKBigramFilter to always also output
+ unigrams. This can be used for a unigram+bigram approach, or at
+ index-time only for better support of short queries.
+ (Tom Burton-West, Robert Muir)
API Changes
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java?rev=1369504&r1=1369503&r2=1369504&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java Sat Aug 4 22:42:03 2012
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
@@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
+ * By default, when a CJK character has no adjacent characters to form
+ * a bigram, it is output in unigram form. If you want to always output
+ * both unigrams and bigrams, set the <code>outputUnigrams</code>
+ * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
+ * This can be used for a combined unigram+bigram approach.
+ * <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
@@ -67,10 +75,16 @@ public final class CJKBigramFilter exten
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
+
+ // true if we should output unigram tokens always
+ private final boolean outputUnigrams;
+ private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
@@ -88,23 +102,36 @@ public final class CJKBigramFilter exten
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
- * CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
+ * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
- * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
+ * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+ * CJKBigramFilter(in, flags, false)}
+ */
+ public CJKBigramFilter(TokenStream in, int flags) {
+ this(in, flags, false);
+ }
+
+ /**
+ * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+ * and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+ * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
+ * when this is false, this is only done when there are no adjacent characters to form
+ * a bigram.
*/
- public CJKBigramFilter(TokenStream in, int flags) {
+ public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
+ this.outputUnigrams = outputUnigrams;
}
/*
@@ -120,7 +147,24 @@ public final class CJKBigramFilter exten
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
- flushBigram();
+ if (outputUnigrams) {
+
+ // when also outputting unigrams, we output the unigram first,
+ // then rewind back to revisit the bigram.
+ // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+ // the logic in hasBufferedUnigram ensures we output the C,
+ // even though it did actually have adjacent CJK characters.
+
+ if (ngramState) {
+ flushBigram();
+ } else {
+ flushUnigram();
+ index--;
+ }
+ ngramState = !ngramState;
+ } else {
+ flushBigram();
+ }
return true;
} else if (doNext()) {
@@ -260,6 +304,11 @@ public final class CJKBigramFilter exten
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
+ // when outputting unigrams, all bigrams are synonyms that span two unigrams
+ if (outputUnigrams) {
+ posIncAtt.setPositionIncrement(0);
+ posLengthAtt.setPositionLength(2);
+ }
index++;
}
@@ -292,7 +341,13 @@ public final class CJKBigramFilter exten
* inputs.
*/
private boolean hasBufferedUnigram() {
- return bufferLen == 1 && index == 0;
+ if (outputUnigrams) {
+ // when outputting unigrams always
+ return bufferLen - index == 1;
+ } else {
+ // otherwise its only when we have a lone CJK character
+ return bufferLen == 1 && index == 0;
+ }
}
@Override
@@ -303,5 +358,6 @@ public final class CJKBigramFilter exten
lastEndOffset = 0;
loneState = null;
exhausted = false;
+ ngramState = false;
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java?rev=1369504&r1=1369503&r2=1369504&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java Sat Aug 4 22:42:03 2012
@@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.T
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true"
- * katakana="true" hangul="true" />
+ * katakana="true" hangul="true" outputUnigrams="false" />
* </analyzer>
* </fieldType></pre>
*/
public class CJKBigramFilterFactory extends TokenFilterFactory {
int flags;
+ boolean outputUnigrams;
@Override
public void init(Map<String,String> args) {
@@ -56,10 +57,11 @@ public class CJKBigramFilterFactory exte
if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL;
}
+ outputUnigrams = getBoolean("outputUnigrams", false);
}
@Override
public TokenStream create(TokenStream input) {
- return new CJKBigramFilter(input, flags);
+ return new CJKBigramFilter(input, flags, outputUnigrams);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java?rev=1369504&r1=1369503&r2=1369504&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java Sat Aug 4 22:42:03 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/
import java.io.Reader;
+import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends
}
};
+ Analyzer unibiAnalyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(t,
+ new CJKBigramFilter(t, 0xff, true));
+ }
+ };
+
public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
+ "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
@@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends
}
};
assertAnalyzesTo(a, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
- new String[] { "å¤", "ã", "ã®", "å¦ç", "ã", "試é¨", "ã«", "è½", "ã¡", "ã" });
+ new String[] { "å¤", "ã", "ã®", "å¦ç", "ã", "試é¨", "ã«", "è½", "ã¡", "ã" },
+ new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
+ new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
+ new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
+ "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
+ new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
+ }
+
+ public void testAllScripts() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(t,
+ new CJKBigramFilter(t, 0xff, false));
+ }
+ };
+ assertAnalyzesTo(a, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ new String[] { "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã" });
+ }
+
+ public void testUnigramsAndBigramsAllScripts() throws Exception {
+ assertAnalyzesTo(unibiAnalyzer, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ new String[] {
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç",
+ "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«",
+ "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã"
+ },
+ new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
+ new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
+ new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
+ "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
+ "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
+ new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+ new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
+ 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
+ );
+ }
+
+ public void testUnigramsAndBigramsHanOnly() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
+ }
+ };
+ assertAnalyzesTo(a, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ new String[] { "å¤", "ã", "ã®", "å¦", "å¦ç", "ç", "ã", "試", "試é¨", "é¨", "ã«", "è½", "ã¡", "ã" },
+ new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
+ new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
+ new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
+ "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
+ "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
+ new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
+ new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
+ }
+
+ public void testUnigramsAndBigramsHuge() throws Exception {
+ assertAnalyzesTo(unibiAnalyzer, "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
+ + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
+ + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã",
+ new String[] {
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã", "ãå¤",
+ "å¤", "å¤ã", "ã", "ãã®", "ã®", "ã®å¦", "å¦", "å¦ç", "ç", "çã", "ã", "ã試", "試", "試é¨", "é¨", "é¨ã«", "ã«", "ã«è½", "è½", "è½ã¡", "ã¡", "ã¡ã", "ã"
+ }
+ );
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomUnibiStrings() throws Exception {
+ checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomUnibiHugeStrings() throws Exception {
+ Random random = random();
+ checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java?rev=1369504&r1=1369503&r2=1369504&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java Sat Aug 4 22:42:03 2012
@@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory
assertTokenStreamContents(stream,
new String[] { "å¤", "ã", "ã®", "å¦ç", "ã", "試é¨", "ã«", "è½", "ã¡", "ã" });
}
+
+ public void testHanOnlyUnigrams() throws Exception {
+ Reader reader = new StringReader("å¤ãã®å¦çã試é¨ã«è½ã¡ãã");
+ CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("hiragana", "false");
+ args.put("outputUnigrams", "true");
+ factory.init(args);
+ TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "å¤", "ã", "ã®", "å¦", "å¦ç", "ç", "ã", "試", "試é¨", "é¨", "ã«", "è½", "ã¡", "ã" });
+ }
}