You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 18:55:23 UTC
svn commit: r1006187 - in /lucene/dev/trunk/modules/analysis: ./
common/src/java/org/apache/lucene/analysis/shingle/
common/src/test/org/apache/lucene/analysis/shingle/
Author: sarowe
Date: Sat Oct 9 16:55:23 2010
New Revision: 1006187
URL: http://svn.apache.org/viewvc?rev=1006187&view=rev
Log:
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.
Modified:
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Sat Oct 9 16:55:23 2010
@@ -15,6 +15,9 @@ API Changes
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
+ * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
+ can be generated. (Chris Harris via Steven Rowe)
+
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sat Oct 9 16:55:23 2010
@@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrappe
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true;
+ private boolean outputUnigramsIfNoShingles = false;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
@@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrappe
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
+
+ public boolean isOutputUnigramsIfNoShingles() {
+ return outputUnigramsIfNoShingles;
+ }
+
+ /**
+ * <p>Shall we override the behavior of outputUnigrams==false for those
+ * times when no shingles are available (because there are fewer than
+ * minShingleSize tokens in the input stream)? (default: false.)
+ * <p>Note that if outputUnigrams==true, then unigrams are always output,
+ * regardless of whether any shingles are available.
+ *
+ * @param outputUnigramsIfNoShingles Whether or not to output a single
+ * unigram when no shingles are available.
+ */
+ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ }
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrappe
filter.setMaxShingleSize(maxShingleSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
+ filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return filter;
}
@@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrappe
streams.shingle.setMinShingleSize(minShingleSize);
streams.shingle.setTokenSeparator(tokenSeparator);
streams.shingle.setOutputUnigrams(outputUnigrams);
+ streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return streams.shingle;
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sat Oct 9 16:55:23 2010
@@ -103,6 +103,11 @@ public final class ShingleFilter extends
private boolean outputUnigrams = true;
/**
+ * By default, we don't override behavior of outputUnigrams.
+ */
+ private boolean outputUnigramsIfNoShingles = false;
+
+ /**
* maximum shingle size (number of tokens)
*/
private int maxShingleSize;
@@ -136,6 +141,11 @@ public final class ShingleFilter extends
* position.
*/
private boolean isOutputHere = false;
+
+ /**
+ * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
+ */
+ boolean noShingleOutput = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -212,6 +222,20 @@ public final class ShingleFilter extends
}
/**
+ * <p>Shall we override the behavior of outputUnigrams==false for those
+ * times when no shingles are available (because there are fewer than
+ * minShingleSize tokens in the input stream)? (default: false.)
+ * <p>Note that if outputUnigrams==true, then unigrams are always output,
+ * regardless of whether any shingles are available.
+ *
+ * @param outputUnigramsIfNoShingles Whether or not to output a single
+ * unigram when no shingles are available.
+ */
+ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ }
+
+ /**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
@@ -292,6 +316,7 @@ public final class ShingleFilter extends
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType);
+ noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
isOutputHere = true;
@@ -395,6 +420,10 @@ public final class ShingleFilter extends
}
}
}
+ if (outputUnigramsIfNoShingles && noShingleOutput
+ && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
+ gramSize.minValue = 1;
+ }
gramSize.reset();
isOutputHere = false;
}
@@ -406,6 +435,11 @@ public final class ShingleFilter extends
inputWindow.clear();
numFillerTokensToInsert = 0;
isOutputHere = false;
+ noShingleOutput = true;
+ if (outputUnigramsIfNoShingles && ! outputUnigrams) {
+ // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+ gramSize.minValue = minShingleSize;
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Sat Oct 9 16:55:23 2010
@@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
+
+ public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
+ analyzer.setOutputUnigrams(false);
+ analyzer.setOutputUnigramsIfNoShingles(true);
+ assertAnalyzesToReuse(analyzer, "please",
+ new String[] { "please" },
+ new int[] { 0 },
+ new int[] { 6 },
+ new int[] { 1 });
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sat Oct 9 16:55:23 2010
@@ -73,6 +73,14 @@ public class ShingleFilterTest extends B
createToken("shingles", 33, 39),
};
+ public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
+ 1, 1, 1, 1, 1, 1
+ };
+
+ public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
+ "word", "word", "word", "word", "word", "word"
+ };
+
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] {
@@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends B
new int[]{1,0,1,0,1,0,1}
);
}
-
+
+ public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
+ // Single token input with outputUnigrams==false is the primary case where
+ // enabling this option should alter program behavior.
+ this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+ SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+ false, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
+ // Here we expect the same result as with testBiGramFilter().
+ this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
+ BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+ true, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
+ // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
+ this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+ false, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
+ // Test when the minimum shingle size is greater than the number of input tokens
+ this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
+ UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
+ false, true);
+ }
+
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
@@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends B
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
+ }
+
+ protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
+ Token[] tokensToCompare, int[] positionIncrements,
+ String[] types, boolean outputUnigrams,
+ boolean outputUnigramsIfNoShingles)
+ throws IOException {
+ ShingleFilter filter
+ = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ filter.setOutputUnigrams(outputUnigrams);
+ filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
@@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends B
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTestCommon(ShingleFilter filter,
Token[] tokensToCompare,
int[] positionIncrements,
- String[] types, boolean outputUnigrams)
+ String[] types)
throws IOException {
-
- filter.setOutputUnigrams(outputUnigrams);
-
String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length];