You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 19:16:40 UTC
svn commit: r1006195 - in /lucene/dev/branches/branch_3x/lucene/contrib: ./
analyzers/common/src/java/org/apache/lucene/analysis/shingle/
analyzers/common/src/test/org/apache/lucene/analysis/shingle/
Author: sarowe
Date: Sat Oct 9 17:16:40 2010
New Revision: 1006195
URL: http://svn.apache.org/viewvc?rev=1006195&view=rev
Log:
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Sat Oct 9 17:16:40 2010
@@ -133,6 +133,9 @@ API Changes
* LUCENE-2626: FastVectorHighlighter: enable FragListBuilder and FragmentsBuilder
to be set per-field override. (Koji Sekiguchi)
+ * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
+ can be generated. (Chris Harris via Steven Rowe)
+
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
Propchange: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Sat Oct 9 17:16:40 2010
@@ -0,0 +1,8 @@
+/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,942166,942235,942288,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949288,949311,949445,949976,949997,950008,950042,950458,950613,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957520,957634,957707,960367,960371,960374,960719,962555,963372
,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982725,982824,983100,983212,983216,983313,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996978,997180,997230,998055,998505,998684,9
99016,999223,999545,999842,1000424,1000581,1000675,1001006,1001420,1001661,1001796,1002032,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146
+/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle:942235,945090,946139,950008,964019,964054,984968,986612,990459,996611,997180,998684,1002032,1003906,1003978,1003990,1006187
+/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:924791,924850,930201
+/lucene/java/branches/lucene_2_4/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:748824
+/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:817269-818600,825998,829134,829816,829881,831036,896850,909334,948516
+/lucene/java/branches/lucene_2_9_back_compat_tests/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:818601-821336
+/lucene/java/branches/lucene_3_0/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:880793,896906
+/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:924483-925561
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sat Oct 9 17:16:40 2010
@@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrappe
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true;
+ private boolean outputUnigramsIfNoShingles = false;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
@@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrappe
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
+
+ public boolean isOutputUnigramsIfNoShingles() {
+ return outputUnigramsIfNoShingles;
+ }
+
+ /**
+ * <p>Shall we override the behavior of outputUnigrams==false for those
+ * times when no shingles are available (because there are fewer than
+ * minShingleSize tokens in the input stream)? (default: false.)
+ * <p>Note that if outputUnigrams==true, then unigrams are always output,
+ * regardless of whether any shingles are available.
+ *
+ * @param outputUnigramsIfNoShingles Whether or not to output a single
+ * unigram when no shingles are available.
+ */
+ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ }
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrappe
filter.setMaxShingleSize(maxShingleSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
+ filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return filter;
}
@@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrappe
streams.shingle.setMinShingleSize(minShingleSize);
streams.shingle.setTokenSeparator(tokenSeparator);
streams.shingle.setOutputUnigrams(outputUnigrams);
+ streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return streams.shingle;
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sat Oct 9 17:16:40 2010
@@ -103,6 +103,11 @@ public final class ShingleFilter extends
private boolean outputUnigrams = true;
/**
+ * By default, we don't override behavior of outputUnigrams.
+ */
+ private boolean outputUnigramsIfNoShingles = false;
+
+ /**
* maximum shingle size (number of tokens)
*/
private int maxShingleSize;
@@ -136,6 +141,11 @@ public final class ShingleFilter extends
* position.
*/
private boolean isOutputHere = false;
+
+ /**
+ * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
+ */
+ boolean noShingleOutput = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -212,6 +222,20 @@ public final class ShingleFilter extends
}
/**
+ * <p>Shall we override the behavior of outputUnigrams==false for those
+ * times when no shingles are available (because there are fewer than
+ * minShingleSize tokens in the input stream)? (default: false.)
+ * <p>Note that if outputUnigrams==true, then unigrams are always output,
+ * regardless of whether any shingles are available.
+ *
+ * @param outputUnigramsIfNoShingles Whether or not to output a single
+ * unigram when no shingles are available.
+ */
+ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ }
+
+ /**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
@@ -292,6 +316,7 @@ public final class ShingleFilter extends
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType);
+ noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
isOutputHere = true;
@@ -395,6 +420,10 @@ public final class ShingleFilter extends
}
}
}
+ if (outputUnigramsIfNoShingles && noShingleOutput
+ && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
+ gramSize.minValue = 1;
+ }
gramSize.reset();
isOutputHere = false;
}
@@ -406,6 +435,11 @@ public final class ShingleFilter extends
inputWindow.clear();
numFillerTokensToInsert = 0;
isOutputHere = false;
+ noShingleOutput = true;
+ if (outputUnigramsIfNoShingles && ! outputUnigrams) {
+ // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+ gramSize.minValue = minShingleSize;
+ }
}
Propchange: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Sat Oct 9 17:16:40 2010
@@ -0,0 +1,8 @@
+/lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,942166,942235,942288,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949288,949311,949445,949976,949997,950008,950042,950458,950613,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957520,957634,957707,960367,960371,960374,960719,962555,963372
,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982725,982824,983100,983212,983216,983313,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996978,997180,997230,998055,998505,998684,9
99016,999223,999545,999842,1000424,1000581,1000675,1001006,1001420,1001661,1001796,1002032,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146
+/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle:942235,945090,946139,950008,950042,964019,964054,984968,986612,990459,996611,997180,998684,1002032,1003906,1003978,1003990,1006187
+/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:924791,924850,930201
+/lucene/java/branches/lucene_2_4/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:748824
+/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:817269-818600,825998,829134,829816,829881,831036,896850,909334,948516
+/lucene/java/branches/lucene_2_9_back_compat_tests/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:818601-821336
+/lucene/java/branches/lucene_3_0/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:880793,896906
+/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:924483-925561
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Sat Oct 9 17:16:40 2010
@@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
+
+ public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
+ analyzer.setOutputUnigrams(false);
+ analyzer.setOutputUnigramsIfNoShingles(true);
+ assertAnalyzesToReuse(analyzer, "please",
+ new String[] { "please" },
+ new int[] { 0 },
+ new int[] { 6 },
+ new int[] { 1 });
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sat Oct 9 17:16:40 2010
@@ -73,6 +73,14 @@ public class ShingleFilterTest extends B
createToken("shingles", 33, 39),
};
+ public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
+ 1, 1, 1, 1, 1, 1
+ };
+
+ public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
+ "word", "word", "word", "word", "word", "word"
+ };
+
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] {
@@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends B
new int[]{1,0,1,0,1,0,1}
);
}
-
+
+ public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
+ // Single token input with outputUnigrams==false is the primary case where
+ // enabling this option should alter program behavior.
+ this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+ SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+ false, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
+ // Here we expect the same result as with testBiGramFilter().
+ this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
+ BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+ true, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
+ // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
+ this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+ false, true);
+ }
+
+ public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
+ // Test when the minimum shingle size is greater than the number of input tokens
+ this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
+ UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
+ false, true);
+ }
+
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
@@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends B
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
+ }
+
+ protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
+ Token[] tokensToCompare, int[] positionIncrements,
+ String[] types, boolean outputUnigrams,
+ boolean outputUnigramsIfNoShingles)
+ throws IOException {
+ ShingleFilter filter
+ = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ filter.setOutputUnigrams(outputUnigrams);
+ filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
@@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends B
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
- shingleFilterTestCommon
- (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+ filter.setOutputUnigrams(outputUnigrams);
+ shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTestCommon(ShingleFilter filter,
Token[] tokensToCompare,
int[] positionIncrements,
- String[] types, boolean outputUnigrams)
+ String[] types)
throws IOException {
-
- filter.setOutputUnigrams(outputUnigrams);
-
String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length];