You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 18:56:43 UTC
svn commit: r1006191 - in /lucene/dev/trunk/solr: CHANGES.txt
src/java/org/apache/solr/analysis/ShingleFilterFactory.java
src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
Author: sarowe
Date: Sat Oct 9 16:56:43 2010
New Revision: 1006191
URL: http://svn.apache.org/viewvc?rev=1006191&view=rev
Log:
SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sat Oct 9 16:56:43 2010
@@ -168,6 +168,11 @@ New Features
parameters for controlling the minimum shingle size produced by the filter, and
the separator string that it uses, respectively. (Steven Rowe via rmuir)
+* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
+ parameter, to output unigrams if the number of input tokens is fewer than
+ minShingleSize, and no shingles can be generated.
+ (Chris Harris via Steven Rowe)
+
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of
the cache when warming happens.
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java Sat Oct 9 16:56:43 2010
@@ -31,6 +31,7 @@ public class ShingleFilterFactory extend
private int minShingleSize;
private int maxShingleSize;
private boolean outputUnigrams;
+ private boolean outputUnigramsIfNoShingles;
private String tokenSeparator;
public void init(Map<String, String> args) {
@@ -56,6 +57,7 @@ public class ShingleFilterFactory extend
+ maxShingleSize + ")");
}
outputUnigrams = getBoolean("outputUnigrams", true);
+ outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
tokenSeparator = args.containsKey("tokenSeparator")
? args.get("tokenSeparator")
: ShingleFilter.TOKEN_SEPARATOR;
@@ -63,6 +65,7 @@ public class ShingleFilterFactory extend
public ShingleFilter create(TokenStream input) {
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams);
+ r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator);
return r;
}
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Sat Oct 9 16:56:43 2010
@@ -216,4 +216,23 @@ public class TestShingleFilterFactory ex
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
"is=BLAH=a=BLAH=test", });
}
+
+ /**
+ * Test with unigrams disabled except when there are no shingles, with
+ * a single input token. Using default min/max shingle sizes: 2/2. No
+ * shingles will be created, since there are fewer input tokens than
+ * min shingle size. However, because outputUnigramsIfNoShingles is
+ * set to true, even though outputUnigrams is set to false, one
+ * unigram should be output.
+ */
+ public void testOutputUnigramsIfNoShingles() throws Exception {
+ Reader reader = new StringReader("test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("outputUnigrams", "false");
+ args.put("outputUnigramsIfNoShingles", "true");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream, new String[] { "test" });
+ }
}