You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 18:56:43 UTC

svn commit: r1006191 - in /lucene/dev/trunk/solr: CHANGES.txt src/java/org/apache/solr/analysis/ShingleFilterFactory.java src/test/org/apache/solr/analysis/TestShingleFilterFactory.java

Author: sarowe
Date: Sat Oct  9 16:56:43 2010
New Revision: 1006191

URL: http://svn.apache.org/viewvc?rev=1006191&view=rev
Log:
SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.  

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sat Oct  9 16:56:43 2010
@@ -168,6 +168,11 @@ New Features
   parameters for controlling the minimum shingle size produced by the filter, and
   the separator string that it uses, respectively.  (Steven Rowe via rmuir)
   
+* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
+  parameter, to output unigrams if the number of input tokens is fewer than
+  minShingleSize, and no shingles can be generated.  
+  (Chris Harris via Steven Rowe)
+  
 * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now 
   supports "percentages" which get evaluated  relative the current size of 
   the cache when warming happens. 

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java Sat Oct  9 16:56:43 2010
@@ -31,6 +31,7 @@ public class ShingleFilterFactory extend
   private int minShingleSize;
   private int maxShingleSize;
   private boolean outputUnigrams;
+  private boolean outputUnigramsIfNoShingles;
   private String tokenSeparator;
 
   public void init(Map<String, String> args) {
@@ -56,6 +57,7 @@ public class ShingleFilterFactory extend
                               + maxShingleSize + ")");
     }
     outputUnigrams = getBoolean("outputUnigrams", true);
+    outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
     tokenSeparator = args.containsKey("tokenSeparator")
                      ? args.get("tokenSeparator")
                      : ShingleFilter.TOKEN_SEPARATOR;
@@ -63,6 +65,7 @@ public class ShingleFilterFactory extend
   public ShingleFilter create(TokenStream input) {
     ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
     r.setOutputUnigrams(outputUnigrams);
+    r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     r.setTokenSeparator(tokenSeparator);
     return r;
   }

Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=1006191&r1=1006190&r2=1006191&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Sat Oct  9 16:56:43 2010
@@ -216,4 +216,23 @@ public class TestShingleFilterFactory ex
         new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", 
         "is=BLAH=a=BLAH=test", });
   }
+
+  /**
+   * Test with unigrams disabled except when there are no shingles, with
+   * a single input token. Using default min/max shingle sizes: 2/2.  No
+   * shingles will be created, since there are fewer input tokens than
+   * min shingle size.  However, because outputUnigramsIfNoShingles is
+   * set to true, even though outputUnigrams is set to false, one
+   * unigram should be output.
+   */
+  public void testOutputUnigramsIfNoShingles() throws Exception {
+    Reader reader = new StringReader("test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("outputUnigrams", "false");
+    args.put("outputUnigramsIfNoShingles", "true");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, new String[] { "test" });
+  }
 }