You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/04/02 07:00:53 UTC

svn commit: r930163 - in /lucene/dev/trunk/solr: CHANGES.txt src/java/org/apache/solr/analysis/ShingleFilterFactory.java src/test/org/apache/solr/analysis/TestShingleFilterFactory.java

Author: rmuir
Date: Fri Apr  2 05:00:53 2010
New Revision: 930163

URL: http://svn.apache.org/viewvc?rev=930163&view=rev
Log:
SOLR-1740: ShingleFilterFactory improvements

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Apr  2 05:00:53 2010
@@ -135,6 +135,10 @@ New Features
   TokenFilters now support custom Attributes, and some have improved performance: 
   especially WordDelimiterFilter and CommonGramsFilter.  (rmuir, cmale, uschindler)
 
+* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
+  parameters for controlling the minimum shingle size produced by the filter, and
+  the separator string that it uses, respectively.  (Steven Rowe via rmuir)
+   
 Optimizations
 ----------------------
 

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java Fri Apr  2 05:00:53 2010
@@ -21,21 +21,49 @@ package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
 import java.util.Map;
 
 /** Factory for {@link ShingleFilter} */
 public class ShingleFilterFactory extends BaseTokenFilterFactory {
+  private int minShingleSize;
   private int maxShingleSize;
   private boolean outputUnigrams;
+  private String tokenSeparator;
+
   public void init(Map<String, String> args) {
     super.init(args);
     maxShingleSize = getInt("maxShingleSize", 
                             ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
+    if (maxShingleSize < 2) {
+      throw new SolrException(ErrorCode.SERVER_ERROR,
+                              "Invalid maxShingleSize (" + maxShingleSize
+                              + ") - must be at least 2");
+    }
+    minShingleSize = getInt("minShingleSize",
+                            ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
+    if (minShingleSize < 2) {
+      throw new SolrException(ErrorCode.SERVER_ERROR,
+                              "Invalid minShingleSize (" + minShingleSize
+                              + ") - must be at least 2");
+    }
+    if (minShingleSize > maxShingleSize) {
+      throw new SolrException(ErrorCode.SERVER_ERROR,
+                              "Invalid minShingleSize (" + minShingleSize
+                              + ") - must be no greater than maxShingleSize ("
+                              + maxShingleSize + ")");
+    }
     outputUnigrams = getBoolean("outputUnigrams", true);
+    tokenSeparator = args.containsKey("tokenSeparator")
+                     ? args.get("tokenSeparator")
+                     : ShingleFilter.TOKEN_SEPARATOR;
   }
   public ShingleFilter create(TokenStream input) {
-    ShingleFilter r = new ShingleFilter(input,maxShingleSize);
+    ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
     r.setOutputUnigrams(outputUnigrams);
+    r.setTokenSeparator(tokenSeparator);
     return r;
   }
 }

Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Fri Apr  2 05:00:53 2010
@@ -70,4 +70,150 @@ public class TestShingleFilterFactory ex
         new String[] {"this", "this is", "this is a", "is",
         "is a", "is a test", "a", "a test", "test"});
   }
+
+  /**
+   * Test with higher min (and max) shingle size
+   */
+  public void testMinShingleSize() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "4");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this", "this is a", "this is a test",
+        "is", "is a test", "a", "test" });
+  }
+
+  /**
+   * Test with higher min (and max) shingle size and with unigrams disabled
+   */
+  public void testMinShingleSizeNoUnigrams() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "4");
+    args.put("outputUnigrams", "false");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this is a", "this is a test", "is a test" });
+  }
+
+  /**
+   * Test with higher same min and max shingle size
+   */
+  public void testEqualMinAndMaxShingleSize() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "3");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+         new String[] { "this", "this is a", "is", "is a test", "a", "test" });
+  }
+
+  /**
+   * Test with higher same min and max shingle size and with unigrams disabled
+   */
+  public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "3");
+    args.put("outputUnigrams", "false");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream,
+        new String[] { "this is a", "is a test" });
+  }
+    
+  /**
+   * Test with a non-default token separator
+   */
+  public void testTokenSeparator() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("tokenSeparator", "=BLAH=");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a", 
+        "a", "a=BLAH=test", "test" });
+  }
+
+  /**
+   * Test with a non-default token separator and with unigrams disabled
+   */
+  public void testTokenSeparatorNoUnigrams() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("tokenSeparator", "=BLAH=");
+    args.put("outputUnigrams", "false");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
+  }
+
+  /**
+   * Test with an empty token separator
+   */
+  public void testEmptyTokenSeparator() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("tokenSeparator", "");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
+  }
+    
+  /**
+   * Test with higher min (and max) shingle size 
+   * and with a non-default token separator
+   */
+  public void testMinShingleSizeAndTokenSeparator() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "4");
+    args.put("tokenSeparator", "=BLAH=");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this", "this=BLAH=is=BLAH=a", 
+        "this=BLAH=is=BLAH=a=BLAH=test", "is", 
+        "is=BLAH=a=BLAH=test", "a", "test" });
+  }
+
+  /**
+   * Test with higher min (and max) shingle size 
+   * and with a non-default token separator
+   * and with unigrams disabled
+   */
+  public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
+    Reader reader = new StringReader("this is a test");
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("minShingleSize", "3");
+    args.put("maxShingleSize", "4");
+    args.put("tokenSeparator", "=BLAH=");
+    args.put("outputUnigrams", "false");
+    ShingleFilterFactory factory = new ShingleFilterFactory();
+    factory.init(args);
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, 
+        new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", 
+        "is=BLAH=a=BLAH=test", });
+  }
 }