You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/04/02 07:00:53 UTC
svn commit: r930163 - in /lucene/dev/trunk/solr: CHANGES.txt
src/java/org/apache/solr/analysis/ShingleFilterFactory.java
src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
Author: rmuir
Date: Fri Apr 2 05:00:53 2010
New Revision: 930163
URL: http://svn.apache.org/viewvc?rev=930163&view=rev
Log:
SOLR-1740: ShingleFilterFactory improvements
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Apr 2 05:00:53 2010
@@ -135,6 +135,10 @@ New Features
TokenFilters now support custom Attributes, and some have improved performance:
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
+* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
+ parameters for controlling the minimum shingle size produced by the filter, and
+ the separator string that it uses, respectively. (Steven Rowe via rmuir)
+
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java Fri Apr 2 05:00:53 2010
@@ -21,21 +21,49 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
import java.util.Map;
/** Factory for {@link ShingleFilter} */
public class ShingleFilterFactory extends BaseTokenFilterFactory {
+ private int minShingleSize;
private int maxShingleSize;
private boolean outputUnigrams;
+ private String tokenSeparator;
+
public void init(Map<String, String> args) {
super.init(args);
maxShingleSize = getInt("maxShingleSize",
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
+ if (maxShingleSize < 2) {
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "Invalid maxShingleSize (" + maxShingleSize
+ + ") - must be at least 2");
+ }
+ minShingleSize = getInt("minShingleSize",
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
+ if (minShingleSize < 2) {
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "Invalid minShingleSize (" + minShingleSize
+ + ") - must be at least 2");
+ }
+ if (minShingleSize > maxShingleSize) {
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "Invalid minShingleSize (" + minShingleSize
+ + ") - must be no greater than maxShingleSize ("
+ + maxShingleSize + ")");
+ }
outputUnigrams = getBoolean("outputUnigrams", true);
+ tokenSeparator = args.containsKey("tokenSeparator")
+ ? args.get("tokenSeparator")
+ : ShingleFilter.TOKEN_SEPARATOR;
}
public ShingleFilter create(TokenStream input) {
- ShingleFilter r = new ShingleFilter(input,maxShingleSize);
+ ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams);
+ r.setTokenSeparator(tokenSeparator);
return r;
}
}
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Fri Apr 2 05:00:53 2010
@@ -70,4 +70,150 @@ public class TestShingleFilterFactory ex
new String[] {"this", "this is", "this is a", "is",
"is a", "is a test", "a", "a test", "test"});
}
+
+ /**
+ * Test with higher min (and max) shingle size
+ */
+ public void testMinShingleSize() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "4");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this", "this is a", "this is a test",
+ "is", "is a test", "a", "test" });
+ }
+
+ /**
+ * Test with higher min (and max) shingle size and with unigrams disabled
+ */
+ public void testMinShingleSizeNoUnigrams() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "4");
+ args.put("outputUnigrams", "false");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this is a", "this is a test", "is a test" });
+ }
+
+ /**
+ * Test with higher same min and max shingle size
+ */
+ public void testEqualMinAndMaxShingleSize() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "3");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this", "this is a", "is", "is a test", "a", "test" });
+ }
+
+ /**
+ * Test with higher same min and max shingle size and with unigrams disabled
+ */
+ public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "3");
+ args.put("outputUnigrams", "false");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this is a", "is a test" });
+ }
+
+ /**
+ * Test with a non-default token separator
+ */
+ public void testTokenSeparator() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("tokenSeparator", "=BLAH=");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a",
+ "a", "a=BLAH=test", "test" });
+ }
+
+ /**
+ * Test with a non-default token separator and with unigrams disabled
+ */
+ public void testTokenSeparatorNoUnigrams() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("tokenSeparator", "=BLAH=");
+ args.put("outputUnigrams", "false");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
+ }
+
+ /**
+ * Test with an empty token separator
+ */
+ public void testEmptyTokenSeparator() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("tokenSeparator", "");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
+ }
+
+ /**
+ * Test with higher min (and max) shingle size
+ * and with a non-default token separator
+ */
+ public void testMinShingleSizeAndTokenSeparator() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "4");
+ args.put("tokenSeparator", "=BLAH=");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this", "this=BLAH=is=BLAH=a",
+ "this=BLAH=is=BLAH=a=BLAH=test", "is",
+ "is=BLAH=a=BLAH=test", "a", "test" });
+ }
+
+ /**
+ * Test with higher min (and max) shingle size
+ * and with a non-default token separator
+ * and with unigrams disabled
+ */
+ public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
+ Reader reader = new StringReader("this is a test");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("minShingleSize", "3");
+ args.put("maxShingleSize", "4");
+ args.put("tokenSeparator", "=BLAH=");
+ args.put("outputUnigrams", "false");
+ ShingleFilterFactory factory = new ShingleFilterFactory();
+ factory.init(args);
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream,
+ new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
+ "is=BLAH=a=BLAH=test", });
+ }
}