You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2011/01/30 19:30:35 UTC
svn commit: r1065343 - in /lucene/dev/trunk: ./ lucene/
modules/analysis/common/src/java/org/apache/lucene/analysis/core/
modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/
modules/analysis/common/src/java/org/apache/lucene/anal...
Author: uschindler
Date: Sun Jan 30 18:30:34 2011
New Revision: 1065343
URL: http://svn.apache.org/viewvc?rev=1065343&view=rev
Log:
LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
- copied, changed from r1065324, lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
lucene/dev/trunk/solr/ (props changed)
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Jan 30 18:30:34 2011
@@ -643,6 +643,12 @@ API Changes
deletes remain buffered so that the next time you open an NRT reader
and pass true, all deletes will be a applied. (Mike McCandless)
+* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
+ require up front specification of enablePositionIncrement. Together with
+ StopFilter they have a common base class (FilteringTokenFilter) that handles
+ the position increments automatically. Implementors only need to override an
+ accept() method that filters tokens. (Uwe Schindler, Robert Muir)
+
Bug fixes
* LUCENE-2249: ParallelMultiSearcher should shut down thread pool on
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java Sun Jan 30 18:30:34 2011
@@ -22,10 +22,9 @@ import java.util.Arrays;
import java.util.List;
import java.util.Set;
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
@@ -42,14 +41,10 @@ import org.apache.lucene.util.Version;
* increments are preserved
* </ul>
*/
-public final class StopFilter extends TokenFilter {
+public final class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords;
- private boolean enablePositionIncrements = true;
-
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
/**
* Construct a token stream filtering the given input. If
@@ -75,7 +70,7 @@ public final class StopFilter extends To
*/
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
- super(input);
+ super(true, input);
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
}
@@ -157,48 +152,8 @@ public final class StopFilter extends To
* Returns the next input Token whose term() is not a stop word.
*/
@Override
- public final boolean incrementToken() throws IOException {
- // return the first non-stop word found
- int skippedPositions = 0;
- while (input.incrementToken()) {
- if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
- if (enablePositionIncrements) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
- }
- skippedPositions += posIncrAtt.getPositionIncrement();
- }
- // reached EOS -- return false
- return false;
- }
-
- /**
- * @see #setEnablePositionIncrements(boolean)
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
+ protected boolean accept() throws IOException {
+ return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}
- /**
- * If <code>true</code>, this StopFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed stop tokens).
- * Generally, <code>true</code> is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * Default is true.
- *
- * <p> When set, when a token is stopped
- * (omitted), the position increment of the following
- * token is incremented.
- *
- * <p> <b>NOTE</b>: be sure to also
- * set {@link QueryParser#setEnablePositionIncrements} if
- * you use QueryParser to create queries.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java Sun Jan 30 18:30:34 2011
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.C
*
* @since solr 1.3
*/
-public final class KeepWordFilter extends TokenFilter {
+public final class KeepWordFilter extends FilteringTokenFilter {
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The words set passed to this constructor will be directly used by this filter
* and should not be modified, */
- public KeepWordFilter(TokenStream in, CharArraySet words) {
- super(in);
+ public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
+ super(enablePositionIncrements, in);
this.words = words;
}
@Override
- public boolean incrementToken() throws IOException {
- while (input.incrementToken()) {
- if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true;
- }
- return false;
+ public boolean accept() throws IOException {
+ return words.contains(termAtt.buffer(), 0, termAtt.length());
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java Sun Jan 30 18:30:34 2011
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokena
* Note: Length is calculated as the number of UTF-16 code units.
* </p>
*/
-public final class LengthFilter extends TokenFilter {
+public final class LengthFilter extends FilteringTokenFilter {
private final int min;
private final int max;
@@ -40,27 +41,15 @@ public final class LengthFilter extends
* Build a filter that removes words that are too long or too
* short from the text.
*/
- public LengthFilter(TokenStream in, int min, int max)
- {
- super(in);
+ public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
+ super(enablePositionIncrements, in);
this.min = min;
this.max = max;
}
- /**
- * Returns the next input Token whose term() is the right len
- */
@Override
- public final boolean incrementToken() throws IOException {
- // return the first non-stop word found
- while (input.incrementToken()) {
- int len = termAtt.length();
- if (len >= min && len <= max) {
- return true;
- }
- // note: else we ignore it but should we index each part of it?
- }
- // reached EOS -- return false
- return false;
+ public boolean accept() throws IOException {
+ final int len = termAtt.length();
+ return (len >= min && len <= max);
}
}
Copied: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (from r1065324, lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java?p2=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java&p1=lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java&r1=1065324&r2=1065343&rev=1065343&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java Sun Jan 30 18:30:34 2011
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis;
+package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java Sun Jan 30 18:30:34 2011
@@ -35,16 +35,26 @@ public class TestKeepWordFilter extends
words.add( "aaa" );
words.add( "bbb" );
- String input = "aaa BBB ccc ddd EEE";
+ String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
// Test Stopwords
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
- stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
- assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
+ stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+ assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
- stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
- assertTokenStreamContents(stream, new String[] { "aaa" });
+ stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+ assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
+
+ // Test Stopwords
+ stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+ stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+ assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
+
+ // Now force case
+ stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+ stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+ assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java Sun Jan 30 18:30:34 2011
@@ -24,19 +24,24 @@ import java.io.StringReader;
public class TestLengthFilter extends BaseTokenStreamTestCase {
- public void testFilter() throws Exception {
+ public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
- LengthFilter filter = new LengthFilter(stream, 2, 6);
- CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ LengthFilter filter = new LengthFilter(false, stream, 2, 6);
+ assertTokenStreamContents(filter,
+ new String[]{"short", "ab", "foo"},
+ new int[]{1, 1, 1}
+ );
+ }
- assertTrue(filter.incrementToken());
- assertEquals("short", termAtt.toString());
- assertTrue(filter.incrementToken());
- assertEquals("ab", termAtt.toString());
- assertTrue(filter.incrementToken());
- assertEquals("foo", termAtt.toString());
- assertFalse(filter.incrementToken());
+ public void testFilterWithPosIncr() throws Exception {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
+ LengthFilter filter = new LengthFilter(true, stream, 2, 6);
+ assertTokenStreamContents(filter,
+ new String[]{"short", "ab", "foo"},
+ new int[]{1, 4, 2}
+ );
}
}
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java Sun Jan 30 18:30:34 2011
@@ -23,22 +23,27 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.util.CharArraySet;
+import java.util.Map;
import java.util.Set;
import java.io.IOException;
/**
* @version $Id$
- * @since solr 1.3
*/
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
- private CharArraySet words;
- private boolean ignoreCase;
+ @Override
+ public void init(Map<String,String> args) {
+ super.init(args);
+ assureMatchVersion();
+ }
public void inform(ResourceLoader loader) {
String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
- if (wordFiles != null) {
+ enablePositionIncrements = getBoolean("enablePositionIncrements",false);
+
+ if (wordFiles != null) {
try {
words = getWordSet(loader, wordFiles, ignoreCase);
} catch (IOException e) {
@@ -47,6 +52,10 @@ public class KeepWordFilterFactory exten
}
}
+ private CharArraySet words;
+ private boolean ignoreCase;
+ private boolean enablePositionIncrements;
+
/**
* Set the keep word list.
* NOTE: if ignoreCase==true, the words are expected to be lowercase
@@ -62,15 +71,19 @@ public class KeepWordFilterFactory exten
this.ignoreCase = ignoreCase;
}
- public KeepWordFilter create(TokenStream input) {
- return new KeepWordFilter(input, words);
+ public boolean isEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
}
public CharArraySet getWords() {
return words;
}
- public boolean isIgnoreCase() {
- return ignoreCase;
+ public KeepWordFilter create(TokenStream input) {
+ return new KeepWordFilter(enablePositionIncrements, input, words);
}
}
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java Sun Jan 30 18:30:34 2011
@@ -27,6 +27,7 @@ import java.util.Map;
*/
public class LengthFilterFactory extends BaseTokenFilterFactory {
int min,max;
+ boolean enablePositionIncrements;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
@@ -35,8 +36,10 @@ public class LengthFilterFactory extends
super.init(args);
min=Integer.parseInt(args.get(MIN_KEY));
max=Integer.parseInt(args.get(MAX_KEY));
+ enablePositionIncrements = getBoolean("enablePositionIncrements",false);
}
+
public LengthFilter create(TokenStream input) {
- return new LengthFilter(input,min,max);
+ return new LengthFilter(enablePositionIncrements, input,min,max);
}
}
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java?rev=1065343&r1=1065342&r2=1065343&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java Sun Jan 30 18:30:34 2011
@@ -31,9 +31,19 @@ public class LengthFilterTest extends Ba
Map<String, String> args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
+ // default: args.put("enablePositionIncrements", "false");
factory.init(args);
String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
- assertTokenStreamContents(stream, new String[] { "foobar" });
+ assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
+
+ factory = new LengthFilterFactory();
+ args = new HashMap<String, String>();
+ args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
+ args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
+ args.put("enablePositionIncrements", "true");
+ factory.init(args);
+ stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
+ assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
}
}
\ No newline at end of file