You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ma...@apache.org on 2009/07/14 23:39:23 UTC
svn commit: r794078 - in /lucene/java/trunk: ./
contrib/analyzers/src/java/org/apache/lucene/analysis/th/
contrib/highlighter/src/test/org/apache/lucene/search/highlight/
contrib/memory/src/java/org/apache/lucene/index/memory/
contrib/memory/src/test/o...
Author: markrmiller
Date: Tue Jul 14 21:39:22 2009
New Revision: 794078
URL: http://svn.apache.org/viewvc?rev=794078&view=rev
Log:
LUCENE-1688: Deprecate static final String stop word array in and StopAnalzyer and replace it with an immutable implementation of CharArraySet.
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java
lucene/java/trunk/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestSpans.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Jul 14 21:39:22 2009
@@ -309,6 +309,11 @@
all synchronization in TermInfosReader, which previously could
cause threads to pile up in certain cases. (Dan Rosher via Mike
McCandless)
+
+30. LUCENE-1688: Deprecate static final String stop word array in and
+ StopAnalzyer and replace it with an immutable implementation of
+ CharArraySet. (Simon Willnauer via Mark Miller)
+
Bug fixes
@@ -604,6 +609,11 @@
9. LUCENE-1653: Avoid creating a Calendar in every call to
DateTools#dateToString, DateTools#timeToString and
DateTools#round. (Shai Erera via Mark Miller)
+
+10. LUCENE-1688: Deprecate static final String stop word array and
+ replace it with an immutable implementation of CharArraySet.
+ Removes conversions between Set and array.
+ (Simon Willnauer via Mark Miller)
Documentation
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Tue Jul 14 21:39:22 2009
@@ -33,7 +33,7 @@
TokenStream ts = new StandardTokenizer(reader);
ts = new StandardFilter(ts);
ts = new ThaiWordFilter(ts);
- ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS);
+ ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return ts;
}
}
Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Tue Jul 14 21:39:22 2009
@@ -23,9 +23,11 @@
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
@@ -982,7 +984,8 @@
public void run() throws Exception {
String goodWord = "goodtoken";
- String stopWords[] = { "stoppedtoken" };
+ Set stopWords = new HashSet(1);
+ stopWords.add("stoppedtoken");
TermQuery query = new TermQuery(new Term("data", goodWord));
@@ -991,7 +994,8 @@
sb.append(goodWord);
for (int i = 0; i < 10000; i++) {
sb.append(" ");
- sb.append(stopWords[0]);
+ // only one stopword
+ sb.append(stopWords.iterator().next());
}
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(stopWords).tokenStream(
@@ -1024,7 +1028,9 @@
public void testMaxSizeEndHighlight() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
public void run() throws Exception {
- String stopWords[] = { "in", "it" };
+ Set stopWords = new HashSet();
+ stopWords.add("in");
+ stopWords.add("it");
TermQuery query = new TermQuery(new Term("text", "searchterm"));
String text = "this is a text with searchterm in it";
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java Tue Jul 14 21:39:22 2009
@@ -70,55 +70,60 @@
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
- private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
- "a", "about", "above", "across", "adj", "after", "afterwards",
- "again", "against", "albeit", "all", "almost", "alone", "along",
- "already", "also", "although", "always", "among", "amongst", "an",
- "and", "another", "any", "anyhow", "anyone", "anything",
- "anywhere", "are", "around", "as", "at", "be", "became", "because",
- "become", "becomes", "becoming", "been", "before", "beforehand",
- "behind", "being", "below", "beside", "besides", "between",
- "beyond", "both", "but", "by", "can", "cannot", "co", "could",
- "down", "during", "each", "eg", "either", "else", "elsewhere",
- "enough", "etc", "even", "ever", "every", "everyone", "everything",
- "everywhere", "except", "few", "first", "for", "former",
- "formerly", "from", "further", "had", "has", "have", "he", "hence",
- "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
- "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
- "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
- "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
- "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
- "must", "my", "myself", "namely", "neither", "never",
- "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
- "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
- "once one", "only", "onto", "or", "other", "others", "otherwise",
- "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
- "rather", "s", "same", "seem", "seemed", "seeming", "seems",
- "several", "she", "should", "since", "so", "some", "somehow",
- "someone", "something", "sometime", "sometimes", "somewhere",
- "still", "such", "t", "than", "that", "the", "their", "them",
- "themselves", "then", "thence", "there", "thereafter", "thereby",
- "therefor", "therein", "thereupon", "these", "they", "this",
- "those", "though", "through", "throughout", "thru", "thus", "to",
- "together", "too", "toward", "towards", "under", "until", "up",
- "upon", "us", "very", "via", "was", "we", "well", "were", "what",
- "whatever", "whatsoever", "when", "whence", "whenever",
- "whensoever", "where", "whereafter", "whereas", "whereat",
- "whereby", "wherefrom", "wherein", "whereinto", "whereof",
- "whereon", "whereto", "whereunto", "whereupon", "wherever",
- "wherewith", "whether", "which", "whichever", "whichsoever",
- "while", "whilst", "whither", "who", "whoever", "whole", "whom",
- "whomever", "whomsoever", "whose", "whosoever", "why", "will",
- "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
- "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
- "yourselves"});
+ private static final Set EXTENDED_ENGLISH_STOP_WORDS;
+ static {
+ EXTENDED_ENGLISH_STOP_WORDS = new HashSet();
+
+ EXTENDED_ENGLISH_STOP_WORDS.addAll(Arrays.asList(new String[] {
+ "a", "about", "above", "across", "adj", "after", "afterwards",
+ "again", "against", "albeit", "all", "almost", "alone", "along",
+ "already", "also", "although", "always", "among", "amongst", "an",
+ "and", "another", "any", "anyhow", "anyone", "anything",
+ "anywhere", "are", "around", "as", "at", "be", "became", "because",
+ "become", "becomes", "becoming", "been", "before", "beforehand",
+ "behind", "being", "below", "beside", "besides", "between",
+ "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+ "down", "during", "each", "eg", "either", "else", "elsewhere",
+ "enough", "etc", "even", "ever", "every", "everyone", "everything",
+ "everywhere", "except", "few", "first", "for", "former",
+ "formerly", "from", "further", "had", "has", "have", "he", "hence",
+ "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+ "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+ "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+ "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+ "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+ "must", "my", "myself", "namely", "neither", "never",
+ "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+ "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+ "once one", "only", "onto", "or", "other", "others", "otherwise",
+ "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+ "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+ "several", "she", "should", "since", "so", "some", "somehow",
+ "someone", "something", "sometime", "sometimes", "somewhere",
+ "still", "such", "t", "than", "that", "the", "their", "them",
+ "themselves", "then", "thence", "there", "thereafter", "thereby",
+ "therefor", "therein", "thereupon", "these", "they", "this",
+ "those", "though", "through", "throughout", "thru", "thus", "to",
+ "together", "too", "toward", "towards", "under", "until", "up",
+ "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+ "whatever", "whatsoever", "when", "whence", "whenever",
+ "whensoever", "where", "whereafter", "whereas", "whereat",
+ "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+ "whereon", "whereto", "whereunto", "whereupon", "wherever",
+ "wherewith", "whether", "which", "whichever", "whichsoever",
+ "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+ "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+ "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+ "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+ "yourselves"}));
+ }
/**
* A lower-casing word analyzer with English stop words (can be shared
* freely across threads without harm); global per class loader.
*/
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
- NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
+ NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
/**
* A lower-casing word analyzer with <b>extended </b> English stop words
@@ -191,7 +196,7 @@
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
- if (stopWords != null) stream = new StopFilter(stream, stopWords);
+ if (stopWords != null) stream = new StopFilter(false, stream, stopWords);
}
return stream;
@@ -304,9 +309,9 @@
}
/** somewhat oversized to minimize hash collisions */
- private static Set makeStopSet(String[] stopWords) {
- Set stops = new HashSet(stopWords.length * 2, 0.3f);
- stops.addAll(Arrays.asList(stopWords));
+ private static Set makeStopSet(Set stopWords) {
+ Set stops = new HashSet(stopWords.size() * 2, 0.3f);
+ stops.addAll(stopWords);
return stops;
// return Collections.unmodifiableSet(stops);
}
Modified: lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (original)
+++ lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java Tue Jul 14 21:39:22 2009
@@ -271,7 +271,7 @@
boolean toLowerCase = true;
// boolean toLowerCase = false;
// Set stopWords = null;
- Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+ Set stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
Analyzer[] analyzers = new Analyzer[] {
new SimpleAnalyzer(),
Modified: lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (original)
+++ lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java Tue Jul 14 21:39:22 2009
@@ -135,7 +135,7 @@
for (int stops=0; stops < maxStops; stops++) {
Set stopWords = null;
- if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+ if (stops != 0) stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
for (int toLower=0; toLower < maxToLower; toLower++) {
boolean toLowerCase = toLower != 0;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java Tue Jul 14 21:39:22 2009
@@ -2,6 +2,7 @@
import java.util.AbstractSet;
import java.util.Collection;
+import java.util.Collections;
import java.util.Iterator;
/**
@@ -53,6 +54,12 @@
this(c.size(), ignoreCase);
addAll(c);
}
+ /** Create set from entries */
+ private CharArraySet(char[][] entries, boolean ignoreCase, int count){
+ this.entries = entries;
+ this.ignoreCase = ignoreCase;
+ this.count = count;
+ }
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
@@ -100,7 +107,7 @@
public boolean add(CharSequence text) {
return add(text.toString()); // could be more efficient
}
-
+
/** Add this String into the set */
public boolean add(String text) {
return add(text.toCharArray());
@@ -228,6 +235,26 @@
}
return add(o.toString());
}
+
+ /**
+ * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+ * unmodifiable views of internal sets for "read-only" use.
+ *
+ * @param set
+ * a set for which the unmodifiable set is returned.
+ * @return an new unmodifiable {@link CharArraySet}.
+ * @throws NullPointerException
+ * if the given set is <code>null</code>.
+ */
+ public static CharArraySet unmodifiableSet(CharArraySet set) {
+ if (set == null)
+ throw new NullPointerException("Given set is null");
+ /*
+ * Instead of delegating calls to the given set copy the low-level values to
+ * the unmodifiable Subclass
+ */
+ return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
+ }
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
@@ -270,5 +297,40 @@
public Iterator iterator() {
return new CharArraySetIterator();
}
+
+ /**
+ * Efficient unmodifiable {@link CharArraySet}. This implementation does not
+ * delegate calls to a give {@link CharArraySet} like
+ * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
+ * the internal representation of a {@link CharArraySet} to a super
+ * constructor and overrides all mutators.
+ */
+ private static final class UnmodifiableCharArraySet extends CharArraySet {
+
+ private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
+ int count) {
+ super(entries, ignoreCase, count);
+ }
+
+ public boolean add(Object o){
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean addAll(Collection coll) {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean add(char[] text) {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean add(CharSequence text) {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean add(String text) {
+ throw new UnsupportedOperationException();
+ }
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java Tue Jul 14 21:39:22 2009
@@ -20,18 +20,20 @@
import java.io.File;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
import java.util.Set;
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
public final class StopAnalyzer extends Analyzer {
- private Set stopWords;
+ private final Set/*<String>*/ stopWords;
// @deprecated
- private boolean useDefaultStopPositionIncrement;
- private boolean enablePositionIncrements;
+ private final boolean useDefaultStopPositionIncrement;
+ private final boolean enablePositionIncrements;
/** An array containing some common English words that are not usually useful
- for searching. */
+ for searching.
+ @deprecated Use {@link #ENGLISH_STOP_WORDS_SET} instead */
public static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
@@ -39,13 +41,31 @@
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
-
+
+ /** An unmodifiable set containing some common English words that are not usually useful
+ for searching.*/
+ public static final Set/*<String>*/ ENGLISH_STOP_WORDS_SET;
+
+ static {
+ final String[] stopWords = new String[]{
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+ final CharArraySet stopSet = new CharArraySet(stopWords.length, false);
+ stopSet.addAll(Arrays.asList(stopWords));
+ ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+ }
+
/** Builds an analyzer which removes words in
* ENGLISH_STOP_WORDS.
* @deprecated Use {@link #StopAnalyzer(boolean)} instead */
public StopAnalyzer() {
- stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
+ stopWords = ENGLISH_STOP_WORDS_SET;
useDefaultStopPositionIncrement = true;
+ enablePositionIncrements = false;
}
/** Builds an analyzer which removes words in
@@ -53,8 +73,9 @@
* @param enablePositionIncrements See {@link
* StopFilter#setEnablePositionIncrements} */
public StopAnalyzer(boolean enablePositionIncrements) {
- stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
+ stopWords = ENGLISH_STOP_WORDS_SET;
this.enablePositionIncrements = enablePositionIncrements;
+ useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given set.
@@ -62,6 +83,7 @@
public StopAnalyzer(Set stopWords) {
this.stopWords = stopWords;
useDefaultStopPositionIncrement = true;
+ enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given set.
@@ -71,22 +93,26 @@
public StopAnalyzer(Set stopWords, boolean enablePositionIncrements) {
this.stopWords = stopWords;
this.enablePositionIncrements = enablePositionIncrements;
+ useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer which removes words in the provided array.
- * @deprecated Use {@link #StopAnalyzer(String[], boolean)} instead */
+ * @deprecated Use {@link #StopAnalyzer(Set, boolean)} instead */
public StopAnalyzer(String[] stopWords) {
this.stopWords = StopFilter.makeStopSet(stopWords);
useDefaultStopPositionIncrement = true;
+ enablePositionIncrements = false;
}
/** Builds an analyzer which removes words in the provided array.
* @param stopWords Array of stop words
* @param enablePositionIncrements See {@link
- * StopFilter#setEnablePositionIncrements} */
+ * StopFilter#setEnablePositionIncrements}
+ * @deprecated Use {@link #StopAnalyzer(Set, boolean) instead*/
public StopAnalyzer(String[] stopWords, boolean enablePositionIncrements) {
this.stopWords = StopFilter.makeStopSet(stopWords);
this.enablePositionIncrements = enablePositionIncrements;
+ useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given file.
@@ -95,6 +121,7 @@
public StopAnalyzer(File stopwordsFile) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
useDefaultStopPositionIncrement = true;
+ enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given file.
@@ -105,6 +132,7 @@
public StopAnalyzer(File stopwordsFile, boolean enablePositionIncrements) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
this.enablePositionIncrements = enablePositionIncrements;
+ useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given reader.
@@ -114,6 +142,7 @@
public StopAnalyzer(Reader stopwords) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
useDefaultStopPositionIncrement = true;
+ enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given reader.
@@ -124,6 +153,7 @@
public StopAnalyzer(Reader stopwords, boolean enablePositionIncrements) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
this.enablePositionIncrements = enablePositionIncrements;
+ useDefaultStopPositionIncrement = false;
}
/** Filters LowerCaseTokenizer with StopFilter. */
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Tue Jul 14 21:39:22 2009
@@ -55,6 +55,7 @@
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param input input TokenStream
* @param stopWords array of stop words
+ * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead.
*/
public StopFilter(boolean enablePositionIncrements, TokenStream input, String [] stopWords)
{
@@ -77,6 +78,7 @@
* @param in input TokenStream
* @param stopWords array of stop words
* @param ignoreCase true if case is ignored
+ * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead.
*/
public StopFilter(boolean enablePositionIncrements, TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Tue Jul 14 21:39:22 2009
@@ -101,15 +101,19 @@
/** An array containing some common English words that are usually not
- useful for searching. */
+ useful for searching.
+ @deprecated Use {@link #STOP_WORDS_SET} instead */
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
+
+ /** An unmodifiable set containing some common English words that are usually not
+ useful for searching. */
+ public static final Set/*<String>*/ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the default stop words ({@link
- * #STOP_WORDS}).
- * @deprecated Use {@link #StandardAnalyzer(Version)},
- * instead. */
+ * #STOP_WORDS_SET}).
+ * @deprecated Use {@link #StandardAnalyzer(Version)} instead. */
public StandardAnalyzer() {
- this(Version.LUCENE_24, STOP_WORDS);
+ this(Version.LUCENE_24, STOP_WORDS_SET);
}
/** Builds an analyzer with the default stop words ({@link
@@ -118,7 +122,7 @@
* <a href="#version">above</a>}
*/
public StandardAnalyzer(Version matchVersion) {
- this(matchVersion, STOP_WORDS);
+ this(matchVersion, STOP_WORDS_SET);
}
/** Builds an analyzer with the given stop words.
@@ -138,22 +142,9 @@
}
/** Builds an analyzer with the given stop words.
- * @deprecated Use {@link #StandardAnalyzer(Version,
- * String[])} instead */
+ * @deprecated Use {@link #StandardAnalyzer(Version, Set)} instead */
public StandardAnalyzer(String[] stopWords) {
- this(Version.LUCENE_24, stopWords);
- }
-
- /** Builds an analyzer with the given stop words.
- * @param matchVersion Lucene version to match See {@link
- * <a href="#version">above</a>}
- * @param stopWords Array of stop words */
- public StandardAnalyzer(Version matchVersion, String[] stopWords) {
- if (stopWords == null) {
- stopWords = STOP_WORDS;
- }
- stopSet = StopFilter.makeStopSet(stopWords);
- init(matchVersion);
+ this(Version.LUCENE_24, StopFilter.makeStopSet(stopWords));
}
/** Builds an analyzer with the stop words from the given file.
@@ -203,8 +194,9 @@
* @deprecated Remove in 3.X and make true the only valid value
*/
public StandardAnalyzer(boolean replaceInvalidAcronym) {
- this(Version.LUCENE_24, STOP_WORDS);
+ this(Version.LUCENE_24, STOP_WORDS_SET);
this.replaceInvalidAcronym = replaceInvalidAcronym;
+ useDefaultStopPositionIncrements = true;
}
/**
@@ -243,7 +235,7 @@
* @deprecated Remove in 3.X and make true the only valid value
*/
public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
- this(Version.LUCENE_24, stopwords);
+ this(Version.LUCENE_24, StopFilter.makeStopSet(stopwords));
this.replaceInvalidAcronym = replaceInvalidAcronym;
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java Tue Jul 14 21:39:22 2009
@@ -23,13 +23,22 @@
public class TestCharArraySet extends LuceneTestCase {
+ static final String[] TEST_STOP_WORDS = {
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+
public void testRehash() throws Exception {
CharArraySet cas = new CharArraySet(0, true);
- for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
- cas.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
- assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS.length, cas.size());
- for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
- assertTrue(cas.contains(StopAnalyzer.ENGLISH_STOP_WORDS[i]));
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ cas.add(TEST_STOP_WORDS[i]);
+ assertEquals(TEST_STOP_WORDS.length, cas.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertTrue(cas.contains(TEST_STOP_WORDS[i]));
}
public void testNonZeroOffset() {
@@ -39,6 +48,11 @@
set.addAll(Arrays.asList(words));
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));
+
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(findme, 1, 4));
+ assertTrue(set.contains(new String(findme,1,4)));
}
public void testObjectContains() {
@@ -47,5 +61,118 @@
set.add(val);
assertTrue(set.contains(val));
assertTrue(set.contains(new Integer(1)));
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(val));
+ assertTrue(set.contains(new Integer(1)));
+ }
+
+ public void testClear(){
+ CharArraySet set=new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ try{
+ set.clear();
+ fail("remove is not supported");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ }
+ }
+
+ public void testModifyOnUnmodifiable(){
+ CharArraySet set=new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+ String NOT_IN_SET = "SirGallahad";
+ assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
+
+ try{
+ set.add(NOT_IN_SET.toCharArray());
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(new StringBuffer(NOT_IN_SET));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.clear();
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+ try{
+ set.add((Object) NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+ try{
+ set.removeAll(Arrays.asList(TEST_STOP_WORDS));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.retainAll(Arrays.asList(new String[]{NOT_IN_SET}));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.addAll(Arrays.asList(new String[]{NOT_IN_SET}));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ }
+
+ for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
+ assertTrue(set.contains(TEST_STOP_WORDS[i]));
+ }
+ }
+
+ public void testUnmodifiableSet(){
+ CharArraySet set=new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+
+ try{
+ CharArraySet.unmodifiableSet(null);
+ fail("can not make null unmodifiable");
+ }catch (NullPointerException e) {
+ // expected
+ }
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java Tue Jul 14 21:39:22 2009
@@ -23,12 +23,13 @@
import java.io.StringReader;
import java.io.IOException;
+import java.util.Iterator;
import java.util.Set;
import java.util.HashSet;
public class TestStopAnalyzer extends LuceneTestCase {
- private StopAnalyzer stop = new StopAnalyzer();
+ private StopAnalyzer stop = new StopAnalyzer(false);
private Set inValidTokens = new HashSet();
public TestStopAnalyzer(String s) {
@@ -37,8 +38,10 @@
protected void setUp() throws Exception {
super.setUp();
- for (int i = 0; i < StopAnalyzer.ENGLISH_STOP_WORDS.length; i++) {
- inValidTokens.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
+
+ Iterator it = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
+ while(it.hasNext()) {
+ inValidTokens.add(it.next());
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java Tue Jul 14 21:39:22 2009
@@ -23,7 +23,9 @@
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Date;
+import java.util.HashSet;
import java.util.Locale;
+import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
@@ -768,7 +770,9 @@
public void testBoost()
throws Exception {
- StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(new String[]{"on"});
+ Set stopWords = new HashSet(1);
+ stopWords.add("on");
+ StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(stopWords);
QueryParser qp = new QueryParser("field", oneStopAnalyzer);
Query q = qp.parse("on^1.0");
assertNotNull(q);
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Tue Jul 14 21:39:22 2009
@@ -31,6 +31,7 @@
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
+import java.util.HashSet;
import java.util.LinkedList;
/**
@@ -169,7 +170,7 @@
public void testPhrasePrefixWithBooleanQuery() throws IOException {
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new String[]{}), true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED);
add("This is a test", "object", writer);
add("a note", "note", writer);
writer.close();
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestSpans.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestSpans.java?rev=794078&r1=794077&r2=794078&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestSpans.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/spans/TestSpans.java Tue Jul 14 21:39:22 2009
@@ -39,6 +39,7 @@
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
+import java.util.HashSet;
public class TestSpans extends LuceneTestCase {
private IndexSearcher searcher;
@@ -448,7 +449,7 @@
// LUCENE-1404
public void testNPESpanQuery() throws Throwable {
final Directory dir = new MockRAMDirectory();
- final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new String[0]), IndexWriter.MaxFieldLength.LIMITED);
+ final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new HashSet(0)), IndexWriter.MaxFieldLength.LIMITED);
// Add documents
addDoc(writer, "1", "the big dogs went running to the market");