You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/09/22 23:56:02 UTC
svn commit: r817859 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
Author: yonik
Date: Tue Sep 22 21:55:57 2009
New Revision: 817859
URL: http://svn.apache.org/viewvc?rev=817859&view=rev
Log:
SOLR-908: add CommonGramsFilterFactory CommonGramsQueryFilterFactory
Added:
lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (with props)
Modified:
lucene/solr/trunk/CHANGES.txt
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=817859&r1=817858&r2=817859&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Tue Sep 22 21:55:57 2009
@@ -321,6 +321,11 @@
80. SOLR-1447 : Simple property injection. <mergePolicy> & <mergeSceduler> syntaxes are deprecated ( Jason rutherglen noble)
+82. SOLR-908 : CommonGramsFilterFactory/CommonGramsQueryFilterFactory for
+ speeding up phrase queries containing common words by indexing
+ n-grams and using them at query time.
+ (Tom Burton-West, Jason Rutherglen via yonik)
+
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,223 @@
+/*
+ * Licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/*
+ * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
+ * associated constructors
+ */
+
+/**
+ * Construct bigrams for frequently occurring terms while indexing. Single terms
+ * are still indexed too, with bigrams overlaid. This is achieved through the
+ * use of {@link Token#Set<String>PositionIncrement(int)}. Bigrams have a type
+ * of "gram" Example
+ * <ul>
+ * <li>input:"the quick brown fox"</li>
+ * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
+ * <li>"the-quick" has a position increment of 0 so it is in the same position
+ * as "the" "the-quick" has a term.type() of "gram"</li>
+ *
+ * </ul>
+ */
+
+/*
+ * Constructors and makeCommonSet based on similar code in StopFilter
+ */
+
+public class CommonGramsFilter extends BufferedTokenStream {
+
+ private static final char SEPARATOR = '_';
+
+ private final CharArraySet commonWords;
+
+ private StringBuilder buffer = new StringBuilder();
+
+ /**
+ * Construct a token stream filtering the given input using a Set of common
+ * words to create bigrams. Outputs both unigrams with position increment and
+ * bigrams with position increment 0 type=gram where one or both of the words
+ * in a potential bigram are in the set of common words .
+ *
+ * @param input TokenStream input in filter chain
+ * @param commonWords The set of common words.
+ *
+ */
+ public CommonGramsFilter(TokenStream input, Set commonWords) {
+ this(input, commonWords, false);
+ }
+
+ /**
+ * Construct a token stream filtering the given input using a Set of common
+ * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
+ * is CharArraySet). If <code>commonWords</code> is an instance of
+ * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
+ * construct the set) it will be directly used and <code>ignoreCase</code>
+ * will be ignored since <code>CharArraySet</code> directly controls case
+ * sensitivity.
+ * <p/>
+ * If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
+ * new CharArraySet will be constructed and <code>ignoreCase</code> will be
+ * used to specify the case sensitivity of that set.
+ *
+ * @param input TokenStream input in filter chain.
+ * @param commonWords The set of common words.
+ * @param ignoreCase -Ignore case when constructing bigrams for common words.
+ */
+ public CommonGramsFilter(TokenStream input, Set commonWords,
+ boolean ignoreCase) {
+ super(input);
+ if (commonWords instanceof CharArraySet) {
+ this.commonWords = (CharArraySet) commonWords;
+ } else {
+ this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
+ this.commonWords.addAll(commonWords);
+ }
+ init();
+ }
+
+ /**
+ * Construct a token stream filtering the given input using an Array of common
+ * words to create bigrams.
+ *
+ * @param input Tokenstream in filter chain
+ * @param commonWords words to be used in constructing bigrams
+ */
+ public CommonGramsFilter(TokenStream input, String[] commonWords) {
+ this(input, commonWords, false);
+ init();
+ }
+
+ /**
+ * Construct a token stream filtering the given input using an Array of common
+ * words to create bigrams and is case-sensitive if ignoreCase is false.
+ *
+ * @param input Tokenstream in filter chain
+ * @param commonWords words to be used in constructing bigrams
+ * @param ignoreCase -Ignore case when constructing bigrams for common words.
+ */
+ public CommonGramsFilter(TokenStream input, String[] commonWords,
+ boolean ignoreCase) {
+ super(input);
+ this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
+ init();
+ }
+
+ // Here for future moving to 2.9 api See StopFilter code
+
+ public void init() {
+ /**
+ * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
+ * =(PositionIncrementAttribute)
+ * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
+ * addAttribute(TypeAttribute.class);
+ */
+ }
+
+ /**
+ * Build a CharArraySet from an array of common words, appropriate for passing
+ * into the CommonGramsFilter constructor. This permits this commonWords
+ * construction to be cached once when an Analyzer is constructed.
+ *
+ * @see #makeCommonSet(java.lang.String[], boolean) passing false to
+ * ignoreCase
+ */
+ public static final CharArraySet makeCommonSet(String[] commonWords) {
+ return makeCommonSet(commonWords, false);
+ }
+
+ /**
+ * Build a CharArraySet from an array of common words, appropriate for passing
+ * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
+ * false.
+ *
+ * @param commonWords
+ * @param ignoreCase If true, all words are lower cased first.
+ * @return a Set containing the words
+ */
+ public static final CharArraySet makeCommonSet(String[] commonWords,
+ boolean ignoreCase) {
+ CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
+ commonSet.addAll(Arrays.asList(commonWords));
+ return commonSet;
+ }
+
+ /**
+ * Inserts bigrams for common words into a token stream. For each input token,
+ * output the token. If the token and/or the following token are in the list
+ * of common words also output a bigram with position increment 0 and
+ * type="gram"
+ */
+ /*
+ * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
+ * Token.next() TODO:Consider adding an option to not emit unigram stopwords
+ * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+ * changed to work with this. TODO: Consider optimizing for the case of three
+ * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
+ * "of-the", "the-year" but with proper management of positions we could
+ * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
+ * position lookups.
+ */
+ public Token process(Token token) throws IOException {
+ Token next = peek(1);
+ // if this is the last token just spit it out. Any commongram would have
+ // been output in the previous call
+ if (next == null) {
+ return token;
+ }
+
+ /**
+ * if this token or next are common then construct a bigram with type="gram"
+ * position increment = 0, and put it in the output queue. It will be
+ * returned when super.next() is called, before this method gets called with
+ * a new token from the input stream See implementation of next() in
+ * BufferedTokenStream
+ */
+
+ if (isCommon(token) || isCommon(next)) {
+ Token gram = gramToken(token, next);
+ write(gram);
+ }
+ // we always return the unigram token
+ return token;
+ }
+
+ /** True if token is for a common term. */
+ private boolean isCommon(Token token) {
+ return commonWords != null
+ && commonWords.contains(token.termBuffer(), 0, token.termLength());
+ }
+
+ /** Construct a compound token. */
+ private Token gramToken(Token first, Token second) {
+ buffer.setLength(0);
+ buffer.append(first.termText());
+ buffer.append(SEPARATOR);
+ buffer.append(second.termText());
+ Token result = new Token(buffer.toString(), first.startOffset(), second
+ .endOffset(), "gram");
+ result.setPositionIncrement(0);
+ return result;
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ buffer.setLength(0);
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Constructs a CommonGramsFilter
+ */
+
+/*
+ * This is pretty close to a straight copy from StopFilterFactory
+ */
+public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
+ ResourceLoaderAware {
+
+ public void inform(ResourceLoader loader) {
+ String commonWordFiles = args.get("words");
+ ignoreCase = getBoolean("ignoreCase", false);
+ enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+
+ if (commonWordFiles != null) {
+ try {
+ List<String> files = StrUtils.splitFileNames(commonWordFiles);
+ if (commonWords == null && files.size() > 0){
+ //default stopwords list has 35 or so words, but maybe don't make it that big to start
+ commonWords = new CharArraySet(files.size() * 10, ignoreCase);
+ }
+ for (String file : files) {
+ List<String> wlist = loader.getLines(file.trim());
+ //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
+ commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ } else {
+ commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+ }
+ }
+
+ //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
+ private CharArraySet commonWords;
+ private boolean ignoreCase;
+ private boolean enablePositionIncrements;
+
+ public boolean isEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public Set getCommonWords() {
+ return commonWords;
+ }
+
+ public CommonGramsFilter create(TokenStream input) {
+ CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
+ return commonGrams;
+ }
+}
+
+
+
\ No newline at end of file
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+ * words when they are not a member of a bigram.
+ *
+ * Example:
+ * <ul>
+ * <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+ * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+ * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+ * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
+ * "falls", "mainly"
+ * </ul>
+ */
+
+/*
+ * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
+ * 2.9 lucene TokenStream api, make necessary changes here.
+ * See:http://hudson.zones
+ * .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
+ * /lucene/analysis/TokenStream.html and
+ * http://svn.apache.org/viewvc/lucene/java
+ * /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
+ */
+public class CommonGramsQueryFilter extends BufferedTokenStream {
+ //private CharArraySet commonWords;
+ private Token prev;
+
+ /**
+ * Constructor
+ *
+ * @param input must be a CommonGramsFilter!
+ *
+ */
+
+ public CommonGramsQueryFilter(CommonGramsFilter input) {
+ super(input);
+ prev = new Token();
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ prev = new Token();
+ }
+
+ /**
+ * Output bigrams whenever possible to optimize queries. Only output unigrams
+ * when they are not a member of a bigram. Example:
+ * <ul>
+ * <li>input: "the rain in spain falls mainly"
+ * <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+ */
+
+ public Token process(Token token) throws IOException {
+ Token next = peek(1);
+ /*
+ * Deal with last token (next=null when current token is the last word) Last
+ * token will be a unigram. If previous token was a bigram, then we already
+ * output the last token as part of the unigram and should not additionally
+ * output the unigram. <p> Example: If the end of the input to the
+ * CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
+ * <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
+ * <li> Since the word "plain" was already output as part of the bigram we
+ * don't output it.</li> </ul> Example: If the end of the input to the
+ * CommonGramsFilter is "falls mainly" <ul> <li>current token =
+ * "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
+ * (unigram)</li> <li>Since we haven't yet output the current token, we
+ * output it</li> </ul>
+ */
+
+ // Deal with special case of last token
+ if (next == null) {
+ if (prev == null) {
+ // This is the first and only token i.e. one word query
+ return token;
+ }
+ if (prev != null && prev.type() != "gram") {
+ // If previous token was a unigram, output the current token
+ return token;
+ } else {
+ // If previous token was a bigram, we already output it and this token
+ // was output as part of the bigram so we are done.
+ return null;
+ }
+ }
+
+ /*
+ * Possible cases are: |token |next 1|word |gram 2|word |word The
+ * CommonGramsFilter we are wrapping always outputs the unigram word prior
+ * to outputting an optional bigram: "the sound of" gets output as |"the",
+ * "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
+ * input stream and output it rather than the current token This means that
+ * the call to super.next() which reads a token from input and passes it on
+ * to this process method will always get a token of type word
+ */
+ if (next != null && next.type() == "gram") {
+ // consume "next" token from list and output it
+ token = read();
+ // use this to clone the token because clone requires all these args but
+ // won't take the token.type
+ // see
+ // http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
+ prev.reinit(token.termBuffer(), 0, token.termLength(), token
+ .startOffset(), token.endOffset(), token.type());
+ token.setPositionIncrement(1);
+ return token;
+ }
+
+ // if the next token is not a bigram, then output the token
+ // see note above regarding this method of copying token to prev
+ prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
+ token.endOffset(), token.type());
+ assert token.type() == "word";
+ return token;
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Construct CommonGramsQueryFilter
+ *
+ * This is pretty close to a straight copy from StopFilterFactory
+ *
+ */
+public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
+ implements ResourceLoaderAware {
+
+ public void inform(ResourceLoader loader) {
+ String commonWordFiles = args.get("words");
+ ignoreCase = getBoolean("ignoreCase", false);
+ enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+
+ if (commonWordFiles != null) {
+ try {
+ List<String> files = StrUtils.splitFileNames(commonWordFiles);
+ if (commonWords == null && files.size() > 0) {
+ // default stopwords list has 35 or so words, but maybe don't make it
+ // that big to start
+ commonWords = new CharArraySet(files.size() * 10, ignoreCase);
+ }
+ for (String file : files) {
+ List<String> wlist = loader.getLines(file.trim());
+ // TODO: once StopFilter.makeStopSet(List) method is available, switch
+ // to using that so we can avoid a toArray() call
+ commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
+ .toArray(new String[0]), ignoreCase));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ } else {
+ commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
+ StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+ }
+ }
+
+ // Force the use of a char array set, as it is the most performant, although
+ // this may break things if Lucene ever goes away from it. See SOLR-1095
+ private CharArraySet commonWords;
+
+ private boolean ignoreCase;
+
+ private boolean enablePositionIncrements;
+
+ public boolean isEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public Set getCommonWords() {
+ return commonWords;
+ }
+
+ /**
+ * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
+ */
+ public CommonGramsQueryFilter create(TokenStream input) {
+ CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
+ ignoreCase);
+ CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
+ commonGrams);
+ return commonGramsQuery;
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,69 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * Tests pretty much copied from StopFilterFactoryTest We use the test files
+ * used by the StopFilterFactoryTest TODO: consider creating separate test files
+ * so this won't break if stop filter test files change
+ **/
+public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
+ public String getSchemaFile() {
+ return "schema-stop-keep.xml";
+ }
+
+ public String getSolrConfigFile() {
+ return "solrconfig.xml";
+ }
+
+ public void testInform() throws Exception {
+ ResourceLoader loader = solrConfig.getResourceLoader();
+ assertTrue("loader is null and it shouldn't be", loader != null);
+ CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put("words", "stop-1.txt");
+ args.put("ignoreCase", "true");
+ factory.init(args);
+ factory.inform(loader);
+ Set words = factory.getCommonWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 2,
+ words.size() == 2);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+ .isIgnoreCase() == true);
+
+ factory = new CommonGramsFilterFactory();
+ args.put("words", "stop-1.txt, stop-2.txt");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getCommonWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 4,
+ words.size() == 4);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+ .isIgnoreCase() == true);
+
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,265 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+import java.util.Map.Entry;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
+
+/**
+ * Tests CommonGramsQueryFilter
+ */
+public class CommonGramsFilterTest extends TestCase {
+ private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
+ "of" };
+
+ public void testReset() throws Exception {
+ final String input = "How the s a brown s cow d like A B thing?";
+ WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+ CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+
+ TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
+ assertTrue(cgf.incrementToken());
+ assertEquals("How", term.term());
+ assertTrue(cgf.incrementToken());
+ assertEquals("How_the", term.term());
+ assertTrue(cgf.incrementToken());
+ assertEquals("the", term.term());
+ assertTrue(cgf.incrementToken());
+ assertEquals("the_s", term.term());
+
+ wt.reset(new StringReader(input));
+ cgf.reset();
+ assertTrue(cgf.incrementToken());
+ assertEquals("How", term.term());
+ }
+
+ public void testCommonGramsQueryFilter() throws Exception {
+ Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
+ for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
+ .hasNext();) {
+ Map.Entry<String, String> me = i.next();
+ String input = me.getKey();
+ String expected = me.getValue();
+ String message = "message: input value is: " + input;
+ assertEquals(message, expected, testFilter(input, "query"));
+ }
+ }
+
+ public void testQueryReset() throws Exception {
+ final String input = "How the s a brown s cow d like A B thing?";
+ WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+ CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+ CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
+
+ TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
+ assertTrue(nsf.incrementToken());
+ assertEquals("How_the", term.term());
+ assertTrue(nsf.incrementToken());
+ assertEquals("the_s", term.term());
+
+ wt.reset(new StringReader(input));
+ nsf.reset();
+ assertTrue(nsf.incrementToken());
+ assertEquals("How_the", term.term());
+ }
+
+ public void testCommonGramsFilter() throws Exception {
+ Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
+ for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
+ .hasNext();) {
+ Map.Entry<String, String> me = i.next();
+ String input = me.getKey();
+ String expected = me.getValue();
+ String message = "message: input value is: " + input;
+ assertEquals(message, expected, testFilter(input, "common"));
+ }
+ }
+
+ /**
+ * This is for testing CommonGramsQueryFilter which outputs a set of tokens
+ * optimized for querying with only one token at each position, either a
+ * unigram or a bigram It also will not return a token for the final position
+ * if the final word is already in the preceding bigram Example:(three
+ * tokens/positions in)
+ * "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
+ * out)
+ *
+ * @return Map<String,String>
+ */
+ private static Map<String, String> initQueryMap() {
+ Map<String, String> input2expected = new LinkedHashMap<String, String>();
+
+ // Stop words used below are "of" "the" and "s"
+
+ // two word queries
+ input2expected.put("brown fox", "/brown/fox");
+ input2expected.put("the fox", "/the_fox");
+ input2expected.put("fox of", "/fox_of");
+ input2expected.put("of the", "/of_the");
+
+ // one word queries
+ input2expected.put("the", "/the");
+ input2expected.put("foo", "/foo");
+
+ // 3 word combinations s=stopword/common word n=not a stop word
+ input2expected.put("n n n", "/n/n/n");
+ input2expected.put("quick brown fox", "/quick/brown/fox");
+
+ input2expected.put("n n s", "/n/n_s");
+ input2expected.put("quick brown the", "/quick/brown_the");
+
+ input2expected.put("n s n", "/n_s/s_n");
+ input2expected.put("quick the brown", "/quick_the/the_brown");
+
+ input2expected.put("n s s", "/n_s/s_s");
+ input2expected.put("fox of the", "/fox_of/of_the");
+
+ input2expected.put("s n n", "/s_n/n/n");
+ input2expected.put("the quick brown", "/the_quick/quick/brown");
+
+ input2expected.put("s n s", "/s_n/n_s");
+ input2expected.put("the fox of", "/the_fox/fox_of");
+
+ input2expected.put("s s n", "/s_s/s_n");
+ input2expected.put("of the fox", "/of_the/the_fox");
+
+ input2expected.put("s s s", "/s_s/s_s");
+ input2expected.put("of the of", "/of_the/the_of");
+
+ return input2expected;
+ }
+
+ private static Map<String, String> initMap() {
+ Map<String, String> input2expected = new HashMap<String, String>();
+
+ // Stop words used below are "of" "the" and "s"
+ // one word queries
+ input2expected.put("the", "/the");
+ input2expected.put("foo", "/foo");
+
+ // two word queries
+ input2expected.put("brown fox", "/brown/fox");
+ input2expected.put("the fox", "/the,the_fox/fox");
+ input2expected.put("fox of", "/fox,fox_of/of");
+ input2expected.put("of the", "/of,of_the/the");
+
+ // 3 word combinations s=stopword/common word n=not a stop word
+ input2expected.put("n n n", "/n/n/n");
+ input2expected.put("quick brown fox", "/quick/brown/fox");
+
+ input2expected.put("n n s", "/n/n,n_s/s");
+ input2expected.put("quick brown the", "/quick/brown,brown_the/the");
+
+ input2expected.put("n s n", "/n,n_s/s,s_n/n");
+ input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
+
+ input2expected.put("n s s", "/n,n_s/s,s_s/s");
+ input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
+
+ input2expected.put("s n n", "/s,s_n/n/n");
+ input2expected.put("the quick brown", "/the,the_quick/quick/brown");
+
+ input2expected.put("s n s", "/s,s_n/n,n_s/s");
+ input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
+
+ input2expected.put("s s n", "/s,s_s/s,s_n/n");
+ input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
+
+ input2expected.put("s s s", "/s,s_s/s,s_s/s");
+ input2expected.put("of the of", "/of,of_the/the,the_of/of");
+
+ return input2expected;
+ }
+
+ /*
+ * Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
+ * modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
+ */
+ /**
+ * Very simple tokenizer that breaks up a string into a series of Lucene
+ * {@link Token Token}s.
+ */
+ static class StringTokenStream extends TokenStream {
+ private String str;
+
+ private int prevEnd = 0;
+
+ private StringTokenizer tok;
+
+ private int count = 0;
+
+ public StringTokenStream(String str, String delim) {
+ this.str = str;
+ tok = new StringTokenizer(str, delim);
+ }
+
+ public Token next() {
+ if (!tok.hasMoreTokens())
+ return null;
+ count++;
+ String term = tok.nextToken();
+ Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
+ prevEnd)
+ + term.length(), "word");
+ prevEnd = t.endOffset();
+ return t;
+ }
+ }
+
+ public static String testFilter(String in, String type) throws IOException {
+ TokenStream nsf;
+ StringTokenStream ts = new StringTokenStream(in, " .");
+ if (type.equals("query")) {
+ CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
+ nsf = new CommonGramsQueryFilter(cgf);
+ } else {
+ nsf = new CommonGramsFilter(ts, commonWords);
+ }
+
+ StringBuffer outBuf = new StringBuffer();
+ while (true) {
+ Token t = nsf.next();
+ if (t == null)
+ break;
+ for (int i = 0; i < t.getPositionIncrement(); i++)
+ outBuf.append('/');
+ if (t.getPositionIncrement() == 0)
+ outBuf.append(',');
+ outBuf.append(t.term());
+ }
+
+ String out = outBuf.toString();
+ out = out.replaceAll(" ", "");
+ return out;
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java?rev=817859&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java Tue Sep 22 21:55:57 2009
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * Tests pretty much copied from StopFilterFactoryTest We use the test files
+ * used by the StopFilterFactoryTest TODO: consider creating separate test files
+ * so this won't break if stop filter test files change
+ **/
+public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
+ public String getSchemaFile() {
+ return "schema-stop-keep.xml";
+ }
+
+ public String getSolrConfigFile() {
+ return "solrconfig.xml";
+ }
+
+ public void testInform() throws Exception {
+ ResourceLoader loader = solrConfig.getResourceLoader();
+ assertTrue("loader is null and it shouldn't be", loader != null);
+ CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put("words", "stop-1.txt");
+ args.put("ignoreCase", "true");
+ factory.init(args);
+ factory.inform(loader);
+ Set words = factory.getCommonWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 2,
+ words.size() == 2);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+ .isIgnoreCase() == true);
+
+ factory = new CommonGramsQueryFilterFactory();
+ args.put("words", "stop-1.txt, stop-2.txt");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getCommonWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 4,
+ words.size() == 4);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+ .isIgnoreCase() == true);
+
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL