You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/05/30 09:53:46 UTC
svn commit: r1487777 [5/50] - in /lucene/dev/branches/security: ./
dev-tools/ dev-tools/eclipse/dot.settings/ dev-tools/idea/.idea/
dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/replicator/
dev-tools/maven/ dev-tools/maven/lucene/ dev-tools/mav...
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -22,10 +22,9 @@ import java.io.InputStream;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.hunspell.HunspellDictionary;
-import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -34,7 +33,7 @@ import org.apache.lucene.util.IOUtils;
/**
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
* Example config for British English including a custom dictionary, case insensitive matching:
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <filter class="solr.HunspellStemFilterFactory"
* dictionary="en_GB.dic,my_custom.dic"
* affix="en_GB.aff"
@@ -51,16 +50,29 @@ import org.apache.lucene.util.IOUtils;
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
*/
public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
-
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_IGNORE_CASE = "ignoreCase";
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
- private static final String TRUE = "true";
- private static final String FALSE = "false";
-
+
+ private final String dictionaryArg;
+ private final String affixFile;
+ private final boolean ignoreCase;
+ private final boolean strictAffixParsing;
private HunspellDictionary dictionary;
- private boolean ignoreCase = false;
+
+ /** Creates a new HunspellStemFilterFactory */
+ public HunspellStemFilterFactory(Map<String,String> args) {
+ super(args);
+ assureMatchVersion();
+ dictionaryArg = require(args, PARAM_DICTIONARY);
+ affixFile = get(args, PARAM_AFFIX);
+ ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
+ strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
/**
* Loads the hunspell dictionary and affix files defined in the configuration
@@ -69,27 +81,7 @@ public class HunspellStemFilterFactory e
*/
@Override
public void inform(ResourceLoader loader) throws IOException {
- assureMatchVersion();
- String dictionaryArg = args.get(PARAM_DICTIONARY);
- if (dictionaryArg == null) {
- throw new IllegalArgumentException("Parameter " + PARAM_DICTIONARY + " is mandatory.");
- }
- String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
- String affixFile = args.get(PARAM_AFFIX);
- String pic = args.get(PARAM_IGNORE_CASE);
- if(pic != null) {
- if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
- else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
- else throw new IllegalArgumentException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false");
- }
-
- String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING);
- boolean strictAffixParsing = true;
- if(strictAffixParsingParam != null) {
- if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false;
- else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true;
- else throw new IllegalArgumentException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
- }
+ String dictionaryFiles[] = dictionaryArg.split(",");
InputStream affix = null;
List<InputStream> dictionaries = new ArrayList<InputStream>();
@@ -103,7 +95,7 @@ public class HunspellStemFilterFactory e
this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
} catch (ParseException e) {
- throw new IOException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
+ throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e);
} finally {
IOUtils.closeWhileHandlingException(affix);
IOUtils.closeWhileHandlingException(dictionaries);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java Thu May 30 07:53:18 2013
@@ -175,10 +175,7 @@ public class HunspellStemmer {
@SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
if(dictionary.isIgnoreCase()) {
- for(int i=0;i<strippedWord.length;){
- i += Character.toChars(
- Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
- }
+ charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
}
segment.setLength(0);
segment.append(strippedWord, 0, length);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java Thu May 30 07:53:18 2013
@@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@@ -89,7 +89,7 @@ public final class ArmenianAnalyzer exte
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
- * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@@ -111,7 +111,7 @@ public final class ArmenianAnalyzer exte
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
- * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@@ -122,7 +122,7 @@ public final class ArmenianAnalyzer exte
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
- result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new ArmenianStemmer());
return new TokenStreamComponents(source, result);
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java Thu May 30 07:53:18 2013
@@ -22,7 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@@ -87,7 +87,7 @@ public final class IndonesianAnalyzer ex
/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
- * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* {@link IndonesianStemFilter}.
*
* @param matchVersion
@@ -111,7 +111,7 @@ public final class IndonesianAnalyzer ex
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, {@link KeywordMarkerFilter}
+ * {@link StopFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
*/
@Override
@@ -122,7 +122,7 @@ public final class IndonesianAnalyzer ex
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) {
- result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -25,7 +25,7 @@ import org.apache.lucene.analysis.util.T
/**
* Factory for {@link IndonesianStemFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_idstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -33,15 +33,17 @@ import org.apache.lucene.analysis.util.T
* <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class IndonesianStemFilterFactory extends TokenFilterFactory {
- private boolean stemDerivational = true;
+ private final boolean stemDerivational;
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- stemDerivational = getBoolean("stemDerivational", true);
+ /** Creates a new IndonesianStemFilterFactory */
+ public IndonesianStemFilterFactory(Map<String,String> args) {
+ super(args);
+ stemDerivational = getBoolean(args, "stemDerivational", true);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.in;
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@@ -25,16 +27,24 @@ import org.apache.lucene.analysis.util.T
/**
* Factory for {@link IndicNormalizationFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_innormal" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.IndicNormalizationFilterFactory"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class IndicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+ /** Creates a new IndicNormalizationFilterFactory */
+ public IndicNormalizationFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public TokenStream create(TokenStream input) {
return new IndicNormalizationFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Thu May 30 07:53:18 2013
@@ -24,7 +24,8 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@@ -99,7 +100,7 @@ public final class ItalianAnalyzer exten
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
- * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@@ -121,7 +122,7 @@ public final class ItalianAnalyzer exten
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
- * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link ItalianLightStemFilter}.
*/
@Override
@@ -133,7 +134,7 @@ public final class ItalianAnalyzer exten
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
- result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new ItalianLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java Thu May 30 07:53:18 2013
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokena
* words.
* <p>
* To prevent terms from being stemmed use an instance of
- * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,13 +17,15 @@ package org.apache.lucene.analysis.it;
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.it.ItalianLightStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ItalianLightStemFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_itlgtstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -31,9 +33,17 @@ import org.apache.lucene.analysis.util.T
* <filter class="solr.ItalianLightStemFilterFactory"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class ItalianLightStemFilterFactory extends TokenFilterFactory {
+
+ /** Creates a new ItalianLightStemFilterFactory */
+ public ItalianLightStemFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public TokenStream create(TokenStream input) {
return new ItalianLightStemFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java Thu May 30 07:53:18 2013
@@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@@ -89,7 +89,7 @@ public final class LatvianAnalyzer exten
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
- * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@@ -111,7 +111,7 @@ public final class LatvianAnalyzer exten
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
- * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link LatvianStemFilter}.
*/
@Override
@@ -122,7 +122,7 @@ public final class LatvianAnalyzer exten
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
- result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java Thu May 30 07:53:18 2013
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokena
* words.
* <p>
* To prevent terms from being stemmed use an instance of
- * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,13 +17,15 @@ package org.apache.lucene.analysis.lv;
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.lv.LatvianStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link LatvianStemFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -33,6 +35,15 @@ import org.apache.lucene.analysis.util.T
* </fieldType></pre>
*/
public class LatvianStemFilterFactory extends TokenFilterFactory {
+
+ /** Creates a new LatvianStemFilterFactory */
+ public LatvianStemFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public TokenStream create(TokenStream input) {
return new LatvianStemFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -25,16 +27,24 @@ import org.apache.lucene.analysis.TokenS
/**
* Factory for {@link ASCIIFoldingFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.ASCIIFoldingFilterFactory"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+ /** Creates a new ASCIIFoldingFilterFactory */
+ public ASCIIFoldingFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java Thu May 30 07:53:18 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.misce
*/
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -26,7 +25,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
-import java.util.StringTokenizer;
+import java.util.Set;
/**
* Factory for {@link CapitalizationFilter}.
@@ -44,7 +43,7 @@ import java.util.StringTokenizer;
* "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is
* assumed to be correct.<br/>
*
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -54,7 +53,6 @@ import java.util.StringTokenizer;
* </analyzer>
* </fieldType></pre>
*
- *
* @since solr 1.3
*/
public class CapitalizationFilterFactory extends TokenFilterFactory {
@@ -67,69 +65,42 @@ public class CapitalizationFilterFactory
public static final String ONLY_FIRST_WORD = "onlyFirstWord";
public static final String FORCE_FIRST_LETTER = "forceFirstLetter";
- //Map<String,String> keep = new HashMap<String, String>(); // not synchronized because it is only initialized once
CharArraySet keep;
Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
- int minWordLength = 0; // don't modify capitalization for words shorter then this
- int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT;
- int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH;
- boolean onlyFirstWord = true;
- boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
+ final int minWordLength; // don't modify capitalization for words shorter then this
+ final int maxWordCount;
+ final int maxTokenLength;
+ final boolean onlyFirstWord;
+ final boolean forceFirstLetter; // make sure the first letter is capital even if it is in the keep list
+
+ /** Creates a new CapitalizationFilterFactory */
+ public CapitalizationFilterFactory(Map<String, String> args) {
+ super(args);
assureMatchVersion();
-
- String k = args.get(KEEP);
+ boolean ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false);
+ Set<String> k = getSet(args, KEEP);
if (k != null) {
- StringTokenizer st = new StringTokenizer(k);
- boolean ignoreCase = false;
- String ignoreStr = args.get(KEEP_IGNORE_CASE);
- if ("true".equalsIgnoreCase(ignoreStr)) {
- ignoreCase = true;
- }
keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
- while (st.hasMoreTokens()) {
- k = st.nextToken().trim();
- keep.add(k.toCharArray());
- }
+ keep.addAll(k);
}
- k = args.get(OK_PREFIX);
+ k = getSet(args, OK_PREFIX);
if (k != null) {
okPrefix = new ArrayList<char[]>();
- StringTokenizer st = new StringTokenizer(k);
- while (st.hasMoreTokens()) {
- okPrefix.add(st.nextToken().trim().toCharArray());
+ for (String item : k) {
+ okPrefix.add(item.toCharArray());
}
}
- k = args.get(MIN_WORD_LENGTH);
- if (k != null) {
- minWordLength = Integer.valueOf(k);
- }
-
- k = args.get(MAX_WORD_COUNT);
- if (k != null) {
- maxWordCount = Integer.valueOf(k);
- }
-
- k = args.get(MAX_TOKEN_LENGTH);
- if (k != null) {
- maxTokenLength = Integer.valueOf(k);
- }
-
- k = args.get(ONLY_FIRST_WORD);
- if (k != null) {
- onlyFirstWord = Boolean.valueOf(k);
- }
-
- k = args.get(FORCE_FIRST_LETTER);
- if (k != null) {
- forceFirstLetter = Boolean.valueOf(k);
+ minWordLength = getInt(args, MIN_WORD_LENGTH, 0);
+ maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
+ maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
+ onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true);
+ forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,22 +17,32 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link HyphenatedWordsFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_hyphn" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.HyphenatedWordsFilterFactory"/>
* </analyzer>
- * </fieldType></pre>
- *
+ * </fieldType></pre>
*/
public class HyphenatedWordsFilterFactory extends TokenFilterFactory {
+
+ /** Creates a new HyphenatedWordsFilterFactory */
+ public HyphenatedWordsFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public HyphenatedWordsFilter create(TokenStream input) {
return new HyphenatedWordsFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java Thu May 30 07:53:18 2013
@@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* A TokenFilter that only keeps tokens with text contained in the
@@ -32,10 +33,16 @@ public final class KeepWordFilter extend
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- /** The words set passed to this constructor will be directly used by this filter
- * and should not be modified, */
- public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
- super(enablePositionIncrements, in);
+ /**
+ * Create a new {@link KeepWordFilter}.
+ * <p><b>NOTE</b>: The words set passed to this constructor will be directly
+ * used by this filter and should not be modified.
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ * @param words the words to keep
+ */
+ public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
+ super(version, in);
this.words = words;
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,67 +17,48 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
-import org.apache.lucene.analysis.util.*;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
-import java.util.Set;
import java.io.IOException;
/**
* Factory for {@link KeepWordFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
+ * <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/>
* </analyzer>
- * </fieldType></pre>
- *
+ * </fieldType></pre>
*/
public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
-
- @Override
- public void init(Map<String,String> args) {
- super.init(args);
+ private final boolean ignoreCase;
+ private final String wordFiles;
+ private CharArraySet words;
+
+ /** Creates a new KeepWordFilterFactory */
+ public KeepWordFilterFactory(Map<String,String> args) {
+ super(args);
assureMatchVersion();
+ wordFiles = get(args, "words");
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
public void inform(ResourceLoader loader) throws IOException {
- String wordFiles = args.get("words");
- ignoreCase = getBoolean("ignoreCase", false);
- enablePositionIncrements = getBoolean("enablePositionIncrements",false);
-
if (wordFiles != null) {
words = getWordSet(loader, wordFiles, ignoreCase);
}
}
- private CharArraySet words;
- private boolean ignoreCase;
- private boolean enablePositionIncrements;
-
- /**
- * Set the keep word list.
- * NOTE: if ignoreCase==true, the words are expected to be lowercase
- */
- public void setWords(Set<String> words) {
- this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
- }
-
- public void setIgnoreCase(boolean ignoreCase) {
- if (words != null && this.ignoreCase != ignoreCase) {
- words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
- }
- this.ignoreCase = ignoreCase;
- }
-
- public boolean isEnablePositionIncrements() {
- return enablePositionIncrements;
- }
-
public boolean isIgnoreCase() {
return ignoreCase;
}
@@ -89,6 +70,11 @@ public class KeepWordFilterFactory exten
@Override
public TokenStream create(TokenStream input) {
// if the set is null, it means it was empty
- return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words);
+ if (words == null) {
+ return input;
+ } else {
+ final TokenStream filter = new KeepWordFilter(luceneMatchVersion, input, words);
+ return filter;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java Thu May 30 07:53:18 2013
@@ -22,41 +22,28 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
- * Marks terms as keywords via the {@link KeywordAttribute}. Each token
- * contained in the provided is marked as a keyword by setting
- * {@link KeywordAttribute#setKeyword(boolean)} to <code>true</code>.
+ * Marks terms as keywords via the {@link KeywordAttribute}.
*
* @see KeywordAttribute
*/
-public final class KeywordMarkerFilter extends TokenFilter {
+public abstract class KeywordMarkerFilter extends TokenFilter {
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final CharArraySet keywordSet;
/**
- * Create a new KeywordMarkerFilter, that marks the current token as a
- * keyword if the tokens term buffer is contained in the given set via the
- * {@link KeywordAttribute}.
- *
- * @param in
- * TokenStream to filter
- * @param keywordSet
- * the keywords set to lookup the current termbuffer
+ * Creates a new {@link KeywordMarkerFilter}
+ * @param in the input stream
*/
- public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
+ protected KeywordMarkerFilter(TokenStream in) {
super(in);
- this.keywordSet = keywordSet;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) {
+ if (isKeyword()) {
keywordAttr.setKeyword(true);
}
return true;
@@ -64,4 +51,7 @@ public final class KeywordMarkerFilter e
return false;
}
}
+
+ protected abstract boolean isKeyword();
+
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java Thu May 30 07:53:18 2013
@@ -18,34 +18,53 @@ package org.apache.lucene.analysis.misce
*/
import java.io.IOException;
+import java.util.Map;
+import java.util.regex.Pattern;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
-import org.apache.lucene.analysis.util.*;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link KeywordMarkerFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/>
+ * <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" pattern="^.+er$" ignoreCase="false"/>
* </analyzer>
- * </fieldType></pre>
- *
+ * </fieldType></pre>
*/
public class KeywordMarkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
+ public static final String PATTERN = "pattern";
+ private final String wordFiles;
+ private final String stringPattern;
+ private final boolean ignoreCase;
+ private Pattern pattern;
private CharArraySet protectedWords;
- private boolean ignoreCase;
+
+ /** Creates a new KeywordMarkerFilterFactory */
+ public KeywordMarkerFilterFactory(Map<String,String> args) {
+ super(args);
+ wordFiles = get(args, PROTECTED_TOKENS);
+ stringPattern = get(args, PATTERN);
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
@Override
public void inform(ResourceLoader loader) throws IOException {
- String wordFiles = args.get(PROTECTED_TOKENS);
- ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
}
+ if (stringPattern != null) {
+ pattern = ignoreCase ? Pattern.compile(stringPattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE) : Pattern.compile(stringPattern);
+ }
}
public boolean isIgnoreCase() {
@@ -54,6 +73,12 @@ public class KeywordMarkerFilterFactory
@Override
public TokenStream create(TokenStream input) {
- return protectedWords == null ? input : new KeywordMarkerFilter(input, protectedWords);
+ if (pattern != null) {
+ input = new PatternKeywordMarkerFilter(input, pattern);
+ }
+ if (protectedWords != null) {
+ input = new SetKeywordMarkerFilter(input, protectedWords);
+ }
+ return input;
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java Thu May 30 07:53:18 2013
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.misce
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
/**
* Removes words that are too long or too short from the stream.
@@ -35,15 +36,20 @@ public final class LengthFilter extends
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
- * Build a filter that removes words that are too long or too
- * short from the text.
+ * Create a new {@link LengthFilter}. This will filter out tokens whose
+ * {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
+ * < min) or too long ({@link CharTermAttribute#length()} > max).
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ * @param min the minimum length
+ * @param max the maximum length
*/
- public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
- super(enablePositionIncrements, in);
+ public LengthFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
this.min = min;
this.max = max;
}
-
+
@Override
public boolean accept() {
final int len = termAtt.length();
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,44 +17,40 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import java.util.Map;
-
/**
* Factory for {@link LengthFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
+ * <filter class="solr.LengthFilterFactory" min="0" max="1" />
* </analyzer>
- * </fieldType></pre>
- *
+ * </fieldType></pre>
*/
public class LengthFilterFactory extends TokenFilterFactory {
- int min,max;
- boolean enablePositionIncrements;
+ final int min;
+ final int max;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- String minKey = args.get(MIN_KEY);
- String maxKey = args.get(MAX_KEY);
- if (minKey == null || maxKey == null) {
- throw new IllegalArgumentException("Both " + MIN_KEY + " and " + MAX_KEY + " are mandatory");
+ /** Creates a new LengthFilterFactory */
+ public LengthFilterFactory(Map<String, String> args) {
+ super(args);
+ min = requireInt(args, MIN_KEY);
+ max = requireInt(args, MAX_KEY);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
- min=Integer.parseInt(minKey);
- max=Integer.parseInt(maxKey);
- enablePositionIncrements = getBoolean("enablePositionIncrements",false);
}
@Override
public LengthFilter create(TokenStream input) {
- return new LengthFilter(enablePositionIncrements, input,min,max);
+ final LengthFilter filter = new LengthFilter(luceneMatchVersion, input,min,max);
+ return filter;
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java Thu May 30 07:53:18 2013
@@ -20,33 +20,36 @@ package org.apache.lucene.analysis.misce
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link LimitTokenCountFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_lngthcnt" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10" consumeAllTokens="false" />
* </analyzer>
- * </fieldType></pre>
+ * </fieldType></pre>
* <p>
- * The {@code consumeAllTokens} property is optional and defaults to {@code false}. See {@link LimitTokenCountFilter} for an explanation of it's use.
+ * The {@code consumeAllTokens} property is optional and defaults to {@code false}.
+ * See {@link LimitTokenCountFilter} for an explanation of it's use.
*/
public class LimitTokenCountFilterFactory extends TokenFilterFactory {
public static final String MAX_TOKEN_COUNT_KEY = "maxTokenCount";
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
- int maxTokenCount;
- boolean consumeAllTokens;
+ final int maxTokenCount;
+ final boolean consumeAllTokens;
- @Override
- public void init(Map<String, String> args) {
- super.init( args );
- maxTokenCount = getInt(MAX_TOKEN_COUNT_KEY);
- consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false);
+ /** Creates a new LimitTokenCountFilterFactory */
+ public LimitTokenCountFilterFactory(Map<String, String> args) {
+ super(args);
+ maxTokenCount = requireInt(args, MAX_TOKEN_COUNT_KEY);
+ consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,22 +17,32 @@ package org.apache.lucene.analysis.misce
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link RemoveDuplicatesTokenFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_rmdup" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class RemoveDuplicatesTokenFilterFactory extends TokenFilterFactory {
+
+ /** Creates a new RemoveDuplicatesTokenFilterFactory */
+ public RemoveDuplicatesTokenFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public RemoveDuplicatesTokenFilter create(TokenStream input) {
return new RemoveDuplicatesTokenFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java Thu May 30 07:53:18 2013
@@ -18,23 +18,34 @@ package org.apache.lucene.analysis.misce
*/
import java.io.IOException;
+import java.util.ArrayList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FST.Arc;
+import org.apache.lucene.util.fst.FST.BytesReader;
/**
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
* with custom dictionary-based stemming.
*/
public final class StemmerOverrideFilter extends TokenFilter {
- private final CharArrayMap<String> dictionary;
+ private final StemmerOverrideMap stemmerOverrideMap;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+ private final BytesReader fstReader;
+ private final Arc<BytesRef> scratchArc = new FST.Arc<BytesRef>();
+ private final CharsRef spare = new CharsRef();
/**
* Create a new StemmerOverrideFilter, performing dictionary-based stemming
@@ -44,19 +55,28 @@ public final class StemmerOverrideFilter
* so that they will not be stemmed with stemmers down the chain.
* </p>
*/
- public StemmerOverrideFilter(Version matchVersion, TokenStream input,
- CharArrayMap<String> dictionary) {
+ public StemmerOverrideFilter(final TokenStream input, final StemmerOverrideMap stemmerOverrideMap) {
super(input);
- this.dictionary = CharArrayMap.copy(matchVersion, dictionary);
+ this.stemmerOverrideMap = stemmerOverrideMap;
+ fstReader = stemmerOverrideMap.getBytesReader();
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
+ if (fstReader == null) {
+ // No overrides
+ return true;
+ }
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
- String stem = dictionary.get(termAtt.buffer(), 0, termAtt.length());
+ final BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader);
if (stem != null) {
- termAtt.setEmpty().append(stem);
+ final char[] buffer = spare.chars = termAtt.buffer();
+ UnicodeUtil.UTF8toUTF16(stem.bytes, stem.offset, stem.length, spare);
+ if (spare.chars != buffer) {
+ termAtt.copyBuffer(spare.chars, spare.offset, spare.length);
+ }
+ termAtt.setLength(spare.length);
keywordAtt.setKeyword(true);
}
}
@@ -65,4 +85,134 @@ public final class StemmerOverrideFilter
return false;
}
}
+
+ /**
+ * A read-only 4-byte FST backed map that allows fast case-insensitive key
+ * value lookups for {@link StemmerOverrideFilter}
+ */
+ // TODO maybe we can generalize this and reuse this map somehow?
+ public final static class StemmerOverrideMap {
+ private final FST<BytesRef> fst;
+ private final boolean ignoreCase;
+
+ /**
+ * Creates a new {@link StemmerOverrideMap}
+ * @param fst the fst to lookup the overrides
+ * @param ignoreCase if the keys case should be ingored
+ */
+ StemmerOverrideMap(FST<BytesRef> fst, boolean ignoreCase) {
+ this.fst = fst;
+ this.ignoreCase = ignoreCase;
+ }
+
+ /**
+ * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, Arc, BytesReader)} method.
+ */
+ BytesReader getBytesReader() {
+ if (fst == null) {
+ return null;
+ } else {
+ return fst.getBytesReader();
+ }
+ }
+
+ /**
+ * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
+ */
+ BytesRef get(char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) throws IOException {
+ BytesRef pendingOutput = fst.outputs.getNoOutput();
+ BytesRef matchOutput = null;
+ int bufUpto = 0;
+ fst.getFirstArc(scratchArc);
+ while (bufUpto < bufferLen) {
+ final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+ if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
+ return null;
+ }
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ bufUpto += Character.charCount(codePoint);
+ }
+ if (scratchArc.isFinal()) {
+ matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+ }
+ return matchOutput;
+ }
+
+ }
+ /**
+ * This builder builds an {@link FST} for the {@link StemmerOverrideFilter}
+ */
+ public static class Builder {
+ private final BytesRefHash hash = new BytesRefHash();
+ private final BytesRef spare = new BytesRef();
+ private final ArrayList<CharSequence> outputValues = new ArrayList<CharSequence>();
+ private final boolean ignoreCase;
+ private final CharsRef charsSpare = new CharsRef();
+
+ /**
+ * Creates a new {@link Builder} with ignoreCase set to <code>false</code>
+ */
+ public Builder() {
+ this(false);
+ }
+
+ /**
+ * Creates a new {@link Builder}
+ * @param ignoreCase if the input case should be ignored.
+ */
+ public Builder(boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ }
+
+ /**
+ * Adds an input string and it's stemmer override output to this builder.
+ *
+ * @param input the input char sequence
+ * @param output the stemmer override output char sequence
+ * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>.
+ */
+ public boolean add(CharSequence input, CharSequence output) {
+ final int length = input.length();
+ if (ignoreCase) {
+ // convert on the fly to lowercase
+ charsSpare.grow(length);
+ final char[] buffer = charsSpare.chars;
+ for (int i = 0; i < length; ) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ Character.codePointAt(input, i)), buffer, i);
+ }
+ UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
+ } else {
+ UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
+ }
+ if (hash.add(spare) >= 0) {
+ outputValues.add(output);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
+ * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
+ * @throws IOException if an {@link IOException} occurs;
+ */
+ public StemmerOverrideMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
+ FST.INPUT_TYPE.BYTE4, outputs);
+ final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ IntsRef intsSpare = new IntsRef();
+ final int size = hash.size();
+ for (int i = 0; i < size; i++) {
+ int id = sort[i];
+ BytesRef bytesRef = hash.get(id, spare);
+ UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
+ builder.add(intsSpare, new BytesRef(outputValues.get(id)));
+ }
+ return new StemmerOverrideMap(builder.finish(), ignoreCase);
+ }
+
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,43 +19,54 @@ package org.apache.lucene.analysis.misce
import java.io.IOException;
import java.util.List;
+import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
-import org.apache.lucene.analysis.util.*;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link StemmerOverrideFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_dicstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class StemmerOverrideFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private CharArrayMap<String> dictionary = null;
- private boolean ignoreCase;
+ private StemmerOverrideMap dictionary;
+ private final String dictionaryFiles;
+ private final boolean ignoreCase;
+
+ /** Creates a new StemmerOverrideFilterFactory */
+ public StemmerOverrideFilterFactory(Map<String,String> args) {
+ super(args);
+ dictionaryFiles = get(args, "dictionary");
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
@Override
public void inform(ResourceLoader loader) throws IOException {
- String dictionaryFiles = args.get("dictionary");
- ignoreCase = getBoolean("ignoreCase", false);
if (dictionaryFiles != null) {
assureMatchVersion();
List<String> files = splitFileNames(dictionaryFiles);
if (files.size() > 0) {
- dictionary = new CharArrayMap<String>(luceneMatchVersion,
- files.size() * 10, ignoreCase);
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
for (String file : files) {
List<String> list = getLines(loader, file.trim());
for (String line : list) {
String[] mapping = line.split("\t", 2);
- dictionary.put(mapping[0], mapping[1]);
+ builder.add(mapping[0], mapping[1]);
}
}
+ dictionary = builder.build();
}
}
}
@@ -66,6 +77,6 @@ public class StemmerOverrideFilterFactor
@Override
public TokenStream create(TokenStream input) {
- return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary);
+ return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java Thu May 30 07:53:18 2013
@@ -21,22 +21,26 @@ import org.apache.lucene.analysis.TokenF
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
/**
* Trims leading and trailing whitespace from Tokens in the stream.
+ * <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
+ * as it can lead to broken token streams.
*/
public final class TrimFilter extends TokenFilter {
- final boolean updateOffsets;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- public TrimFilter(TokenStream in, boolean updateOffsets) {
+ /**
+ * Create a new {@link TrimFilter}.
+ * @param version the Lucene match version
+ * @param in the stream to consume
+ */
+ public TrimFilter(Version version, TokenStream in) {
super(in);
- this.updateOffsets = updateOffsets;
}
@Override
@@ -52,15 +56,12 @@ public final class TrimFilter extends To
}
int start = 0;
int end = 0;
- int endOff = 0;
// eat the first characters
- //QUESTION: Should we use Character.isWhitespace() instead?
- for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
+ for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) {
}
// eat the end characters
- for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
- endOff++;
+ for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) {
}
if (start > 0 || end < len) {
if (start < end) {
@@ -68,11 +69,6 @@ public final class TrimFilter extends To
} else {
termAtt.setEmpty();
}
- if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) {
- int newStart = offsetAtt.startOffset()+start;
- int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
- offsetAtt.setOffset(newStart, newEnd);
- }
}
return true;
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java Thu May 30 07:53:18 2013
@@ -25,11 +25,11 @@ import org.apache.lucene.analysis.util.T
/**
* Factory for {@link TrimFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.NGramTokenizerFactory"/>
- * <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
+ * <filter class="solr.TrimFilterFactory" />
* </analyzer>
* </fieldType></pre>
*
@@ -37,20 +37,17 @@ import org.apache.lucene.analysis.util.T
*/
public class TrimFilterFactory extends TokenFilterFactory {
- protected boolean updateOffsets = false;
-
- @Override
- public void init(Map<String,String> args) {
- super.init( args );
-
- String v = args.get( "updateOffsets" );
- if (v != null) {
- updateOffsets = Boolean.valueOf( v );
+ /** Creates a new TrimFilterFactory */
+ public TrimFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TrimFilter create(TokenStream input) {
- return new TrimFilter(input, updateOffsets);
+ final TrimFilter filter = new TrimFilter(luceneMatchVersion, input);
+ return filter;
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java Thu May 30 07:53:18 2013
@@ -18,9 +18,10 @@ package org.apache.lucene.analysis.misce
*/
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
-import org.apache.lucene.analysis.util.*;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.ArrayList;
import java.util.List;
@@ -33,10 +34,9 @@ import java.io.IOException;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
-
/**
* Factory for {@link WordDelimiterFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -46,65 +46,71 @@ import static org.apache.lucene.analysis
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
* types="wdfftypes.txt" />
* </analyzer>
- * </fieldType></pre>
- *
+ * </fieldType></pre>
*/
public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public static final String TYPES = "types";
-
- @Override
- public void inform(ResourceLoader loader) throws IOException {
- String wordFiles = args.get(PROTECTED_TOKENS);
- if (wordFiles != null) {
- protectedWords = getWordSet(loader, wordFiles, false);
- }
- String types = args.get(TYPES);
- if (types != null) {
- List<String> files = splitFileNames( types );
- List<String> wlist = new ArrayList<String>();
- for( String file : files ){
- List<String> lines = getLines(loader, file.trim());
- wlist.addAll( lines );
- }
- typeTable = parseTypes(wlist);
- }
- }
- private CharArraySet protectedWords = null;
- private int flags;
+ private final String wordFiles;
+ private final String types;
+ private final int flags;
byte[] typeTable = null;
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- if (getInt("generateWordParts", 1) != 0) {
+ private CharArraySet protectedWords = null;
+
+ /** Creates a new WordDelimiterFilterFactory */
+ public WordDelimiterFilterFactory(Map<String, String> args) {
+ super(args);
+ int flags = 0;
+ if (getInt(args, "generateWordParts", 1) != 0) {
flags |= GENERATE_WORD_PARTS;
}
- if (getInt("generateNumberParts", 1) != 0) {
+ if (getInt(args, "generateNumberParts", 1) != 0) {
flags |= GENERATE_NUMBER_PARTS;
}
- if (getInt("catenateWords", 0) != 0) {
+ if (getInt(args, "catenateWords", 0) != 0) {
flags |= CATENATE_WORDS;
}
- if (getInt("catenateNumbers", 0) != 0) {
+ if (getInt(args, "catenateNumbers", 0) != 0) {
flags |= CATENATE_NUMBERS;
}
- if (getInt("catenateAll", 0) != 0) {
+ if (getInt(args, "catenateAll", 0) != 0) {
flags |= CATENATE_ALL;
}
- if (getInt("splitOnCaseChange", 1) != 0) {
+ if (getInt(args, "splitOnCaseChange", 1) != 0) {
flags |= SPLIT_ON_CASE_CHANGE;
}
- if (getInt("splitOnNumerics", 1) != 0) {
+ if (getInt(args, "splitOnNumerics", 1) != 0) {
flags |= SPLIT_ON_NUMERICS;
}
- if (getInt("preserveOriginal", 0) != 0) {
+ if (getInt(args, "preserveOriginal", 0) != 0) {
flags |= PRESERVE_ORIGINAL;
}
- if (getInt("stemEnglishPossessive", 1) != 0) {
+ if (getInt(args, "stemEnglishPossessive", 1) != 0) {
flags |= STEM_ENGLISH_POSSESSIVE;
}
+ wordFiles = get(args, PROTECTED_TOKENS);
+ types = get(args, TYPES);
+ this.flags = flags;
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ if (wordFiles != null) {
+ protectedWords = getWordSet(loader, wordFiles, false);
+ }
+ if (types != null) {
+ List<String> files = splitFileNames( types );
+ List<String> wlist = new ArrayList<String>();
+ for( String file : files ){
+ List<String> lines = getLines(loader, file.trim());
+ wlist.addAll( lines );
+ }
+ typeTable = parseTypes(wlist);
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,46 +19,34 @@ package org.apache.lucene.analysis.ngram
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Creates new instances of {@link EdgeNGramTokenFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.EdgeNGramFilterFactory" side="front" minGramSize="1" maxGramSize="1"/>
+ * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class EdgeNGramFilterFactory extends TokenFilterFactory {
- private int maxGramSize = 0;
+ private final int maxGramSize;
+ private final int minGramSize;
- private int minGramSize = 0;
-
- private String side;
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- String maxArg = args.get("maxGramSize");
- maxGramSize = (maxArg != null ? Integer.parseInt(maxArg)
- : EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
-
- String minArg = args.get("minGramSize");
- minGramSize = (minArg != null ? Integer.parseInt(minArg)
- : EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
-
- side = args.get("side");
- if (side == null) {
- side = EdgeNGramTokenFilter.Side.FRONT.getLabel();
+ /** Creates a new EdgeNGramFilterFactory */
+ public EdgeNGramFilterFactory(Map<String, String> args) {
+ super(args);
+ minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
+ maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public EdgeNGramTokenFilter create(TokenStream input) {
- return new EdgeNGramTokenFilter(input, side, minGramSize, maxGramSize);
+ return new EdgeNGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
}
}