You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC
svn commit: r1512909 [3/38] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/
dev-tools/idea/solr/core/src/test/ dev-tool...
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compo
import java.io.IOException;
import java.util.LinkedList;
-import java.util.Set;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -41,6 +40,7 @@ import org.apache.lucene.util.Version;
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
* supplementary characters in strings and char arrays provided as compound word
* dictionaries.
+ * <li>As of 4.4, {@link CompoundWordTokenFilterBase} doesn't update offsets.
* </ul>
*/
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
@@ -58,7 +58,8 @@ public abstract class CompoundWordTokenF
* The default for maximal length of subwords that get propagated to the output of this filter
*/
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
-
+
+ protected final Version matchVersion;
protected final CharArraySet dictionary;
protected final LinkedList<CompoundToken> tokens;
protected final int minWordSize;
@@ -82,7 +83,7 @@ public abstract class CompoundWordTokenF
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input);
-
+ this.matchVersion = matchVersion;
this.tokens=new LinkedList<CompoundToken>();
if (minWordSize < 0) {
throw new IllegalArgumentException("minWordSize cannot be negative");
@@ -156,7 +157,8 @@ public abstract class CompoundWordTokenF
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
- if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+ if (matchVersion.onOrAfter(Version.LUCENE_44) ||
+ endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
this.startOffset = startOff;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java Sun Aug 11 12:19:13 2013
@@ -57,7 +57,7 @@ public final class GreekLowerCaseFilter
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
- lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
+ lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
}
return true;
} else {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java Sun Aug 11 12:19:13 2013
@@ -378,17 +378,14 @@ public class HunspellDictionary {
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
- if(ignoreCase) {
- entry = entry.toLowerCase(Locale.ROOT);
- }
}
-
- List<HunspellWord> entries = words.get(entry);
- if (entries == null) {
- entries = new ArrayList<HunspellWord>();
- words.put(entry, entries);
+ if(ignoreCase) {
+ entry = entry.toLowerCase(Locale.ROOT);
}
+
+ List<HunspellWord> entries = new ArrayList<HunspellWord>();
entries.add(wordForm);
+ words.put(entry, entries);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java Sun Aug 11 12:19:13 2013
@@ -55,17 +55,31 @@ public final class HunspellStemFilter ex
private final boolean dedup;
+ /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
+ * recursion level of 2.
+ * @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
+ this(input, dictionary, 2);
+ }
+
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+ * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
- this(input, dictionary, true);
+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
+ this(input, dictionary, true, recursionCap);
}
-
+
+ /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2.
+ * @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+ this(input, dictionary, dedup, 2);
+ }
+
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
@@ -73,11 +87,12 @@ public final class HunspellStemFilter ex
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
+ * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
super(input);
this.dedup = dedup;
- this.stemmer = new HunspellStemmer(dictionary);
+ this.stemmer = new HunspellStemmer(dictionary, recursionCap);
}
/**
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -54,12 +54,14 @@ public class HunspellStemFilterFactory e
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_IGNORE_CASE = "ignoreCase";
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
+ private static final String PARAM_RECURSION_CAP = "recursionCap";
private final String dictionaryArg;
private final String affixFile;
private final boolean ignoreCase;
private final boolean strictAffixParsing;
private HunspellDictionary dictionary;
+ private int recursionCap;
/** Creates a new HunspellStemFilterFactory */
public HunspellStemFilterFactory(Map<String,String> args) {
@@ -69,6 +71,7 @@ public class HunspellStemFilterFactory e
affixFile = get(args, PARAM_AFFIX);
ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
+ recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -111,6 +114,6 @@ public class HunspellStemFilterFactory e
*/
@Override
public TokenStream create(TokenStream tokenStream) {
- return new HunspellStemFilter(tokenStream, dictionary);
+ return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java Sun Aug 11 12:19:13 2013
@@ -17,16 +17,10 @@ package org.apache.lucene.analysis.hunsp
* limitations under the License.
*/
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
-import java.util.Scanner;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
@@ -37,23 +31,33 @@ import org.apache.lucene.util.Version;
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
*/
public class HunspellStemmer {
-
- private static final int RECURSION_CAP = 2;
-
+ private final int recursionCap;
private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder();
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
/**
- * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
+ * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
+ * default recursion cap of <code>2</code> (based on Hunspell documentation).
*
* @param dictionary HunspellDictionary that will be used to create the stems
*/
public HunspellStemmer(HunspellDictionary dictionary) {
- this.dictionary = dictionary;
+ this(dictionary, 2);
}
/**
+ * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
+ *
+ * @param dictionary HunspellDictionary that will be used to create the stems
+ * @param recursionCap maximum level of recursion stemmer can go into
+ */
+ public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
+ this.dictionary = dictionary;
+ this.recursionCap = recursionCap;
+ }
+
+ /**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
@@ -194,7 +198,7 @@ public class HunspellStemmer {
}
}
- if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
+ if (affix.isCrossProduct() && recursionDepth < recursionCap) {
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java Sun Aug 11 12:19:13 2013
@@ -84,11 +84,6 @@ public final class PerFieldAnalyzerWrapp
}
@Override
- protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
- return components;
- }
-
- @Override
public String toString() {
return "PerFieldAnalyzerWrapper(" + fieldAnalyzers + ", default=" + defaultAnalyzer + ")";
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Sun Aug 11 12:19:13 2013
@@ -25,21 +25,26 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* Tokenizes the given token into n-grams of given size(s).
* <p>
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
+ * supplementary characters.
*/
public final class EdgeNGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+ private final CharacterUtils charUtils;
private final int minGram;
private final int maxGram;
private char[] curTermBuffer;
private int curTermLength;
+ private int curCodePointCount;
private int curGramSize;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
@@ -74,6 +79,9 @@ public final class EdgeNGramTokenFilter
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
+ this.charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
this.minGram = minGram;
this.maxGram = maxGram;
}
@@ -87,6 +95,7 @@ public final class EdgeNGramTokenFilter
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
+ curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
@@ -95,7 +104,7 @@ public final class EdgeNGramTokenFilter
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
- if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
+ if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
clearAttributes();
offsetAtt.setOffset(tokStart, tokEnd);
@@ -107,7 +116,8 @@ public final class EdgeNGramTokenFilter
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
- termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
+ final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
+ termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Sun Aug 11 12:19:13 2013
@@ -17,37 +17,23 @@ package org.apache.lucene.analysis.ngram
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Version;
/**
* Tokenizes the input from an edge into n-grams of given size(s).
* <p>
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this class supports
+ * {@link #isTokenChar(int) pre-tokenization} and correctly handles
+ * supplementary characters.
*/
-public final class EdgeNGramTokenizer extends Tokenizer {
+public class EdgeNGramTokenizer extends NGramTokenizer {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
- private int minGram;
- private int maxGram;
- private int gramSize;
- private boolean started;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private String inStr;
-
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
@@ -57,8 +43,7 @@ public final class EdgeNGramTokenizer ex
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
- super(input);
- init(version, minGram, maxGram);
+ super(version, input, minGram, maxGram, true);
}
/**
@@ -71,102 +56,7 @@ public final class EdgeNGramTokenizer ex
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
- super(factory, input);
- init(version, minGram, maxGram);
- }
-
- private void init(Version version, int minGram, int maxGram) {
- if (version == null) {
- throw new IllegalArgumentException("version must not be null");
- }
-
- if (minGram < 1) {
- throw new IllegalArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram) {
- throw new IllegalArgumentException("minGram must not be greater than maxGram");
- }
-
- this.minGram = minGram;
- this.maxGram = maxGram;
+ super(version, factory, input, minGram, maxGram, true);
}
- /** Returns the next token in the stream, or null at EOS. */
- @Override
- public boolean incrementToken() throws IOException {
- clearAttributes();
- // if we are just starting, read the whole input
- if (!started) {
- started = true;
- gramSize = minGram;
- char[] chars = new char[Math.min(1024, maxGram)];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- boolean exhausted = false;
- while (charsRead < maxGram) {
- final int inc = input.read(chars, charsRead, chars.length-charsRead);
- if (inc == -1) {
- exhausted = true;
- break;
- }
- charsRead += inc;
- if (charsRead == chars.length && charsRead < maxGram) {
- chars = ArrayUtil.grow(chars);
- }
- }
-
- inStr = new String(chars, 0, charsRead);
-
- if (!exhausted) {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- char[] throwaway = new char[1024];
- while(true) {
- final int inc = input.read(throwaway, 0, throwaway.length);
- if (inc == -1) {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.length();
- if (inLen == 0) {
- return false;
- }
- posIncrAtt.setPositionIncrement(1);
- } else {
- posIncrAtt.setPositionIncrement(1);
- }
-
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen) {
- return false;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram || gramSize > inLen) {
- return false;
- }
-
- // grab gramSize chars from front or back
- termAtt.setEmpty().append(inStr, 0, gramSize);
- offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize));
- gramSize++;
- return true;
- }
-
- @Override
- public void end() {
- // set final offset
- final int finalOffset = correctOffset(charsRead);
- this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- started = false;
- }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Sun Aug 11 12:19:13 2013
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@@ -33,6 +34,7 @@ import org.apache.lucene.util.Version;
* <a name="version"/>
* <p>You must specify the required {@link Version} compatibility when
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>handles supplementary characters correctly,</li>
* <li>emits all n-grams for the same token at the same position,</li>
* <li>does not modify offsets,</li>
* <li>sorts n-grams by their offset in the original token first, then
@@ -42,6 +44,10 @@ import org.apache.lucene.util.Version;
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
* it will lead to broken {@link TokenStream}s that will cause highlighting
* bugs.
+ * <p>If you were using this {@link TokenFilter} to perform partial highlighting,
+ * this won't work anymore since this filter doesn't update offsets. You should
+ * modify your analysis chain to use {@link NGramTokenizer}, and potentially
+ * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
*/
public final class NGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
@@ -51,6 +57,7 @@ public final class NGramTokenFilter exte
private char[] curTermBuffer;
private int curTermLength;
+ private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
@@ -59,6 +66,7 @@ public final class NGramTokenFilter exte
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final Version version;
+ private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
@@ -75,6 +83,9 @@ public final class NGramTokenFilter exte
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
+ this.charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -126,6 +137,7 @@ public final class NGramTokenFilter exte
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
+ curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
@@ -138,13 +150,15 @@ public final class NGramTokenFilter exte
}
}
if (version.onOrAfter(Version.LUCENE_44)) {
- if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+ if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
++curPos;
curGramSize = minGram;
}
- if (curPos + curGramSize <= curTermLength) {
+ if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
- termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+ final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+ final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+ termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posLenAtt.setPositionLength(curPosLen);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Sun Aug 11 12:19:13 2013
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@@ -40,29 +41,47 @@ import org.apache.lucene.util.Version;
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
* </table>
* <a name="version"/>
- * <p>Before Lucene 4.4, this class had a different behavior:<ul>
- * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
- * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
- * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
- * <p>Although highly discouraged, it is still possible to use the old behavior
- * through {@link Lucene43NGramTokenizer}.
+ * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+ * <li>tokenize in a streaming fashion to support streams which are larger
+ * than 1024 chars (limit of the previous version),
+ * <li>count grams based on unicode code points instead of java chars (and
+ * never split in the middle of surrogate pairs),
+ * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
+ * before computing n-grams.</ul>
+ * <p>Additionally, this class doesn't trim trailing whitespaces and emits
+ * tokens in a different order, tokens are now emitted by increasing start
+ * offsets while they used to be emitted by increasing lengths (which prevented
+ * from supporting large input streams).
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43NGramTokenizer}.
*/
-public final class NGramTokenizer extends Tokenizer {
+// non-final to allow for overriding isTokenChar, but all other methods should be final
+public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private char[] buffer;
- private int bufferStart, bufferEnd; // remaining slice of the buffer
+ private CharacterUtils charUtils;
+ private CharacterUtils.CharacterBuffer charBuffer;
+ private int[] buffer; // like charBuffer, but converted to code points
+ private int bufferStart, bufferEnd; // remaining slice in buffer
private int offset;
private int gramSize;
private int minGram, maxGram;
private boolean exhausted;
+ private int lastCheckedChar; // last offset in the buffer that we checked
+ private int lastNonTokenChar; // last offset that we found to not be a token char
+ private boolean edgesOnly; // leading edges n-grams only
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ NGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+ super(input);
+ init(version, minGram, maxGram, edgesOnly);
+ }
+
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param version the lucene compatibility <a href="#version">version</a>
@@ -71,8 +90,12 @@ public final class NGramTokenizer extend
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
- super(input);
- init(version, minGram, maxGram);
+ this(version, input, minGram, maxGram, false);
+ }
+
+ NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+ super(factory, input);
+ init(version, minGram, maxGram, edgesOnly);
}
/**
@@ -84,8 +107,7 @@ public final class NGramTokenizer extend
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
- super(factory, input);
- init(version, minGram, maxGram);
+ this(version, factory, input, minGram, maxGram, false);
}
/**
@@ -97,10 +119,13 @@ public final class NGramTokenizer extend
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
- private void init(Version version, int minGram, int maxGram) {
- if (!version.onOrAfter(Version.LUCENE_44)) {
+ private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
+ if (!edgesOnly && !version.onOrAfter(Version.LUCENE_44)) {
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
}
+ charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -109,66 +134,107 @@ public final class NGramTokenizer extend
}
this.minGram = minGram;
this.maxGram = maxGram;
- buffer = new char[maxGram + 1024];
+ this.edgesOnly = edgesOnly;
+ charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+ buffer = new int[charBuffer.getBuffer().length];
+ // Make the term att large enough
+ termAtt.resizeBuffer(2 * maxGram);
}
- /** Returns the next token in the stream, or null at EOS. */
@Override
- public boolean incrementToken() throws IOException {
+ public final boolean incrementToken() throws IOException {
clearAttributes();
- // compact
- if (bufferStart >= buffer.length - maxGram) {
- System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
- bufferEnd -= bufferStart;
- bufferStart = 0;
-
- // fill in remaining space
- if (!exhausted) {
- // TODO: refactor to a shared readFully
- while (bufferEnd < buffer.length) {
- final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
- if (read == -1) {
- exhausted = true;
- break;
- }
- bufferEnd += read;
+ // termination of this loop is guaranteed by the fact that every iteration
+ // either advances the buffer (calls consumes()) or increases gramSize
+ while (true) {
+ // compact
+ if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
+ System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+ bufferEnd -= bufferStart;
+ lastCheckedChar -= bufferStart;
+ lastNonTokenChar -= bufferStart;
+ bufferStart = 0;
+
+ // fill in remaining space
+ exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+ // convert to code points
+ bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+ }
+
+ // should we go to the next offset?
+ if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
+ if (bufferStart + 1 + minGram > bufferEnd) {
+ assert exhausted;
+ return false;
}
+ consume();
+ gramSize = minGram;
}
+
+ updateLastNonTokenChar();
+
+ // retry if the token to be emitted was going to not only contain token chars
+ final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+ final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+ if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
+ consume();
+ gramSize = minGram;
+ continue;
+ }
+
+ final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+ termAtt.setLength(length);
+ posIncAtt.setPositionIncrement(1);
+ posLenAtt.setPositionLength(1);
+ offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+ ++gramSize;
+ return true;
}
+ }
- // should we go to the next offset?
- if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
- bufferStart++;
- offset++;
- gramSize = minGram;
- }
-
- // are there enough chars remaining?
- if (bufferStart + gramSize > bufferEnd) {
- return false;
- }
-
- termAtt.copyBuffer(buffer, bufferStart, gramSize);
- posIncAtt.setPositionIncrement(1);
- posLenAtt.setPositionLength(1);
- offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
- ++gramSize;
+ private void updateLastNonTokenChar() {
+ final int termEnd = bufferStart + gramSize - 1;
+ if (termEnd > lastCheckedChar) {
+ for (int i = termEnd; i > lastCheckedChar; --i) {
+ if (!isTokenChar(buffer[i])) {
+ lastNonTokenChar = i;
+ break;
+ }
+ }
+ lastCheckedChar = termEnd;
+ }
+ }
+
+ /** Consume one code point. */
+ private void consume() {
+ offset += Character.charCount(buffer[bufferStart++]);
+ }
+
+ /** Only collect characters which satisfy this condition. */
+ protected boolean isTokenChar(int chr) {
return true;
}
@Override
- public void end() {
- final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+ public final void end() {
+ assert bufferStart <= bufferEnd;
+ int endOffset = offset;
+ for (int i = bufferStart; i < bufferEnd; ++i) {
+ endOffset += Character.charCount(buffer[i]);
+ }
+ endOffset = correctOffset(endOffset);
offsetAtt.setOffset(endOffset, endOffset);
}
@Override
- public void reset() throws IOException {
+ public final void reset() throws IOException {
super.reset();
bufferStart = bufferEnd = buffer.length;
+ lastNonTokenChar = lastCheckedChar = bufferStart - 1;
offset = 0;
gramSize = minGram;
exhausted = false;
+ charBuffer.reset();
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java Sun Aug 11 12:19:13 2013
@@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokena
* </p>
*/
public final class NorwegianLightStemFilter extends TokenFilter {
- private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
+ private final NorwegianLightStemmer stemmer;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
-
+
+ /**
+ * Calls {@link #NorwegianLightStemFilter(TokenStream, int)
+ * NorwegianLightStemFilter(input, BOKMAAL)}
+ */
public NorwegianLightStemFilter(TokenStream input) {
+ this(input, NorwegianLightStemmer.BOKMAAL);
+ }
+
+ /**
+ * Creates a new NorwegianLightStemFilter
+ * @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
+ * {@link NorwegianLightStemmer#NYNORSK}, or both.
+ */
+ public NorwegianLightStemFilter(TokenStream input, int flags) {
super(input);
+ stemmer = new NorwegianLightStemmer(flags);
}
@Override
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
+
/**
* Factory for {@link NorwegianLightStemFilter}.
* <pre class="prettyprint">
@@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.T
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.NorwegianLightStemFilterFactory"/>
+ * <filter class="solr.NorwegianLightStemFilterFactory" variant="nb"/>
* </analyzer>
* </fieldType></pre>
*/
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
+ private final int flags;
+
/** Creates a new NorwegianLightStemFilterFactory */
public NorwegianLightStemFilterFactory(Map<String,String> args) {
super(args);
+ String variant = get(args, "variant");
+ if (variant == null || "nb".equals(variant)) {
+ flags = BOKMAAL;
+ } else if ("nn".equals(variant)) {
+ flags = NYNORSK;
+ } else if ("no".equals(variant)) {
+ flags = BOKMAAL | NYNORSK;
+ } else {
+ throw new IllegalArgumentException("invalid variant: " + variant);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -46,6 +61,6 @@ public class NorwegianLightStemFilterFac
@Override
public TokenStream create(TokenStream input) {
- return new NorwegianLightStemFilter(input);
+ return new NorwegianLightStemFilter(input, flags);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java Sun Aug 11 12:19:13 2013
@@ -62,50 +62,106 @@ import static org.apache.lucene.analysis
* corpus to validate against whereas the Norwegian one is hand crafted.
*/
public class NorwegianLightStemmer {
+ /** Constant to remove Bokmål-specific endings */
+ public static final int BOKMAAL = 1;
+ /** Constant to remove Nynorsk-specific endings */
+ public static final int NYNORSK = 2;
+ final boolean useBokmaal;
+ final boolean useNynorsk;
+
+ /**
+ * Creates a new NorwegianLightStemmer
+ * @param flags set to {@link #BOKMAAL}, {@link #NYNORSK}, or both.
+ */
+ public NorwegianLightStemmer(int flags) {
+ if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
+ throw new IllegalArgumentException("invalid flags");
+ }
+ useBokmaal = (flags & BOKMAAL) != 0;
+ useNynorsk = (flags & NYNORSK) != 0;
+ }
+
public int stem(char s[], int len) {
// Remove posessive -s (bilens -> bilen) and continue checking
if (len > 4 && s[len-1] == 's')
len--;
// Remove common endings, single-pass
- if (len > 7 &&
- (endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
- endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
+ if (len > 7 &&
+ ((endsWith(s, len, "heter") &&
+ useBokmaal) || // general ending (hemmelig-heter -> hemmelig)
+ (endsWith(s, len, "heten") &&
+ useBokmaal) || // general ending (hemmelig-heten -> hemmelig)
+ (endsWith(s, len, "heita") &&
+ useNynorsk))) // general ending (hemmeleg-heita -> hemmeleg)
return len - 5;
+
+ // Remove Nynorsk common endings, single-pass
+ if (len > 8 && useNynorsk &&
+ (endsWith(s, len, "heiter") || // general ending (hemmeleg-heiter -> hemmeleg)
+ endsWith(s, len, "leiken") || // general ending (trygg-leiken -> trygg)
+ endsWith(s, len, "leikar"))) // general ending (trygg-leikar -> trygg)
+ return len - 6;
if (len > 5 &&
- (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
- endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
+ (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
+ (endsWith(s, len, "het") &&
+ useBokmaal))) // general ending (hemmelig-het -> hemmelig)
return len - 3;
+ if (len > 6 && useNynorsk &&
+ (endsWith(s, len, "heit") || // general ending (hemmeleg-heit -> hemmeleg)
+ endsWith(s, len, "semd") || // general ending (verk-semd -> verk)
+ endsWith(s, len, "leik"))) // general ending (trygg-leik -> trygg)
+ return len - 4;
+
if (len > 7 &&
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
return len - 5;
if (len > 6 &&
- (endsWith(s, len, "ende") || // (sov-ende -> sov)
+ ((endsWith(s, len, "ende") &&
+ useBokmaal) || // (sov-ende -> sov)
+ (endsWith(s, len, "ande") &&
+ useNynorsk) || // (sov-ande -> sov)
endsWith(s, len, "else") || // general ending (føl-else -> føl)
- endsWith(s, len, "este") || // adj (fin-este -> fin)
- endsWith(s, len, "eren"))) // masc
+ (endsWith(s, len, "este") &&
+ useBokmaal) || // adj (fin-este -> fin)
+ (endsWith(s, len, "aste") &&
+ useNynorsk) || // adj (fin-aste -> fin)
+ (endsWith(s, len, "eren") &&
+ useBokmaal) || // masc
+ (endsWith(s, len, "aren") &&
+ useNynorsk))) // masc
return len - 4;
if (len > 5 &&
- (endsWith(s, len, "ere") || // adj (fin-ere -> fin)
- endsWith(s, len, "est") || // adj (fin-est -> fin)
- endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
- ))
+ ((endsWith(s, len, "ere") &&
+ useBokmaal) || // adj (fin-ere -> fin)
+ (endsWith(s, len, "are") &&
+ useNynorsk) || // adj (fin-are -> fin)
+ (endsWith(s, len, "est") &&
+ useBokmaal) || // adj (fin-est -> fin)
+ (endsWith(s, len, "ast") &&
+ useNynorsk) || // adj (fin-ast -> fin)
+ endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
+ (endsWith(s, len, "ane") &&
+ useNynorsk))) // masc pl definite (gut-ane)
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") || // neutr definite
- endsWith(s, len, "st") || // adj (billig-st -> billig)
+ (endsWith(s, len, "ar") &&
+ useNynorsk) || // masc pl indefinite
+ (endsWith(s, len, "st") &&
+ useBokmaal) || // adj (billig-st -> billig)
endsWith(s, len, "te")))
return len - 2;
-
+
if (len > 3)
switch(s[len-1]) {
case 'a': // fem definite
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java Sun Aug 11 12:19:13 2013
@@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokena
* </p>
*/
public final class NorwegianMinimalStemFilter extends TokenFilter {
- private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
+ private final NorwegianMinimalStemmer stemmer;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+ /**
+ * Calls {@link #NorwegianMinimalStemFilter(TokenStream, int)
+ * NorwegianMinimalStemFilter(input, BOKMAAL)}
+ */
public NorwegianMinimalStemFilter(TokenStream input) {
+ this(input, NorwegianLightStemmer.BOKMAAL);
+ }
+
+ /**
+ * Creates a new NorwegianLightStemFilter
+ * @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
+ * {@link NorwegianLightStemmer#NYNORSK}, or both.
+ */
+ public NorwegianMinimalStemFilter(TokenStream input, int flags) {
super(input);
+ this.stemmer = new NorwegianMinimalStemmer(flags);
}
@Override
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
+
/**
* Factory for {@link NorwegianMinimalStemFilter}.
* <pre class="prettyprint">
@@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.T
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.NorwegianMinimalStemFilterFactory"/>
+ * <filter class="solr.NorwegianMinimalStemFilterFactory" variant="nb"/>
* </analyzer>
* </fieldType></pre>
*/
public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
+ private final int flags;
+
/** Creates a new NorwegianMinimalStemFilterFactory */
public NorwegianMinimalStemFilterFactory(Map<String,String> args) {
super(args);
+ String variant = get(args, "variant");
+ if (variant == null || "nb".equals(variant)) {
+ flags = BOKMAAL;
+ } else if ("nn".equals(variant)) {
+ flags = NYNORSK;
+ } else if ("no".equals(variant)) {
+ flags = BOKMAAL | NYNORSK;
+ } else {
+ throw new IllegalArgumentException("invalid variant: " + variant);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -46,6 +61,6 @@ public class NorwegianMinimalStemFilterF
@Override
public TokenStream create(TokenStream input) {
- return new NorwegianMinimalStemFilter(input);
+ return new NorwegianMinimalStemFilter(input, flags);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java Sun Aug 11 12:19:13 2013
@@ -53,31 +53,52 @@ package org.apache.lucene.analysis.no;
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
- * Minimal Stemmer for Norwegian bokmål (no-nb)
+ * Minimal Stemmer for Norwegian Bokmål (no-nb) and Nynorsk (no-nn)
* <p>
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
*/
public class NorwegianMinimalStemmer {
+ final boolean useBokmaal;
+ final boolean useNynorsk;
- public int stem(char s[], int len) {
+ /**
+ * Creates a new NorwegianMinimalStemmer
+ * @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
+ * {@link NorwegianLightStemmer#NYNORSK}, or both.
+ */
+ public NorwegianMinimalStemmer(int flags) {
+ if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
+ throw new IllegalArgumentException("invalid flags");
+ }
+ useBokmaal = (flags & BOKMAAL) != 0;
+ useNynorsk = (flags & NYNORSK) != 0;
+ }
+
+ public int stem(char s[], int len) {
// Remove genitiv s
if (len > 4 && s[len-1] == 's')
len--;
if (len > 5 &&
- endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
- )
+ (endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
+ (endsWith(s, len, "ane") &&
+ useNynorsk // masc pl definite (gut-ane)
+ )))
return len - 3;
-
+
if (len > 4 &&
- (endsWith(s, len, "er") || // masc/fem indefinite
- endsWith(s, len, "en") || // masc/fem definite
- endsWith(s, len, "et") // neutr definite
- ))
+ (endsWith(s, len, "er") || // masc/fem indefinite
+ endsWith(s, len, "en") || // masc/fem definite
+ endsWith(s, len, "et") || // neutr definite
+ (endsWith(s, len, "ar") &&
+ useNynorsk // masc pl indefinite
+ )))
return len - 2;
-
+
if (len > 3)
switch(s[len-1]) {
case 'a': // fem definite
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Sun Aug 11 12:19:13 2013
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
+/* The following code was generated by JFlex. */
package org.apache.lucene.analysis.standard;
@@ -29,13 +29,10 @@ WARNING: if you change ClassicTokenizerI
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
/**
- * This class is a scanner generated by
- * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 9/19/12 6:23 PM from the specification file
- * <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * This class implements the classic lucene StandardTokenizer up until 3.0
*/
+
class ClassicTokenizerImpl implements StandardTokenizerInterface {
/** This character denotes the end of file */
@@ -359,7 +356,6 @@ public static final int ACRONYM_DEP
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
-@Override
public final int yychar()
{
return yychar;
@@ -368,7 +364,6 @@ public final int yychar()
/**
* Fills CharTermAttribute with the current token text.
*/
-@Override
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
@@ -484,7 +479,6 @@ public final void getText(CharTermAttrib
*
* @param reader the new input stream
*/
- @Override
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
@@ -544,7 +538,6 @@ public final void getText(CharTermAttrib
/**
* Returns the length of the matched text region.
*/
- @Override
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
@@ -600,7 +593,6 @@ public final void getText(CharTermAttrib
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
- @Override
public int getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex Sun Aug 11 12:19:13 2013
@@ -27,6 +27,9 @@ WARNING: if you change ClassicTokenizerI
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+/**
+ * This class implements the classic lucene StandardTokenizer up until 3.0
+ */
%%
%class ClassicTokenizerImpl
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Sun Aug 11 12:19:13 2013
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-// Generated using ICU4J 49.1.0.0 on Wednesday, September 19, 2012 10:23:34 PM UTC
+// Generated using ICU4J 49.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Sun Aug 11 12:19:13 2013
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
+/* The following code was generated by JFlex. */
package org.apache.lucene.analysis.standard;
@@ -843,7 +843,6 @@ public final class StandardTokenizerImpl
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
- @Override
public final int yychar()
{
return yychar;
@@ -852,7 +851,6 @@ public final class StandardTokenizerImpl
/**
* Fills CharTermAttribute with the current token text.
*/
- @Override
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
@@ -967,7 +965,6 @@ public final class StandardTokenizerImpl
*
* @param reader the new input stream
*/
- @Override
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
@@ -1027,7 +1024,6 @@ public final class StandardTokenizerImpl
/**
* Returns the length of the matched text region.
*/
- @Override
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
@@ -1083,7 +1079,6 @@ public final class StandardTokenizerImpl
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
- @Override
public int getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java Sun Aug 11 12:19:13 2013
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
+/* The following code was generated by JFlex. */
package org.apache.lucene.analysis.standard;
@@ -4033,7 +4033,6 @@ public final class UAX29URLEmailTokenize
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
- @Override
public final int yychar()
{
return yychar;
@@ -4042,7 +4041,6 @@ public final class UAX29URLEmailTokenize
/**
* Fills CharTermAttribute with the current token text.
*/
- @Override
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
@@ -4157,7 +4155,6 @@ public final class UAX29URLEmailTokenize
*
* @param reader the new input stream
*/
- @Override
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
@@ -4217,7 +4214,6 @@ public final class UAX29URLEmailTokenize
/**
* Returns the length of the matched text region.
*/
- @Override
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
@@ -4273,7 +4269,6 @@ public final class UAX29URLEmailTokenize
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
- @Override
public int getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -68,6 +68,7 @@ public class SynonymFilterFactory extend
private final String synonyms;
private final String format;
private final boolean expand;
+ private final String analyzerName;
private final Map<String, String> tokArgs = new HashMap<String, String>();
private SynonymMap map;
@@ -79,7 +80,13 @@ public class SynonymFilterFactory extend
format = get(args, "format");
expand = getBoolean(args, "expand", true);
+ analyzerName = get(args, "analyzer");
tokenizerFactory = get(args, "tokenizerFactory");
+ if (analyzerName != null && tokenizerFactory != null) {
+ throw new IllegalArgumentException("Analyzer and TokenizerFactory can't be specified both: " +
+ analyzerName + " and " + tokenizerFactory);
+ }
+
if (tokenizerFactory != null) {
assureMatchVersion();
tokArgs.put("luceneMatchVersion", getLuceneMatchVersion().toString());
@@ -104,15 +111,20 @@ public class SynonymFilterFactory extend
@Override
public void inform(ResourceLoader loader) throws IOException {
final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
+ Analyzer analyzer;
- Analyzer analyzer = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
- TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
- return new TokenStreamComponents(tokenizer, stream);
- }
- };
+ if (analyzerName != null) {
+ analyzer = loadAnalyzer(loader, analyzerName);
+ } else {
+ analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
+ TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ }
try {
if (format == null || format.equals("solr")) {
@@ -188,4 +200,17 @@ public class SynonymFilterFactory extend
throw new RuntimeException(e);
}
}
+
+ private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
+ Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
+ try {
+ Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_50);
+ if (analyzer instanceof ResourceLoaderAware) {
+ ((ResourceLoaderAware) analyzer).inform(loader);
+ }
+ return analyzer;
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java Sun Aug 11 12:19:13 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.synon
*/
import java.io.IOException;
-import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -112,7 +111,7 @@ public class SynonymMap {
* separates by {@link SynonymMap#WORD_SEPARATOR}.
* reuse and its chars must not be null. */
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
- TokenStream ts = analyzer.tokenStream("", new StringReader(text));
+ TokenStream ts = analyzer.tokenStream("", text);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java Sun Aug 11 12:19:13 2013
@@ -57,7 +57,7 @@ public final class TurkishLowerCaseFilte
final char[] buffer = termAtt.buffer();
int length = termAtt.length();
for (int i = 0; i < length;) {
- final int ch = Character.codePointAt(buffer, i);
+ final int ch = Character.codePointAt(buffer, i, length);
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I ||
(iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
@@ -100,7 +100,7 @@ public final class TurkishLowerCaseFilte
*/
private boolean isBeforeDot(char s[], int pos, int len) {
for (int i = pos; i < len;) {
- final int ch = Character.codePointAt(s, i);
+ final int ch = Character.codePointAt(s, i, len);
if (Character.getType(ch) != Character.NON_SPACING_MARK)
return false;
if (ch == COMBINING_DOT_ABOVE)
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java Sun Aug 11 12:19:13 2013
@@ -262,7 +262,7 @@ public class CharArrayMap<V> extends Abs
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
+ if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@@ -282,7 +282,7 @@ public class CharArrayMap<V> extends Abs
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, i);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
+ if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java Sun Aug 11 12:19:13 2013
@@ -100,7 +100,8 @@ public abstract class CharTokenizer exte
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
- if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
+ charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
+ if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
break;
@@ -113,7 +114,7 @@ public abstract class CharTokenizer exte
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
- final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
+ final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java Sun Aug 11 12:19:13 2013
@@ -52,27 +52,6 @@ public abstract class CharacterUtils {
}
/**
- * Returns the code point at the given index of the char array.
- * Depending on the {@link Version} passed to
- * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
- * of {@link Character#codePointAt(char[], int)} as it would have been
- * available on a Java 1.4 JVM or on a later virtual machine version.
- *
- * @param chars
- * a character array
- * @param offset
- * the offset to the char values in the chars array to be converted
- *
- * @return the Unicode code point at the given index
- * @throws NullPointerException
- * - if the array is null.
- * @throws IndexOutOfBoundsException
- * - if the value offset is negative or not less than the length of
- * the char array.
- */
- public abstract int codePointAt(final char[] chars, final int offset);
-
- /**
* Returns the code point at the given index of the {@link CharSequence}.
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
@@ -116,7 +95,10 @@ public abstract class CharacterUtils {
* the char array.
*/
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
-
+
+ /** Return the number of characters in <code>seq</code>. */
+ public abstract int codePointCount(CharSequence seq);
+
/**
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
* of the given bufferSize.
@@ -140,51 +122,105 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
- public void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
- codePointAt(buffer, i)), buffer, i);
+ codePointAt(buffer, i, limit)), buffer, i);
}
}
-
+
+ /** Converts a sequence of Java characters to a sequence of unicode code points.
+ * @return the number of code points written to the destination buffer */
+ public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount = 0;
+ for (int i = 0; i < srcLen; ) {
+ final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int charCount = Character.charCount(cp);
+ dest[destOff + codePointCount++] = cp;
+ i += charCount;
+ }
+ return codePointCount;
+ }
+
+ /** Converts a sequence of unicode code points to a sequence of Java characters.
+ * @return the number of chars written to the destination buffer */
+ public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i) {
+ written += Character.toChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
/**
* Fills the {@link CharacterBuffer} with characters read from the given
- * reader {@link Reader}. This method tries to read as many characters into
- * the {@link CharacterBuffer} as possible, each call to fill will start
- * filling the buffer from offset <code>0</code> up to the length of the size
- * of the internal character array.
+ * reader {@link Reader}. This method tries to read <code>numChars</code>
+ * characters into the {@link CharacterBuffer}, each call to fill will start
+ * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ * In case code points can span across 2 java characters, this method may
+ * only fill <code>numChars - 1</code> characters in order not to split in
+ * the middle of a surrogate pair, even if there are remaining characters in
+ * the {@link Reader}.
* <p>
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method implements
* supplementary character awareness when filling the given buffer. For all
- * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees
+ * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
* that the given {@link CharacterBuffer} will never contain a high surrogate
* character as the last element in the buffer unless it is the last available
* character in the reader. In other words, high and low surrogate pairs will
* always be preserved across buffer boarders.
* </p>
+ * <p>
+ * A return value of <code>false</code> means that this method call exhausted
+ * the reader, but there may be some bytes which have been read, which can be
+ * verified by checking whether <code>buffer.getLength() > 0</code>.
+ * </p>
*
* @param buffer
* the buffer to fill.
* @param reader
* the reader to read characters from.
- * @return <code>true</code> if and only if no more characters are available
- * in the reader, otherwise <code>false</code>.
+ * @param numChars
+ * the number of chars to read
+ * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
* @throws IOException
* if the reader throws an {@link IOException}.
*/
- public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
+ public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
- private static final class Java5CharacterUtils extends CharacterUtils {
- Java5CharacterUtils() {
+ /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+ public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ return fill(buffer, reader, buffer.buffer.length);
+ }
+
+ /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+ * code points from <code>index</code>. */
+ public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+ static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+ int read = 0;
+ while (read < len) {
+ final int r = reader.read(dest, offset + read, len - read);
+ if (r == -1) {
+ break;
+ }
+ read += r;
}
+ return read;
+ }
- @Override
- public int codePointAt(final char[] chars, final int offset) {
- return Character.codePointAt(chars, offset);
+ private static final class Java5CharacterUtils extends CharacterUtils {
+ Java5CharacterUtils() {
}
@Override
@@ -198,7 +234,11 @@ public abstract class CharacterUtils {
}
@Override
- public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+ public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
final int offset;
@@ -206,47 +246,36 @@ public abstract class CharacterUtils {
// Install the previously saved ending high surrogate:
if (buffer.lastTrailingHighSurrogate != 0) {
charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
offset = 1;
} else {
offset = 0;
}
- final int read = reader.read(charBuffer,
- offset,
- charBuffer.length - offset);
- if (read == -1) {
- buffer.length = offset;
- buffer.lastTrailingHighSurrogate = 0;
- return offset != 0;
- }
- assert read > 0;
- buffer.length = read + offset;
-
- // If we read only a single char, and that char was a
- // high surrogate, read again:
- if (buffer.length == 1
- && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
- final int read2 = reader.read(charBuffer,
- 1,
- charBuffer.length - 1);
- if (read2 == -1) {
- // NOTE: mal-formed input (ended on a high
- // surrogate)! Consumer must deal with it...
- return true;
- }
- assert read2 > 0;
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
- buffer.length += read2;
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
}
- if (buffer.length > 1
- && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
- } else {
- buffer.lastTrailingHighSurrogate = 0;
}
+ return result;
+ }
- return true;
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return Character.codePointCount(seq, 0, seq.length());
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ return Character.offsetByCodePoints(buf, start, count, index, offset);
}
}
@@ -255,11 +284,6 @@ public abstract class CharacterUtils {
}
@Override
- public int codePointAt(final char[] chars, final int offset) {
- return chars[offset];
- }
-
- @Override
public int codePointAt(final CharSequence seq, final int offset) {
return seq.charAt(offset);
}
@@ -272,13 +296,31 @@ public abstract class CharacterUtils {
}
@Override
- public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+ public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
+ throws IOException {
+ assert buffer.buffer.length >= 1;
+ if (numChars < 1 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
+ }
buffer.offset = 0;
- final int read = reader.read(buffer.buffer);
- if(read == -1)
- return false;
+ final int read = readFully(reader, buffer.buffer, 0, numChars);
buffer.length = read;
- return true;
+ buffer.lastTrailingHighSurrogate = 0;
+ return read == numChars;
+ }
+
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return seq.length();
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ final int result = index + offset;
+ if (result < 0 || result > count) {
+ throw new IndexOutOfBoundsException();
+ }
+ return result;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Sun Aug 11 12:19:13 2013
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
+/* The following code was generated by JFlex. */
package org.apache.lucene.analysis.wikipedia;
@@ -21,13 +21,10 @@ package org.apache.lucene.analysis.wikip
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
/**
- * This class is a scanner generated by
- * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 9/19/12 6:23 PM from the specification file
- * <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * JFlex-generated tokenizer that is aware of Wikipedia syntax.
*/
+
class WikipediaTokenizerImpl {
/** This character denotes the end of file */