You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2010/03/14 21:58:34 UTC
svn commit: r922957 [2/3] - in /lucene/solr/branches/solr: ./ lib/
src/common/org/apache/solr/common/util/ src/java/org/apache/solr/analysis/
src/java/org/apache/solr/handler/ src/java/org/apache/solr/handler/admin/
src/java/org/apache/solr/handler/com...
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Sun Mar 14 20:58:32 2010
@@ -19,12 +19,14 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
/**
* Splits words into subwords and performs optional transformations on subword groups.
@@ -50,118 +52,108 @@ import java.util.List;
* - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
*
* One use for WordDelimiterFilter is to help match words with different subword delimiters.
- * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi"
- * queries to all match.
- * One way of doing so is to specify combinations="1" in the analyzer
- * used for indexing, and combinations="0" (the default) in the analyzer
- * used for querying. Given that the current StandardTokenizer
- * immediately removes many intra-word delimiters, it is recommended that
- * this filter be used after a tokenizer that does not do this
- * (such as WhitespaceTokenizer).
+ * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
+ * One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
+ * in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
+ * delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
*
* @version $Id$
*/
final class WordDelimiterFilter extends TokenFilter {
- private final byte[] charTypeTable;
-
- public static final int LOWER=0x01;
- public static final int UPPER=0x02;
- public static final int DIGIT=0x04;
- public static final int SUBWORD_DELIM=0x08;
+
+ public static final int LOWER = 0x01;
+ public static final int UPPER = 0x02;
+ public static final int DIGIT = 0x04;
+ public static final int SUBWORD_DELIM = 0x08;
// combinations: for testing, not for setting bits
- public static final int ALPHA=0x03;
- public static final int ALPHANUM=0x07;
-
- // TODO: should there be a WORD_DELIM category for
- // chars that only separate words (no catenation of subwords
- // will be done if separated by these chars?)
- // "," would be an obvious candidate...
-
- static byte[] defaultWordDelimTable;
- static {
- byte[] tab = new byte[256];
- for (int i=0; i<256; i++) {
- byte code = 0;
- if (Character.isLowerCase(i)) code |= LOWER;
- else if (Character.isUpperCase(i)) code |= UPPER;
- else if (Character.isDigit(i)) code |= DIGIT;
- if (code==0) code=SUBWORD_DELIM;
- tab[i]=code;
- }
- defaultWordDelimTable = tab;
- }
+ public static final int ALPHA = 0x03;
+ public static final int ALPHANUM = 0x07;
/**
- * If 1, causes parts of words to be generated:
+ * If true, causes parts of words to be generated:
* <p/>
* "PowerShot" => "Power" "Shot"
*/
- final int generateWordParts;
+ final boolean generateWordParts;
/**
- * If 1, causes number subwords to be generated:
+ * If true, causes number subwords to be generated:
* <p/>
* "500-42" => "500" "42"
*/
- final int generateNumberParts;
+ final boolean generateNumberParts;
/**
- * If 1, causes maximum runs of word parts to be catenated:
+ * If true, causes maximum runs of word parts to be catenated:
* <p/>
* "wi-fi" => "wifi"
*/
- final int catenateWords;
+ final boolean catenateWords;
/**
- * If 1, causes maximum runs of number parts to be catenated:
+ * If true, causes maximum runs of number parts to be catenated:
* <p/>
* "500-42" => "50042"
*/
- final int catenateNumbers;
+ final boolean catenateNumbers;
/**
- * If 1, causes all subword parts to be catenated:
+ * If true, causes all subword parts to be catenated:
* <p/>
* "wi-fi-4000" => "wifi4000"
*/
- final int catenateAll;
+ final boolean catenateAll;
/**
- * If 0, causes case changes to be ignored (subwords will only be generated
- * given SUBWORD_DELIM tokens). (Defaults to 1)
- */
- final int splitOnCaseChange;
-
- /**
- * If 1, original words are preserved and added to the subword list (Defaults to 0)
+ * If true, original words are preserved and added to the subword list (Defaults to false)
* <p/>
* "500-42" => "500" "42" "500-42"
*/
- final int preserveOriginal;
-
- /**
- * If 0, causes numeric changes to be ignored (subwords will only be generated
- * given SUBWORD_DELIM tokens). (Defaults to 1)
- */
- final int splitOnNumerics;
-
- /**
- * If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
- * <p/>
- * "O'Neil's" => "O", "Neil"
- */
- final int stemEnglishPossessive;
+ final boolean preserveOriginal;
/**
* If not null is the set of tokens to protect from being delimited
*
*/
final CharArraySet protWords;
+
+ private final TermAttribute termAtttribute = (TermAttribute) addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
+
+ // used for iterating word delimiter breaks
+ private final WordDelimiterIterator iterator;
+
+ // used for concatenating runs of similar typed subwords (word,number)
+ private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+ // number of subwords last output by concat.
+ private int lastConcatCount = 0;
+
+ // used for catenate all
+ private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+ // used for accumulating position increment gaps
+ private int accumPosInc = 0;
+
+ private char savedBuffer[] = new char[1024];
+ private int savedStartOffset;
+ private int savedEndOffset;
+ private String savedType;
+ private boolean hasSavedState = false;
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ private boolean hasIllegalOffsets = false;
+
+ // for a run of the same subword type within a word, have we output anything?
+ private boolean hasOutputToken = false;
+ // when preserve original is on, have we output any token following it?
+ // this token must have posInc=0!
+ private boolean hasOutputFollowingOriginal = false;
/**
- *
* @param in Token stream to be filtered.
* @param charTypeTable
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
@@ -175,19 +167,27 @@ final class WordDelimiterFilter extends
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ int stemEnglishPossessive,
+ CharArraySet protWords) {
super(in);
- this.generateWordParts = generateWordParts;
- this.generateNumberParts = generateNumberParts;
- this.catenateWords = catenateWords;
- this.catenateNumbers = catenateNumbers;
- this.catenateAll = catenateAll;
- this.splitOnCaseChange = splitOnCaseChange;
- this.preserveOriginal = preserveOriginal;
- this.charTypeTable = charTypeTable;
- this.splitOnNumerics = splitOnNumerics;
- this.stemEnglishPossessive = stemEnglishPossessive;
+ this.generateWordParts = generateWordParts != 0;
+ this.generateNumberParts = generateNumberParts != 0;
+ this.catenateWords = catenateWords != 0;
+ this.catenateNumbers = catenateNumbers != 0;
+ this.catenateAll = catenateAll != 0;
+ this.preserveOriginal = preserveOriginal != 0;
this.protWords = protWords;
+ this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
}
/**
@@ -198,8 +198,18 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
- this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ CharArraySet protWords) {
+ this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, 1, null);
}
/**
@@ -210,8 +220,16 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
- this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal) {
+ this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, null);
}
/**
@@ -227,8 +245,18 @@ final class WordDelimiterFilter extends
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ int stemEnglishPossessive,
+ CharArraySet protWords) {
+ this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
}
/**
@@ -237,8 +265,17 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ CharArraySet protWords) {
+ this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
}
/** * Compatibility constructor
@@ -248,8 +285,15 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal) {
+ this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
}
/**
* Compatibility constructor
@@ -259,7 +303,13 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll) {
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
/**
@@ -270,407 +320,365 @@ final class WordDelimiterFilter extends
* instead.
*/
@Deprecated
- public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
- this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll) {
+ this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
+
+ public boolean incrementToken() throws IOException {
+ while (true) {
+ if (!hasSavedState) {
+ // process a new input word
+ if (!input.incrementToken()) {
+ return false;
+ }
+ int termLength = termAtttribute.termLength();
+ char[] termBuffer = termAtttribute.termBuffer();
+
+ accumPosInc += posIncAttribute.getPositionIncrement();
+
+ iterator.setText(termBuffer, termLength);
+ iterator.next();
+
+ // word of no delimiters, or protected word: just return it
+ if ((iterator.current == 0 && iterator.end == termLength) ||
+ (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
+ posIncAttribute.setPositionIncrement(accumPosInc);
+ accumPosInc = 0;
+ return true;
+ }
+
+ // word of simply delimiters
+ if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
+ // if the posInc is 1, simply ignore it in the accumulation
+ if (posIncAttribute.getPositionIncrement() == 1) {
+ accumPosInc--;
+ }
+ continue;
+ }
- int charType(int ch) {
- if (ch<charTypeTable.length) {
- return charTypeTable[ch];
- }
- switch (Character.getType(ch)) {
- case Character.UPPERCASE_LETTER: return UPPER;
- case Character.LOWERCASE_LETTER: return LOWER;
-
- case Character.TITLECASE_LETTER:
- case Character.MODIFIER_LETTER:
- case Character.OTHER_LETTER:
- case Character.NON_SPACING_MARK:
- case Character.ENCLOSING_MARK: // depends what it encloses?
- case Character.COMBINING_SPACING_MARK:
- return ALPHA;
-
- case Character.DECIMAL_DIGIT_NUMBER:
- case Character.LETTER_NUMBER:
- case Character.OTHER_NUMBER:
- return DIGIT;
-
- // case Character.SPACE_SEPARATOR:
- // case Character.LINE_SEPARATOR:
- // case Character.PARAGRAPH_SEPARATOR:
- // case Character.CONTROL:
- // case Character.FORMAT:
- // case Character.PRIVATE_USE:
-
- case Character.SURROGATE: // prevent splitting
- return ALPHA|DIGIT;
-
- // case Character.DASH_PUNCTUATION:
- // case Character.START_PUNCTUATION:
- // case Character.END_PUNCTUATION:
- // case Character.CONNECTOR_PUNCTUATION:
- // case Character.OTHER_PUNCTUATION:
- // case Character.MATH_SYMBOL:
- // case Character.CURRENCY_SYMBOL:
- // case Character.MODIFIER_SYMBOL:
- // case Character.OTHER_SYMBOL:
- // case Character.INITIAL_QUOTE_PUNCTUATION:
- // case Character.FINAL_QUOTE_PUNCTUATION:
+ saveState();
- default: return SUBWORD_DELIM;
+ hasOutputToken = false;
+ hasOutputFollowingOriginal = !preserveOriginal;
+ lastConcatCount = 0;
+
+ if (preserveOriginal) {
+ posIncAttribute.setPositionIncrement(accumPosInc);
+ accumPosInc = 0;
+ return true;
+ }
+ }
+
+ // at the end of the string, output any concatenations
+ if (iterator.end == WordDelimiterIterator.DONE) {
+ if (!concat.isEmpty()) {
+ if (flushConcatenation(concat)) {
+ return true;
+ }
+ }
+
+ if (!concatAll.isEmpty()) {
+ // only if we haven't output this same combo above!
+ if (concatAll.subwordCount > lastConcatCount) {
+ concatAll.writeAndClear();
+ return true;
+ }
+ concatAll.clear();
+ }
+
+ // no saved concatenations, on to the next input word
+ hasSavedState = false;
+ continue;
+ }
+
+ // word surrounded by delimiters: always output
+ if (iterator.isSingleWord()) {
+ generatePart(true);
+ iterator.next();
+ return true;
+ }
+
+ int wordType = iterator.type();
+
+ // do we already have queued up incompatible concatenations?
+ if (!concat.isEmpty() && (concat.type & wordType) == 0) {
+ if (flushConcatenation(concat)) {
+ hasOutputToken = false;
+ return true;
+ }
+ hasOutputToken = false;
+ }
+
+ // add subwords depending upon options
+ if (shouldConcatenate(wordType)) {
+ if (concat.isEmpty()) {
+ concat.type = wordType;
+ }
+ concatenate(concat);
+ }
+
+ // add all subwords (catenateAll)
+ if (catenateAll) {
+ concatenate(concatAll);
+ }
+
+ // if we should output the word or number part
+ if (shouldGenerateParts(wordType)) {
+ generatePart(false);
+ iterator.next();
+ return true;
+ }
+
+ iterator.next();
}
}
- // use the type of the first char as the type
- // of the token.
- private int tokType(Token t) {
- return charType(t.termBuffer()[0]);
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ hasSavedState = false;
+ concat.clear();
+ concatAll.clear();
+ accumPosInc = 0;
}
- // There isn't really an efficient queue class, so we will
- // just use an array for now.
- private ArrayList<Token> queue = new ArrayList<Token>(4);
- private int queuePos=0;
-
- // temporary working queue
- private ArrayList<Token> tlist = new ArrayList<Token>(4);
+ // ================================================= Helper Methods ================================================
+ /**
+ * Saves the existing attribute states
+ */
+ private void saveState() {
+ // otherwise, we have delimiters, save state
+ savedStartOffset = offsetAttribute.startOffset();
+ savedEndOffset = offsetAttribute.endOffset();
+ // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
+ hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAtttribute.termLength());
+ savedType = typeAttribute.type();
- private Token newTok(Token orig, int start, int end) {
- int startOff = orig.startOffset();
- int endOff = orig.endOffset();
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- if (orig.termLength() == endOff-startOff) {
- endOff = startOff + end;
- startOff += start;
+ if (savedBuffer.length < termAtttribute.termLength()) {
+ savedBuffer = new char[ArrayUtil.getNextSize(termAtttribute.termLength())];
}
- return (Token)orig.clone(orig.termBuffer(), start, (end - start), startOff, endOff);
- }
-
-
- public final Token next(Token in) throws IOException {
+ System.arraycopy(termAtttribute.termBuffer(), 0, savedBuffer, 0, termAtttribute.termLength());
+ iterator.text = savedBuffer;
- // check the queue first
- if (queuePos<queue.size()) {
- return queue.get(queuePos++);
- }
+ hasSavedState = true;
+ }
- // reset the queue if it had been previously used
- if (queuePos!=0) {
- queuePos=0;
- queue.clear();
+ /**
+ * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+ *
+ * @param concatenation WordDelimiterConcatenation that will be flushed
+ * @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
+ */
+ private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
+ lastConcatCount = concatenation.subwordCount;
+ if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
+ concatenation.writeAndClear();
+ return true;
}
+ concatenation.clear();
+ return false;
+ }
+ /**
+ * Determines whether to concatenate a word or number if the current word is the given type
+ *
+ * @param wordType Type of the current word used to determine if it should be concatenated
+ * @return {@code true} if concatenation should occur, {@code false} otherwise
+ */
+ private boolean shouldConcatenate(int wordType) {
+ return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
+ }
- // optimize for the common case: assume there will be
- // no subwords (just a simple word)
- //
- // Would it actually be faster to check for the common form
- // of isLetter() isLower()*, and then backtrack if it doesn't match?
-
- int origPosIncrement = 0;
- Token t;
- while(true) {
- // t is either returned, or a new token is made from it, so it should
- // be safe to use the next(Token) method.
- t = input.next(in);
- if (t == null) return null;
-
- char [] termBuffer = t.termBuffer();
- int len = t.termLength();
- int start=0;
- if (len ==0) continue;
-
- int posInc = t.getPositionIncrement();
- origPosIncrement += posInc;
-
- //skip protected tokens
- if (protWords != null && protWords.contains(termBuffer, 0, len)) {
- t.setPositionIncrement(origPosIncrement);
- return t;
- }
-
- // Avoid calling charType more than once for each char (basically
- // avoid any backtracking).
- // makes code slightly more difficult, but faster.
- int ch=termBuffer[start];
- int type=charType(ch);
-
- int numWords=0;
-
- while (start< len) {
- // first eat delimiters at the start of this subword
- while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
- ch=termBuffer[start];
- type=charType(ch);
- }
-
- int pos=start;
-
- // save the type of the first char of the subword
- // as a way to tell what type of subword token this is (number, word, etc)
- int firstType=type;
- int lastType=type; // type of the previously read char
-
-
- while (pos< len) {
-
- if ((type & lastType)==0) { // no overlap in character type
- // check and remove "'s" from the end of a token.
- // the pattern to check for is
- // ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
- if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
- if (ch=='\'' && pos+1< len
- && (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
- {
- int subWordEnd=pos;
- if (pos+2>= len) {
- // end of string detected after "'s"
- pos+=2;
- } else {
- // make sure that a delimiter follows "'s"
- int ch2 = termBuffer[pos+2];
- int type2 = charType(ch2);
- if ((type2 & SUBWORD_DELIM)!=0) {
- // if delimiter, move position pointer
- // to it (skipping over "'s"
- ch=ch2;
- type=type2;
- pos+=2;
- }
- }
-
- queue.add(newTok(t,start,subWordEnd));
- if ((firstType & ALPHA)!=0) numWords++;
- break;
- }
- }
-
- // For case changes, only split on a transition from
- // lower to upper case, not vice-versa.
- // That will correctly handle the
- // case of a word starting with a capital (won't split).
- // It will also handle pluralization of
- // an uppercase word such as FOOs (won't split).
-
- if (splitOnCaseChange == 0 &&
- (lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
- // ALPHA->ALPHA: always ignore if case isn't considered.
- } else if ((lastType & UPPER)!=0 && (type & ALPHA)!=0) {
- // UPPER->letter: Don't split
- } else if(splitOnNumerics == 0 &&
- ( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
- // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
- } else {
- // NOTE: this code currently assumes that only one flag
- // is set for each character now, so we don't have
- // to explicitly check for all the classes of transitions
- // listed below.
-
- // LOWER->UPPER
- // ALPHA->NUMERIC
- // NUMERIC->ALPHA
- // *->DELIMITER
- queue.add(newTok(t,start,pos));
- if ((firstType & ALPHA)!=0) numWords++;
- break;
- }
- }
-
- if (++pos >= len) {
- if (start==0) {
- // the subword is the whole original token, so
- // return it unchanged.
- t.setPositionIncrement(origPosIncrement);
- return t;
- }
-
- // optimization... if this is the only token,
- // return it immediately.
- if (queue.size()==0 && preserveOriginal == 0) {
- // just adjust the text w/o changing the rest
- // of the original token
- t.setTermBuffer(termBuffer, start, len-start);
- t.setStartOffset(t.startOffset() + start);
- t.setPositionIncrement(origPosIncrement);
- return t;
- }
-
- Token newtok = newTok(t,start,pos);
-
- queue.add(newtok);
- if ((firstType & ALPHA)!=0) numWords++;
- break;
- }
-
- lastType = type;
- ch = termBuffer[pos];
- type = charType(ch);
- }
-
- // start of the next subword is the current position
- start=pos;
- }
-
- // System.out.println("##########TOKEN=" + s + " ######### WORD DELIMITER QUEUE=" + str(queue));
-
- final int numtok = queue.size();
-
- // We reached the end of the current token.
- // If the queue is empty, we should continue by reading
- // the next token
- if (numtok==0) {
- // the token might have been all delimiters, in which
- // case return it if we're meant to preserve it
- if (preserveOriginal != 0) {
- return t;
- }
-
- // if this token had a "normal" gap of 1, remove it.
- if (posInc==1) origPosIncrement-=1;
- continue;
- }
-
- // if number of tokens is 1, there are no catenations to be done.
- if (numtok==1) {
- break;
- }
-
-
- final int numNumbers = numtok - numWords;
+ /**
+ * Determines whether a word/number part should be generated for a word of the given type
+ *
+ * @param wordType Type of the word used to determine if a word/number part should be generated
+ * @return {@code true} if a word/number part should be generated, {@code false} otherwise
+ */
+ private boolean shouldGenerateParts(int wordType) {
+ return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
+ }
- // check conditions under which the current token
- // queue may be used as-is (no catenations needed)
- if (catenateAll==0 // no "everything" to catenate
- && (catenateWords==0 || numWords<=1) // no words to catenate
- && (catenateNumbers==0 || numNumbers<=1) // no numbers to catenate
- && (generateWordParts!=0 || numWords==0) // word generation is on
- && (generateNumberParts!=0 || numNumbers==0)) // number generation is on
- {
- break;
- }
+ /**
+ * Concatenates the saved buffer to the given WordDelimiterConcatenation
+ *
+ * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+ */
+ private void concatenate(WordDelimiterConcatenation concatenation) {
+ if (concatenation.isEmpty()) {
+ concatenation.startOffset = savedStartOffset + iterator.current;
+ }
+ concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
+ concatenation.endOffset = savedStartOffset + iterator.end;
+ }
+ /**
+ * Generates a word/number part, updating the appropriate attributes
+ *
+ * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
+ */
+ private void generatePart(boolean isSingleWord) {
+ clearAttributes();
+ termAtttribute.setTermBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
- // swap queue and the temporary working list, then clear the
- // queue in preparation for adding all combinations back to it.
- ArrayList<Token> tmp=tlist;
- tlist=queue;
- queue=tmp;
- queue.clear();
-
- if (numWords==0) {
- // all numbers
- addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1);
- if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
- } else if (numNumbers==0) {
- // all words
- addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1);
- if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
- } else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) {
- // catenate all *only*
- // OPT:could be optimized to add to current queue...
- addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
- if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
- }
+ int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
+ int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
- //
- // Find all adjacent tokens of the same type.
- //
- Token tok = tlist.get(0);
- boolean isWord = (tokType(tok) & ALPHA) != 0;
- boolean wasWord=isWord;
-
- for(int i=0; i<numtok;) {
- int j;
- for (j=i+1; j<numtok; j++) {
- wasWord=isWord;
- tok = tlist.get(j);
- isWord = (tokType(tok) & ALPHA) != 0;
- if (isWord != wasWord) break;
- }
- if (wasWord) {
- addCombos(tlist,i,j,generateWordParts!=0,catenateWords!=0,1);
- } else {
- addCombos(tlist,i,j,generateNumberParts!=0,catenateNumbers!=0,1);
- }
- i=j;
- }
+ offsetAttribute.setOffset(startOffSet, endOffSet);
+ posIncAttribute.setPositionIncrement(position(false));
+ typeAttribute.setType(savedType);
+ }
- // take care catenating all subwords
- if (catenateAll!=0) {
- addCombos(tlist,0,numtok,false,true,0);
- }
+ /**
+ * Get the position increment gap for a subword or concatenation
+ *
+ * @param inject true if this token wants to be injected
+ * @return position increment gap
+ */
+ private int position(boolean inject) {
+ int posInc = accumPosInc;
- // NOTE: in certain cases, queue may be empty (for instance, if catenate
- // and generate are both set to false). Only exit the loop if the queue
- // is not empty.
- if (queue.size() > 0 || preserveOriginal!=0) break;
+ if (hasOutputToken) {
+ accumPosInc = 0;
+ return inject ? 0 : Math.max(1, posInc);
}
- // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
-
- if (preserveOriginal != 0) {
- queuePos = 0;
- if (queue.size() > 0) {
- // overlap first token with the original
- queue.get(0).setPositionIncrement(0);
- }
- return t; // return the original
- } else {
- queuePos=1;
- Token tok = queue.get(0);
- tok.setPositionIncrement(origPosIncrement);
- return tok;
+ hasOutputToken = true;
+
+ if (!hasOutputFollowingOriginal) {
+ // the first token following the original is 0 regardless
+ hasOutputFollowingOriginal = true;
+ return 0;
}
+ // clear the accumulated position increment
+ accumPosInc = 0;
+ return Math.max(1, posInc);
}
+ /**
+ * Checks if the given word type includes {@link #ALPHA}
+ *
+ * @param type Word type to check
+ * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+ */
+ static boolean isAlpha(int type) {
+ return (type & ALPHA) != 0;
+ }
- // index "a","b","c" as pos0="a", pos1="b", pos2="c", pos2="abc"
- private void addCombos(List<Token> lst, int start, int end, boolean generateSubwords, boolean catenateSubwords, int posOffset) {
- if (end-start==1) {
- // always generate a word alone, even if generateSubwords=0 because
- // the catenation of all the subwords *is* the subword.
- queue.add(lst.get(start));
- return;
- }
-
- StringBuilder sb = null;
- if (catenateSubwords) sb=new StringBuilder();
- Token firstTok=null;
- Token tok=null;
- for (int i=start; i<end; i++) {
- tok = lst.get(i);
- if (catenateSubwords) {
- if (i==start) firstTok=tok;
- sb.append(tok.termBuffer(), 0, tok.termLength());
- }
- if (generateSubwords) {
- queue.add(tok);
- }
- }
+ /**
+ * Checks if the given word type includes {@link #DIGIT}
+ *
+ * @param type Word type to check
+ * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+ */
+ static boolean isDigit(int type) {
+ return (type & DIGIT) != 0;
+ }
- if (catenateSubwords) {
- Token concatTok = new Token(sb.toString(),
- firstTok.startOffset(),
- tok.endOffset(),
- firstTok.type());
- // if we indexed some other tokens, then overlap concatTok with the last.
- // Otherwise, use the value passed in as the position offset.
- concatTok.setPositionIncrement(generateSubwords==true ? 0 : posOffset);
- queue.add(concatTok);
- }
+ /**
+ * Checks if the given word type includes {@link #SUBWORD_DELIM}
+ *
+ * @param type Word type to check
+ * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+ */
+ static boolean isSubwordDelim(int type) {
+ return (type & SUBWORD_DELIM) != 0;
}
- @Override
- public void reset() throws IOException {
- input.reset();
- queuePos=0;
- queue.clear();
+ /**
+ * Checks if the given word type includes {@link #UPPER}
+ *
+ * @param type Word type to check
+ * @return {@code true} if the type contains UPPER, {@code false} otherwise
+ */
+ static boolean isUpper(int type) {
+ return (type & UPPER) != 0;
}
+ // ================================================= Inner Classes =================================================
+
+ /**
+ * A WDF concatenated 'run'
+ */
+ final class WordDelimiterConcatenation {
+ final StringBuilder buffer = new StringBuilder();
+ int startOffset;
+ int endOffset;
+ int type;
+ int subwordCount;
+
+ /**
+ * Appends the given text of the given length, to the concetenation at the given offset
+ *
+ * @param text Text to append
+ * @param offset Offset in the concetenation to add the text
+ * @param length Length of the text to append
+ */
+ void append(char text[], int offset, int length) {
+ buffer.append(text, offset, length);
+ subwordCount++;
+ }
+
+ /**
+ * Writes the concatenation to the attributes
+ */
+ void write() {
+ clearAttributes();
+ if (termAtttribute.termLength() < buffer.length()) {
+ termAtttribute.resizeTermBuffer(buffer.length());
+ }
+ char termbuffer[] = termAtttribute.termBuffer();
+
+ buffer.getChars(0, buffer.length(), termbuffer, 0);
+ termAtttribute.setTermLength(buffer.length());
+
+ if (hasIllegalOffsets) {
+ offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+ }
+ else {
+ offsetAttribute.setOffset(startOffset, endOffset);
+ }
+ posIncAttribute.setPositionIncrement(position(true));
+ typeAttribute.setType(savedType);
+ accumPosInc = 0;
+ }
+
+ /**
+ * Determines if the concatenation is empty
+ *
+ * @return {@code true} if the concatenation is empty, {@code false} otherwise
+ */
+ boolean isEmpty() {
+ return buffer.length() == 0;
+ }
+
+ /**
+ * Clears the concatenation and resets its state
+ */
+ void clear() {
+ buffer.setLength(0);
+ startOffset = endOffset = type = subwordCount = 0;
+ }
+
+ /**
+ * Convenience method for the common scenario of having to write the concetenation and then clearing its state
+ */
+ void writeAndClear() {
+ write();
+ clear();
+ }
+ }
// questions:
// negative numbers? -42 indexed as just 42?
// dollar sign? $42
Added: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java?rev=922957&view=auto
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java (added)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java Sun Mar 14 20:58:32 2010
@@ -0,0 +1,315 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.solr.analysis.WordDelimiterFilter.*;
+
+/**
+ * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
+ */
+final class WordDelimiterIterator {
+
+ /** Indicates the end of iteration */
+ public static final int DONE = -1;
+
+ public static final byte[] DEFAULT_WORD_DELIM_TABLE;
+
+ char text[];
+ int length;
+
+ /** start position of text, excluding leading delimiters */
+ int startBounds;
+ /** end position of text, excluding trailing delimiters */
+ int endBounds;
+
+ /** Beginning of subword */
+ int current;
+ /** End of subword */
+ int end;
+
+ /* does this string end with a possessive such as 's */
+ private boolean hasFinalPossessive = false;
+
+ /**
+ * If false, causes case changes to be ignored (subwords will only be generated
+ * given SUBWORD_DELIM tokens). (Defaults to true)
+ */
+ final boolean splitOnCaseChange;
+
+ /**
+ * If false, causes numeric changes to be ignored (subwords will only be generated
+ * given SUBWORD_DELIM tokens). (Defaults to true)
+ */
+ final boolean splitOnNumerics;
+
+ /**
+ * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
+ * <p/>
+ * "O'Neil's" => "O", "Neil"
+ */
+ final boolean stemEnglishPossessive;
+
+ private final byte[] charTypeTable;
+
+ /** if true, need to skip over a possessive found in the last call to next() */
+ private boolean skipPossessive = false;
+
+ // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
+ // done if separated by these chars?) "," would be an obvious candidate...
+ static {
+ byte[] tab = new byte[256];
+ for (int i = 0; i < 256; i++) {
+ byte code = 0;
+ if (Character.isLowerCase(i)) {
+ code |= LOWER;
+ }
+ else if (Character.isUpperCase(i)) {
+ code |= UPPER;
+ }
+ else if (Character.isDigit(i)) {
+ code |= DIGIT;
+ }
+ if (code == 0) {
+ code = SUBWORD_DELIM;
+ }
+ tab[i] = code;
+ }
+ DEFAULT_WORD_DELIM_TABLE = tab;
+ }
+
+ /**
+ * Create a new WordDelimiterIterator operating with the supplied rules.
+ *
+ * @param charTypeTable table containing character types
+ * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+ * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
+ * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+ */
+ WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
+ this.charTypeTable = charTypeTable;
+ this.splitOnCaseChange = splitOnCaseChange;
+ this.splitOnNumerics = splitOnNumerics;
+ this.stemEnglishPossessive = stemEnglishPossessive;
+ }
+
+ /**
+ * Advance to the next subword in the string.
+ *
+ * @return index of the next subword, or {@link #DONE} if all subwords have been returned
+ */
+ int next() {
+ current = end;
+ if (current == DONE) {
+ return DONE;
+ }
+
+ if (skipPossessive) {
+ current += 2;
+ skipPossessive = false;
+ }
+
+ int lastType = 0;
+
+ while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
+ current++;
+ }
+
+ if (current >= endBounds) {
+ return end = DONE;
+ }
+
+ for (end = current + 1; end < endBounds; end++) {
+ int type = charType(text[end]);
+ if (isBreak(lastType, type)) {
+ break;
+ }
+ lastType = type;
+ }
+
+ if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
+ skipPossessive = true;
+ }
+
+ return end;
+ }
+
+
+ /**
+ * Return the type of the current subword.
+ * This currently uses the type of the first character in the subword.
+ *
+ * @return type of the current word
+ */
+ int type() {
+ if (end == DONE) {
+ return 0;
+ }
+
+ int type = charType(text[current]);
+ switch (type) {
+ // return ALPHA word type for both lower and upper
+ case LOWER:
+ case UPPER:
+ return ALPHA;
+ default:
+ return type;
+ }
+ }
+
+ /**
+ * Reset the text to a new value, and reset all state
+ *
+ * @param text New text
+ * @param length length of the text
+ */
+ void setText(char text[], int length) {
+ this.text = text;
+ this.length = this.endBounds = length;
+ current = startBounds = end = 0;
+ skipPossessive = hasFinalPossessive = false;
+ setBounds();
+ }
+
+ // ================================================= Helper Methods ================================================
+
+ /**
+ * Determines whether the transition from lastType to type indicates a break
+ *
+ * @param lastType Last subword type
+ * @param type Current subword type
+ * @return {@code true} if the transition indicates a break, {@code false} otherwise
+ */
+ private boolean isBreak(int lastType, int type) {
+ if ((type & lastType) != 0) {
+ return false;
+ }
+
+ if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
+ // ALPHA->ALPHA: always ignore if case isn't considered.
+ return false;
+ } else if (isUpper(lastType) && isAlpha(type)) {
+ // UPPER->letter: Don't split
+ return false;
+ } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
+ // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
+ *
+ * @return {@code true} if the current word contains only one subword, {@code false} otherwise
+ */
+ boolean isSingleWord() {
+ if (hasFinalPossessive) {
+ return current == startBounds && end == endBounds - 2;
+ }
+ else {
+ return current == startBounds && end == endBounds;
+ }
+ }
+
+ /**
+ * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
+ * it yet, simply note it.
+ */
+ private void setBounds() {
+ while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
+ startBounds++;
+ }
+
+ while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
+ endBounds--;
+ }
+ if (endsWithPossessive(endBounds)) {
+ hasFinalPossessive = true;
+ }
+ current = startBounds;
+ }
+
+ /**
+ * Determines if the text at the given position indicates an English possessive which should be removed
+ *
+ * @param pos Position in the text to check if it indicates an English possessive
+ * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
+ */
+ private boolean endsWithPossessive(int pos) {
+ return (stemEnglishPossessive &&
+ pos > 2 &&
+ text[pos - 2] == '\'' &&
+ (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
+ isAlpha(charType(text[pos - 3])) &&
+ (pos == endBounds || isSubwordDelim(charType(text[pos]))));
+ }
+
+ /**
+ * Determines the type of the given character
+ *
+ * @param ch Character whose type is to be determined
+ * @return Type of the character
+ */
+ private int charType(int ch) {
+ if (ch < charTypeTable.length) {
+ return charTypeTable[ch];
+ }
+ switch (Character.getType(ch)) {
+ case Character.UPPERCASE_LETTER: return UPPER;
+ case Character.LOWERCASE_LETTER: return LOWER;
+
+ case Character.TITLECASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ case Character.OTHER_LETTER:
+ case Character.NON_SPACING_MARK:
+ case Character.ENCLOSING_MARK: // depends what it encloses?
+ case Character.COMBINING_SPACING_MARK:
+ return ALPHA;
+
+ case Character.DECIMAL_DIGIT_NUMBER:
+ case Character.LETTER_NUMBER:
+ case Character.OTHER_NUMBER:
+ return DIGIT;
+
+ // case Character.SPACE_SEPARATOR:
+ // case Character.LINE_SEPARATOR:
+ // case Character.PARAGRAPH_SEPARATOR:
+ // case Character.CONTROL:
+ // case Character.FORMAT:
+ // case Character.PRIVATE_USE:
+
+ case Character.SURROGATE: // prevent splitting
+ return ALPHA|DIGIT;
+
+ // case Character.DASH_PUNCTUATION:
+ // case Character.START_PUNCTUATION:
+ // case Character.END_PUNCTUATION:
+ // case Character.CONNECTOR_PUNCTUATION:
+ // case Character.OTHER_PUNCTUATION:
+ // case Character.MATH_SYMBOL:
+ // case Character.CURRENCY_SYMBOL:
+ // case Character.MODIFIER_SYMBOL:
+ // case Character.OTHER_SYMBOL:
+ // case Character.INITIAL_QUOTE_PUNCTUATION:
+ // case Character.FINAL_QUOTE_PUNCTUATION:
+
+ default: return SUBWORD_DELIM;
+ }
+ }
+}
\ No newline at end of file
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,12 @@ import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
@@ -132,15 +138,20 @@ public class AnalysisRequestHandler exte
static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
// outer is namedList since order of tokens is important
NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
- Token t = null;
- while (((t = tstream.next()) != null)) {
+ // TODO: support custom attributes
+ TermAttribute termAtt = (TermAttribute) tstream.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tstream.addAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) tstream.addAttribute(TypeAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tstream.addAttribute(PositionIncrementAttribute.class);
+
+ while (tstream.incrementToken()) {
NamedList<Object> token = new SimpleOrderedMap<Object>();
tokens.add("token", token);
- token.add("value", new String(t.termBuffer(), 0, t.termLength()));
- token.add("start", t.startOffset());
- token.add("end", t.endOffset());
- token.add("posInc", t.getPositionIncrement());
- token.add("type", t.type());
+ token.add("value", new String(termAtt.termBuffer(), 0, termAtt.termLength()));
+ token.add("start", offsetAtt.startOffset());
+ token.add("end", offsetAtt.endOffset());
+ token.add("posInc", posIncAtt.getPositionIncrement());
+ token.add("type", typeAtt.type());
//TODO: handle payloads
}
return tokens;
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java Sun Mar 14 20:58:32 2010
@@ -22,6 +22,12 @@ import org.apache.lucene.analysis.CharRe
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
@@ -141,11 +147,30 @@ public abstract class AnalysisRequestHan
*/
private List<Token> analyzeTokenStream(TokenStream tokenStream) {
List<Token> tokens = new ArrayList<Token>();
- Token reusableToken = new Token();
- Token token = null;
+ // TODO change this API to support custom attributes
+ TermAttribute termAtt = (TermAttribute)
+ tokenStream.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute)
+ tokenStream.addAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute)
+ tokenStream.addAttribute(TypeAttribute.class);
+ FlagsAttribute flagsAtt = (FlagsAttribute)
+ tokenStream.addAttribute(FlagsAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute)
+ tokenStream.addAttribute(PayloadAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
+ tokenStream.addAttribute(PositionIncrementAttribute.class);
+
try {
- while ((token = tokenStream.next(reusableToken)) != null) {
+ while (tokenStream.incrementToken()) {
+ Token token = new Token();
+ token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+ token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+ token.setType(typeAtt.type());
+ token.setFlags(flagsAtt.getFlags());
+ token.setPayload(payloadAtt.getPayload());
+ token.setPositionIncrement(posIncAtt.getPositionIncrement());
tokens.add((Token) token.clone());
}
} catch (IOException ioe) {
@@ -229,16 +254,30 @@ public abstract class AnalysisRequestHan
/**
* TokenStream that iterates over a list of pre-existing Tokens
*/
+ // TODO refactor to support custom attributes
protected static class ListBasedTokenStream extends TokenStream {
+ private final List<Token> tokens;
+ private Iterator<Token> tokenIterator;
- private final Iterator<Token> tokenIterator;
-
+ private final TermAttribute termAtt = (TermAttribute)
+ addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAtt = (OffsetAttribute)
+ addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = (TypeAttribute)
+ addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = (FlagsAttribute)
+ addAttribute(FlagsAttribute.class);
+ private final PayloadAttribute payloadAtt = (PayloadAttribute)
+ addAttribute(PayloadAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
+ addAttribute(PositionIncrementAttribute.class);
/**
* Creates a new ListBasedTokenStream which uses the given tokens as its token source.
*
* @param tokens Source of tokens to be used
*/
ListBasedTokenStream(List<Token> tokens) {
+ this.tokens = tokens;
tokenIterator = tokens.iterator();
}
@@ -246,8 +285,25 @@ public abstract class AnalysisRequestHan
* {@inheritDoc}
*/
@Override
- public Token next(Token token) throws IOException {
- return (tokenIterator.hasNext()) ? tokenIterator.next() : null;
+ public boolean incrementToken() throws IOException {
+ if (tokenIterator.hasNext()) {
+ Token next = tokenIterator.next();
+ termAtt.setTermBuffer(next.termBuffer(), 0, next.termLength());
+ typeAtt.setType(next.type());
+ offsetAtt.setOffset(next.startOffset(), next.endOffset());
+ flagsAtt.setFlags(next.getFlags());
+ payloadAtt.setPayload(next.getPayload());
+ posIncAtt.setPositionIncrement(next.getPositionIncrement());
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ tokenIterator = tokens.iterator();
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -258,7 +258,7 @@ public class SpellCheckerRequestHandler
}
dirDescription = f.getAbsolutePath();
log.info("using spell directory: " + dirDescription);
- spellcheckerIndexDir = FSDirectory.getDirectory(f);
+ spellcheckerIndexDir = FSDirectory.open(f);
} else {
log.info("using RAM based spell directory");
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -40,7 +40,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ConstantScoreRangeQuery;
+import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.PriorityQueue;
@@ -172,7 +172,8 @@ public class LukeRequestHandler extends
flags.append( (f != null && f.getOmitNorms()) ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' );
flags.append( (f != null && f.isLazy()) ? FieldFlag.LAZY.getAbbreviation() : '-' );
flags.append( (f != null && f.isBinary()) ? FieldFlag.BINARY.getAbbreviation() : '-' );
- flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
+ //nocommit: handle compressed
+ //flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
flags.append( (false) ? FieldFlag.SORT_MISSING_FIRST.getAbbreviation() : '-' ); // SchemaField Specific
flags.append( (false) ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); // SchemaField Specific
return flags.toString();
@@ -312,7 +313,7 @@ public class LukeRequestHandler extends
// If numTerms==0, the call is just asking for a quick field list
if( ttinfo != null && sfield != null && sfield.indexed() ) {
- Query q = new ConstantScoreRangeQuery(fieldName,null,null,false,false);
+ Query q = new TermRangeQuery(fieldName,null,null,false,false);
TopDocs top = searcher.search( q, 1 );
if( top.totalHits > 0 ) {
// Find a document with this field
@@ -652,7 +653,7 @@ public class LukeRequestHandler extends
}
if( terms.docFreq() > tiq.minFreq ) {
- tiq.put(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
+ tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
if (tiq.size() > numTerms) { // if tiq full
tiq.pop(); // remove lowest in tiq
tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java Sun Mar 14 20:58:32 2010
@@ -33,6 +33,7 @@ import org.apache.solr.common.params.Sha
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
@@ -305,7 +306,6 @@ public class QueryComponent extends Sear
public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
mergeIds(rb, sreq);
- return;
}
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
@@ -399,7 +399,8 @@ public class QueryComponent extends Sear
// Merge the docs via a priority queue so we don't have to sort *all* of the
// documents... we only need to order the top (rows+start)
- ShardFieldSortedHitQueue queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
+ ShardFieldSortedHitQueue queue;
+ queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
long numFound = 0;
Float maxScore=null;
@@ -451,7 +452,7 @@ public class QueryComponent extends Sear
shardDoc.sortFieldValues = sortFieldValues;
- queue.insert(shardDoc);
+ queue.insertWithOverflow(shardDoc);
} // end for-each-doc-in-response
} // end for-each-response
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java Sun Mar 14 20:58:32 2010
@@ -38,8 +38,8 @@ import javax.xml.xpath.XPathExpressionEx
import javax.xml.xpath.XPathFactory;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
@@ -298,10 +298,9 @@ public class QueryElevationComponent ext
TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
tokens.reset();
- Token token = tokens.next();
- while( token != null ) {
- norm.append( new String(token.termBuffer(), 0, token.termLength()) );
- token = tokens.next();
+ TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
+ while( tokens.incrementToken() ) {
+ norm.append( termAtt.termBuffer(), 0, termAtt.termLength() );
}
return norm.toString();
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java Sun Mar 14 20:58:32 2010
@@ -33,6 +33,12 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
@@ -332,7 +338,7 @@ public class SpellCheckComponent extends
// create token
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
Token token = new Token();
- token.setTermText(original);
+ token.setTermBuffer(original);
token.setStartOffset(suggestion.getStartOffset());
token.setEndOffset(suggestion.getEndOffset());
@@ -364,10 +370,24 @@ public class SpellCheckComponent extends
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>();
- Token token = null;
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
ts.reset();
- while ((token = ts.next()) != null){
+ // TODO: support custom attributes
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
+ FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
+
+ while (ts.incrementToken()){
+ Token token = new Token();
+ token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+ token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+ token.setType(typeAtt.type());
+ token.setFlags(flagsAtt.getFlags());
+ token.setPayload(payloadAtt.getPayload());
+ token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
return result;
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java Sun Mar 14 20:58:32 2010
@@ -113,7 +113,7 @@ public class TermVectorComponent extends
IndexSchema schema = rb.req.getSchema();
String uniqFieldName = schema.getUniqueKeyField().getName();
//Only load the id field
- SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
+ SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
while (iter.hasNext()) {
Integer docId = iter.next();
NamedList docNL = new NamedList();
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Sun Mar 14 20:58:32 2010
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Cachin
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
@@ -39,6 +40,7 @@ import org.apache.lucene.search.vectorhi
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
+import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
@@ -512,28 +514,28 @@ public class DefaultSolrHighlighter exte
*/
class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
- private final LinkedList<Token> queue = new LinkedList<Token>();
+ private final LinkedList<OrderedToken> queue = new LinkedList<OrderedToken>();
private boolean done=false;
-
+ private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
- public Token next() throws IOException {
+ public boolean incrementToken() throws IOException {
while (!done && queue.size() < windowSize) {
- Token newTok = input.next();
- if (newTok==null) {
- done=true;
+ if (!input.incrementToken()) {
+ done = true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
- ListIterator<Token> iter = queue.listIterator(queue.size());
+ ListIterator<OrderedToken> iter = queue.listIterator(queue.size());
while(iter.hasPrevious()) {
- if (newTok.startOffset() >= iter.previous().startOffset()) {
+ if (offsetAtt.startOffset() >= iter.previous().startOffset) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
@@ -541,50 +543,82 @@ class TokenOrderingFilter extends TokenF
break;
}
}
- iter.add(newTok);
+ OrderedToken ot = new OrderedToken();
+ ot.state = captureState();
+ ot.startOffset = offsetAtt.startOffset();
+ iter.add(ot);
}
- return queue.isEmpty() ? null : queue.removeFirst();
+ if (queue.isEmpty()) {
+ return false;
+ } else {
+ restoreState(queue.removeFirst().state);
+ return true;
+ }
}
}
+// for TokenOrderingFilter, so it can easily sort by startOffset
+class OrderedToken {
+ State state;
+ int startOffset;
+}
+
class TermOffsetsTokenStream {
TokenStream bufferedTokenStream = null;
- Token bufferedToken;
+ OffsetAttribute bufferedOffsetAtt;
+ State bufferedToken;
+ int bufferedStartOffset;
+ int bufferedEndOffset;
int startOffset;
int endOffset;
public TermOffsetsTokenStream( TokenStream tstream ){
bufferedTokenStream = tstream;
+ bufferedOffsetAtt = (OffsetAttribute) bufferedTokenStream.addAttribute(OffsetAttribute.class);
startOffset = 0;
bufferedToken = null;
}
public TokenStream getMultiValuedTokenStream( final int length ){
endOffset = startOffset + length;
- return new TokenStream(){
- Token token;
- public Token next() throws IOException {
+ return new MultiValuedStream(length);
+ }
+
+ class MultiValuedStream extends TokenStream {
+ private final int length;
+ OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+ MultiValuedStream(int length) {
+ super(bufferedTokenStream.cloneAttributes());
+ this.length = length;
+ }
+
+ public boolean incrementToken() throws IOException {
while( true ){
- if( bufferedToken == null )
- bufferedToken = bufferedTokenStream.next();
- if( bufferedToken == null ) return null;
- if( startOffset <= bufferedToken.startOffset() &&
- bufferedToken.endOffset() <= endOffset ){
- token = bufferedToken;
+ if( bufferedToken == null ) {
+ if (!bufferedTokenStream.incrementToken())
+ return false;
+ bufferedToken = bufferedTokenStream.captureState();
+ bufferedStartOffset = bufferedOffsetAtt.startOffset();
+ bufferedEndOffset = bufferedOffsetAtt.endOffset();
+ }
+
+ if( startOffset <= bufferedStartOffset &&
+ bufferedEndOffset <= endOffset ){
+ restoreState(bufferedToken);
bufferedToken = null;
- token.setStartOffset( token.startOffset() - startOffset );
- token.setEndOffset( token.endOffset() - startOffset );
- return token;
+ offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
+ return true;
}
- else if( bufferedToken.endOffset() > endOffset ){
+ else if( bufferedEndOffset > endOffset ){
startOffset += length + 1;
- return null;
+ return false;
}
bufferedToken = null;
}
}
- };
- }
-}
+
+ };
+};
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java Sun Mar 14 20:58:32 2010
@@ -176,7 +176,7 @@ public abstract class BaseResponseWriter
Object val = null;
if (ft == null) { // handle fields not in the schema
if (f.isBinary())
- val = f.binaryValue();
+ val = f.getBinaryValue();
else
val = f.stringValue();
} else {
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java Sun Mar 14 20:58:32 2010
@@ -140,7 +140,7 @@ public class BinaryResponseWriter implem
if(sf != null) ft =sf.getType();
Object val;
if (ft == null) { // handle fields not in the schema
- if (f.isBinary()) val = f.binaryValue();
+ if (f.isBinary()) val = f.getBinaryValue();
else val = f.stringValue();
} else {
try {
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java Sun Mar 14 20:58:32 2010
@@ -58,8 +58,10 @@ public abstract class CompressableField
String internalVal) {
/* compress field if length exceeds threshold */
if(field.isCompressed()) {
- return internalVal.length() >= compressThreshold ?
- Field.Store.COMPRESS : Field.Store.YES;
+ // nocommit: handle compression
+ //return internalVal.length() >= compressThreshold ?
+ // Field.Store.COMPRESS : Field.Store.YES;
+ return Field.Store.YES;
} else
return super.getFieldStore(field, internalVal);
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java Sun Mar 14 20:58:32 2010
@@ -302,8 +302,8 @@ public abstract class FieldType extends
}
protected Field.Index getFieldIndex(SchemaField field,
String internalVal) {
- return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
- Field.Index.UN_TOKENIZED) : Field.Index.NO;
+ return field.indexed() ? (isTokenized() ? Field.Index.ANALYZED :
+ Field.Index.NOT_ANALYZED) : Field.Index.NO;
}
/**
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java Sun Mar 14 20:58:32 2010
@@ -63,7 +63,7 @@ public class TrieDateField extends DateF
@Override
public Date toObject(Fieldable f) {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,TrieField.badFieldString(f));
return new Date(TrieField.toLong(arr));
}
@@ -85,7 +85,7 @@ public class TrieDateField extends DateF
@Override
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) {
xmlWriter.writeStr(name, TrieField.badFieldString(f));
return;
@@ -96,7 +96,7 @@ public class TrieDateField extends DateF
@Override
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) {
writer.writeStr(name, TrieField.badFieldString(f),true);
return;
@@ -136,7 +136,7 @@ public class TrieDateField extends DateF
@Override
public String toExternal(Fieldable f) {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) return TrieField.badFieldString(f);
return super.toExternal(new Date(TrieField.toLong(arr)));
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java Sun Mar 14 20:58:32 2010
@@ -93,7 +93,7 @@ public class TrieField extends FieldType
@Override
public Object toObject(Fieldable f) {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) return badFieldString(f);
switch (type) {
case INTEGER:
@@ -145,7 +145,7 @@ public class TrieField extends FieldType
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) {
xmlWriter.writeStr(name, badFieldString(f));
return;
@@ -173,7 +173,7 @@ public class TrieField extends FieldType
}
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) {
writer.writeStr(name, badFieldString(f),true);
return;
@@ -352,7 +352,7 @@ public class TrieField extends FieldType
@Override
public String toExternal(Fieldable f) {
- byte[] arr = f.binaryValue();
+ byte[] arr = f.getBinaryValue();
if (arr==null) return badFieldString(f);
switch (type) {
case INTEGER:
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java Sun Mar 14 20:58:32 2010
@@ -17,7 +17,6 @@
package org.apache.solr.search;
-import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.OpenBitSet;
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java Sun Mar 14 20:58:32 2010
@@ -25,7 +25,6 @@ import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.util.OpenBitSet;
-import java.util.BitSet;
import java.io.IOException;
/**
@@ -40,17 +39,6 @@ public class PrefixFilter extends Filter
Term getPrefix() { return prefix; }
- @Override
- public BitSet bits(IndexReader reader) throws IOException {
- final BitSet bitSet = new BitSet(reader.maxDoc());
- new PrefixGenerator(prefix) {
- public void handleDoc(int doc) {
- bitSet.set(doc);
- }
- }.generate(reader);
- return bitSet;
- }
-
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());