You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2011/10/25 13:12:14 UTC
svn commit: r1188604 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/src/test/ lucene/contrib/ lucene/contrib/analyzers/common/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/
lucene/contrib/analyzers/comm...
Author: uschindler
Date: Tue Oct 25 11:12:14 2011
New Revision: 1188604
URL: http://svn.apache.org/viewvc?rev=1188604&view=rev
Log:
LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be used with custom attributes. All those attributes are preserved and set on all added decompounded tokens
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Tue Oct 25 11:12:14 2011
@@ -10,6 +10,9 @@ Changes in backwards compatibility polic
* LUCENE-3446: Removed BooleanFilter.finalResult() due to change to
FixedBitSet. (Uwe Schindler)
+ * LUCENE-3508: Changed some method signatures in decompounding TokenFilters
+ to make them no longer use the Token class. (Uwe Schindler)
+
New Features
* LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
@@ -79,6 +82,10 @@ Bug Fixes
Replaced with RandomSampler. For previous behavior use RepeatableSampler.
(Gilad Barkai, Shai Erera, Doron Cohen)
+ * LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be
+ used with custom attributes. All those attributes are preserved and set on all
+ added decompounded tokens. (Spyros Kapnissis, Uwe Schindler)
+
API Changes
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Tue Oct 25 11:12:14 2011
@@ -25,19 +25,16 @@ import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
- * Base class for decomposition token filters. <a name="version"/>
+ * Base class for decomposition token filters.
* <p>
* You must specify the required {@link Version} compatibility when creating
* CompoundWordTokenFilterBase:
@@ -46,6 +43,13 @@ import org.apache.lucene.util.Version;
* supplementary characters in strings and char arrays provided as compound word
* dictionaries.
* </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
*/
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/**
@@ -64,26 +68,24 @@ public abstract class CompoundWordTokenF
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected final CharArraySet dictionary;
- protected final LinkedList<Token> tokens;
+ protected final LinkedList<CompoundToken> tokens;
protected final int minWordSize;
protected final int minSubwordSize;
protected final int maxSubwordSize;
protected final boolean onlyLongestMatch;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
- private final Token wrapper = new Token();
+ private AttributeSource.State current;
+
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
+ this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
/**
@@ -91,7 +93,7 @@ public abstract class CompoundWordTokenF
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+ this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
/**
@@ -107,7 +109,7 @@ public abstract class CompoundWordTokenF
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+ this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
@@ -127,11 +129,11 @@ public abstract class CompoundWordTokenF
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
+ this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
- this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+ this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
@@ -139,7 +141,7 @@ public abstract class CompoundWordTokenF
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
- this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+ this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
@@ -149,7 +151,7 @@ public abstract class CompoundWordTokenF
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input);
- this.tokens=new LinkedList<Token>();
+ this.tokens=new LinkedList<CompoundToken>();
this.minWordSize=minWordSize;
this.minSubwordSize=minSubwordSize;
this.maxSubwordSize=maxSubwordSize;
@@ -158,113 +160,77 @@ public abstract class CompoundWordTokenF
if (dictionary==null || dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
- this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
- addAllLowerCase(this.dictionary, dictionary);
+ this.dictionary = new CharArraySet(matchVersion, dictionary, true);
}
}
-
- /**
- * Create a set of words from an array
- * The resulting Set does case insensitive matching
- * TODO We should look for a faster dictionary lookup approach.
- * @param dictionary
- * @return {@link Set} of lowercased terms
- */
- public static final Set<?> makeDictionary(final String[] dictionary) {
- return makeDictionary(Version.LUCENE_30, dictionary);
- }
- public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+ /** @deprecated Only available for backwards compatibility. */
+ @Deprecated
+ public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {
if (dictionary == null) {
return null;
}
- // is the below really case insensitive?
- CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
- addAllLowerCase(dict, Arrays.asList(dictionary));
- return dict;
- }
-
- private final void setToken(final Token token) throws IOException {
- clearAttributes();
- termAtt.copyBuffer(token.buffer(), 0, token.length());
- flagsAtt.setFlags(token.getFlags());
- typeAtt.setType(token.type());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- posIncAtt.setPositionIncrement(token.getPositionIncrement());
- payloadAtt.setPayload(token.getPayload());
+ return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);
}
@Override
public final boolean incrementToken() throws IOException {
- if (tokens.size() > 0) {
- setToken(tokens.removeFirst());
+ if (!tokens.isEmpty()) {
+ assert current != null;
+ CompoundToken token = tokens.removeFirst();
+ restoreState(current); // keep all other attributes untouched
+ termAtt.setEmpty().append(token.txt);
+ offsetAtt.setOffset(token.startOffset, token.endOffset);
+ posIncAtt.setPositionIncrement(0);
return true;
}
- if (!input.incrementToken())
- return false;
-
- wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
- wrapper.setStartOffset(offsetAtt.startOffset());
- wrapper.setEndOffset(offsetAtt.endOffset());
- wrapper.setFlags(flagsAtt.getFlags());
- wrapper.setType(typeAtt.type());
- wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
- wrapper.setPayload(payloadAtt.getPayload());
-
- decompose(wrapper);
-
- if (tokens.size() > 0) {
- setToken(tokens.removeFirst());
+ current = null; // not really needed, but for safety
+ if (input.incrementToken()) {
+ // Only words longer than minWordSize get processed
+ if (termAtt.length() >= this.minWordSize) {
+ decompose();
+ // only capture the state if we really need it for producing new tokens
+ if (!tokens.isEmpty()) {
+ current = captureState();
+ }
+ }
+ // return original token:
return true;
} else {
return false;
}
}
-
- protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
- for (Object obj : col) {
- String string = (String) obj;
- target.add(string.toLowerCase(Locale.ENGLISH));
- }
- }
-
- protected static char[] makeLowerCaseCopy(final char[] buffer) {
- char[] result=new char[buffer.length];
- System.arraycopy(buffer, 0, result, 0, buffer.length);
-
- for (int i=0;i<buffer.length;++i) {
- result[i]=Character.toLowerCase(buffer[i]);
- }
-
- return result;
- }
-
- protected final Token createToken(final int offset, final int length,
- final Token prototype) {
- int newStart = prototype.startOffset() + offset;
- Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
- t.setPositionIncrement(0);
- return t;
- }
-
- protected void decompose(final Token token) {
- // In any case we give the original token back
- tokens.add((Token) token.clone());
- // Only words longer than minWordSize get processed
- if (token.length() < this.minWordSize) {
- return;
- }
-
- decomposeInternal(token);
- }
-
- protected abstract void decomposeInternal(final Token token);
+ /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
+ * The original token may not be placed in the list, as it is automatically passed through this filter.
+ */
+ protected abstract void decompose();
@Override
public void reset() throws IOException {
super.reset();
tokens.clear();
+ current = null;
}
+
+ /**
+ * Helper class to hold decompounded token information
+ */
+ protected class CompoundToken {
+ public final CharSequence txt;
+ public final int startOffset, endOffset;
+
+ /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
+ public CompoundToken(int offset, int length) {
+ final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
+ this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
+ // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
+ // chars from the term, offsets may not match correctly (other filters producing tokens
+ // may also have this problem):
+ this.startOffset = newStart;
+ this.endOffset = newStart + length;
+ }
+
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compo
import java.util.Set;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; // for javadocs
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
@@ -31,13 +30,26 @@ import org.apache.lucene.util.Version;
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
* "Donaudampfschiff" even when you only enter "schiff".
* It uses a brute-force algorithm to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
*/
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
/**
- * Creates a new {@link DictionaryCompoundWordTokenFilter}
- *
+ * Creates a new {@link DictionaryCompoundWordTokenFilter}.
* @param input the {@link TokenStream} to process
* @param dictionary the word dictionary to match against
* @param minWordSize only words longer than this get processed
@@ -115,7 +127,9 @@ public class DictionaryCompoundWordToken
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
@@ -134,7 +148,9 @@ public class DictionaryCompoundWordToken
* the {@link TokenStream} to process
* @param dictionary
* the word dictionary to match against
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
super(matchVersion, input, dictionary);
}
@@ -150,12 +166,9 @@ public class DictionaryCompoundWordToken
* @param input
* the {@link TokenStream} to process
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
*/
- public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
+ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
super(matchVersion, input, dictionary);
}
@@ -170,10 +183,7 @@ public class DictionaryCompoundWordToken
* @param input
* the {@link TokenStream} to process
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
@@ -183,37 +193,31 @@ public class DictionaryCompoundWordToken
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
*/
- public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
+ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
@Override
- protected void decomposeInternal(final Token token) {
- // Only words longer than minWordSize get processed
- if (token.length() < this.minWordSize) {
- return;
- }
-
- char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
-
- for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
- Token longestMatchToken=null;
+ protected void decompose() {
+ final int len = termAtt.length();
+ for (int i=0;i<=len-this.minSubwordSize;++i) {
+ CompoundToken longestMatchToken=null;
for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
- if(i+j>token.length()) {
+ if(i+j>len) {
break;
}
- if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
+ if(dictionary.contains(termAtt.buffer(), i, j)) {
if (this.onlyLongestMatch) {
if (longestMatchToken!=null) {
- if (longestMatchToken.length()<j) {
- longestMatchToken=createToken(i,j,token);
+ if (longestMatchToken.txt.length()<j) {
+ longestMatchToken=new CompoundToken(i,j);
}
} else {
- longestMatchToken=createToken(i,j,token);
+ longestMatchToken=new CompoundToken(i,j);
}
} else {
- tokens.add(createToken(i,j,token));
+ tokens.add(new CompoundToken(i,j));
}
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -21,7 +21,6 @@ import java.io.File;
import java.io.Reader;
import java.util.Set;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; // for javadocs
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
@@ -35,7 +34,21 @@ import org.xml.sax.InputSource;
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
* grammar and a word dictionary to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
*/
public class HyphenationCompoundWordTokenFilter extends
CompoundWordTokenFilterBase {
@@ -63,11 +76,13 @@ public class HyphenationCompoundWordToke
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(input, hyphenator, makeDictionary(dictionary), minWordSize,
+ this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize,
minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
@@ -85,10 +100,12 @@ public class HyphenationCompoundWordToke
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
- this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+ this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
@@ -105,10 +122,7 @@ public class HyphenationCompoundWordToke
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
*/
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set<?> dictionary) {
@@ -129,10 +143,7 @@ public class HyphenationCompoundWordToke
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
@@ -196,7 +207,7 @@ public class HyphenationCompoundWordToke
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
+ this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize,
minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
@@ -211,7 +222,7 @@ public class HyphenationCompoundWordToke
@Deprecated
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
- this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+ this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
@@ -316,22 +327,20 @@ public class HyphenationCompoundWordToke
}
@Override
- protected void decomposeInternal(final Token token) {
+ protected void decompose() {
// get the hyphenation points
- Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
- .length(), 1, 1);
+ Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
// No hyphen points found -> exit
if (hyphens == null) {
return;
}
final int[] hyp = hyphens.getHyphenationPoints();
- char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
for (int i = 0; i < hyp.length; ++i) {
int remaining = hyp.length - i;
int start = hyp[i];
- Token longestMatchToken = null;
+ CompoundToken longestMatchToken = null;
for (int j = 1; j < remaining; j++) {
int partLength = hyp[i + j] - start;
@@ -348,34 +357,33 @@ public class HyphenationCompoundWordToke
}
// check the dictionary
- if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+ if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
- if (longestMatchToken.length() < partLength) {
- longestMatchToken = createToken(start, partLength, token);
+ if (longestMatchToken.txt.length() < partLength) {
+ longestMatchToken = new CompoundToken(start, partLength);
}
} else {
- longestMatchToken = createToken(start, partLength, token);
+ longestMatchToken = new CompoundToken(start, partLength);
}
} else {
- tokens.add(createToken(start, partLength, token));
+ tokens.add(new CompoundToken(start, partLength));
}
- } else if (dictionary.contains(lowerCaseTermBuffer, start,
- partLength - 1)) {
+ } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
// check the dictionary again with a word that is one character
// shorter
// to avoid problems with genitive 's characters and other binding
// characters
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
- if (longestMatchToken.length() < partLength - 1) {
- longestMatchToken = createToken(start, partLength - 1, token);
+ if (longestMatchToken.txt.length() < partLength - 1) {
+ longestMatchToken = new CompoundToken(start, partLength - 1);
}
} else {
- longestMatchToken = createToken(start, partLength - 1, token);
+ longestMatchToken = new CompoundToken(start, partLength - 1);
}
} else {
- tokens.add(createToken(start, partLength - 1, token));
+ tokens.add(new CompoundToken(start, partLength - 1));
}
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -17,15 +17,20 @@ package org.apache.lucene.analysis.compo
* limitations under the License.
*/
+import java.io.IOException;
import java.io.StringReader;
-import org.xml.sax.InputSource;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.xml.sax.InputSource;
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testHyphenationCompoundWordsDA() throws Exception {
@@ -166,45 +171,45 @@ public class TestCompoundWordTokenFilter
String[] dict = {"ab", "cd", "ef"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
- new WhitespaceTokenizer(TEST_VERSION_CURRENT,
- new StringReader(
- "abcdef")
- ),
- dict,
- CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader(
+ "abcdef")
+ ),
+ dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf,
- new String[] { "abcdef", "ab", "cd", "ef" },
- new int[] { 0, 0, 2, 4},
- new int[] { 6, 2, 4, 6},
- new int[] { 1, 0, 0, 0}
- );
+ new String[] { "abcdef", "ab", "cd", "ef" },
+ new int[] { 0, 0, 2, 4},
+ new int[] { 6, 2, 4, 6},
+ new int[] { 1, 0, 0, 0}
+ );
}
public void testWordComponentWithLessThanMinimumLength() throws Exception {
String[] dict = {"abc", "d", "efg"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
- new WhitespaceTokenizer(TEST_VERSION_CURRENT,
- new StringReader(
- "abcdefg")
- ),
- dict,
- CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader(
+ "abcdefg")
+ ),
+ dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
- // since "d" is shorter than the minimum subword size, it should not be added to the token stream
+ // since "d" is shorter than the minimum subword size, it should not be added to the token stream
assertTokenStreamContents(tf,
- new String[] { "abcdefg", "abc", "efg" },
- new int[] { 0, 0, 4},
- new int[] { 7, 3, 7},
- new int[] { 1, 0, 0}
- );
+ new String[] { "abcdefg", "abc", "efg" },
+ new int[] { 0, 0, 4},
+ new int[] { 7, 3, 7},
+ new int[] { 1, 0, 0}
+ );
}
-
+
public void testReset() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Ãberwachung" };
@@ -228,4 +233,64 @@ public class TestCompoundWordTokenFilter
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
+ public void testRetainMockAttribute() throws Exception {
+ String[] dict = { "abc", "d", "efg" };
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader("abcdefg"));
+ TokenStream stream = new MockRetainAttributeFilter(tokenizer);
+ stream = new DictionaryCompoundWordTokenFilter(
+ TEST_VERSION_CURRENT, stream, dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
+ while (stream.incrementToken()) {
+ assertTrue("Custom attribute value was lost", retAtt.getRetain());
+ }
+
+ }
+
+ public static interface MockRetainAttribute extends Attribute {
+ void setRetain(boolean attr);
+ boolean getRetain();
+ }
+
+ public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
+ private boolean retain = false;
+ @Override
+ public void clear() {
+ retain = false;
+ }
+ public boolean getRetain() {
+ return retain;
+ }
+ public void setRetain(boolean retain) {
+ this.retain = retain;
+ }
+ @Override
+ public void copyTo(AttributeImpl target) {
+ MockRetainAttribute t = (MockRetainAttribute) target;
+ t.setRetain(retain);
+ }
+ }
+
+ private static class MockRetainAttributeFilter extends TokenFilter {
+
+ MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
+
+ MockRetainAttributeFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()){
+ retainAtt.setRetain(true);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
}