You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/08/13 20:10:53 UTC
svn commit: r803971 -
/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
Author: yonik
Date: Thu Aug 13 18:10:52 2009
New Revision: 803971
URL: http://svn.apache.org/viewvc?rev=803971&view=rev
Log:
SOLR-1359: DoubleMetaphone filter could skip or reorder tokens, fix and update to new API
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java?rev=803971&r1=803970&r2=803971&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java Thu Aug 13 18:10:52 2009
@@ -23,68 +23,89 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
- private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
+ private final LinkedList<State> remainingTokens = new LinkedList<State>();
private final DoubleMetaphone encoder = new DoubleMetaphone();
private final boolean inject;
-
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posAtt;
+
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
@Override
- public final Token next(Token in) throws IOException {
- if (!remainingTokens.isEmpty()) {
- return remainingTokens.removeFirst();
- }
+ public boolean incrementToken() throws IOException {
+ for(;;) {
- Token t = input.next(in);
- if (t != null && t.termLength() > 0) {
- if (inject) {
- remainingTokens.addLast(t);
+ if (!remainingTokens.isEmpty()) {
+ clearAttributes(); restoreState(remainingTokens.removeFirst());
+ return true;
}
- boolean isPhonetic = false;
- String v = new String(t.termBuffer(), 0, t.termLength());
+ if (!input.incrementToken()) return false;
+
+ int len = termAtt.termLength();
+ if (len==0) return true; // pass through zero length terms
+
+ int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
+
+ String v = new String(termAtt.termBuffer(), 0, len);
String primaryPhoneticValue = encoder.doubleMetaphone(v);
- if (primaryPhoneticValue.length() > 0) {
- Token token = (Token) t.clone();
- if( inject ) {
- token.setPositionIncrement( 0 );
+ String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
+
+ // a flag to lazily save state if needed... this avoids a save/restore when only
+ // one token will be generated.
+ boolean saveState=inject;
+
+ if (primaryPhoneticValue!=null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
+ if (saveState) {
+ remainingTokens.addLast(captureState());
}
- token.setType( TOKEN_TYPE );
- token.setTermBuffer(primaryPhoneticValue);
- remainingTokens.addLast(token);
- isPhonetic = true;
+ posAtt.setPositionIncrement( firstAlternativeIncrement );
+ firstAlternativeIncrement = 0;
+ termAtt.setTermBuffer(primaryPhoneticValue);
+ saveState = true;
}
- String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
- if (alternatePhoneticValue.length() > 0
- && !primaryPhoneticValue.equals(alternatePhoneticValue)) {
- Token token = (Token) t.clone();
- token.setPositionIncrement( 0 );
- token.setType( TOKEN_TYPE );
- token.setTermBuffer(alternatePhoneticValue);
- remainingTokens.addLast(token);
- isPhonetic = true;
- }
-
- // If we did not add something, then go to the next one...
- if( !isPhonetic ) {
- t = next(in);
- if( t != null ) {
- t.setPositionIncrement( t.getPositionIncrement()+1 );
+ if (alternatePhoneticValue!=null && alternatePhoneticValue.length() > 0
+ && !alternatePhoneticValue.equals(primaryPhoneticValue)
+ && !primaryPhoneticValue.equals(v)) {
+ if (saveState) {
+ remainingTokens.addLast(captureState());
+ saveState = false;
}
- return t;
+ posAtt.setPositionIncrement( firstAlternativeIncrement );
+ termAtt.setTermBuffer(alternatePhoneticValue);
+ saveState = true;
+ }
+
+ // Just one token to return, so no need to capture/restore
+ // any state, simply return it.
+ if (remainingTokens.isEmpty()) {
+ return true;
+ }
+
+ if (saveState) {
+ remainingTokens.addLast(captureState());
}
}
+ }
- return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ remainingTokens.clear();
}
}