You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/08/13 20:18:23 UTC
svn commit: r803977 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/PhoneticFilter.java
src/test/org/apache/solr/analysis/TestPhoneticFilter.java
Author: yonik
Date: Thu Aug 13 18:18:22 2009
New Revision: 803977
URL: http://svn.apache.org/viewvc?rev=803977&view=rev
Log:
SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens, update to new attribute API
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Thu Aug 13 18:18:22 2009
@@ -463,6 +463,10 @@
55. SOLR-1342: CapitalizationFilterFactory uses incorrect term length calculations.
(Robert Muir via Mark Miller)
+56. SOLR-1359: DoubleMetaphoneFilter didn't index original tokens if there was no
+ alternative, and could incorrectly skip or reorder tokens. (yonik)
+
+57. SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens. (yonik)
Other Changes
----------------------
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java Thu Aug 13 18:18:22 2009
@@ -21,6 +21,8 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
@@ -36,39 +38,63 @@
protected Encoder encoder = null;
protected String name = null;
- protected Token save = null;
+ protected State save = null;
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posAtt;
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
@Override
- public final Token next(Token in) throws IOException {
+ public boolean incrementToken() throws IOException {
if( save != null ) {
- Token temp = save;
+ clearAttributes(); restoreState(save);
save = null;
- return temp;
+ return true;
}
-
- Token t = input.next(in);
- if( t != null ) {
- String value = new String(t.termBuffer(), 0, t.termLength());
- try {
- value = encoder.encode(value).toString();
- }
- catch (Exception ignored) {} // just use the direct text
- //Token m = new Token(value, t.startOffset(), t.endOffset(), name );
- if( inject ) {
- save = (Token) t.clone();
- save.setPositionIncrement(0);
- save.setTermBuffer(value.toCharArray(), 0, value.length());
- } else {
- t.setTermBuffer(value.toCharArray(), 0, value.length());
- }
+
+ if (!input.incrementToken()) return false;
+
+ // pass through zero-length terms
+ if (termAtt.termLength()==0) return true;
+
+ String value = termAtt.term();
+ String phonetic = null;
+ try {
+ String v = encoder.encode(value).toString();
+ if (v.length() > 0 && !value.equals(v)) phonetic = v;
+ } catch (Exception ignored) {} // just use the direct text
+
+ if (phonetic == null) return true;
+
+ if (!inject) {
+ // just modify this token
+ termAtt.setTermBuffer(phonetic);
+ return true;
}
- return t;
+
+ // We need to return both the original and the phonetic tokens.
+ // to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig)
+ // we return the phonetic alternative first
+
+ int origOffset = posAtt.getPositionIncrement();
+ posAtt.setPositionIncrement(0);
+ save = captureState();
+
+ posAtt.setPositionIncrement(origOffset);
+ termAtt.setTermBuffer(phonetic);
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ save = null;
}
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java Thu Aug 13 18:18:22 2009
@@ -71,17 +71,26 @@
ArrayList<Token> output = new ArrayList<Token>();
for( String s : input ) {
stream.add( new Token( s, 0, s.length() ) );
+
+ // phonetic token is added first in the current impl
+ output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
+
+ // add the original if applicable
if( inject ) {
output.add( new Token( s, 0, s.length() ) );
}
- output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
}
-
+
+ // System.out.println("###stream="+stream);
+ // System.out.println("###output="+output);
+
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) {
Token got = filter.next(t);
+ // System.out.println("##### got="+got);
+
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
}
assertNull( filter.next() ); // no more tokens