You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/08/13 20:18:23 UTC
svn commit: r803977 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/PhoneticFilter.java src/test/org/apache/solr/analysis/TestPhoneticFilter.java

Author: yonik
Date: Thu Aug 13 18:18:22 2009
New Revision: 803977

URL: http://svn.apache.org/viewvc?rev=803977&view=rev
Log:
SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens, update to new attribute API

Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Thu Aug 13 18:18:22 2009
@@ -463,6 +463,10 @@
 55. SOLR-1342: CapitalizationFilterFactory uses incorrect term length calculations.
     (Robert Muir via Mark Miller)
 
+56. SOLR-1359: DoubleMetaphoneFilter didn't index original tokens if there was no
+    alternative, and could incorrectly skip or reorder tokens.  (yonik)
+
+57. SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens. (yonik)
 
 Other Changes
 ----------------------

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PhoneticFilter.java Thu Aug 13 18:18:22 2009
@@ -21,6 +21,8 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 
 import java.io.IOException;
 
@@ -36,39 +38,63 @@
   protected Encoder encoder = null;
   protected String name = null;
   
-  protected Token save = null;
+  protected State save = null;
+  private final TermAttribute termAtt;
+  private final PositionIncrementAttribute posAtt;
 
   public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
     super(in);
     this.encoder = encoder;
     this.name = name;
     this.inject = inject;
+    this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);    
   }
 
   @Override
-  public final Token next(Token in) throws IOException {
+  public boolean incrementToken() throws IOException {
     if( save != null ) {
-      Token temp = save;
+      clearAttributes(); restoreState(save);
       save = null;
-      return temp;
+      return true;
     }
-    
-    Token t = input.next(in);
-    if( t != null ) {
-      String value = new String(t.termBuffer(), 0, t.termLength());
-      try {
-        value = encoder.encode(value).toString();
-      } 
-      catch (Exception ignored) {} // just use the direct text
-      //Token m = new Token(value, t.startOffset(), t.endOffset(), name );
-      if( inject ) {
-        save = (Token) t.clone();
-        save.setPositionIncrement(0);
-        save.setTermBuffer(value.toCharArray(), 0, value.length());
-      } else {
-        t.setTermBuffer(value.toCharArray(), 0, value.length());
-      }
+
+    if (!input.incrementToken()) return false;
+
+    // pass through zero-length terms
+    if (termAtt.termLength()==0) return true;
+
+    String value = termAtt.term();
+    String phonetic = null;
+    try {
+     String v = encoder.encode(value).toString();
+     if (v.length() > 0 && !value.equals(v)) phonetic = v;
+    } catch (Exception ignored) {} // just use the direct text
+
+    if (phonetic == null) return true;
+
+    if (!inject) {
+      // just modify this token
+      termAtt.setTermBuffer(phonetic);
+      return true;
     }
-    return t;
+
+    // We need to return both the original and the phonetic tokens.
+    // to avoid a orig=captureState() change_to_phonetic() saved=captureState()  restoreState(orig)
+    // we return the phonetic alternative first
+
+    int origOffset = posAtt.getPositionIncrement();
+    posAtt.setPositionIncrement(0);
+    save = captureState();
+
+    posAtt.setPositionIncrement(origOffset);
+    termAtt.setTermBuffer(phonetic);
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    save = null;
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java?rev=803977&r1=803976&r2=803977&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java Thu Aug 13 18:18:22 2009
@@ -71,17 +71,26 @@
     ArrayList<Token> output = new ArrayList<Token>();
     for( String s : input ) {
       stream.add( new Token( s, 0, s.length() ) );
+
+      // phonetic token is added first in the current impl
+      output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
+
+      // add the original if applicable
       if( inject ) {
         output.add( new Token( s, 0, s.length() ) );
       }
-      output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
     }
-    
+
+    // System.out.println("###stream="+stream);
+    // System.out.println("###output="+output);
+
     PhoneticFilter filter = new PhoneticFilter( 
         new IterTokenStream(stream.iterator()), enc, "text", inject );
     
     for( Token t : output ) {
       Token got = filter.next(t);
+      // System.out.println("##### got="+got);
+
       assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
     }
     assertNull( filter.next() );  // no more tokens