You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ry...@apache.org on 2008/10/18 20:38:24 UTC

svn commit: r705903 - in /lucene/solr/trunk: ./ example/solr/conf/ src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/

Author: ryan
Date: Sat Oct 18 11:38:24 2008
New Revision: 705903

URL: http://svn.apache.org/viewvc?rev=705903&view=rev
Log:
SOLR-813: Adding DoubleMetaphone Filter and Factory

Added:
    lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/example/solr/conf/schema.xml

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=705903&r1=705902&r2=705903&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Oct 18 11:38:24 2008
@@ -43,7 +43,7 @@
     See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
     (yonik, Noble Paul, Akshay Ukey via shalin)
  
- 3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents
+ 3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
     (Lars Kotthoff via ryan)
 
  4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
@@ -56,6 +56,10 @@
  6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
     since the last commit. (Noble Paul, koji via shalin)
     
+ 7. SOLR-813: Adding DoubleMetaphone Filter and Factory.  Similar to the PhoneticFilter, 
+    but this uses DoubleMetaphone specific calls (including alternate encoding)
+    (Todd Feak via ryan)   
+
 
 Optimizations
 ----------------------

Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=705903&r1=705902&r2=705903&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Sat Oct 18 11:38:24 2008
@@ -247,6 +247,14 @@
         />
       </analyzer>
     </fieldType>
+    
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype> 
+    
 
     <!-- since fields of this type are by default not stored or indexed, any data added to 
          them will be ignored outright 

Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class DoubleMetaphoneFilter extends TokenFilter {
+
+  private static final String TOKEN_TYPE = "DoubleMetaphone";
+  
+  private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
+  private final DoubleMetaphone encoder = new DoubleMetaphone();
+  private final boolean inject;
+  
+  protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
+    super(input);
+    this.encoder.setMaxCodeLen(maxCodeLength);
+    this.inject = inject;
+  }
+
+  @Override
+  public final Token next(Token in) throws IOException {
+    if (!remainingTokens.isEmpty()) {
+      return remainingTokens.removeFirst();
+    }
+
+    Token t = input.next(in);
+    if (t != null) {
+      if (inject) {
+        remainingTokens.addLast(t);
+      }
+
+      boolean isPhonetic = false;
+      String v = new String(t.termBuffer(), 0, t.termLength());
+      String primaryPhoneticValue = encoder.doubleMetaphone(v);
+      if (primaryPhoneticValue.length() > 0) {
+        Token token = (Token) t.clone();
+        if( inject ) {
+          token.setPositionIncrement( 0 );
+        }
+        token.setType( TOKEN_TYPE );
+        token.setTermBuffer(primaryPhoneticValue);
+        remainingTokens.addLast(token);
+        isPhonetic = true;
+      }
+
+      String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
+      if (alternatePhoneticValue.length() > 0
+          && !primaryPhoneticValue.equals(alternatePhoneticValue)) {
+        Token token = (Token) t.clone();
+        token.setPositionIncrement( 0 );
+        token.setType( TOKEN_TYPE );
+        token.setTermBuffer(alternatePhoneticValue);
+        remainingTokens.addLast(token);
+        isPhonetic = true;
+      }
+      
+      // If we did not add something, then go to the next one...
+      if( !isPhonetic ) {
+        t = next(in);
+        if( t != null ) {
+          t.setPositionIncrement( t.getPositionIncrement()+1 ); 
+        }
+        return t;
+      }
+    }
+
+    return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
+  }
+}

Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+
+public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory 
+{
+  public static final String INJECT = "inject"; 
+  public static final String MAX_CODE_LENGTH = "maxCodeLength"; 
+
+  public static final int DEFAULT_MAX_CODE_LENGTH = 4;
+
+  private boolean inject = true;
+  private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
+
+  @Override
+  public void init(Map<String, String> args) {
+    super.init(args);
+
+    if (args.get(INJECT) != null) {
+      inject = Boolean.getBoolean(args.get(INJECT));
+    }
+
+    if (args.get(MAX_CODE_LENGTH) != null) {
+      maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
+    }
+  }
+
+  public DoubleMetaphoneFilter create(TokenStream input) {
+    return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
+  }
+}

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+
+public class DoubleMetaphoneFilterFactoryTest extends TestCase {
+
+  public void testDefaults() throws Exception {
+    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
+    factory.init(new HashMap<String, String>());
+    TokenStream inputStream = new IterTokenStream("international");
+
+    TokenStream filteredStream = factory.create(inputStream);
+
+    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+
+    Token token = filteredStream.next(new Token());
+    assertEquals(13, token.termLength());
+    assertEquals("international", new String(token.termBuffer(), 0, token
+        .termLength()));
+
+    token = filteredStream.next(new Token());
+    assertEquals(4, token.termLength());
+    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+    assertNull(filteredStream.next(new Token()));
+  }
+
+  public void testSettingSizeAndInject() throws Exception {
+    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
+    Map<String, String> parameters = new HashMap<String, String>();
+    parameters.put("inject", "false");
+    parameters.put("maxCodeLength", "8");
+    factory.init(parameters);
+
+    TokenStream inputStream = new IterTokenStream("international");
+
+    TokenStream filteredStream = factory.create(inputStream);
+
+    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+
+    Token token = filteredStream.next(new Token());
+    assertEquals(8, token.termLength());
+    assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
+        .termLength()));
+
+    assertNull(filteredStream.next(new Token()));
+  }
+}

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+
+public class DoubleMetaphoneFilterTest extends TestCase {
+
+  public void testSize4FalseInject() throws Exception {
+    TokenStream stream = new IterTokenStream("international");
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+
+    Token token = filter.next(new Token());
+    assertEquals(4, token.termLength());
+    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+    assertNull(filter.next(new Token()));
+  }
+
+  public void testSize4TrueInject() throws Exception {
+    TokenStream stream = new IterTokenStream("international");
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
+
+    Token token = filter.next(new Token());
+    assertEquals(13, token.termLength());
+    assertEquals("international", new String(token.termBuffer(), 0, token
+        .termLength()));
+
+    token = filter.next(new Token());
+    assertEquals(4, token.termLength());
+    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+    assertNull(filter.next(new Token()));
+  }
+
+  public void testAlternateInjectFalse() throws Exception {
+    TokenStream stream = new IterTokenStream("Kuczewski");
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+
+    Token token = filter.next(new Token());
+    assertEquals(4, token.termLength());
+    assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
+
+    token = filter.next(new Token());
+    assertEquals(4, token.termLength());
+    assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
+    assertNull(filter.next(new Token()));
+  }
+
+  public void testSize8FalseInject() throws Exception {
+    TokenStream stream = new IterTokenStream("international");
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+
+    Token token = filter.next(new Token());
+    assertEquals(8, token.termLength());
+    assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
+        .termLength()));
+
+    assertNull(filter.next(new Token()));
+  }
+
+  public void testNonConvertableStringsWithInject() throws Exception {
+    TokenStream stream = new IterTokenStream(
+        new String[] { "12345", "#$%@#^%&" });
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
+
+    Token token = filter.next(new Token());
+    assertEquals(5, token.termLength());
+    assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
+
+    token = filter.next(new Token());
+    assertEquals(8, token.termLength());
+    assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
+        .termLength()));
+  }
+
+  public void testNonConvertableStringsWithoutInject() throws Exception {
+    TokenStream stream = new IterTokenStream(
+        new String[] { "12345", "#$%@#^%&" });
+    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+
+    assertNull(filter.next(new Token()));
+    
+    // should have something after the stream
+    stream = new IterTokenStream(
+        new String[] { "12345", "#$%@#^%&", "hello" });
+    filter = new DoubleMetaphoneFilter(stream, 8, false);
+    assertNotNull(filter.next(new Token()));
+  }
+
+}