You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/11/11 19:05:24 UTC

svn commit: r1638234 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/analysis/ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/

Author: mikemccand
Date: Tue Nov 11 18:05:23 2014
New Revision: 1638234

URL: http://svn.apache.org/r1638234
Log:
LUCENE-4400: Add NYSIIS Apache commons phonetic codec

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java
    lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
    lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1638234&r1=1638233&r2=1638234&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Tue Nov 11 18:05:23 2014
@@ -65,6 +65,9 @@ New Features
 
 * LUCENE-6053: Add Serbian analyzer.  (Nikola Smolenski via Robert Muir, Mike McCandless)
 
+* LUCENE-4400: Add support for new NYSIIS Apache commons phonetic
+  codec (Thomas Neidhart via Mike McCandless)
+
 API Changes
 
 * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

Modified: lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java?rev=1638234&r1=1638233&r2=1638234&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java Tue Nov 11 18:05:23 2014
@@ -18,8 +18,8 @@ package org.apache.lucene.analysis.phone
  */
 
 import java.io.IOException;
-import java.lang.reflect.Method;
 import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
@@ -29,6 +29,7 @@ import org.apache.commons.codec.language
 import org.apache.commons.codec.language.ColognePhonetic;
 import org.apache.commons.codec.language.DoubleMetaphone;
 import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Nysiis;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.TokenStream;
@@ -46,8 +47,8 @@ import org.apache.lucene.analysis.util.T
  * This takes one required argument, "encoder", and the rest are optional:
  * <dl>
  *  <dt>encoder</dt><dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
- *  or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
- *  itself if it already contains a '.' or otherwise as in the same package as these others.</dd>
+ *  "ColognePhonetic" or "Nysiis" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name
+ *  either by itself if it already contains a '.' or otherwise as in the same package as these others.</dd>
  *  <dt>inject</dt><dd> (default=true) add tokens to the stream with the offset=0</dd>
  *  <dt>maxCodeLength</dt><dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
  *  support this then specifying this is an error.</dd>
@@ -82,6 +83,7 @@ public class PhoneticFilterFactory exten
     registry.put("RefinedSoundex".toUpperCase(Locale.ROOT), RefinedSoundex.class);
     registry.put("Caverphone".toUpperCase(Locale.ROOT), Caverphone2.class);
     registry.put("ColognePhonetic".toUpperCase(Locale.ROOT), ColognePhonetic.class);
+    registry.put("Nysiis".toUpperCase(Locale.ROOT), Nysiis.class);
   }
 
   final boolean inject; //accessed by the test

Modified: lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java?rev=1638234&r1=1638233&r2=1638234&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java Tue Nov 11 18:05:23 2014
@@ -21,7 +21,12 @@ import java.io.IOException;
 import java.io.StringReader;
 
 import org.apache.commons.codec.Encoder;
-import org.apache.commons.codec.language.*;
+import org.apache.commons.codec.language.Caverphone2;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Nysiis;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
@@ -59,6 +64,11 @@ public class TestPhoneticFilter extends 
           "TTA1111111", "Datha", "KLN1111111", "Carlene" });
     assertAlgorithm(new Caverphone2(), false, "Darda Karleen Datha Carlene",
         new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
+
+    assertAlgorithm(new Nysiis(), true, "aaa bbb ccc easgasg",
+        new String[] { "A", "aaa", "B", "bbb", "C", "ccc", "EASGAS", "easgasg" });
+    assertAlgorithm(new Nysiis(), false, "aaa bbb ccc easgasg",
+        new String[] { "A", "B", "C", "EASGAS" });
   }
 
   

Modified: lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java?rev=1638234&r1=1638233&r2=1638234&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java Tue Nov 11 18:05:23 2014
@@ -18,14 +18,12 @@ package org.apache.lucene.analysis.phone
  */
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
-import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.Caverphone2;
+import org.apache.commons.codec.language.Metaphone;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;
@@ -164,6 +162,12 @@ public class TestPhoneticFilterFactory e
           "67", "Meir", "862", "Schmidt" });
     assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt",
         new String[] { "67", "862", "67", "862" });
+    
+    assertAlgorithm("Nysiis", "true", "Macintosh Knuth Bart Hurd",
+        new String[] { "MCANT", "Macintosh", "NAT", "Knuth", 
+          "BAD", "Bart", "HAD", "Hurd" });
+    assertAlgorithm("Nysiis", "false", "Macintosh Knuth Bart Hurd",
+        new String[] { "MCANT", "NAT", "BAD", "HAD" });
   }
   
   /** Test that bogus arguments result in exception */