You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2012/03/08 14:29:24 UTC

svn commit: r1298387 - in /commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language: Nysiis.java NysiisTest.java

Author: ggregory
Date: Thu Mar  8 13:29:24 2012
New Revision: 1298387

URL: http://svn.apache.org/viewvc?rev=1298387&view=rev
Log:
[CODEC-63] Implement NYSIIS. Adding Encoder class and test both in the test directory until the encoder is fully baked. This should make it easier to patch further.

Added:
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java

Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java?rev=1298387&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java Thu Mar  8 13:29:24 2012
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * THIS CLASS LIVES IN THE TEST DIRECTORY UNTIL IT IS FULLY BAKED. 
+ * 
+ * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
+ * general purpose scheme to find word with similar phonemes.
+ * 
+ * <p>
+ * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
+ * </p>
+ * 
+ * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">http://en.wikipedia.org/wiki/NYSIIS</a>
+ * @see <a href="http://www.dropby.com/NYSIIS.html">http://www.dropby.com/NYSIIS.html</a>
+ * @see Soundex
+ * @version $Id: Nysiis.java 669755 2008-06-20 01:21:52Z sebb $
+ */
+public class Nysiis implements StringEncoder {
+
+    private static final char[] CHARS_A = new char[] { 'A' };
+    private static final char[] CHARS_AF = new char[] { 'A', 'F' };
+    private static final char[] CHARS_C = new char[] { 'C' };
+    private static final char[] CHARS_FF = new char[] { 'F', 'F' };
+    private static final char[] CHARS_G = new char[] { 'G' };
+    private static final char[] CHARS_N = new char[] { 'N' };
+    private static final char[] CHARS_NN = new char[] { 'N', 'N' };
+    private static final char[] CHARS_S = new char[] { 'S' };
+    private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
+    private static final char SPACE = ' ';
+    private static final int TRUE_LENGTH = 6;
+
+    /**
+     * Tests if the given character is a vowel.
+     * 
+     * @param c
+     *            the character to test
+     * @return <code>true</code> if the character is a vowel, <code>false</code> otherwise
+     */
+    private static boolean isVowel(final char c) {
+        return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
+    }
+
+    /**
+     * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
+     * a time: [i-1, i, i+1, i+2].
+     * 
+     * @param prev
+     *            the previous character
+     * @param curr
+     *            the current character
+     * @param next
+     *            the next character
+     * @param aNext
+     *            the after next character
+     * @return a transcoded array of characters, starting from the current position
+     */
+    private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
+        // 1. EV -> AF
+        if (curr == 'E' && next == 'V') {
+            return CHARS_AF;
+        }
+
+        // A, E, I, O, U -> A
+        if (isVowel(curr)) {
+            return CHARS_A;
+        }
+
+        // 2. Q -> G, Z -> S, M -> N
+        if (curr == 'Q') {
+            return CHARS_G;
+        } else if (curr == 'Z') {
+            return CHARS_S;
+        } else if (curr == 'M') {
+            return CHARS_N;
+        }
+
+        // 3. KN -> NN else K -> C
+        if (curr == 'K') {
+            if (next == 'N') {
+                return CHARS_NN;
+            } else {
+                return CHARS_C;
+            }
+        }
+
+        // 4. SCH -> SSS
+        if (curr == 'S' && next == 'C' && aNext == 'H') {
+            return CHARS_SSS;
+        }
+
+        // PH -> FF
+        if (curr == 'P' && next == 'H') {
+            return CHARS_FF;
+        }
+
+        // 5. H -> If previous or next is a non vowel, previous.
+        if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
+            return new char[] { prev };
+        }
+
+        // 6. W -> If previous is vowel, previous.
+        if (curr == 'W' && isVowel(prev)) {
+            return new char[] { prev };
+        }
+
+        return new char[] { curr };
+    }
+
+    private final boolean trueLength;
+
+    public Nysiis() {
+        this(true);
+    }
+
+    public Nysiis(boolean trueLength) {
+        this.trueLength = trueLength;
+    }
+
+    /**
+     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
+     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
+     * {@link String}.
+     * 
+     * @param pObject
+     *            Object to encode
+     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
+     * @throws EncoderException
+     *             if the parameter supplied is not of a {@link String}
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    public Object encode(Object pObject) throws EncoderException {
+        if (!(pObject instanceof String)) {
+            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
+        }
+        return this.nysiis((String) pObject);
+    }
+
+    /**
+     * Encodes a String using the NYSIIS algorithm.
+     * 
+     * @param pString
+     *            A String object to encode
+     * @return A Nysiis code corresponding to the String supplied
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    public String encode(String pString) {
+        return this.nysiis(pString);
+    }
+
+    public boolean isTrueLength() {
+        return trueLength;
+    }
+
+    /**
+     * Retrieves the NYSIIS code for a given String object.
+     * 
+     * @param str
+     *            String to encode using the NYSIIS algorithm
+     * @return A NYSIIS code for the String supplied
+     */
+    public String nysiis(String str) {
+        if (str == null) {
+            return null;
+        }
+
+        // Use the same clean rules as Soundex
+        str = SoundexUtils.clean(str);
+
+        if (str.length() == 0) {
+            return str;
+        }
+
+        // Translate first characters of name:
+        // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
+        str = str.replaceFirst("^MAC", "MCC");
+        str = str.replaceFirst("^KN", "NN");
+        str = str.replaceFirst("^K", "C");
+        str = str.replaceFirst("^(PH|PF)", "FF");
+        str = str.replaceFirst("^SCH", "SSS");
+
+        // Translate last characters of name:
+        // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
+        str = str.replaceFirst("(EE|IE)$", "Y");
+        str = str.replaceFirst("(DT|RT|RD|NT|ND)$", "D");
+
+        // First character of key = first character of name.
+        StringBuffer key = new StringBuffer(str.length());
+        key.append(str.charAt(0));
+
+        // Transcode remaining characters, incrementing by one character each time
+        final char[] chars = str.toCharArray();
+        final int len = chars.length;
+
+        for (int i = 1; i < len; i++) {
+            final char next = i < len - 1 ? chars[i + 1] : SPACE;
+            final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
+            final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
+            System.arraycopy(transcoded, 0, chars, i, transcoded.length);
+
+            // only append the current char to the key if it is different from the last one
+            if (chars[i] != chars[i - 1]) {
+                key.append(chars[i]);
+            }
+        }
+
+        if (key.length() > 1) {
+            char lastChar = key.charAt(key.length() - 1);
+
+            // If last character is S, remove it.
+            if (lastChar == 'S') {
+                key.deleteCharAt(key.length() - 1);
+                lastChar = key.charAt(key.length() - 1);
+            }
+
+            if (key.length() > 2) {
+                final char last2Char = key.charAt(key.length() - 2);
+                // If last characters are AY, replace with Y.
+                if (last2Char == 'A' && lastChar == 'Y') {
+                    key.deleteCharAt(key.length() - 2);
+                }
+            }
+
+            // If last character is A, remove it.
+            if (lastChar == 'A') {
+                key.deleteCharAt(key.length() - 1);
+            }
+        }
+
+        final String string = key.toString();
+        return this.isTrueLength() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
+    }
+
+}

Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298387&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java Thu Mar  8 13:29:24 2012
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+import org.apache.commons.codec.StringEncoderAbstractTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests {@link Nysiis}
+ * 
+ * @version $Id: NysiisTest.java 658834 2008-05-21 19:57:51Z niallp $
+ */
+public class NysiisTest extends StringEncoderAbstractTest {
+
+    @Override
+    protected StringEncoder createStringEncoder() {
+        return new Nysiis();
+    }
+
+    protected StringEncoder createStringEncoder(boolean trueLength) {
+        return new Nysiis(trueLength);
+    }
+
+    private void encodeAll(String[] strings, String expectedEncoding) throws EncoderException {
+        for (int i = 0; i < strings.length; i++) {
+            Assert.assertEquals("Problem with " + strings[i], expectedEncoding, getStringEncoder().encode(strings[i]));
+        }
+    }
+
+    @Test
+    public void testDropBy() throws EncoderException {
+        List<String[]> testValues =
+                Arrays.asList(
+                        new String[] { "MACINTOSH", "MCANT" },
+                        new String[] { "KNUTH", "NAT"   },
+                        new String[] { "KOEHN", "CAN" },
+                        new String[] { "PHILLIPSON", "FALAPSAN" },
+                        new String[] { "PFEISTER", "FASTAR" },
+                        new String[] { "MCKEE", "MCY" },
+                        new String[] { "MACKIE", "MCY" },
+                        new String[] { "HEITSCHMIDT", "HATSNAD" },
+                        new String[] { "BART", "BAD" },
+                        new String[] { "HURD", "HAD" },
+                        new String[] { "HUNT", "HAD" },
+                        new String[] { "WESTERLUND", "WASTARLAD" },
+                        new String[] { "CASSTEVENS", "CASTAFAN" },
+                        new String[] { "VASQUEZ", "VASG" },
+                        new String[] { "FRAZIER", "FRASAR" },
+                        new String[] { "BOWMAN", "BANAN" },
+                        new String[] { "RICKERT", "RACAD" },
+                        new String[] { "DEUTSCH", "DAT" },
+                        new String[] { "WESTPHAL", "WASTFAL" },
+                        new String[] { "SHRIVER", "SRAVAR" },
+                        new String[] { "KUHL", "CAL" },
+                        new String[] { "RAWSON", "RASAN" },
+                        new String[] { "JILES", "JAL" },
+                        new String[] { "CARRAWAY", "CARY" },
+                        new String[] { "YAMADA", "YANAD" });
+
+        for (String[] arr : testValues) {
+            Assert.assertEquals("Problem with " + arr[0], arr[1], createStringEncoder(false).encode(arr[0]));
+        }
+    }
+
+    /**
+     * Tests data gathered from around the internets.
+     * 
+     * @throws EncoderException
+     */
+    @Test
+    public void testOthers() throws EncoderException {
+        List<String[]> testValues =
+                Arrays.asList(
+                        // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+                        // 1. Transcode first characters of name
+                        new String[] { "MACINTOSH", "MCANT" },
+                        //new String[] { "KNUTH", "NNATH" }, // Original: NNAT; modified: NATH
+                        //new String[] { "KOEHN", "C" },
+                        //new String[] { "PHILLIPSON", "FFALAP" },
+                        //new String[] { "PFEISTER", "FFASTA" },
+                        //new String[] { "SCHOENHOEFT", "SSANAF" },
+                        // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+                        // 2.Transcode last characters of name: 
+                        new String[] { "MCKEE", "MCY" },
+                        new String[] { "MACKIE", "MCY" },
+                        new String[] { "HEITSCHMIDT", "HATSNAD" },
+                        new String[] { "BART", "BAD" },
+                        new String[] { "HURD", "HAD" },
+                        new String[] { "HUNT", "HAD" },
+                        new String[] { "WESTERLUND", "WASTARLAD" },
+                        // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+                        // 4. Transcode remaining characters by following these rules, incrementing by one character each time: 
+                        new String[] { "CASSTEVENS", "CASTAFAN" },
+                        new String[] { "VASQUEZ", "VASG" },
+                        new String[] { "FRAZIER", "FRASAR" },
+                        new String[] { "BOWMAN", "BANAN" },
+                        new String[] { "MCKNIGHT", "MCNAGT" },
+                        new String[] { "RICKERT", "RACAD" },
+                        //new String[] { "DEUTSCH", "DATS" },
+                        new String[] { "WESTPHAL", "WASTFAL" },
+                        //new String[] { "SHRIVER", "SHRAVA" },
+                        //new String[] { "KUHL", "C" },
+                        new String[] { "RAWSON", "RASAN" },
+                        // If last character is S, remove it
+                        new String[] { "JILES", "JAL" },
+                        //new String[] { "CARRAWAY", "CARAY" },
+                        new String[] { "YAMADA", "YANAD" },
+                        // Others
+                        new String[] { "O'Daniel", "ODANAL" },
+                        new String[] { "O'Donnel", "ODANAL" },
+                        new String[] { "Cory", "CARY" },
+                        new String[] { "Corey", "CARY" },
+                        new String[] { "Kory", "CARY" },
+                        //
+                        new String[] { "FUZZY", "FASY" });
+
+        for (String[] arr : testValues) {
+            Assert.assertEquals("Problem with " + arr[0], arr[1], createStringEncoder(false).encode(arr[0]));
+        }
+    }
+
+    @Test
+    public void testBran() throws EncoderException {
+        encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
+    }
+
+    @Test
+    public void testCap() throws EncoderException {
+        this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
+    }
+
+    @Test
+    public void testDan() throws EncoderException {
+        this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
+    }
+
+    @Test
+    public void testDad() throws EncoderException {
+        // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
+        // but it should be DAD, verified also with dropby.com
+        this.encodeAll(new String[] { "Dent" }, "DAD");
+    }
+
+    @Test
+    public void testSnat() throws EncoderException {
+        this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
+    }
+
+    @Test
+    public void testSnad() throws EncoderException {
+        // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
+        // but it should be SNAD
+        this.encodeAll(new String[] { "Schmidt" }, "SNAD");
+    }
+
+    @Test
+    public void testFal() throws EncoderException {
+        this.encodeAll(new String[] { "Phil" }, "FAL");
+    }
+
+    @Test
+    public void testTranan() throws EncoderException {
+        this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
+    }
+
+}