You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2012/03/08 14:29:24 UTC
svn commit: r1298387 - in
/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language:
Nysiis.java NysiisTest.java
Author: ggregory
Date: Thu Mar 8 13:29:24 2012
New Revision: 1298387
URL: http://svn.apache.org/viewvc?rev=1298387&view=rev
Log:
[CODEC-63] Implement NYSIIS. Adding Encoder class and test both in the test directory until the encoder is fully baked. This should make it easier to patch further.
Added:
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java?rev=1298387&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/Nysiis.java Thu Mar 8 13:29:24 2012
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * THIS CLASS LIVES IN THE TEST DIRECTORY UNTIL IT IS FULLY BAKED.
+ *
+ * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
+ * general purpose scheme to find word with similar phonemes.
+ *
+ * <p>
+ * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
+ * </p>
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">http://en.wikipedia.org/wiki/NYSIIS</a>
+ * @see <a href="http://www.dropby.com/NYSIIS.html">http://www.dropby.com/NYSIIS.html</a>
+ * @see Soundex
+ * @version $Id: Nysiis.java 669755 2008-06-20 01:21:52Z sebb $
+ */
+public class Nysiis implements StringEncoder {
+
+ private static final char[] CHARS_A = new char[] { 'A' };
+ private static final char[] CHARS_AF = new char[] { 'A', 'F' };
+ private static final char[] CHARS_C = new char[] { 'C' };
+ private static final char[] CHARS_FF = new char[] { 'F', 'F' };
+ private static final char[] CHARS_G = new char[] { 'G' };
+ private static final char[] CHARS_N = new char[] { 'N' };
+ private static final char[] CHARS_NN = new char[] { 'N', 'N' };
+ private static final char[] CHARS_S = new char[] { 'S' };
+ private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
+ private static final char SPACE = ' ';
+ private static final int TRUE_LENGTH = 6;
+
+ /**
+ * Tests if the given character is a vowel.
+ *
+ * @param c
+ * the character to test
+ * @return <code>true</code> if the character is a vowel, <code>false</code> otherwise
+ */
+ private static boolean isVowel(final char c) {
+ return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
+ }
+
+ /**
+ * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
+ * a time: [i-1, i, i+1, i+2].
+ *
+ * @param prev
+ * the previous character
+ * @param curr
+ * the current character
+ * @param next
+ * the next character
+ * @param aNext
+ * the after next character
+ * @return a transcoded array of characters, starting from the current position
+ */
+ private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
+ // 1. EV -> AF
+ if (curr == 'E' && next == 'V') {
+ return CHARS_AF;
+ }
+
+ // A, E, I, O, U -> A
+ if (isVowel(curr)) {
+ return CHARS_A;
+ }
+
+ // 2. Q -> G, Z -> S, M -> N
+ if (curr == 'Q') {
+ return CHARS_G;
+ } else if (curr == 'Z') {
+ return CHARS_S;
+ } else if (curr == 'M') {
+ return CHARS_N;
+ }
+
+ // 3. KN -> NN else K -> C
+ if (curr == 'K') {
+ if (next == 'N') {
+ return CHARS_NN;
+ } else {
+ return CHARS_C;
+ }
+ }
+
+ // 4. SCH -> SSS
+ if (curr == 'S' && next == 'C' && aNext == 'H') {
+ return CHARS_SSS;
+ }
+
+ // PH -> FF
+ if (curr == 'P' && next == 'H') {
+ return CHARS_FF;
+ }
+
+ // 5. H -> If previous or next is a non vowel, previous.
+ if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
+ return new char[] { prev };
+ }
+
+ // 6. W -> If previous is vowel, previous.
+ if (curr == 'W' && isVowel(prev)) {
+ return new char[] { prev };
+ }
+
+ return new char[] { curr };
+ }
+
+ private final boolean trueLength;
+
+ public Nysiis() {
+ this(true);
+ }
+
+ public Nysiis(boolean trueLength) {
+ this.trueLength = trueLength;
+ }
+
+ /**
+ * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
+ * {@link String}.
+ *
+ * @param pObject
+ * Object to encode
+ * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
+ * @throws EncoderException
+ * if the parameter supplied is not of a {@link String}
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public Object encode(Object pObject) throws EncoderException {
+ if (!(pObject instanceof String)) {
+ throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
+ }
+ return this.nysiis((String) pObject);
+ }
+
+ /**
+ * Encodes a String using the NYSIIS algorithm.
+ *
+ * @param pString
+ * A String object to encode
+ * @return A Nysiis code corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public String encode(String pString) {
+ return this.nysiis(pString);
+ }
+
+ public boolean isTrueLength() {
+ return trueLength;
+ }
+
+ /**
+ * Retrieves the NYSIIS code for a given String object.
+ *
+ * @param str
+ * String to encode using the NYSIIS algorithm
+ * @return A NYSIIS code for the String supplied
+ */
+ public String nysiis(String str) {
+ if (str == null) {
+ return null;
+ }
+
+ // Use the same clean rules as Soundex
+ str = SoundexUtils.clean(str);
+
+ if (str.length() == 0) {
+ return str;
+ }
+
+ // Translate first characters of name:
+ // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
+ str = str.replaceFirst("^MAC", "MCC");
+ str = str.replaceFirst("^KN", "NN");
+ str = str.replaceFirst("^K", "C");
+ str = str.replaceFirst("^(PH|PF)", "FF");
+ str = str.replaceFirst("^SCH", "SSS");
+
+ // Translate last characters of name:
+ // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
+ str = str.replaceFirst("(EE|IE)$", "Y");
+ str = str.replaceFirst("(DT|RT|RD|NT|ND)$", "D");
+
+ // First character of key = first character of name.
+ StringBuffer key = new StringBuffer(str.length());
+ key.append(str.charAt(0));
+
+ // Transcode remaining characters, incrementing by one character each time
+ final char[] chars = str.toCharArray();
+ final int len = chars.length;
+
+ for (int i = 1; i < len; i++) {
+ final char next = i < len - 1 ? chars[i + 1] : SPACE;
+ final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
+ final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
+ System.arraycopy(transcoded, 0, chars, i, transcoded.length);
+
+ // only append the current char to the key if it is different from the last one
+ if (chars[i] != chars[i - 1]) {
+ key.append(chars[i]);
+ }
+ }
+
+ if (key.length() > 1) {
+ char lastChar = key.charAt(key.length() - 1);
+
+ // If last character is S, remove it.
+ if (lastChar == 'S') {
+ key.deleteCharAt(key.length() - 1);
+ lastChar = key.charAt(key.length() - 1);
+ }
+
+ if (key.length() > 2) {
+ final char last2Char = key.charAt(key.length() - 2);
+ // If last characters are AY, replace with Y.
+ if (last2Char == 'A' && lastChar == 'Y') {
+ key.deleteCharAt(key.length() - 2);
+ }
+ }
+
+ // If last character is A, remove it.
+ if (lastChar == 'A') {
+ key.deleteCharAt(key.length() - 1);
+ }
+ }
+
+ final String string = key.toString();
+ return this.isTrueLength() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
+ }
+
+}
Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298387&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java Thu Mar 8 13:29:24 2012
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+import org.apache.commons.codec.StringEncoderAbstractTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests {@link Nysiis}
+ *
+ * @version $Id: NysiisTest.java 658834 2008-05-21 19:57:51Z niallp $
+ */
+public class NysiisTest extends StringEncoderAbstractTest {
+
+ @Override
+ protected StringEncoder createStringEncoder() {
+ return new Nysiis();
+ }
+
+ protected StringEncoder createStringEncoder(boolean trueLength) {
+ return new Nysiis(trueLength);
+ }
+
+ private void encodeAll(String[] strings, String expectedEncoding) throws EncoderException {
+ for (int i = 0; i < strings.length; i++) {
+ Assert.assertEquals("Problem with " + strings[i], expectedEncoding, getStringEncoder().encode(strings[i]));
+ }
+ }
+
+ @Test
+ public void testDropBy() throws EncoderException {
+ List<String[]> testValues =
+ Arrays.asList(
+ new String[] { "MACINTOSH", "MCANT" },
+ new String[] { "KNUTH", "NAT" },
+ new String[] { "KOEHN", "CAN" },
+ new String[] { "PHILLIPSON", "FALAPSAN" },
+ new String[] { "PFEISTER", "FASTAR" },
+ new String[] { "MCKEE", "MCY" },
+ new String[] { "MACKIE", "MCY" },
+ new String[] { "HEITSCHMIDT", "HATSNAD" },
+ new String[] { "BART", "BAD" },
+ new String[] { "HURD", "HAD" },
+ new String[] { "HUNT", "HAD" },
+ new String[] { "WESTERLUND", "WASTARLAD" },
+ new String[] { "CASSTEVENS", "CASTAFAN" },
+ new String[] { "VASQUEZ", "VASG" },
+ new String[] { "FRAZIER", "FRASAR" },
+ new String[] { "BOWMAN", "BANAN" },
+ new String[] { "RICKERT", "RACAD" },
+ new String[] { "DEUTSCH", "DAT" },
+ new String[] { "WESTPHAL", "WASTFAL" },
+ new String[] { "SHRIVER", "SRAVAR" },
+ new String[] { "KUHL", "CAL" },
+ new String[] { "RAWSON", "RASAN" },
+ new String[] { "JILES", "JAL" },
+ new String[] { "CARRAWAY", "CARY" },
+ new String[] { "YAMADA", "YANAD" });
+
+ for (String[] arr : testValues) {
+ Assert.assertEquals("Problem with " + arr[0], arr[1], createStringEncoder(false).encode(arr[0]));
+ }
+ }
+
+ /**
+ * Tests data gathered from around the internets.
+ *
+ * @throws EncoderException
+ */
+ @Test
+ public void testOthers() throws EncoderException {
+ List<String[]> testValues =
+ Arrays.asList(
+ // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+ // 1. Transcode first characters of name
+ new String[] { "MACINTOSH", "MCANT" },
+ //new String[] { "KNUTH", "NNATH" }, // Original: NNAT; modified: NATH
+ //new String[] { "KOEHN", "C" },
+ //new String[] { "PHILLIPSON", "FFALAP" },
+ //new String[] { "PFEISTER", "FFASTA" },
+ //new String[] { "SCHOENHOEFT", "SSANAF" },
+ // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+ // 2.Transcode last characters of name:
+ new String[] { "MCKEE", "MCY" },
+ new String[] { "MACKIE", "MCY" },
+ new String[] { "HEITSCHMIDT", "HATSNAD" },
+ new String[] { "BART", "BAD" },
+ new String[] { "HURD", "HAD" },
+ new String[] { "HUNT", "HAD" },
+ new String[] { "WESTERLUND", "WASTARLAD" },
+ // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
+ // 4. Transcode remaining characters by following these rules, incrementing by one character each time:
+ new String[] { "CASSTEVENS", "CASTAFAN" },
+ new String[] { "VASQUEZ", "VASG" },
+ new String[] { "FRAZIER", "FRASAR" },
+ new String[] { "BOWMAN", "BANAN" },
+ new String[] { "MCKNIGHT", "MCNAGT" },
+ new String[] { "RICKERT", "RACAD" },
+ //new String[] { "DEUTSCH", "DATS" },
+ new String[] { "WESTPHAL", "WASTFAL" },
+ //new String[] { "SHRIVER", "SHRAVA" },
+ //new String[] { "KUHL", "C" },
+ new String[] { "RAWSON", "RASAN" },
+ // If last character is S, remove it
+ new String[] { "JILES", "JAL" },
+ //new String[] { "CARRAWAY", "CARAY" },
+ new String[] { "YAMADA", "YANAD" },
+ // Others
+ new String[] { "O'Daniel", "ODANAL" },
+ new String[] { "O'Donnel", "ODANAL" },
+ new String[] { "Cory", "CARY" },
+ new String[] { "Corey", "CARY" },
+ new String[] { "Kory", "CARY" },
+ //
+ new String[] { "FUZZY", "FASY" });
+
+ for (String[] arr : testValues) {
+ Assert.assertEquals("Problem with " + arr[0], arr[1], createStringEncoder(false).encode(arr[0]));
+ }
+ }
+
+ @Test
+ public void testBran() throws EncoderException {
+ encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
+ }
+
+ @Test
+ public void testCap() throws EncoderException {
+ this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
+ }
+
+ @Test
+ public void testDan() throws EncoderException {
+ this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
+ }
+
+ @Test
+ public void testDad() throws EncoderException {
+ // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
+ // but it should be DAD, verified also with dropby.com
+ this.encodeAll(new String[] { "Dent" }, "DAD");
+ }
+
+ @Test
+ public void testSnat() throws EncoderException {
+ this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
+ }
+
+ @Test
+ public void testSnad() throws EncoderException {
+ // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
+ // but it should be SNAD
+ this.encodeAll(new String[] { "Schmidt" }, "SNAD");
+ }
+
+ @Test
+ public void testFal() throws EncoderException {
+ this.encodeAll(new String[] { "Phil" }, "FAL");
+ }
+
+ @Test
+ public void testTranan() throws EncoderException {
+ this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
+ }
+
+}