You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by to...@apache.org on 2003/02/03 16:00:12 UTC
cvs commit: jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language TestDoubleMetaphone.java
tobrien 2003/02/03 07:00:12
Modified: codec/src/test/org/apache/commons/codec TestAll.java
Added: codec TODO
codec/src/java/org/apache/commons/codec/language
DoubleMetaphone.java
codec/src/test/org/apache/commons/codec/language
TestDoubleMetaphone.java
Log:
Added DoubleMetaphone and associated JUnit test
Revision Changes Path
1.1 jakarta-commons-sandbox/codec/TODO
Index: TODO
===================================================================
This is a list of action items to be finished in the [codec] project.
This TODO list reflects the current direction of development, and
should be updated by all committers when a known issues or task
is identified.
This TODO list be periodically sync'd with the content on
http://nagoya.apache.org/wiki/apachewiki.cgi?CodecProjectPages - this
WIKI page is provides as a tool for volunteers to comment on the
current TODO list and to suggest tasks.
When a task in the TODO list is done, move the entry to the DONE list
below, and note who made the change and when.
** TODO List
* Add a Hex implementation
* Add a Rot13 implementation
* Move phonetic encoders into dedicated package.
* Add a Decoder interface
* Refactor Base64 to implement both Encoder and Decoder
* Documentation! Create Forrest documentation for Codec AFTER documentation has evolved in Wiki
* Integrate Patches:
** Patch submitted by Iulian Musat for Base64
** Add DoubleMetaphone and Nysiis implementation from KyleBurton
* DoubleMetaphone
** Modify DoubleMetaphone implementation - make it thread safe(r).
** Figure out why algorithm fails to properly code "bryce" and "maurice".
** DONE
2/3/03 - TOB - Integrated DoubleMetaphone and Test from Kyle Burton
2/2/03 - TOB - "language" package created to hold language and phonetic encodings
2/2/03 - TOB - All CRLF issues resolved in codec
1/31/03 - TOB - Patch submitted fixing CRLF problems in Soundex.java
1/31/03 - TOB - Patch submitted fixing CRLF problems in RefinedSoundex.java
1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java
Index: DoubleMetaphone.java
===================================================================
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec.language;
import org.apache.commons.codec.Encoder;
/**
* A class to generate phonetic codings based on the double metaphone
* algorithm. This module is based on example code by Ed Parrish.
*
* <b>PLEASE NOTE:</b> This implementation is not thread-safe. Please
* see TODO list for [codec] - Tim O'Brien
*
* @see http://www.cse.ucsc.edu/~eparrish/toolbox/search.html
*
* @version $Revision: 1.1 $
* @author Ed Parish
* @author <a href="mortis@voicenet.com">Kyle R. Burton</a>
*/
public class DoubleMetaphone implements Encoder {
/** The current character position in the string being encoded. */
private int current;
/** The maximum size of the phonetic encoding to compute. */
private int encodeLimit = 4;
/** Buffer for the primary encoding */
private StringBuffer primary = new StringBuffer();
/** Buffer for the alternate encoding */
private StringBuffer alternate = new StringBuffer();
/** Holder for the input being parsed. */
private String input = null;
/**
* These structures are used to make the code easier to understand, modify,
* debug, and otherwise maintain.
*/
private final static char[] vowels = {'A', 'E', 'I', 'O', 'U', 'Y'};
private final static char[] AEOU = {'A', 'E', 'O', 'U'};
private final static char[] AO = "AO".toCharArray();
private final static char[] BDH = {'B', 'D', 'H'};
private final static char[] BFHLMNRVW_ = "BFHLMNRVW ".toCharArray();
private final static char[] BH = {'B', 'H'};
private final static char[] BKLMNSTZ = "LTKSNMBZ".toCharArray();
private final static char[] BP = "BP".toCharArray();
private final static char[] CGQ = {'C', 'G', 'Q'};
private final static char[] CGLRT = {'C', 'G', 'L', 'R', 'T'};
private final static char[] CKQ = {'C', 'K', 'Q'};
private final static char[] CX = "CX".toCharArray();
private final static char[] DT = "DT".toCharArray();
private final static char[] EI = {'E', 'I'};
private final static char[] EIY = {'E', 'I', 'Y'};
private final static char[] EHI = {'I', 'E', 'H'};
private final static char[] KLS = "KLS".toCharArray();
private final static char[] LMNW = "LMNW".toCharArray();
private final static char[] ST = {'S', 'T'};
private final static char[] SZ = "SZ".toCharArray();
private final static String[] AggiOggi = {"AGGI", "OGGI"};
private final static String[] AiOi = {"AI", "OI"};
private final static String[] AlleIllaIllo = {"ILLO", "ILLA", "ALLE"};
private final static String[] AmOm = {"OM", "AM"};
private final static String[] AsOs = {"AS", "OS"};
private final static String[] ArchitOrchesOrchid = {"ARCHIT", "ORCHES", "ORCHID"};
private final static String[] AuOu = {"AU", "OU"};
private final static String[] BacherMacher = {"BACHER", "MACHER"};
private final static String[] CeCiCy = {"CI", "CE", "CY"};
private final static String[] CeCi = {"CE", "CI"};
private final static String[] CiaCieCio = {"CIO", "CIE", "CIA"};
private final static String[] CkCgCq = {"CK", "CG", "CQ"};
private final static String[] DangerMangerRanger = {"DANGER", "RANGER", "MANGER"};
private final static String[] DdDt = {"DD", "DT"};
private final static String[] EauIau = {"IAU", "EAU"};
private final static String[] EbEiElEpErEsEyIbIlInIe = {"ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"};
private final static String[] EdEmEnErOoUy = {"OO", "ER", "EN", "UY", "ED", "EM"};
private final static String[] EnEr = {"ER", "EN"};
private final static String[] EwskiEwskyOwskiOwsky = {"EWSKI", "EWSKY", "OWSKI", "OWSKY"};
private final static String[] GnKnPnPsWr = {"GN", "KN", "PN", "WR", "PS"};
private final static String[] HaracHaris = {"HARAC", "HARIS"};
private final static String[] HeimHoekHolmHolz = {"HEIM", "HOEK", "HOLM", "HOLZ"};
private final static String[] HemHiaHorHym = {"HOR", "HYM", "HIA", "HEM"};
private final static String[] IslYsl = {"ISL", "YSL"};
private final static String[] MaMe = {"ME", "MA"};
private final static String[] OgyRgy = {"RGY", "OGY"};
private final static String[] SiaSio = {"SIO", "SIA"};
private final static String[] TiaTch = {"TIA", "TCH"};
private final static String[] UcceeUcces = {"UCCEE", "UCCES"};
private final static String[] Van_Von_ = {"VAN ", "VON "};
private final static String[] WiczWitz = {"WICZ", "WITZ"};
private final static String[] ZaZiZo = {"ZO", "ZI", "ZA"};
/**
* Default constructor.
*/
public DoubleMetaphone() {
}
/**
* Parameterized constructor.
*/
public DoubleMetaphone( String in ) {
setInput( in );
}
/**
* Accessor for the primary encoding. The primary encoding will not be set
* until after encode is invoked with a non-null string.
* @return the primary encoding.
*/
public String getPrimary() {
return primary.toString();
}
/**
* Accessor for the primary encoding as a StringBuffer.
* @return the string buffer for the primary encoding
*/
public StringBuffer getPrimaryBuffer() {
return primary;
}
/**
* Accessor for the alternate encoding. The alternate encoding will not be
* set untill after encode is invoked with a non-null string.
* @return the alternate encoding.
*/
public String getAlternate() {
return alternate.toString();
}
/**
* Accessor for the alternate encoding as a StringBuffer.
* @return the string buffer for the alternate encoding
*/
public StringBuffer getAlternateBuffer() {
return alternate;
}
/**
* Accessor for the maximum encoding length for both the primary and
* alternate encodings. Once either encoding reaches this limit, the
* encoding loop will return.
* @return the encoding limit
*/
public int getEncodeLimit() {
return encodeLimit;
}
/**
* Accessor for the maximum encoding length for both the primary and
* alternate encodings. Once either encoding reaches this limit, the
* encoding loop will return.
* @return the encoding limit
*/
public boolean setEncodeLimit(int newLimit) {
if (newLimit < 1) {
throw new IllegalArgumentException("Error, limit [" + newLimit + "] must be a positive integer.");
}
encodeLimit = newLimit;
return true;
}
/**
* Accessor for storing the input to be encoded.
* @param in the input to be encoded.
*/
private void setInput(String in) {
if (in != null) {
input = in.toUpperCase() + " ";
} else {
input = "";
}
}
/**
* Append a phonetic encoded character to both the primary and alternate
* encodings.
* @param ch the character to append.
*/
private void add(char ch) {
add(ch, ch);
}
/**
* Append a phonetic encoded character to both the primary and alternate
* encodings.
* @param primaryChar the character to append to the primary encoding.
* @param alternateChar the character to append to the alternate encoding.
*/
private void add(char primaryChar, char alternateChar) {
primary.append(primaryChar);
alternate.append(alternateChar);
}
/**
* Is the character in the input string at the given index in the list of
* characters?
* @param index
* @param list
* @return true/false
*/
private boolean charAt(int index, char[] list) {
if (index < 0 || index >= input.length()) return false;
char value = input.charAt(index);
for (int i = 0; i < list.length; i++) {
if (value == list[i]) return true;
}
return false;
}
/**
* Is the string at the given starting index matches the given pattern.
* @param start the index where to begin the comparison
* @param length the number of characters to compare
* @param str the pattern string to be located
* @return true/false
*/
private boolean stringAt(int start, int length, String str) {
String[] list = new String[1];
list[0] = str;
return stringAt(start, length, list);
}
/**
* Is the string at the given starting index matches any of the given pattern
* strings.
* @param start the index where to begin the comparison
* @param length the number of characters to compare
* @param list the strings to search for.
* @return true/false
*/
private boolean stringAt(int start, int length, String[] list) {
if (length <= 0) return false;
for (int i = 0; i < list.length; i++) {
if (input.regionMatches(start, list[i], 0, length)) return true;
}
return false;
}
/**
* Test the character in the input string at index to see if it is a vowel.
* @param index the location of the character to test
* @return true/false
*/
private boolean isVowel(int index) {
return charAt(index, vowels);
}
/**
* Test the input string to see if it is likely to be categorizeable
* as Slavo-Germanic in nature. This effects some of the encoding
* descisions as far as the phonetic pronounciations of portions of
* the name.
* @return true/false
*/
private boolean isSlavoGermanic() {
if((input.indexOf('W') > -1) || (input.indexOf('K') > -1)
|| (input.indexOf("CZ") > -1) || (input.indexOf("WITZ") > -1)) {
return true;
}
return false;
}
/**
* Append the given coding to both the primary and alternate encodings.
* @param ch
* @param code
*/
private void addCode(char ch, char code) {
add(code);
current++;
if(input.charAt(current) == ch) current++;
}
/**
* Static version of encode that first constructs a new DoubleMetaphone
* object, and then invokes encode on it. Note that by using this method you
* are sacrificing the abilty to access the alternate encoding. Also, since
* this method merely creates a new DoubleMetaphone to handle the encoding,
* it is effectivly thread-safe.
*
* This method was originaly created to allow this encoder to be used as a
* Java Stored Procedure in Oracle.
*
* @param in the string to encode
* @return the encoded string
*/
public static String sencode( String in ) {
DoubleMetaphone dm = new DoubleMetaphone();
return dm.encode(in);
}
/**
* Encode the given string using the Double Metaphone algorithm. Double
* Metaphone produces two encodings, a primary and a secondary. The encode
* method returns the primary encoding. To access the secondary encoding,
* call getAlternate.
* @param in the input string to encode
* @return the primary encoding.
*/
public String encode( String in ) {
setInput(in);
return encode();
}
/**
* Encode the already set input string using the Double Metaphone algorithm.
* Double Metaphone produces two encodings, a primary and a secondary. The
* encode method returns the primary encoding. To access the secondary
* encoding, call getAlternate.
* @return the primary encoding.
*/
public String encode() {
if (input == null) return "";
primary.delete(0, primary.length());
alternate.delete(0, alternate.length());
int length = input.length();
if (length < 1) return "";
int last = length - 1; //zero based index
current = 0;
//skip these when at start of word
if (stringAt(0, 2, GnKnPnPsWr)) current++;
//Initial 'X' is pronounced 'Z' e.g. 'Xavier'
if(input.startsWith("X")) {
add('S'); //'Z' maps to 'S'
current++;
}
while (primary.length() < encodeLimit || alternate.length() < encodeLimit) {
if(current >= length) break;
// this is coded as a huge switch statement for performance
switch(input.charAt(current)) {
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'Y':
if (current == 0) add('A'); // all init vowels map to 'A'
current++;
break;
case 'B':
// "-mb", e.g "dumb", already skipped over...
addCode('B', 'P');
break;
case '�':
add('S');
current++;
// Note: no doublecheck
break;
case 'C':
// various germanic
if((current > 1) && !isVowel(current - 2)
&& input.regionMatches(current - 1, "ACH", 0, 3)
&& (input.charAt(current + 2) != 'I'
&& input.charAt(current + 2) != 'E'
|| stringAt(current - 2, 6, BacherMacher) )) {
add('K');
current +=2;
break;
}
// special case 'caesar'
if (current == 0
&& input.regionMatches(current, "CAESAR", 0, 6)) {
add('S');
current +=2;
break;
}
//italian 'chianti'
if (input.regionMatches(current, "CHIA", 0, 4)) {
add('K');
current +=2;
break;
}
if (input.regionMatches(current, "CH", 0, 2)) {
//find 'michael'
if(current > 0
&& input.regionMatches(current, "CHAE", 0, 4)) {
add('K', 'X');
current +=2;
break;
}
// greek roots e.g. 'chemistry', 'chorus'
if (current == 0
&& (stringAt(current + 1, 5, HaracHaris)
|| stringAt((current + 1), 3, HemHiaHorHym))
&& !input.regionMatches(0, "CHORE", 0, 5)) {
add('K');
current +=2;
break;
}
// germanic, greek, or otherwise 'ch' for 'kh' sound
if ((stringAt(0, 4, Van_Von_)
|| input.regionMatches(0, "SCH ", 0, 3))
// 'architect' but not 'arch', 'orchestra', 'orchid'
|| stringAt(0, 6, ArchitOrchesOrchid)
|| charAt(current + 2, ST)
|| ((charAt(current - 1, AEOU)
|| current == 0)
// e.g. 'wachtler', 'wechsler', but not 'tichner'
&& charAt(current + 2, BFHLMNRVW_))) {
add('K');
} else {
if (current > 0) {
if (input.regionMatches(0, "MC", 0, 2)) {
// e.g. "McHugh"
add('K');
} else {
add('X', 'K');
}
} else {
add('X');
}
}
current +=2;
break;
}
// e.g. 'czerny'
if (input.regionMatches(current, "CZ", 0, 2)
&& !input.regionMatches(current - 2, "WICZ", 0, 4)) {
add('S', 'X');
current += 2;
break;
}
// e.g. 'focaccia'
if (input.regionMatches(current + 1, "CIA", 0, 3)) {
add('X');
current += 3;
break;
}
// double 'C', but not if e.g. 'McClellan'
if (input.regionMatches(current, "CC", 0, 2)
&& !((current == 1) && (input.charAt(0) == 'M'))) {
// 'bellocchio' but not 'bacchus'
if (charAt(current + 2, EHI)
&& !input.regionMatches(current + 2, "HU", 0, 2)) {
// 'accident', 'accede' 'succeed'
if(((current == 1) && (input.charAt(current - 1) == 'A'))
|| stringAt(current - 1, 5, UcceeUcces)) {
add('K');
add('S');
} else { // 'bacci', 'bertucci', other italian
add('X');
}
current += 3;
break;
} else { // Pierce's rule
add('K');
current += 2;
break;
}
}
if (stringAt(0, 2, CkCgCq)) {
add('K');
current += 2;
break;
}
if (stringAt(0, 2, CeCiCy)) {
// italian vs. english
if (stringAt(0, 3, CiaCieCio)) {
add('S', 'X');
} else {
add('S');
}
current += 2;
break;
}
// else
add('K');
// name sent in 'mac caffrey', 'mac gregor'
if (charAt(current + 1, CGQ)) {
current += 3;
} else {
if (charAt(current + 1, CKQ)
&& !stringAt(current + 1, 2, CeCi)) {
current += 2;
} else {
current++;
}
}
break;
case 'D':
if(input.regionMatches(current, "DG", 0, 2)) {
if (charAt(current + 2, EIY)) {
//e.g. 'edge'
add('J');
current += 3;
break;
} else {
//e.g. 'edgar'
add('T');
add('K');
current += 2;
break;
}
}
if (stringAt(current, 2, DdDt)) {
add('T');
current += 2;
break;
}
//else
add('T');
current++;
break;
case 'F': // NTR: this is typical default behavior
addCode('F', 'F');
break;
case 'G':
if (input.charAt(current + 1) == 'H') {
if (current > 0 && !isVowel(current - 1)) {
add('K');
current += 2;
break;
}
if (current < 3) {
// 'ghislane', 'ghiradelli'
if (current == 0) {
if (input.charAt(current + 2) == 'I') {
add('J');
} else {
add('K');
}
current += 2;
break;
}
}
//Parker's rule (with some further refinements) - e.g., 'hugh'
if((current > 1 && charAt(current - 2, BDH))
//e.g., 'bough'
|| (current > 2 && charAt(current - 3, BDH ))
//e.g., 'broughton'
|| (current > 3 && charAt(current - 4, BH)) ) {
current += 2;
break;
} else {
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
if (current > 2 && input.charAt(current - 1) == 'U'
&& charAt(current - 3, CGLRT) ) {
add('F');
} else {
if (current > 0 && input.charAt(current - 1) != 'I') {
add('K');
}
}
current += 2;
break;
}
}
boolean slavoGermanic = isSlavoGermanic();
if (input.charAt(current + 1) == 'N') {
if (current == 1 && isVowel(0) && !slavoGermanic) {
primary.append('K');
add('N');
} else {
//not e.g. 'cagney'
if (!input.regionMatches(current + 2, "EY", 0, 2)
&& (input.charAt(current + 1) != 'Y')
&& !slavoGermanic) {
alternate.append('K');
add('N');
} else {
add('K');
add('N');
}
current += 2;
break;
}
}
//'tagliaro'
if (input.regionMatches(current + 1, "LI", 0, 2)
&& !slavoGermanic) {
primary.append('K');
add('L');
current += 2;
break;
}
//-ges-,-gep-,-gel-, -gie- at beginning
if((current == 0)
&& (input.charAt(current + 1) == 'Y'
|| stringAt(current + 1, 2, EbEiElEpErEsEyIbIlInIe)) ) {
add('K', 'J');
current += 2;
break;
}
// -ger-, -gy-
if ((input.regionMatches(current + 1, "ER", 0, 2)
|| input.charAt(current + 1) == 'Y')
&& !stringAt(0, 6, DangerMangerRanger)
&& !charAt(current - 1, EI)
&& !stringAt(current - 1, 3, OgyRgy) ) {
add('K', 'J');
current += 2;
break;
}
// italian e.g, 'biaggi'
if (charAt(current + 1, EIY)
|| stringAt(current - 1, 4, AggiOggi)) {
//obvious germanic
if ((stringAt(0, 4, Van_Von_)
|| input.regionMatches(0, "SCH", 0, 3))
|| input.regionMatches(current + 1, "ET", 0, 2)) {
add('K');
} else {
//always soft if french ending
if (input.regionMatches(current + 1, "IER ", 0, 4)) {
add('J');
} else {
add('J', 'K');
}
current += 2;
break;
}
}
if (input.charAt(current + 1) == 'G') {
current += 2;
} else {
current++;
}
add('K');
break;
case 'H':
// only keep if first & before vowel or btw. 2 vowels
if ((current == 0 || isVowel(current - 1))
&& isVowel(current + 1)) {
add('H');
current += 2;
} else { // also takes care of 'HH'
current++;
}
break;
case 'J':
//obvious spanish, 'jose', 'san jacinto'
if (stringAt(current, 4, "JOSE") || stringAt(0, 4, "SAN ")) {
if ((current == 0 && (input.charAt(current + 4) == ' '))
|| stringAt(0, 4, "SAN ")) {
add('H');
} else {
add('J', 'H');
}
current +=1;
break;
}
if (current == 0 && !stringAt(current, 4, "JOSE")) {
add('J', 'A'); // Yankelovich/Jankelowicz
} else {
// spanish pron. of e.g. 'bajador'
if (isVowel(current - 1) && !isSlavoGermanic()
&& ((input.charAt(current + 1) == 'A')
|| (input.charAt(current + 1) == 'O'))) {
add('J', 'H');
} else {
if (current == last) {
add('J', ' ');
} else {
if (!charAt(current + 1, BKLMNSTZ)
&& !charAt(current - 1, KLS)) {
add('J');
}
}
}
}
current++;
if(input.charAt(current) == 'J') current++; // doublecheck
break;
case 'K': // NTR: this is typical default behavior
addCode('K', 'K');
break;
case 'L':
if (input.charAt(current + 1) == 'L') {
//spanish e.g. 'cabrillo', 'gallegos'
if (((current == (length - 3))
&& stringAt(current - 1, 4, AlleIllaIllo))
|| ((stringAt((last - 1), 2, AsOs)
|| charAt(last, AO))
&& stringAt(current - 1, 4, "ALLE")) ) {
primary.append('L');
current += 2;
break;
}
current += 2;
} else {
current++;
}
add('L');
break;
case 'M':
if ((stringAt(current - 1, 3, "UMB")
&& (((current + 1) == last)
|| stringAt(current + 2, 2, "ER")))
//'dumb','thumb'
|| (input.charAt(current + 1) == 'M') ) {
current += 2;
} else {
current++;
}
add('M');
break;
case 'N': // NTR: this is typical default behavior
addCode('N', 'N');
break;
case '�':
current++;
add('N');
break;
case 'P':
if (input.charAt(current + 1) == 'H') {
add('F');
current += 2;
break;
}
//also account for 'campbell', 'raspberry'
if (charAt(current + 1, BP))
current += 2;
else
current++;
add('P');
break;
case 'Q': // NTR: this is typical default behavior
addCode('Q', 'K');
break;
case 'R':
//french e.g. 'rogier', but exclude 'hochmeier'
if ((current == last)
&& !isSlavoGermanic()
&& stringAt(current - 2, 2, "IE")
&& !stringAt(current - 4, 2, MaMe)) {
alternate.append('R');
} else {
add('R');
}
current++;
if(input.charAt(current) == 'R') current++; // doublecheck
break;
case 'S':
//special cases 'island', 'isle', 'carlisle', 'carlysle'
if (stringAt(current - 1, 3, IslYsl)) {
current++;
break;
}
//special case 'sugar-'
if ((current == 0) && stringAt(current, 5, "SUGAR")) {
add('X', 'S');
current++;
break;
}
if (stringAt(current, 2, "SH")) {
//germanic
if (stringAt(current + 1, 4, HeimHoekHolmHolz)) {
add('S');
} else {
add('X');
}
current += 2;
break;
}
//italian & armenian
if (stringAt(current, 3, SiaSio)
|| stringAt(current, 4, "SIAN")) {
if (!isSlavoGermanic()) {
add('S', 'X');
} else {
add('S');
}
current += 3;
break;
}
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
if ((current == 0 && charAt(current + 1, LMNW))
|| input.charAt(current + 1) == 'Z') {
add('S', 'X');
if (input.charAt(current + 1) == 'Z') {
current += 2;
} else {
current++;
}
break;
}
if (stringAt(current, 2, "SC")) {
//Schlesinger's rule
if (input.charAt(current + 2) == 'H') {
//dutch origin, e.g. 'school', 'schooner'
if (stringAt(current + 3, 2, EdEmEnErOoUy)) {
//'schermerhorn', 'schenker'
if (stringAt((current + 3), 2, EnEr)) {
add('X', 'S');
alternate.append('K');
} else {
add('S');
add('K');
}
current += 3;
break;
} else {
if (current == 0 && !isVowel(3)
&& input.charAt(3) != 'W') {
add('X', 'S');
} else {
add('X');
}
current += 3;
break;
}
}
if (charAt(current + 2, EIY)) {
add('S');
current += 3;
break;
}
//else
add('S');
add('K');
current += 3;
break;
}
//french e.g. 'resnais', 'artois'
if (current == last && stringAt(current - 2, 2, AiOi)) {
alternate.append('S');
} else {
add('S');
}
if (charAt(current + 1, SZ)) {
current += 2;
} else {
current++;
}
break;
case 'T':
if (stringAt(current, 4, "TION")) {
add('X');
current += 3;
break;
}
if (stringAt(current, 3, TiaTch)) {
add('X');
current += 3;
break;
}
if (stringAt(current, 2, "TH") || stringAt(current, 3, "TTH")) {
//special case 'thomas', 'thames' or germanic
if (stringAt(current + 2, 2, AmOm)
|| stringAt(0, 4, Van_Von_)
|| stringAt(0, 3, "SCH")) {
add('T');
} else {
add('0', 'T');
}
current += 2;
break;
}
if (charAt(current + 1, DT))
current += 2;
else
current++;
add('T');
break;
case 'V': // NTR: this is typical default behavior
addCode('V', 'F');
break;
case 'W':
//can also be in middle of word
if (stringAt(current, 2, "WR")) {
add('R');
current += 2;
break;
}
if (current == 0 && (isVowel(current + 1)
|| stringAt(current, 2, "WH"))) {
//Wasserman should match Vasserman
if (isVowel(current + 1)) {
add('A', 'F');
} else {
//need 'Uomo' to match 'Womo'
add('A');
}
}
//'Arnow' should match 'Arnoff'
if ((current == last && isVowel(current - 1))
|| stringAt(current - 1, 5, EwskiEwskyOwskiOwsky)
|| stringAt(0, 3, "SCH")) {
alternate.append('F');
current +=1;
break;
}
//polish e.g. 'filipowicz'
if (stringAt(current, 4, WiczWitz)) {
add('T', 'F');
add('S', 'X');
current +=4;
break;
}
//else skip it
current +=1;
break;
case 'X':
//french e.g. breaux
if (!(current == last && (stringAt((current - 3), 3, EauIau)
|| stringAt((current - 2), 2, AuOu))) ) {
add('K');
add('S');
}
if (charAt(current + 1, CX)) {
current += 2;
} else {
current++;
}
break;
case 'Z':
//chinese pinyin e.g. 'zhao'
if (input.charAt(current + 1) == 'H') {
add('J');
current += 2;
break;
} else {
if (stringAt(current + 1, 2, ZaZiZo)
|| (isSlavoGermanic() && (current > 0
&& input.charAt(current - 1) != 'T'))) {
alternate.append('T');
add('S');
} else {
add('S');
}
}
if (input.charAt(current + 1) == 'Z') {
current += 2;
} else {
current++;
}
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
add(input.charAt(current));
current++;
break;
default:
current++;
} // switch
} // while
// Only give back the specified length
if (primary.length() > encodeLimit) {
primary.delete(encodeLimit, primary.length());
}
if (alternate.length() > encodeLimit) {
alternate.delete(encodeLimit, alternate.length());
}
return primary.toString();
}
/**
* Check if the two strings encode to the same primary or alternate encodings
* using the Double Metaphone algorithm.
* @param s1
* @param s2
* @return true/false
*/
public static boolean isEncodeEqual( String s1, String s2 ) {
DoubleMetaphone dm1 = new DoubleMetaphone( s1 );
DoubleMetaphone dm2 = new DoubleMetaphone( s2 );
dm1.encode();
dm2.encode();
return dm1.getPrimary().equals( dm2.getPrimary() )
|| dm1.getPrimary().equals( dm2.getAlternate() )
|| dm1.getAlternate().equals( dm2.getPrimary() );
}
}
1.3 +7 -4 jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java
Index: TestAll.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- TestAll.java 18 Nov 2002 13:00:26 -0000 1.2
+++ TestAll.java 3 Feb 2003 15:00:12 -0000 1.3
@@ -61,6 +61,8 @@
package org.apache.commons.codec;
+import org.apache.commons.codec.language.TestDoubleMetaphone;
+
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
@@ -81,6 +83,7 @@
suite.addTest(TestMetaphone.suite());
suite.addTest(TestSoundex.suite());
suite.addTest(TestRefinedSoundex.suite());
+ suite.addTest(TestDoubleMetaphone.suite());
return suite;
}
1.1 jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java
Index: TestDoubleMetaphone.java
===================================================================
/*
* $Header: /home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java,v 1.1 2003/02/03 15:00:12 tobrien Exp $
* $Revision: 1.1 $
* $Date: 2003/02/03 15:00:12 $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
*/
package org.apache.commons.codec.language;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.TestEncoder;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* @version $Revision: 1.1 $ $Date: 2003/02/03 15:00:12 $
* @author <a href="mortic@voicenet.com">Kyle R. Burton</a>
*/
public class TestDoubleMetaphone extends TestEncoder {
public TestDoubleMetaphone(String name) {
super(name);
}
public static Test suite() {
return (new TestSuite(TestDoubleMetaphone.class));
}
public void setUp() throws Exception {
super.setUp();
_encoder = new DoubleMetaphone();
}
public void tearDown() throws Exception {
super.tearDown();
_encoder = null;
}
protected Encoder makeEncoder() {
return new DoubleMetaphone();
}
// ------------------------------------------------------------------------
public void testDoubleMetaphone() {
for(int i = 0; i < words.length; ++i) {
assertEquals(
"encoding: " + words[i],
encodings[i],
_encoder.encode(words[i])
);
}
}
public void testIsDoubleMetaphoneEqual() {
// need good examples of when two strings should encode to
// the same values...
}
private DoubleMetaphone _encoder = null;
// These tests were taken from the Text::DoubleMetaphone
// Perl module available from CPAN
private String [] words = {
//"maurice",
"aubrey",
"cambrillo",
"heidi",
"katherine",
"catherine",
"richard",
"bob",
"eric",
"geoff",
"dave",
"ray",
"steven",
//"bryce",
"randy",
"bryan",
"brian",
"otto",
"auto",
};
private String [] encodings = {
// "MRS",
"APR",
"KMPR",
"HT",
"K0RN",
"K0RN",
"RXRT",
"PP",
"ARK",
"JF",
"TF",
"R",
"STFN",
//"PRS",
"RNT",
"PRN",
"PRN",
"AT",
"AT",
};
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org