You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by ba...@apache.org on 2002/02/22 06:20:00 UTC
cvs commit: jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec Metaphone.java Soundex.java SoundexComparator.java
bayard 02/02/21 21:20:00
Added: codec/src/java/org/apache/commons/codec Metaphone.java
Soundex.java SoundexComparator.java
Log:
Moved here from util package.
Revision Changes Path
1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java
Index: Metaphone.java
===================================================================
// Permission given by wbrogden for code to be used anywhere.
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/* Metaphone.java
* A class to generate phonetic code and keep lists of objects
* retrievable by a phonetic code.
* reference: Computer Language of Dec. 1990, p 39
* "Hanging on the Metaphone" by Lawrence Philips
*
* This Java implementation, Copyright 1997, William B. Brogden
* is hereby released for all uses. I would appreciate hearing about it
* if you find a good use for the class. December, 1997
*
* @author wbrogden@bga.com CompuServe 75415,610
*/
/*
* List functionality removed: 2001-06-21 bayard@generationjava.com
*/
/*
* Notes:
* The static method metaphone converts an input String into a code.
* All input is converted to upper case.
* Limitations: Input format is expected to be a single ASCII word
* with only characters in the A - Z range, no punctuation or numbers.
*
*/
class Metaphone {
static String vowels = "AEIOU" ;
static String frontv = "EIY" ;
static String varson = "CSPTG" ;
static final int maxCodeLen = 4 ;
/**
* Are the metaphones of two strings the same.
*/
static public boolean isMetaphoneEqual(String str1, String str2) {
return metaphone(str1).equals(metaphone(str2));
}
/**
* Find the metaphone value of a String. This is similar to the
* soundex algorithm, but better at finding similar sounding words.
*/
static public String metaphone( String txt ){
int mtsz = 0 ;
boolean hard = false ;
if(( txt == null ) ||
( txt.length() == 0 )) return "" ;
// single character is itself
if( txt.length() == 1 ) return txt.toUpperCase() ;
char[] inwd = txt.toUpperCase().toCharArray() ;
String tmpS ;
StringBuffer local = new StringBuffer( 40 ); // manipulate
StringBuffer code = new StringBuffer( 10 ) ; // output
// handle initial 2 characters exceptions
switch( inwd[0] ){
case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 );
else local.append( inwd );
break;
case 'A': /* looking for AE */
if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 );
else local.append( inwd );
break;
case 'W' : /* looking for WR or WH */
if( inwd[1] == 'R' ){ // WR -> R
local.append(inwd, 1, inwd.length - 1 ); break ;
}
if( inwd[1] == 'H'){
local.append(inwd, 1, inwd.length - 1 );
local.setCharAt( 0,'W'); // WH -> W
}
else local.append( inwd );
break;
case 'X' : /* initial X becomes S */
inwd[0] = 'S' ;local.append( inwd );
break ;
default :
local.append( inwd );
} // now local has working string with initials fixed
int wdsz = local.length();
int n = 0 ;
while((mtsz < maxCodeLen ) && // max code size of 4 works well
(n < wdsz ) ){
char symb = local.charAt(n) ;
// remove duplicate letters except C
if(( symb != 'C' ) &&
(n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ;
else{ // not dup
switch( symb ){
case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
if( n == 0 ) { code.append(symb );mtsz++;
}
break ; // only use vowel if leading char
case 'B' :
if( (n > 0 ) &&
!(n + 1 == wdsz ) && // not MB at end of word
( local.charAt(n - 1) == 'M')) {
code.append(symb);
}
else code.append(symb);
mtsz++ ;
break ;
case 'C' : // lots of C special cases
/* discard if SCI, SCE or SCY */
if( ( n > 0 ) &&
( local.charAt(n-1) == 'S' ) &&
( n + 1 < wdsz ) &&
( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;}
tmpS = local.toString();
if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X
code.append('X' ); mtsz++; break ;
}
if( ( n + 1 < wdsz ) &&
(frontv.indexOf( local.charAt(n+1) )>= 0 )){
code.append('S');mtsz++; break ; // CI,CE,CY -> S
}
if(( n > 0) &&
( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk
code.append('K') ; mtsz++;break ;
}
if( tmpS.indexOf("CH", n ) == n ){ // detect CH
if((n == 0 ) &&
(wdsz >= 3 ) && // CH consonant -> K consonant
(vowels.indexOf( local.charAt( 2) ) < 0 )){
code.append('K');
}
else { code.append('X'); // CHvowel -> X
}
mtsz++;
}
else { code.append('K' );mtsz++;
}
break ;
case 'D' :
if(( n + 2 < wdsz )&& // DGE DGI DGY -> J
( local.charAt(n+1) == 'G' )&&
(frontv.indexOf( local.charAt(n+2) )>= 0)){
code.append('J' ); n += 2 ;
}
else { code.append( 'T' );
}
mtsz++;
break ;
case 'G' : // GH silent at end or before consonant
if(( n + 2 == wdsz )&&
(local.charAt(n+1) == 'H' )) break ;
if(( n + 2 < wdsz ) &&
(local.charAt(n+1) == 'H' )&&
(vowels.indexOf( local.charAt(n+2)) < 0 )) break ;
tmpS = local.toString();
if((n > 0) &&
( tmpS.indexOf("GN", n ) == n)||
( tmpS.indexOf("GNED",n) == n )) break ; // silent G
if(( n > 0 ) &&
(local.charAt(n-1) == 'G')) hard = true ;
else hard = false ;
if((n+1 < wdsz) &&
(frontv.indexOf( local.charAt(n+1) ) >= 0 )&&
(!hard) ) code.append( 'J' );
else code.append('K');
mtsz++;
break ;
case 'H':
if( n + 1 == wdsz ) break ; // terminal H
if((n > 0) &&
(varson.indexOf( local.charAt(n-1)) >= 0)) break ;
if( vowels.indexOf( local.charAt(n+1)) >=0 ){
code.append('H') ; mtsz++;// Hvowel
}
break;
case 'F': case 'J' : case 'L' :
case 'M': case 'N' : case 'R' :
code.append( symb ); mtsz++; break ;
case 'K' :
if( n > 0 ){ // not initial
if( local.charAt( n -1) != 'C' ) {
code.append(symb );
}
}
else code.append( symb ); // initial K
mtsz++ ;
break ;
case 'P' :
if((n + 1 < wdsz) && // PH -> F
(local.charAt( n+1) == 'H'))code.append('F');
else code.append( symb );
mtsz++;
break ;
case 'Q' :
code.append('K' );mtsz++; break ;
case 'S' :
tmpS = local.toString();
if((tmpS.indexOf("SH", n )== n) ||
(tmpS.indexOf("SIO",n )== n) ||
(tmpS.indexOf("SIA",n )== n)) code.append('X');
else code.append( 'S' );
mtsz++ ;
break ;
case 'T' :
tmpS = local.toString(); // TIA TIO -> X
if((tmpS.indexOf("TIA",n )== n)||
(tmpS.indexOf("TIO",n )== n) ){
code.append('X'); mtsz++; break;
}
if( tmpS.indexOf("TCH",n )==n) break;
// substitute numeral 0 for TH (resembles theta after all)
if( tmpS.indexOf("TH", n )==n) code.append('0');
else code.append( 'T' );
mtsz++ ;
break ;
case 'V' :
code.append('F'); mtsz++;break ;
case 'W' : case 'Y' : // silent if not followed by vowel
if((n+1 < wdsz) &&
(vowels.indexOf( local.charAt(n+1))>=0)){
code.append( symb );mtsz++;
}
break ;
case 'X' :
code.append('K'); code.append('S');mtsz += 2;
break ;
case 'Z' :
code.append('S'); mtsz++; break ;
} // end switch
n++ ;
} // end else from symb != 'C'
if( mtsz > 4 )code.setLength( 4);
}
return code.toString();
} // end static method metaPhone()
}
1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java
Index: Soundex.java
===================================================================
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Find the Soundex of a string. Needs internationalisation in a
* future release.
*
* @author bayard@generationjava.com
* @version $Id: Soundex.java,v 1.1 2002/02/22 05:20:00 bayard Exp $
*/
public class Soundex {
static public final char[] US_ENGLISH_MAPPING =
"01230120022455012623010202".toCharArray();
static public final Soundex US_ENGLISH = new Soundex();
private char[] soundexMapping;
public Soundex() {
this(US_ENGLISH_MAPPING);
}
public Soundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
* Get the SoundEx value of a string.
* This implementation is taken from the code-snippers on
* http://www.sourceforge.net/
*/
public String soundex(String str) {
char out[] = { '0', '0', '0', '0' };
char last, mapped;
int incount = 1, count = 1;
out[0] = Character.toUpperCase( str.charAt(0) );
last = getMappingCode( str.charAt(0) );
while( (incount < str.length() ) &&
(mapped = getMappingCode(str.charAt(incount++))) != 0 &&
(count < 4) )
{
if( (mapped != '0') && (mapped != last) ) {
out[count++] = mapped;
}
last = mapped;
}
return new String(out);
}
/**
* Used internally by the SoundEx algorithm.
*/
private char getMappingCode(char c) {
if( !Character.isLetter(c) ) {
return 0;
} else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
}
1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/SoundexComparator.java
Index: SoundexComparator.java
===================================================================
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Comparator;
/**
* Compare using the Soundex algorithm.
*
* @author bayard@generationjava.com
* @version $Id: SoundexComparator.java,v 1.1 2002/02/22 05:20:00 bayard Exp $
*/
public class SoundexComparator implements Comparator {
private Soundex soundex;
/**
* Use the default soundex algorithm, US_ENGLISH.
*/
public SoundexComparator() {
this(Soundex.US_ENGLISH);
}
/**
* Use the provided soundex algorithm.
*/
public SoundexComparator(Soundex sx) {
this.soundex = sx;
}
public int compare(Object o1, Object o2) {
String s1 = soundex.soundex(o1.toString());
String s2 = soundex.soundex(o2.toString());
return s1.compareTo(s2);
}
}
--
To unsubscribe, e-mail: <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>