You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by to...@apache.org on 2003/02/03 03:19:41 UTC
cvs commit: jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec Encoder.java EncoderComparator.java Metaphone.java RefinedSoundex.java Soundex.java
tobrien 2003/02/02 18:19:41
Modified: codec/src/java/org/apache/commons/codec Encoder.java
EncoderComparator.java Metaphone.java
RefinedSoundex.java Soundex.java
Log:
Fixed a number of CR/LF problems
Revision Changes Path
1.3 +34 -2 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Encoder.java
Index: Encoder.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Encoder.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Encoder.java 18 Nov 2002 12:41:24 -0000 1.2
+++ Encoder.java 3 Feb 2003 02:19:41 -0000 1.3
@@ -1,4 +1,17 @@
-/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
@@ -31,4 +44,23 @@
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- * ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec;
/**
* Encoder is an interface, which is implemented by Soundex,
* Metaphone, Soundex2, etc.
*
* @author tobrien@transolutions.net
* @version $Revision$ $Date$
*/
public interface Encoder {
String encode(String str);
}
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.codec;
+
+/**
+ * Encoder is an interface, which is implemented by Soundex,
+ * Metaphone, Soundex2, etc.
+ *
+ * @author tobrien@transolutions.net
+ * @version $Revision$ $Date$
+ */
+public interface Encoder {
+ String encode(String str);
+}
+
1.3 +89 -2 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/EncoderComparator.java
Index: EncoderComparator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/EncoderComparator.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- EncoderComparator.java 18 Nov 2002 12:41:24 -0000 1.2
+++ EncoderComparator.java 3 Feb 2003 02:19:41 -0000 1.3
@@ -1,2 +1,89 @@
-/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec;
-import java.util.Comparator;
/**
* Compare using an Encoder.
*
* @author tobrien@transolutions.net
* @version $Revision$ $Date$
*/
public class EncoderComparator implements Comparator {
private Encoder encoder;
/**
* Use the default soundex algorithm, US_ENGLISH.
*/
public EncoderComparator() {
this(RefinedSoundex.US_ENGLISH);
}
/**
* Use the provided soundex algorithm.
*/
public EncoderComparator(Encoder en) {
this.encoder = en;
}
public int compare(Object o1, Object o2) {
String s1 = encoder.encode(o1.toString());
String s2 = encoder.encode(o2.toString());
return s1.compareTo(s2);
}
}
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.codec;
+
+import java.util.Comparator;
+
+/**
+ * Compare using an Encoder.
+ *
+ * @author tobrien@transolutions.net
+ * @version $Revision$ $Date$
+ */
+public class EncoderComparator implements Comparator {
+
+ private Encoder encoder;
+
+ /**
+ * Use the default soundex algorithm, US_ENGLISH.
+ */
+ public EncoderComparator() {
+ this(RefinedSoundex.US_ENGLISH);
+ }
+
+ /**
+ * Use the provided soundex algorithm.
+ */
+ public EncoderComparator(Encoder en) {
+ this.encoder = en;
+ }
+
+ public int compare(Object o1, Object o2) {
+ String s1 = encoder.encode(o1.toString());
+ String s2 = encoder.encode(o2.toString());
+ return s1.compareTo(s2);
+ }
+
+}
+
1.5 +306 -2 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java
Index: Metaphone.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- Metaphone.java 18 Nov 2002 12:41:24 -0000 1.4
+++ Metaphone.java 3 Feb 2003 02:19:41 -0000 1.5
@@ -1,2 +1,306 @@
-/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec;
/**
* A class to generate phonetic code.
* The initial Java implementation, William B. Brogden. December, 1997
* Permission given by wbrogden for code to be used anywhere.
*
* @see "Hanging on the Metaphone" by Lawrence Philips
* <i>Computer Language</i> of Dec. 1990, p 39
*
* @version $Revision$ $Date$
* @author wbrogden@bga.com
* @author bayard@generationjava.com
* @author tobrien@transolutions.net
*/
public class Metaphone implements Encoder {
- private String vowels = "AEIOU" ;
private String frontv = "EIY" ;
private String varson = "CSPTG" ;
private int maxCodeLen = 4 ;
public Metaphone() {
super();
}
/**
* Find the metaphone value of a String. This is similar to the
* soundex algorithm, but better at finding similar sounding words.
* All input is converted to upper case.
* Limitations: Input format is expected to be a single ASCII word
* with only characters in the A - Z range, no punctuation or numbers.
*/
public String metaphone( String txt ){
int mtsz = 0 ;
boolean hard = false ;
if(( txt == null ) ||
( txt.length() == 0 )) return "" ;
// single character is itself
if( txt.length() == 1 ) return txt.toUpperCase() ;
char[] inwd = txt.toUpperCase().toCharArray() ;
String tmpS ;
StringBuffer local = new StringBuffer( 40 ); // manipulate
StringBuffer code = new StringBuffer( 10 ) ; // output
// handle initial 2 characters exceptions
switch( inwd[0] ){
case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 );
else local.append( inwd );
break;
case 'A': /* looking for AE */
if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 );
else local.append( inwd );
break;
case 'W' : /* looking for WR or WH */
if( inwd[1] == 'R' ){ // WR -> R
local.append(inwd, 1, inwd.length - 1 ); break ;
}
if( inwd[1] == 'H'){
local.append(inwd, 1, inwd.length - 1 );
local.setCharAt( 0,'W'); // WH -> W
}
else local.append( inwd );
break;
case 'X' : /* initial X becomes S */
inwd[0] = 'S' ;local.append( inwd );
break ;
default :
local.append( inwd );
} // now local has working string with initials fixed
int wdsz = local.length();
int n = 0 ;
while((mtsz < maxCodeLen ) && // max code size of 4 works well
(n < wdsz ) ){
char symb = local.charAt(n) ;
// remove duplicate letters except C
if(( symb != 'C' ) &&
(n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ;
else{ // not dup
switch( symb ){
case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
if( n == 0 ) { code.append(symb );mtsz++;
}
break ; // only use vowel if leading char
case 'B' :
if( (n > 0 ) &&
!(n + 1 == wdsz ) && // not MB at end of word
( local.charAt(n - 1) == 'M')) {
code.append(symb);
}
else code.append(symb);
mtsz++ ;
break ;
case 'C' : // lots of C special cases
/* discard if SCI, SCE or SCY */
if( ( n > 0 ) &&
( local.charAt(n-1) == 'S' ) &&
( n + 1 < wdsz ) &&
( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;}
tmpS = local.toString();
if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X
code.append('X' ); mtsz++; break ;
}
if( ( n + 1 < wdsz ) &&
(frontv.indexOf( local.charAt(n+1) )>= 0 )){
code.append('S');mtsz++; break ; // CI,CE,CY -> S
}
if(( n > 0) &&
( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk
code.append('K') ; mtsz++;break ;
}
if( tmpS.indexOf("CH", n ) == n ){ // detect CH
if((n == 0 ) &&
(wdsz >= 3 ) && // CH consonant -> K consonant
(vowels.indexOf( local.charAt( 2) ) < 0 )){
code.append('K');
}
else { code.append('X'); // CHvowel -> X
}
mtsz++;
}
else { code.append('K' );mtsz++;
}
break ;
case 'D' :
if(( n + 2 < wdsz )&& // DGE DGI DGY -> J
( local.charAt(n+1) == 'G' )&&
(frontv.indexOf( local.charAt(n+2) )>= 0)){
code.append('J' ); n += 2 ;
}
else { code.append( 'T' );
}
mtsz++;
break ;
case 'G' : // GH silent at end or before consonant
if(( n + 2 == wdsz )&&
(local.charAt(n+1) == 'H' )) break ;
if(( n + 2 < wdsz ) &&
(local.charAt(n+1) == 'H' )&&
(vowels.indexOf( local.charAt(n+2)) < 0 )) break ;
tmpS = local.toString();
if((n > 0) &&
( tmpS.indexOf("GN", n ) == n)||
( tmpS.indexOf("GNED",n) == n )) break ; // silent G
if(( n > 0 ) &&
(local.charAt(n-1) == 'G')) hard = true ;
else hard = false ;
if((n+1 < wdsz) &&
(frontv.indexOf( local.charAt(n+1) ) >= 0 )&&
(!hard) ) code.append( 'J' );
else code.append('K');
mtsz++;
break ;
case 'H':
if( n + 1 == wdsz ) break ; // terminal H
if((n > 0) &&
(varson.indexOf( local.charAt(n-1)) >= 0)) break ;
if( vowels.indexOf( local.charAt(n+1)) >=0 ){
code.append('H') ; mtsz++;// Hvowel
}
break;
case 'F': case 'J' : case 'L' :
case 'M': case 'N' : case 'R' :
code.append( symb ); mtsz++; break ;
case 'K' :
if( n > 0 ){ // not initial
if( local.charAt( n -1) != 'C' ) {
code.append(symb );
}
}
else code.append( symb ); // initial K
mtsz++ ;
break ;
case 'P' :
if((n + 1 < wdsz) && // PH -> F
(local.charAt( n+1) == 'H'))code.append('F');
else code.append( symb );
mtsz++;
break ;
case 'Q' :
code.append('K' );mtsz++; break ;
case 'S' :
tmpS = local.toString();
if((tmpS.indexOf("SH", n )== n) ||
(tmpS.indexOf("SIO",n )== n) ||
(tmpS.indexOf("SIA",n )== n)) code.append('X');
else code.append( 'S' );
mtsz++ ;
break ;
case 'T' :
tmpS = local.toString(); // TIA TIO -> X
if((tmpS.indexOf("TIA",n )== n)||
(tmpS.indexOf("TIO",n )== n) ){
code.append('X'); mtsz++; break;
}
if( tmpS.indexOf("TCH",n )==n) break;
// substitute numeral 0 for TH (resembles theta after all)
if( tmpS.indexOf("TH", n )==n) code.append('0');
else code.append( 'T' );
mtsz++ ;
break ;
case 'V' :
code.append('F'); mtsz++;break ;
case 'W' : case 'Y' : // silent if not followed by vowel
if((n+1 < wdsz) &&
(vowels.indexOf( local.charAt(n+1))>=0)){
code.append( symb );mtsz++;
}
break ;
case 'X' :
code.append('K'); code.append('S');mtsz += 2;
break ;
case 'Z' :
code.append('S'); mtsz++; break ;
} // end switch
n++ ;
} // end else from symb != 'C'
if( mtsz > 4 )code.setLength( 4);
}
return code.toString();
} // end static method metaPhone()
public String encode(String pString) {
return( metaphone( pString ) );
}
/**
* Are the metaphones of two strings the same.
*/
public boolean isMetaphoneEqual(String str1, String str2) {
return metaphone(str1).equals(metaphone(str2));
}
/**
* Returns the maxCodeLen.
* @return int
*/
public int getMaxCodeLen() {
return maxCodeLen;
}
/**
* Sets the maxCodeLen.
* @param maxCodeLen The maxCodeLen to set
*/
public void setMaxCodeLen(int maxCodeLen) {
this.maxCodeLen = maxCodeLen;
}
}
\ No newline at end of file
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001-2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.codec;
+
+/**
+ * A class to generate phonetic code.
+ * The initial Java implementation, William B. Brogden. December, 1997
+ * Permission given by wbrogden for code to be used anywhere.
+ *
+ * @see "Hanging on the Metaphone" by Lawrence Philips
+ * <i>Computer Language</i> of Dec. 1990, p 39
+ *
+ * @version $Revision$ $Date$
+ * @author wbrogden@bga.com
+ * @author bayard@generationjava.com
+ * @author tobrien@transolutions.net
+ */
+public class Metaphone implements Encoder {
+
+ private String vowels = "AEIOU" ;
+ private String frontv = "EIY" ;
+ private String varson = "CSPTG" ;
+
+ private int maxCodeLen = 4 ;
+
+ public Metaphone() {
+ super();
+ }
+
+ /**
+ * Find the metaphone value of a String. This is similar to the
+ * soundex algorithm, but better at finding similar sounding words.
+ * All input is converted to upper case.
+ * Limitations: Input format is expected to be a single ASCII word
+ * with only characters in the A - Z range, no punctuation or numbers.
+ */
+ public String metaphone( String txt ){
+ int mtsz = 0 ;
+ boolean hard = false ;
+ if(( txt == null ) ||
+ ( txt.length() == 0 )) return "" ;
+ // single character is itself
+ if( txt.length() == 1 ) return txt.toUpperCase() ;
+
+ char[] inwd = txt.toUpperCase().toCharArray() ;
+
+ String tmpS ;
+ StringBuffer local = new StringBuffer( 40 ); // manipulate
+ StringBuffer code = new StringBuffer( 10 ) ; // output
+ // handle initial 2 characters exceptions
+ switch( inwd[0] ){
+ case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
+ if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 );
+ else local.append( inwd );
+ break;
+ case 'A': /* looking for AE */
+ if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 );
+ else local.append( inwd );
+ break;
+ case 'W' : /* looking for WR or WH */
+ if( inwd[1] == 'R' ){ // WR -> R
+ local.append(inwd, 1, inwd.length - 1 ); break ;
+ }
+ if( inwd[1] == 'H'){
+ local.append(inwd, 1, inwd.length - 1 );
+ local.setCharAt( 0,'W'); // WH -> W
+ }
+ else local.append( inwd );
+ break;
+ case 'X' : /* initial X becomes S */
+ inwd[0] = 'S' ;local.append( inwd );
+ break ;
+ default :
+ local.append( inwd );
+ } // now local has working string with initials fixed
+ int wdsz = local.length();
+ int n = 0 ;
+ while((mtsz < maxCodeLen ) && // max code size of 4 works well
+ (n < wdsz ) ){
+ char symb = local.charAt(n) ;
+ // remove duplicate letters except C
+ if(( symb != 'C' ) &&
+ (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ;
+ else{ // not dup
+ switch( symb ){
+ case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
+ if( n == 0 ) { code.append(symb );mtsz++;
+ }
+ break ; // only use vowel if leading char
+ case 'B' :
+ if( (n > 0 ) &&
+ !(n + 1 == wdsz ) && // not MB at end of word
+ ( local.charAt(n - 1) == 'M')) {
+ code.append(symb);
+ }
+ else code.append(symb);
+ mtsz++ ;
+ break ;
+ case 'C' : // lots of C special cases
+ /* discard if SCI, SCE or SCY */
+ if( ( n > 0 ) &&
+ ( local.charAt(n-1) == 'S' ) &&
+ ( n + 1 < wdsz ) &&
+ ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;}
+ tmpS = local.toString();
+ if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X
+ code.append('X' ); mtsz++; break ;
+ }
+ if( ( n + 1 < wdsz ) &&
+ (frontv.indexOf( local.charAt(n+1) )>= 0 )){
+ code.append('S');mtsz++; break ; // CI,CE,CY -> S
+ }
+ if(( n > 0) &&
+ ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk
+ code.append('K') ; mtsz++;break ;
+ }
+ if( tmpS.indexOf("CH", n ) == n ){ // detect CH
+ if((n == 0 ) &&
+ (wdsz >= 3 ) && // CH consonant -> K consonant
+ (vowels.indexOf( local.charAt( 2) ) < 0 )){
+ code.append('K');
+ }
+ else { code.append('X'); // CHvowel -> X
+ }
+ mtsz++;
+ }
+ else { code.append('K' );mtsz++;
+ }
+ break ;
+ case 'D' :
+ if(( n + 2 < wdsz )&& // DGE DGI DGY -> J
+ ( local.charAt(n+1) == 'G' )&&
+ (frontv.indexOf( local.charAt(n+2) )>= 0)){
+ code.append('J' ); n += 2 ;
+ }
+ else { code.append( 'T' );
+ }
+ mtsz++;
+ break ;
+ case 'G' : // GH silent at end or before consonant
+ if(( n + 2 == wdsz )&&
+ (local.charAt(n+1) == 'H' )) break ;
+ if(( n + 2 < wdsz ) &&
+ (local.charAt(n+1) == 'H' )&&
+ (vowels.indexOf( local.charAt(n+2)) < 0 )) break ;
+ tmpS = local.toString();
+ if((n > 0) &&
+ ( tmpS.indexOf("GN", n ) == n)||
+ ( tmpS.indexOf("GNED",n) == n )) break ; // silent G
+ if(( n > 0 ) &&
+ (local.charAt(n-1) == 'G')) hard = true ;
+ else hard = false ;
+ if((n+1 < wdsz) &&
+ (frontv.indexOf( local.charAt(n+1) ) >= 0 )&&
+ (!hard) ) code.append( 'J' );
+ else code.append('K');
+ mtsz++;
+ break ;
+ case 'H':
+ if( n + 1 == wdsz ) break ; // terminal H
+ if((n > 0) &&
+ (varson.indexOf( local.charAt(n-1)) >= 0)) break ;
+ if( vowels.indexOf( local.charAt(n+1)) >=0 ){
+ code.append('H') ; mtsz++;// Hvowel
+ }
+ break;
+ case 'F': case 'J' : case 'L' :
+ case 'M': case 'N' : case 'R' :
+ code.append( symb ); mtsz++; break ;
+ case 'K' :
+ if( n > 0 ){ // not initial
+ if( local.charAt( n -1) != 'C' ) {
+ code.append(symb );
+ }
+ }
+ else code.append( symb ); // initial K
+ mtsz++ ;
+ break ;
+ case 'P' :
+ if((n + 1 < wdsz) && // PH -> F
+ (local.charAt( n+1) == 'H'))code.append('F');
+ else code.append( symb );
+ mtsz++;
+ break ;
+ case 'Q' :
+ code.append('K' );mtsz++; break ;
+ case 'S' :
+ tmpS = local.toString();
+ if((tmpS.indexOf("SH", n )== n) ||
+ (tmpS.indexOf("SIO",n )== n) ||
+ (tmpS.indexOf("SIA",n )== n)) code.append('X');
+ else code.append( 'S' );
+ mtsz++ ;
+ break ;
+ case 'T' :
+ tmpS = local.toString(); // TIA TIO -> X
+ if((tmpS.indexOf("TIA",n )== n)||
+ (tmpS.indexOf("TIO",n )== n) ){
+ code.append('X'); mtsz++; break;
+ }
+ if( tmpS.indexOf("TCH",n )==n) break;
+ // substitute numeral 0 for TH (resembles theta after all)
+ if( tmpS.indexOf("TH", n )==n) code.append('0');
+ else code.append( 'T' );
+ mtsz++ ;
+ break ;
+ case 'V' :
+ code.append('F'); mtsz++;break ;
+ case 'W' : case 'Y' : // silent if not followed by vowel
+ if((n+1 < wdsz) &&
+ (vowels.indexOf( local.charAt(n+1))>=0)){
+ code.append( symb );mtsz++;
+ }
+ break ;
+ case 'X' :
+ code.append('K'); code.append('S');mtsz += 2;
+ break ;
+ case 'Z' :
+ code.append('S'); mtsz++; break ;
+ } // end switch
+ n++ ;
+ } // end else from symb != 'C'
+ if( mtsz > 4 )code.setLength( 4);
+ }
+ return code.toString();
+ } // end static method metaPhone()
+
+ public String encode(String pString) {
+ return( metaphone( pString ) );
+ }
+
+ /**
+ * Are the metaphones of two strings the same.
+ */
+ public boolean isMetaphoneEqual(String str1, String str2) {
+ return metaphone(str1).equals(metaphone(str2));
+ }
+
+ /**
+ * Returns the maxCodeLen.
+ * @return int
+ */
+ public int getMaxCodeLen() {
+ return maxCodeLen;
+ }
+
+ /**
+ * Sets the maxCodeLen.
+ * @param maxCodeLen The maxCodeLen to set
+ */
+ public void setMaxCodeLen(int maxCodeLen) {
+ this.maxCodeLen = maxCodeLen;
+ }
+
+}
1.4 +130 -2 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/RefinedSoundex.java
Index: RefinedSoundex.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/RefinedSoundex.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- RefinedSoundex.java 18 Nov 2002 13:00:25 -0000 1.3
+++ RefinedSoundex.java 3 Feb 2003 02:19:41 -0000 1.4
@@ -1,3 +1,131 @@
-/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec;
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.codec;
-/**
* Encodes a string into a soundex value. Sounde is an encoding used to
* relate similar names, but can also be used as a general purpose
* scheme to find word with similar phonemes.
* More information may be found at: http://www.bluepoof.com/Soundex/info2.html
*
* @todo Needs internationalisation in a future release.
*
* @author tobrien@transolutions.net
* @version $Revision$ $Date$
*/
public class RefinedSoundex implements Encoder {
static public final char[] US_ENGLISH_MAPPING =
"01360240043788015936020505".toCharArray();
static public final RefinedSoundex US_ENGLISH = new RefinedSoundex();
private char[] soundexMapping;
public RefinedSoundex() {
this(US_ENGLISH_MAPPING);
}
public RefinedSoundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
* Get the SoundEx value of a string.
* This implementation is taken from the code-snippers on
* http://www.sourceforge.net/
*/
public String soundex(String str) {
if(null == str || str.length() == 0) { return str; }
StringBuffer sBuf = new StringBuffer();
str = str.toUpperCase();
sBuf.append( str.charAt(0) );
char last, mapped, current;
last = '*';
for( int i = 0; i < str.length(); i++ ) {
current = getMappingCode( str.charAt(i) );
if( current == last ) {
continue;
} else if( current != 0 ) {
sBuf.append( current );
}
last = current;
}
return sBuf.toString();
}
public String encode(String pString) {
return( soundex( pString ) );
}
/**
* Used internally by the SoundEx algorithm.
*/
private char getMappingCode(char c) {
if( !Character.isLetter(c) ) {
return 0;
} else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
}
\ No newline at end of file
+
+/**
+ * Encodes a string into a soundex value. Sounde is an encoding used to
+ * relate similar names, but can also be used as a general purpose
+ * scheme to find word with similar phonemes.
+ * More information may be found at: http://www.bluepoof.com/Soundex/info2.html
+ *
+ * @todo Needs internationalisation in a future release.
+ *
+ * @author tobrien@transolutions.net
+ * @version $Revision$ $Date$
+ */
+public class RefinedSoundex implements Encoder {
+
+ static public final char[] US_ENGLISH_MAPPING =
+ "01360240043788015936020505".toCharArray();
+
+ static public final RefinedSoundex US_ENGLISH = new RefinedSoundex();
+
+ private char[] soundexMapping;
+
+ public RefinedSoundex() {
+ this(US_ENGLISH_MAPPING);
+ }
+
+ public RefinedSoundex(char[] mapping) {
+ this.soundexMapping = mapping;
+ }
+
+ /**
+ * Get the SoundEx value of a string.
+ * This implementation is taken from the code-snippers on
+ * http://www.sourceforge.net/
+ */
+ public String soundex(String str) {
+ if(null == str || str.length() == 0) { return str; }
+
+ StringBuffer sBuf = new StringBuffer();
+ str = str.toUpperCase();
+
+ sBuf.append( str.charAt(0) );
+
+ char last, mapped, current;
+ last = '*';
+
+ for( int i = 0; i < str.length(); i++ ) {
+
+ current = getMappingCode( str.charAt(i) );
+ if( current == last ) {
+ continue;
+ } else if( current != 0 ) {
+ sBuf.append( current );
+ }
+
+ last = current;
+
+ }
+
+ return sBuf.toString();
+ }
+
+ public String encode(String pString) {
+ return( soundex( pString ) );
+ }
+
+ /**
+ * Used internally by the SoundEx algorithm.
+ */
+ private char getMappingCode(char c) {
+ if( !Character.isLetter(c) ) {
+ return 0;
+ } else {
+ return soundexMapping[Character.toUpperCase(c) - 'A'];
+ }
+ }
+}
1.5 +145 -4 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java
Index: Soundex.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- Soundex.java 18 Nov 2002 13:00:26 -0000 1.4
+++ Soundex.java 3 Feb 2003 02:19:41 -0000 1.5
@@ -1,4 +1,145 @@
-/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.commons.codec;
-/**
* Encodes a string into a refined soundex value.
* A refined soundex code is optimized for spell checking word.
* "Soundex" method originally developed by Margaret Odell and
* Robert Russell
*
* http://www.bluepoof.com/Soundex/info2.html
*
* @todo Needs internationalisation in a future release.
*
* @author bayard@generationjava.com
* @author tobrien@transolutions.net
* @version $Revision$ $Date$
*/
public class Soundex implements Encoder {
static public final char[] US_ENGLISH_MAPPING =
"01230120022455012623010202".toCharArray();
static public final Soundex US_ENGLISH = new Soundex();
private char[] soundexMapping;
private int maxLength = 4;
- public Soundex() {
this(US_ENGLISH_MAPPING);
}
- public Soundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
* Get the SoundEx value of a string.
* This implementation is taken from the code-snippers on
* http://www.sourceforge.net/
*/
public String soundex(String str) {
if(null == str || str.length() == 0) { return str; }
char out[] = { '0', '0', '0', '0' };
char last, mapped;
int incount = 1, count = 1;
out[0] = Character.toUpperCase( str.charAt(0) );
last = getMappingCode( str.charAt(0) );
while( (incount < str.length() ) &&
(mapped = getMappingCode(str.charAt(incount++))) != 0 &&
(count < maxLength) )
{
if( (mapped != '0') && (mapped != last) ) {
out[count++] = mapped;
}
last = mapped;
}
return new String(out);
}
public String encode(String pString) {
return( soundex( pString ) );
}
/**
* Used internally by the SoundEx algorithm.
*/
private char getMappingCode(char c) {
if( !Character.isLetter(c) ) {
return 0;
} else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
/**
* Returns the maxLength. Standard Soundex
* @return int
*/
public int getMaxLength() {
return maxLength;
}
/**
* Sets the maxLength.
* @param maxLength The maxLength to set
*/
public void setMaxLength(int maxLength) {
this.maxLength = maxLength;
}
}
\ No newline at end of file
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001-2002 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.codec;
+
+/**
+ * Encodes a string into a refined soundex value.
+ * A refined soundex code is optimized for spell checking word.
+ * "Soundex" method originally developed by Margaret Odell and
+ * Robert Russell
+ *
+ * http://www.bluepoof.com/Soundex/info2.html
+ *
+ * @todo Needs internationalisation in a future release.
+ *
+ * @author bayard@generationjava.com
+ * @author tobrien@transolutions.net
+ * @version $Revision$ $Date$
+ */
+public class Soundex implements Encoder {
+
+ static public final char[] US_ENGLISH_MAPPING =
+ "01230120022455012623010202".toCharArray();
+
+ static public final Soundex US_ENGLISH = new Soundex();
+
+ private char[] soundexMapping;
+ private int maxLength = 4;
+
+
+ public Soundex() {
+ this(US_ENGLISH_MAPPING);
+ }
+
+ public Soundex(char[] mapping) {
+ this.soundexMapping = mapping;
+ }
+
+ /**
+ * Get the SoundEx value of a string.
+ * This implementation is taken from the code-snippers on
+ * http://www.sourceforge.net/
+ */
+ public String soundex(String str) {
+ if(null == str || str.length() == 0) { return str; }
+
+ char out[] = { '0', '0', '0', '0' };
+ char last, mapped;
+ int incount = 1, count = 1;
+ out[0] = Character.toUpperCase( str.charAt(0) );
+ last = getMappingCode( str.charAt(0) );
+ while( (incount < str.length() ) &&
+ (mapped = getMappingCode(str.charAt(incount++))) != 0 &&
+ (count < maxLength) )
+ {
+ if( (mapped != '0') && (mapped != last) ) {
+ out[count++] = mapped;
+ }
+ last = mapped;
+ }
+ return new String(out);
+ }
+
+ public String encode(String pString) {
+ return( soundex( pString ) );
+ }
+
+ /**
+ * Used internally by the SoundEx algorithm.
+ */
+ private char getMappingCode(char c) {
+ if( !Character.isLetter(c) ) {
+ return 0;
+ } else {
+ return soundexMapping[Character.toUpperCase(c) - 'A'];
+ }
+ }
+
+ /**
+ * Returns the maxLength. Standard Soundex
+ * @return int
+ */
+ public int getMaxLength() {
+ return maxLength;
+ }
+
+ /**
+ * Sets the maxLength.
+ * @param maxLength The maxLength to set
+ */
+ public void setMaxLength(int maxLength) {
+ this.maxLength = maxLength;
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org