You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by gg...@apache.org on 2003/12/12 00:44:11 UTC

cvs commit: jakarta-commons/codec/src/java/org/apache/commons/codec/language SoundexUtils.java RefinedSoundex.java Soundex.java

ggregory    2003/12/11 15:44:11

  Modified:    codec/src/test/org/apache/commons/codec/language
                        RefinedSoundexTest.java
               codec/src/java/org/apache/commons/codec/language
                        RefinedSoundex.java Soundex.java
  Added:       codec/src/java/org/apache/commons/codec/language
                        SoundexUtils.java
  Log:
  Refactor for implementing difference() API in both Soundex and RefinedSoundex.
  http://nagoya.apache.org/bugzilla/show_bug.cgi?id=25243
  
  Revision  Changes    Path
  1.5       +101 -76   jakarta-commons/codec/src/test/org/apache/commons/codec/language/RefinedSoundexTest.java
  
  Index: RefinedSoundexTest.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/test/org/apache/commons/codec/language/RefinedSoundexTest.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- RefinedSoundexTest.java	5 Oct 2003 21:45:49 -0000	1.4
  +++ RefinedSoundexTest.java	11 Dec 2003 23:44:11 -0000	1.5
  @@ -2,109 +2,134 @@
    * ====================================================================
    * 
    * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  + * 
  + * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
  + * 
    * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer. 
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgement:  
  - *       "This product includes software developed by the 
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgement may appear in the software itself,
  - *    if and wherever such third-party acknowledgements normally appear.
  - *
  - * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
  - *    Foundation" must not be used to endorse or promote products derived
  - *    from this software without prior written permission. For written 
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache" nor may "Apache" appear in their name without prior 
  - *    written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  + * modification, are permitted provided that the following conditions are met: 1.
  + * Redistributions of source code must retain the above copyright notice, this
  + * list of conditions and the following disclaimer. 2. Redistributions in
  + * binary form must reproduce the above copyright notice, this list of
  + * conditions and the following disclaimer in the documentation and/or other
  + * materials provided with the distribution. 3. The end-user documentation
  + * included with the redistribution, if any, must include the following
  + * acknowledgement: "This product includes software developed by the Apache
  + * Software Foundation (http://www.apache.org/)." Alternately, this
  + * acknowledgement may appear in the software itself, if and wherever such
  + * third-party acknowledgements normally appear. 4. The names "Apache", "The
  + * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
  + * used to endorse or promote products derived from this software without prior
  + * written permission. For written permission, please contact
  + * apache@apache.org. 5. Products derived from this software may not be called
  + * "Apache", "Apache" nor may "Apache" appear in their name without prior
  + * written permission of the Apache Software Foundation.
  + * 
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
  + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  + * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - *
  - */ 
  + * 
  + * This software consists of voluntary contributions made by many individuals
  + * on behalf of the Apache Software Foundation. For more information on the
  + * Apache Software Foundation, please see <http://www.apache.org/> .
  + *  
  + */
   
   package org.apache.commons.codec.language;
   
   import junit.framework.Test;
   import junit.framework.TestSuite;
  -
  +import org.apache.commons.codec.EncoderException;
   import org.apache.commons.codec.StringEncoder;
   import org.apache.commons.codec.StringEncoderAbstractTest;
   
   /**
  - * @version $Revision$ $Date$
  + * Tests RefinedSoundex.
  + * 
  + * @version $Id$
    * @author Rodney Waldhoff
  + * @author Gary D. Gregory
    */
   public class RefinedSoundexTest extends StringEncoderAbstractTest {
   
  -    public RefinedSoundexTest(String name) {
  -        super(name);
  -    }
  -
       public static Test suite() {
           return (new TestSuite(RefinedSoundexTest.class));
       }
   
  -    public void setUp() throws Exception {        
  -        super.setUp();
  -        _encoder = new RefinedSoundex();
  +    private RefinedSoundex encoder = null;
  +
  +    public RefinedSoundexTest(String name) {
  +        super(name);
       }
   
  -    public void tearDown() throws Exception {
  -        super.tearDown();
  -        _encoder = null;
  +    /**
  +	 * @return Returns the encoder.
  +	 */
  +    private RefinedSoundex getEncoder() {
  +        return this.encoder;
       }
   
       protected StringEncoder makeEncoder() {
           return new RefinedSoundex();
       }
   
  -    // ------------------------------------------------------------------------
  +    /**
  +	 * @param encoder
  +	 *                  The encoder to set.
  +	 */
  +    private void setEncoder(RefinedSoundex encoder) {
  +        this.encoder = encoder;
  +    }
   
  -    public void testEncode() throws Exception {
  -        assertEquals("T6036084",_encoder.encode("testing"));
  -        assertEquals("T60",_encoder.encode("The"));
  -        assertEquals("Q503",_encoder.encode("quick"));
  -        assertEquals("B1908",_encoder.encode("brown"));
  -        assertEquals("F205",_encoder.encode("fox"));
  -        assertEquals("J408106",_encoder.encode("jumped"));
  -        assertEquals("O0209",_encoder.encode("over"));
  -        assertEquals("T60",_encoder.encode("the"));
  -        assertEquals("L7050",_encoder.encode("lazy"));
  -        assertEquals("D6043",_encoder.encode("dogs"));
  +    public void setUp() throws Exception {
  +        super.setUp();
  +        this.setEncoder(new RefinedSoundex());
       }
   
  -    private RefinedSoundex _encoder = null;
  -}
  +    public void tearDown() throws Exception {
  +        super.tearDown();
  +        this.setEncoder(null);
  +    }
  +
  +    public void testDifference() throws EncoderException {
  +        // Edge cases
  +        assertEquals(0, this.getEncoder().difference(null, null));
  +        assertEquals(0, this.getEncoder().difference("", ""));
  +        assertEquals(0, this.getEncoder().difference(" ", " "));
  +        // Normal cases
  +        assertEquals(6, this.getEncoder().difference("Smith", "Smythe"));
  +        assertEquals(3, this.getEncoder().difference("Ann", "Andrew"));
  +        assertEquals(1, this.getEncoder().difference("Margaret", "Andrew"));
  +        assertEquals(1, this.getEncoder().difference("Janet", "Margaret"));
  +        // Examples from
  +		// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
  +        assertEquals(5, this.getEncoder().difference("Green", "Greene"));
  +        assertEquals(1, this.getEncoder().difference("Blotchet-Halls", "Greene"));
  +        // Examples from
  +		// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
  +        assertEquals(6, this.getEncoder().difference("Smith", "Smythe"));
  +        assertEquals(8, this.getEncoder().difference("Smithers", "Smythers"));
  +        assertEquals(5, this.getEncoder().difference("Anothers", "Brothers"));
  +    }
  +
  +    public void testEncode() throws EncoderException {
  +        assertEquals("T6036084", this.getEncoder().encode("testing"));
  +        assertEquals("T6036084", this.getEncoder().encode("TESTING"));
  +        assertEquals("T60", this.getEncoder().encode("The"));
  +        assertEquals("Q503", this.getEncoder().encode("quick"));
  +        assertEquals("B1908", this.getEncoder().encode("brown"));
  +        assertEquals("F205", this.getEncoder().encode("fox"));
  +        assertEquals("J408106", this.getEncoder().encode("jumped"));
  +        assertEquals("O0209", this.getEncoder().encode("over"));
  +        assertEquals("T60", this.getEncoder().encode("the"));
  +        assertEquals("L7050", this.getEncoder().encode("lazy"));
  +        assertEquals("D6043", this.getEncoder().encode("dogs"));
  +    }
  +}
  \ No newline at end of file
  
  
  
  1.13      +126 -97   jakarta-commons/codec/src/java/org/apache/commons/codec/language/RefinedSoundex.java
  
  Index: RefinedSoundex.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/RefinedSoundex.java,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- RefinedSoundex.java	24 Nov 2003 00:11:56 -0000	1.12
  +++ RefinedSoundex.java	11 Dec 2003 23:44:11 -0000	1.13
  @@ -30,8 +30,8 @@
    *    from this software without prior written permission. For written 
    *    permission, please contact apache@apache.org.
    *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache" nor may "Apache" appear in their name without prior 
  + * 5. Products derived from this software may not be called "Apache"
  + *    nor may "Apache" appear in their name without prior 
    *    written permission of the Apache Software Foundation.
    *
    * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  @@ -61,120 +61,99 @@
   import org.apache.commons.codec.StringEncoder;
   
   /**
  - * Encodes a string into a refined soundex value.  
  - * A refined soundex code is optimized for spell checking word. 
  - * "Soundex" method originally developed by Margaret Odell and 
  - * Robert Russell
  + * Encodes a string into a refined soundex value. A refined soundex code is
  + * optimized for spell checking word. "Soundex" method originally developed by
  + * Margaret Odell and Robert Russell
    * 
    * @author Tim O'Brien
  - * @author ggregory@seagullsw.com
  + * @author Gary D. Gregory
    * @version $Id$
    */
   public class RefinedSoundex implements StringEncoder {
   
       /**
  -     * RefinedSoundex is *refined* for a number of
  -     * reasons one being that the mappings have been
  -     * altered.  This implementation contains default
  -     * mappings for US English.
  -     */
  -    public static final char[] US_ENGLISH_MAPPING =
  -        "01360240043788015936020505".toCharArray();
  +	 * This static variable contains an instance of the RefinedSoundex using
  +	 * the US_ENGLISH mapping.
  +	 */
  +    public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
   
       /**
  -     * This static variable contains an instance of the
  -     * RefinedSoundex using the US_ENGLISH mapping.
  -     */
  -    public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
  -    
  +	 * RefinedSoundex is *refined* for a number of reasons one being that the
  +	 * mappings have been altered. This implementation contains default
  +	 * mappings for US English.
  +	 */
  +    public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
  +
       /**
  -     * Every letter of the alphabet is "mapped" to a numerical 
  -     * value.  This char array holds the values to which each
  -     * letter is mapped.  This implementation contains a default
  -     * map for US_ENGLISH
  -     */
  +	 * Every letter of the alphabet is "mapped" to a numerical value. This char
  +	 * array holds the values to which each letter is mapped. This
  +	 * implementation contains a default map for US_ENGLISH
  +	 */
       private char[] soundexMapping;
   
       /**
  -     * Creates an instance of the RefinedSoundex object using the
  -     * default US English mapping.
  -     */
  +	 * Creates an instance of the RefinedSoundex object using the default US
  +	 * English mapping.
  +	 */
       public RefinedSoundex() {
           this(US_ENGLISH_MAPPING);
       }
   
       /**
  -     * Creates a refined soundex instance using a custom mapping.  This
  -     * constructor can be used to customize the mapping, and/or possibly
  -     * provide an internationalized mapping for a non-Western character
  -     * set.
  -     *
  -     * @param mapping Mapping array to use when finding the corresponding
  -     *                code for a given character
  -     */
  +	 * Creates a refined soundex instance using a custom mapping. This
  +	 * constructor can be used to customize the mapping, and/or possibly
  +	 * provide an internationalized mapping for a non-Western character set.
  +	 * 
  +	 * @param mapping
  +	 *                  Mapping array to use when finding the corresponding code for
  +	 *                  a given character
  +	 */
       public RefinedSoundex(char[] mapping) {
           this.soundexMapping = mapping;
       }
   
       /**
  -     * Retreives the Refined Soundex code for a given String object.
  -     *
  -     * @param str String to encode using the Refined Soundex algorithm
  -     * @return A soundex code for the String supplied
  -     */
  -    public String soundex(String str) {
  -        if (null == str || str.length() == 0) { return str; }
  -       
  -        StringBuffer sBuf = new StringBuffer();        
  -        str = str.toUpperCase();
  -
  -        sBuf.append(str.charAt(0));
  -
  -        char last, current;
  -        last = '*';
  -
  -        for (int i = 0; i < str.length(); i++) {
  -
  -            current = getMappingCode(str.charAt(i));
  -            if (current == last) {
  -                continue;
  -            } else if (current != 0) {
  -                sBuf.append(current);   
  -            }
  -            
  -            last = current;             
  -            
  -        }
  -        
  -        return sBuf.toString();
  -    }
  -
  -    /**
  -     * Encodes a String using the refined soundex algorithm. 
  -     *
  -     * @param pString A String object to encode
  -     * @return A Soundex code corresponding to the String supplied
  -     */
  -    public String encode(String pString) {
  -        return soundex(pString);   
  +	 * Returns the number of characters in the two encoded Strings that are the
  +	 * same. This return value ranges from 0 to the length of the shortest
  +	 * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
  +	 * example) indicates strong similarity or identical values. For refined
  +	 * Soundex, the return value can be greater than 4.
  +	 * 
  +	 * @param s1
  +	 *                  A String that will be encoded and compared.
  +	 * @param s2
  +	 *                  A String that will be encoded and compared.
  +	 * @return The number of characters in the two encoded Strings that are the
  +	 *             same from 0 to to the length of the shortest encoded String.
  +	 * 
  +	 * @see SoundexUtils#difference(StringEncoder,String,String)
  +	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  +	 *          MS T-SQL DIFFERENCE</a>
  +	 * 
  +	 * @throws EncoderException
  +	 *                  if an error occurs encoding one of the strings
  +	 */
  +    public int difference(String s1, String s2) throws EncoderException {
  +        return SoundexUtils.difference(this, s1, s2);
       }
   
       /**
  -     * Encodes an Object using the refined soundex algorithm.  This method
  -     * is provided in order to satisfy the requirements of the
  -     * Encoder interface, and will throw an EncoderException if the
  -     * supplied object is not of type java.lang.String.
  -     *
  -     * @param pObject Object to encode
  -     * @return An object (or type java.lang.String) containing the 
  -     *         refined soundex code which corresponds to the String supplied.
  -     * @throws EncoderException if the parameter supplied is not
  -     *                          of type java.lang.String
  -     */
  +	 * Encodes an Object using the refined soundex algorithm. This method is
  +	 * provided in order to satisfy the requirements of the Encoder interface,
  +	 * and will throw an EncoderException if the supplied object is not of type
  +	 * java.lang.String.
  +	 * 
  +	 * @param pObject
  +	 *                  Object to encode
  +	 * @return An object (or type java.lang.String) containing the refined
  +	 *             soundex code which corresponds to the String supplied.
  +	 * @throws EncoderException
  +	 *                  if the parameter supplied is not of type java.lang.String
  +	 */
       public Object encode(Object pObject) throws EncoderException {
           Object result;
           if (!(pObject instanceof java.lang.String)) {
  -            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); 
  +            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
           } else {
               result = soundex((String) pObject);
           }
  @@ -182,18 +161,68 @@
       }
   
       /**
  -     * Returns the mapping code for a given character.  The mapping
  -     * codes are maintained in an internal char array named soundexMapping,
  -     * and the default values of these mappings are US English.
  -     *
  -     * @param c char to get mapping for
  -     * @return A character (really a numeral) to return for the given char
  -     */
  +	 * Encodes a String using the refined soundex algorithm.
  +	 * 
  +	 * @param pString
  +	 *                  A String object to encode
  +	 * @return A Soundex code corresponding to the String supplied
  +	 */
  +    public String encode(String pString) {
  +        return soundex(pString);
  +    }
  +
  +    /**
  +	 * Returns the mapping code for a given character. The mapping codes are
  +	 * maintained in an internal char array named soundexMapping, and the
  +	 * default values of these mappings are US English.
  +	 * 
  +	 * @param c
  +	 *                  char to get mapping for
  +	 * @return A character (really a numeral) to return for the given char
  +	 */
       private char getMappingCode(char c) {
           if (!Character.isLetter(c)) {
               return 0;
           } else {
  -            return soundexMapping[Character.toUpperCase(c) - 'A'];
  +            return this.soundexMapping[Character.toUpperCase(c) - 'A'];
  +        }
  +    }
  +
  +    /**
  +	 * Retreives the Refined Soundex code for a given String object.
  +	 * 
  +	 * @param str
  +	 *                  String to encode using the Refined Soundex algorithm
  +	 * @return A soundex code for the String supplied
  +	 */
  +    public String soundex(String str) {
  +        if (str == null) {
  +            return null;
  +        }
  +        str = SoundexUtils.clean(str);
  +        if (str.length() == 0) {
  +            return str;
           }
  +
  +        StringBuffer sBuf = new StringBuffer();
  +        sBuf.append(str.charAt(0));
  +
  +        char last, current;
  +        last = '*';
  +
  +        for (int i = 0; i < str.length(); i++) {
  +
  +            current = getMappingCode(str.charAt(i));
  +            if (current == last) {
  +                continue;
  +            } else if (current != 0) {
  +                sBuf.append(current);
  +            }
  +
  +            last = current;
  +
  +        }
  +
  +        return sBuf.toString();
       }
  -}
  +}
  \ No newline at end of file
  
  
  
  1.17      +19 -99    jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
  
  Index: Soundex.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -r1.16 -r1.17
  --- Soundex.java	11 Dec 2003 01:39:28 -0000	1.16
  +++ Soundex.java	11 Dec 2003 23:44:11 -0000	1.17
  @@ -68,6 +68,7 @@
    * @author bayard@generationjava.com
    * @author Tim O'Brien
    * @author Gary Gregory
  + * @see <a href="http://www.archives.gov/research_room/genealogy/census/soundex.html">NARA, Genealogy, Soundex Indexing</a>
    * @version $Id$
    */
   public class Soundex implements StringEncoder {
  @@ -85,19 +86,19 @@
       public static final char[] US_ENGLISH_MAPPING = "01230120022455012623010202".toCharArray();
   
       /**
  -	 * Returns the difference between the Soundex values of two Strings. For
  -	 * Soundex, this return value ranges from 0 through 4: 0 indicates little or
  -	 * no similarity, and 4 indicates strong similarity or identical values.
  -	 * 
  -     * @param s1
  -     *                  A String that will be encoded and compared.
  -     * @param s2
  -     *                  A String that will be encoded and compared.
  -	 * @return The return value ranges from 0 through 4: 0 indicates little or
  -	 *             no similarity, and 4 indicates strong similarity or identical
  -	 *             values.
  +	 * Encodes the Strings and returns the number of characters in the two
  +	 * encoded Strings that are the same. This return value ranges from 0
  +	 * through 4: 0 indicates little or no similarity, and 4 indicates strong
  +	 * similarity or identical values.
  +	 * 
  +	 * @param s1
  +	 *                  A String that will be encoded and compared.
  +	 * @param s2
  +	 *                  A String that will be encoded and compared.
  +	 * @return The number of characters in the two encoded Strings that are the
  +	 *             same from 0 to 4.
   	 * 
  -     * @see #difference(StringEncoder,String,String)
  +	 * @see SoundexUtils#difference(StringEncoder,String,String)
   	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
   	 *          MS T-SQL DIFFERENCE</a>
   	 * 
  @@ -105,64 +106,7 @@
   	 *                  if an error occurs encoding one of the strings
   	 */
       public int difference(String s1, String s2) throws EncoderException {
  -        return difference(this, s1, s2);
  -    }
  -
  -    /**
  -	 * Returns the difference between the encoded values of two Strings. The
  -	 * higher the difference factor, the more similar the strings. For Soundex,
  -	 * this return value ranges from 0 through 4: 0 indicates little or no
  -	 * similarity, and 4 indicates strong similarity or identical values.
  -	 * 
  -	 * @param encoder
  -	 *                  The encoder to use to encode the String parameters.
  -     * @param s1
  -     *                  A String that will be encoded and compared.
  -     * @param s2
  -     *                  A String that will be encoded and compared.
  -	 * @return an integer from 0 to the length of the shorter string. The
  -	 *             smaller the number, the more different the strings are.
  -	 * 
  -     * @see #differenceEncoded(String,String)
  -	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  -	 *          MS T-SQL DIFFERENCE</a>
  -	 * 
  -	 * @throws EncoderException
  -	 *                  if an error occurs encoding one of the strings
  -	 */
  -    public static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
  -        return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
  -    }
  -
  -    /**
  -	 * Returns the difference between the values of two encoded Strings. The
  -	 * higher the difference factor, the more similar the strings. For Soundex,
  -	 * this return value ranges from 0 through 4: 0 indicates little or no
  -	 * similarity, and 4 indicates strong similarity or identical values.
  -	 * 
  -	 * @param es1
  -	 *                  An encoded String.
  -	 * @param es2
  -	 *                  An encoded String.
  -	 * @return an integer from 0 to the length of the shorter string. The
  -	 *             smaller the number, the more different the strings are.
  -	 * 
  -	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  -	 *          MS T-SQL DIFFERENCE</a>
  -	 */
  -    public static int differenceEncoded(String es1, String es2) {
  -
  -        if (es1 == null || es2 == null) {
  -            return 0;
  -        }
  -        int lengthToMatch = Math.min(es1.length(), es2.length());
  -        int diff = 0;
  -        for (int i = 0; i < lengthToMatch; i++) {
  -            if (es1.charAt(i) == es2.charAt(i)) {
  -                diff++;
  -            }
  -        }
  -        return diff;
  +        return SoundexUtils.difference(this, s1, s2);
       }
   
       /**
  @@ -203,32 +147,6 @@
       }
   
       /**
  -	 * Cleans up the input string before Soundex processing by only returning
  -	 * upper case letters.
  -	 * 
  -	 * @param str
  -	 *                  The String to clean
  -	 * @return a clean String.
  -	 */
  -    private String clean(String str) {
  -        if (str == null || str.length() == 0) {
  -            return str;
  -        }
  -        int len = str.length();
  -        char[] chars = new char[len];
  -        int count = 0;
  -        for (int i = 0; i < len; i++) {
  -            if (Character.isLetter(str.charAt(i))) {
  -                chars[count++] = str.charAt(i);
  -            }
  -        }
  -        if (count == len) {
  -            return str.toUpperCase();
  -        }
  -        return new String(chars, 0, count).toUpperCase();
  -    }
  -
  -    /**
   	 * Encodes an Object using the soundex algorithm. This method is provided
   	 * in order to satisfy the requirements of the Encoder interface, and will
   	 * throw an EncoderException if the supplied object is not of type
  @@ -306,7 +224,8 @@
       }
   
       /**
  -	 * @return Returns the soundexMapping.
  +     * Returns the soundex mapping.
  +	 * @return soundexMapping.
   	 */
       private char[] getSoundexMapping() {
           return this.soundexMapping;
  @@ -336,6 +255,7 @@
       }
   
       /**
  +     * Sets the soundexMapping.
   	 * @param soundexMapping
   	 *                  The soundexMapping to set.
   	 */
  @@ -354,7 +274,7 @@
           if (str == null) {
               return null;
           }
  -        str = this.clean(str);
  +        str = SoundexUtils.clean(str);
           if (str.length() == 0) {
               return str;
           }
  
  
  
  1.1                  jakarta-commons/codec/src/java/org/apache/commons/codec/language/SoundexUtils.java
  
  Index: SoundexUtils.java
  ===================================================================
  /*
   * ====================================================================
   * 
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgement may appear in the software itself,
   *    if and wherever such third-party acknowledgements normally appear.
   *
   * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their name without prior 
   *    written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   */ 
  
  package org.apache.commons.codec.language;
  
  import org.apache.commons.codec.EncoderException;
  import org.apache.commons.codec.StringEncoder;
  
  /**
   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
   * 
   * @author Gary D. Gregory
   * @version $Id: SoundexUtils.java,v 1.1 2003/12/11 23:44:11 ggregory Exp $
   */
  final class SoundexUtils {
  
      /**
  	 * Cleans up the input string before Soundex processing by only returning
  	 * upper case letters.
  	 * 
  	 * @param str
  	 *                  The String to clean.
  	 * @return A clean String.
  	 */
      static String clean(String str) {
          if (str == null || str.length() == 0) {
              return str;
          }
          int len = str.length();
          char[] chars = new char[len];
          int count = 0;
          for (int i = 0; i < len; i++) {
              if (Character.isLetter(str.charAt(i))) {
                  chars[count++] = str.charAt(i);
              }
          }
          if (count == len) {
              return str.toUpperCase();
          }
          return new String(chars, 0, count).toUpperCase();
      }
  
      /**
  	 * Encodes the Strings and returns the number of characters in the two
  	 * encoded Strings that are the same.
  	 * <ul>
  	 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  	 * little or no similarity, and 4 indicates strong similarity or identical
  	 * values.</li>
  	 * <li>For refined Soundex, the return value can be greater than 4.</li>
  	 * </ul>
  	 * 
  	 * @param encoder
  	 *                  The encoder to use to encode the Strings.
  	 * @param s1
  	 *                  A String that will be encoded and compared.
  	 * @param s2
  	 *                  A String that will be encoded and compared.
  	 * @return The number of characters in the two Soundex encoded Strings that
  	 *             are the same.
  	 * 
  	 * @see #differenceEncoded(String,String)
  	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  	 *          MS T-SQL DIFFERENCE</a>
  	 * 
  	 * @throws EncoderException
  	 *                  if an error occurs encoding one of the strings
  	 */
      static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
      }
  
      /**
  	 * Returns the number of characters in the two Soundex encoded Strings that
  	 * are the same.
  	 * <ul>
  	 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  	 * little or no similarity, and 4 indicates strong similarity or identical
  	 * values.</li>
  	 * <li>For refined Soundex, the return value can be greater than 4.</li>
  	 * </ul>
  	 * 
  	 * @param es1
  	 *                  An encoded String.
  	 * @param es2
  	 *                  An encoded String.
  	 * @return The number of characters in the two Soundex encoded Strings that
  	 *             are the same.
  	 * 
  	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  	 *          MS T-SQL DIFFERENCE</a>
  	 */
      static int differenceEncoded(String es1, String es2) {
  
          if (es1 == null || es2 == null) {
              return 0;
          }
          int lengthToMatch = Math.min(es1.length(), es2.length());
          int diff = 0;
          for (int i = 0; i < lengthToMatch; i++) {
              if (es1.charAt(i) == es2.charAt(i)) {
                  diff++;
              }
          }
          return diff;
      }
  
  }
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org