You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by to...@apache.org on 2003/06/11 05:23:26 UTC

cvs commit: jakarta-commons/codec/src/java/org/apache/commons/codec/language DoubleMetaphone.java

tobrien     2003/06/10 20:23:26

  Modified:    codec    project.xml
  Added:       codec/src/java/org/apache/commons/codec/language
                        DoubleMetaphone.java
  Log:
  Added Benjamin Walstrum's contribution of DoubleMetaphone, added Benjamin to the contributors
  
  Revision  Changes    Path
  1.14      +4 -0      jakarta-commons/codec/project.xml
  
  Index: project.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/project.xml,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- project.xml	29 May 2003 21:53:42 -0000	1.13
  +++ project.xml	11 Jun 2003 03:23:26 -0000	1.14
  @@ -93,6 +93,10 @@
        <email>steve.zimmermann@heii.com</email>
        <roles>documentation</roles>
      </contributor>
  +   <contributor>
  +     <name>Benjamin Walstrum</name>
  +     <email>ben@walstrum.com</email>
  +   </contributor>
     </contributors>  
   
     <dependencies>
  
  
  
  1.1                  jakarta-commons/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java
  
  Index: DoubleMetaphone.java
  ===================================================================
  /*
   * $Header: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java,v 1.1 2003/06/11 03:23:26 tobrien Exp $
   * $Revision: 1.1 $
   * $Date: 2003/06/11 03:23:26 $
   *
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Commons", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   */
  package org.apache.commons.codec.language;
  
  import org.apache.commons.codec.EncoderException;
  import org.apache.commons.codec.StringEncoder;
  
  /**
   * DoubleMetaphone Java Implementation
   * From the algorithm by Lawrence Philips
   * 
   * Original Article <a 
   * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
   * http://www.cuj.com/documents/s=8038/cuj0006philips/</a>
   * Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
   * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a>
   * 
   * @author <a href="mailto:ben@walstrum.com">Benjamin Walstrum</a>
   */
  public class DoubleMetaphone implements StringEncoder {
  
      private static final String VOWELS = "AEIOUY";
      private static final String[] SILENT_START = 
      { "GN", "KN", "PN", "WR", "PS" };
      private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 
      { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
      private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 
      { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
      private static final String[] L_T_K_S_N_M_B_Z = 
      { "L", "T", "K", "S", "N", "M", "B", "Z" };
  
      private int maxCodeLen = 4;
  
      public DoubleMetaphone() {
          super();
      }
      
      /**
       * Encode a value with Double Metaphone
       */
      public String doubleMetaphone(String value) {
          return doubleMetaphone(value, false);
      }
      
      /**
       * Encode a value with Double Metaphone, optionally using the alternate
       * encoding.
       */
      public String doubleMetaphone(String value, boolean alternate) {
          value = cleanInput(value);
          if (value == null) {
              return null;
          }
          
          boolean slavoGermanic = isSlavoGermanic(value);
          int index = isSilentStart(value) ? 1 : 0;
          
          DoubleMetaphoneResult result = new DoubleMetaphoneResult(maxCodeLen);
          
          while (!result.isComplete() && index <= value.length() - 1) {
              switch (value.charAt(index)) {
              case 'A':
              case 'E':
              case 'I':
              case 'O':
              case 'U':
              case 'Y':
                  index = handleAEIOUY(value, result, index);
                  break;
              case 'B':
                  result.append('P');
                  index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
                  break;
              case 'Ç':
                  result.append('S');
                  index++;
                  break;
              case 'C':
                  index = handleC(value, result, index);
                  break;
              case 'D':
                  index = handleD(value, result, index);
                  break;
              case 'F':
                  result.append('F');
                  index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
                  break;
              case 'G':
                  index = handleG(value, result, index, slavoGermanic);
                  break;
              case 'H':
                  index = handleH(value, result, index);
                  break;
              case 'J':
                  index = handleJ(value, result, index, slavoGermanic);
                  break;
              case 'K':
                  result.append('K');
                  index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
                  break;
              case 'L':
                  index = handleL(value, result, index);
                  break;
              case 'M':
                  result.append('M');
                  index = conditionM0(value, index) ? index + 2 : index + 1;
                  break;
              case 'N':
                  result.append('N');
                  index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
                  break;
              case 'Ñ':
                  result.append('N');
                  index++;
                  break;
              case 'P':
                  index = handleP(value, result, index);
                  break;
              case 'Q':
                  result.append('K');
                  index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
                  break;
              case 'R':
                  index = handleR(value, result, index, slavoGermanic);
                  break;
              case 'S':
                  index = handleS(value, result, index, slavoGermanic);
                  break;
              case 'T':
                  index = handleT(value, result, index);
                  break;
              case 'V':
                  result.append('F');
                  index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
                  break;
              case 'W':
                  index = handleW(value, result, index);
                  break;
              case 'X':
                  index = handleX(value, result, index);
                  break;
              case 'Z':
                  index = handleZ(value, result, index, slavoGermanic);
                  break;
              default:
                  index++;
                  break;
              }
          }
  
          return alternate ? result.getAlternate() : result.getPrimary();
      }
      
      /**
       * Encode the value using DoubleMetaphone.  It will only work if 
       * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
       */
      public Object encode(Object obj) throws EncoderException {
          Object result;
  
          if (!(obj instanceof String)) {
              throw new EncoderException("Parameter supplied to Metaphone " 
                                         + "encode is not of type " 
                                         + "java.lang.String"); 
          } else {
              return doubleMetaphone((String) obj);
          }
      }
  
      /**
       * Encode the value using DoubleMetaphone
       */
      public String encode(String value) throws EncoderException {
          return (doubleMetaphone(value));   
      }
  
      /**
       * Check if the Double Metaphone values of two <code>String</code> values
       * are equal
       */
      public boolean isDoubleMetaphoneEqual(String value1, String value2) {
          return isDoubleMetaphoneEqual(value1, value2, false);
      }
      
      /**
       * Check if the Double Metaphone values of two <code>String</code> values
       * are equal, optionally using the alternate value
       */
      public boolean isDoubleMetaphoneEqual(String value1, String value2, boolean 
                                            alternate) {
          return doubleMetaphone(value1, alternate).equals(doubleMetaphone
                                                           (value2, alternate));
      }
      
      /**
       * Returns the maxCodeLen.
       * @return int
       */
      public int getMaxCodeLen() {
          return maxCodeLen;
      }
  
      /**
       * Sets the maxCodeLen.
       * @param maxCodeLen The maxCodeLen to set
       */
      public void setMaxCodeLen(int maxCodeLen) {
          this.maxCodeLen = maxCodeLen;
      }
  
      //-- BEGIN HANDLERS --//
  
      /**
       * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
       */
      private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 
                               index) {
          if (index == 0) {
              result.append('A');
          }
          return index + 1;
      }
      
      /**
       * Handles 'C' cases
       */
      private int handleC(String value, DoubleMetaphoneResult result, int index) {
          if (conditionC0(value, index)) {  // very confusing, moved out
              result.append('K');
              index += 2;
          } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
              result.append('S');
              index += 2;
          } else if (contains(value, index, 2, "CH")) {
              index = handleCH(value, result, index);
          } else if (contains(value, index, 2, "CZ") && 
                     !contains(value, index - 2, 4, "WICZ")) {
              //-- "Czerny" --//
              result.append('S', 'X');
              index += 2;
          } else if (contains(value, index + 1, 3, "CIA")) {
              //-- "focaccia" --//
              result.append('X');
              index += 3;
          } else if (contains(value, index, 2, "CC") && 
                     !(index == 1 && charAt(value, 0) == 'M')) {
              //-- double "cc" but not "McClelland" --//
              return handleCC(value, result, index);
          } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
              result.append('K');
              index += 2;
          } else if (contains(value, index, 2, "CI", "CE", "CY")) {
              //-- Italian vs. English --//
              if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
                  result.append('S', 'X');
              } else {
                  result.append('S');
              }
              index += 2;
          } else {
              result.append('K');
              if (contains(value, index + 1, 2, " C", " Q", " G")) { 
                  //-- Mac Caffrey, Mac Gregor --//
                  index += 3;
              } else if (contains(value, index + 1, 1, "C", "K", "Q") && 
                         !contains(value, index + 1, 2, "CE", "CI")) {
                  index += 2;
              } else {
                  index++;
              }
          }
          
          return index;
      }
  
      /**
       * Handles 'CC' cases
       */
      private int handleCC(String value, DoubleMetaphoneResult result, int index) 
      {
          if (contains(value, index + 2, 1, "I", "E", "H") && 
              !contains(value, index + 2, 2, "HU")) {
              //-- "bellocchio" but not "bacchus" --//
              if ((index == 1 && charAt(value, index - 1) == 'A') || 
                  contains(value, index - 1, 5, "UCCEE", "UCCES")) {
                  //-- "accident", "accede", "succeed" --//
                  result.append("KS");
              } else {
                  //-- "bacci", "bertucci", other Italian --//
                  result.append('X');
              }
              index += 3;
          } else {    // Pierce's rule
              result.append('K');
              index += 2;
          }
          
          return index;
      }
      
      /**
       * Handles 'CH' cases
       */
      private int handleCH(String value, DoubleMetaphoneResult result, int index) 
      {
          if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
              result.append('K', 'X');
              return index + 2;
          } else if (conditionCH0(value, index)) {
              //-- Greek roots ("chemistry", "chorus", etc.) --//
              result.append('K');
              return index + 2;
          } else if (conditionCH1(value, index)) {
              //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
              result.append('K');
              return index + 2;
          } else {
              if (index > 0) {
                  if (contains(value, 0, 2, "MC")) {
                      result.append('K');
                  } else {
                      result.append('X', 'K');
                  }
              } else {
                  result.append('X');
              }
              return index + 2;
          }
      }
  
      /**
       * Handles 'D' cases
       */
      private int handleD(String value, DoubleMetaphoneResult result, int index) {
          if (contains(value, index, 2, "DG")) {
              //-- "Edge" --//
              if (contains(value, index + 2, 1, "I", "E", "Y")) {
                  result.append('J');
                  index += 3;
                  //-- "Edgar" --//
              } else {
                  result.append("TK");
                  index += 2;
              }
          } else if (contains(value, index, 2, "DT", "DD")) {
              result.append('T');
              index += 2;
          } else {
              result.append('T');
              index++;
          }
          return index;
      }
  
      /**
       * Handles 'G' cases
       */
      private int handleG(String value, DoubleMetaphoneResult result, int index, 
                          boolean slavoGermanic) {
          if (charAt(value, index + 1) == 'H') {
              index = handleGH(value, result, index);
          } else if (charAt(value, index + 1) == 'N') {
              if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
                  result.append("KN", "N");
              } else if (!contains(value, index + 2, 2, "EY") && 
                         charAt(value, index + 1) != 'Y' && !slavoGermanic) {
                  result.append("N", "KN");
              } else {
                  result.append("KN");
              }
              index = index + 2;
          } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
              result.append("KL", "L");
              index += 2;
          } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains
                                    (value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
              //-- -ges-, -gep-, -gel-, -gie- at beginning --//
              result.append('K', 'J');
              index += 2;
          } else if ((contains(value, index + 1, 2, "ER") || charAt(value, index 
                                                                    + 1) == 'Y') &&
                     !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
                     !contains(value, index - 1, 1, "E", "I") && 
                     !contains(value, index - 1, 3, "RGY", "OGY")) {
              //-- -ger-, -gy- --//
              result.append('K', 'J');
              index += 2;
          } else if (contains(value, index + 1, 1, "E", "I", "Y") || 
                     contains(value, index - 1, 4, "AGGI", "OGGI")) {
              //-- Italian "biaggi" --//
              if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 
                                                                     3, "SCH")) ||
                  contains(value, index + 1, 2, "ET")) {
                  //-- obvious germanic --//
                  result.append('K');
              } else if (contains(value, index + 1, 4, "IER")) {
                  result.append('J');
              } else {
                  result.append('J', 'K');
              }
              index += 2;
          } else if (charAt(value, index + 1) == 'G') {
              index += 2;
              result.append('K');
          } else {
              index++;
              result.append('K');
          }
          return index;
      }
      
      /**
       * Handles 'GH' cases
       */
      private int handleGH(String value, DoubleMetaphoneResult result, int index) 
      {
          if (index > 0 && !isVowel(charAt(value, index - 1))) {
              result.append('K');
              index += 2;
          } else if (index == 0) {
              if (charAt(value, index + 2) == 'I') {
                  result.append('J');
              } else {
                  result.append('K');
              }
              index += 2;
          } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) 
                     ||
                     (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) 
                     ||
                     (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
              //-- Parker's rule (with some further refinements) - "hugh"
              index += 2;
          } else {
              if (index > 2 && charAt(value, index - 1) == 'U' && 
                  contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
                  //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
                  result.append('F');
              } else if (index > 0 && charAt(value, index - 1) != 'I') {
                  result.append('K');
              }
              index += 2;
          }
          return index;
      }
  
      /**
       * Handles 'H' cases
       */
      private int handleH(String value, DoubleMetaphoneResult result, int index) {
          //-- only keep if first & before vowel or between 2 vowels --//
          if ((index == 0 || isVowel(charAt(value, index - 1))) && 
              isVowel(charAt(value, index + 1))) {
              result.append('H');
              index += 2;
              //-- also takes car of "HH" --//
          } else {
              index++;
          }
          return index;
      }
      
      /**
       * Handles 'J' cases
       */
      private int handleJ(String value, DoubleMetaphoneResult result, int index, 
                          boolean slavoGermanic) {
          if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) 
              {
                  //-- obvious Spanish, "Jose", "San Jacinto" --//
                  if ((index == 0 && (charAt(value, index + 4) == ' ') || 
                       value.length() == 4) || contains(value, 0, 4, "SAN ")) {
                      result.append('H');
                  } else {
                      result.append('J', 'H');
                  }
                  index++;
              } else {
                  if (index == 0 && !contains(value, index, 4, "JOSE")) {
                      result.append('J', 'A');
                  } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 
                             (charAt(value, index + 1) == 'A' || 
                              charAt(value, index + 1) == 'O')) {
                      result.append('J', 'H');
                  } else if (index == value.length() - 1) {
                      result.append('J', ' ');
                  } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !
                             contains(value, index - 1, 1, "S", "K", "L")) {
                      result.append('J');
                  }
  
                  if (charAt(value, index + 1) == 'J') {
                      index += 2;
                  } else {
                      index++;
                  }
              }
          return index;
      }
      
      /**
       * Handles 'L' cases
       */
      private int handleL(String value, DoubleMetaphoneResult result, int index) {
          result.append('L');
          if (charAt(value, index + 1) == 'L') {
              if (conditionL0(value, index)) {
                  result.appendAlternate(' ');
              }
              index += 2;
          } else {
              index++;
          }
          return index;
      }
  
      /**
       * Handles 'P' cases
       */
      private int handleP(String value, DoubleMetaphoneResult result, int index) {
          if (charAt(value, index + 1) == 'H') {
              result.append('F');
              index += 2;
          } else {
              result.append('P');
              index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index 
                  + 1;
          }
          return index;
      }
  
      /**
       * Handles 'R' cases
       */
      private int handleR(String value, DoubleMetaphoneResult result, int index, 
                          boolean slavoGermanic) {
          if (index == value.length() - 1 && !slavoGermanic && 
              contains(value, index - 2, 2, "IE") && 
              !contains(value, index - 4, 2, "ME", "MA")) {
              result.appendAlternate('R');
          } else {
              result.append('R');
          }
          return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
      }
  
      /**
       * Handles 'S' cases
       */
      private int handleS(String value, DoubleMetaphoneResult result, int index, 
                          boolean slavoGermanic) {
          if (contains(value, index - 1, 3, "ISL", "YSL")) {
              //-- special cases "island", "isle", "carlisle", "carlysle" --//
              index++;
          } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
              //-- special case "sugar-" --//
              result.append('X', 'S');
              index++;
          } else if (contains(value, index, 2, "SH")) {
              if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
                  //-- germanic --//
                  result.append('S');
              } else {
                  result.append('X');
              }
              index += 2;
          } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, 
                                                                         index, 4, "SIAN")) {
              //-- Italian and Armenian --//
              if (slavoGermanic) {
                  result.append('S');
              } else {
                  result.append('S', 'X');
              }
              index += 3;
          } else if ((index == 0 && contains(value, index + 1, 
                                             1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
              //-- german & anglicisations, e.g. "smith" match "schmidt", "snider" match "schneider" --//
                  //-- also, -sz- in slavic language altho in hungarian it is pronounced "s" --//
                  result.append('S', 'X');
              index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
          } else if (contains(value, index, 2, "SC")) {
              index = handleSC(value, result, index);
          } else {
              if (index == value.length() - 1 && contains(value, index - 2, 
                                                          2, "AI", "OI")){
                  //-- french e.g. "resnais", "artois" --//
                  result.appendAlternate('S');
              } else {
                  result.append('S');
              }
              index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index 
                  + 1;
          }
          return index;
      }
  
      /**
       * Handles 'SC' cases
       */
      private int handleSC(String value, DoubleMetaphoneResult result, int index) 
      {
          if (charAt(value, index + 2) == 'H') {
              //-- Schlesinger's rule --//
              if (contains(value, index + 3, 
                           2, "OO", "ER", "EN", "UY", "ED", "EM")) {
                  //-- Dutch origin, e.g. "school", "schooner" --//
                  if (contains(value, index + 3, 2, "ER", "EN")) {
                      //-- "schermerhorn", "schenker" --//
                      result.append("X", "SK");
                  } else {
                      result.append("SK");
                  }
              } else {
                  if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 
                                                                         3) != 'W') {
                      result.append('X', 'S');
                  } else {
                      result.append('X');
                  }
              }
          } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
              result.append('S');
          } else {
              result.append("SK");
          }
          return index + 3;
      }
  
      /**
       * Handles 'T' cases
       */
      private int handleT(String value, DoubleMetaphoneResult result, int index) {
          if (contains(value, index, 4, "TION")) {
              result.append('X');
              index += 3;
          } else if (contains(value, index, 3, "TIA", "TCH")) {
              result.append('X');
              index += 3;
          } else if (contains(value, index, 2, "TH") || contains(value, index, 
                                                                 3, "TTH")) {
              if (contains(value, index + 2, 2, "OM", "AM") || 
                  //-- special case "thomas", "thames" or germanic --//
                  contains(value, 0, 4, "VAN ", "VON ") || 
                  contains(value, 0, 3, "SCH")) {
                  result.append('T');
              } else {
                  result.append('0', 'T');
              }
              index += 2;
          } else {
              result.append('T');
              index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index 
                  + 1;
          }
          return index;
      }
  
      /**
       * Handles 'W' cases
       */
      private int handleW(String value, DoubleMetaphoneResult result, int index) {
          if (contains(value, index, 2, "WR")) {
              //-- can also be in middle of word --//
              result.append('R');
              index += 2;
          } else {
              if (index == 0 && (isVowel(charAt(value, index + 1)) || 
                                 contains(value, index, 2, "WH"))) {
                  if (isVowel(charAt(value, index + 1))) {
                      //-- Wasserman should match Vasserman --//
                      result.append('A', 'F');
                  } else {
                      //-- need Uomo to match Womo --//
                      result.append('A');
                  }
                  index++;
              } else if ((index == value.length() - 1 && isVowel(charAt(value, 
                                                                        index - 1))) ||
                         contains(value, index - 1, 
                                  5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
                         contains(value, 0, 3, "SCH")) {
                  //-- Arnow should match Arnoff --//
                  result.appendAlternate('F');
                  index++;
              } else if (contains(value, index, 4, "WICZ", "WITZ")) {
                  //-- Polish e.g. "filipowicz" --//
                  result.append("TS", "FX");
                  index += 4;
              } else {
                  index++;
              }
          }
          return index;
      }
      
      /**
       * Handles 'X' cases
       */
      private int handleX(String value, DoubleMetaphoneResult result, int index) {
          if (index == 0) {
              result.append('S');
              index++;
          } else {
              if (!((index == value.length() - 1) && 
                    (contains(value, index - 3, 3, "IAU", "EAU") || 
                     contains(value, index - 2, 2, "AU", "OU")))) {
                  //-- French e.g. breaux --//
                  result.append("KS");
              }
              index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index 
                  + 1;
          }
          return index;
      }
  
      /**
       * Handles 'Z' cases
       */
      private int handleZ(String value, DoubleMetaphoneResult result, int index, 
                          boolean slavoGermanic) {
          if (charAt(value, index + 1) == 'H') {
              //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
              result.append('J');
              index += 2;
          } else {
              if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 
                  (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
                  result.append("S", "TS");
              } else {
                  result.append('S');
              }
              index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
          }
          return index;
      }
  
      //-- BEGIN CONDITIONS --//
  
      /**
       * Complex condition 0 for 'C'
       */
      private boolean conditionC0(String value, int index) {
          if (contains(value, index, 4, "CHIA")) {
              return true;
          } else if (index <= 1) {
              return false;
          } else if (isVowel(charAt(value, index - 2))) {
              return false;
          } else if (!contains(value, index - 1, 3, "ACH")) {
              return false;
          } else {
              char c = charAt(value, index + 2);
              if ((c != 'I' && c != 'E') || contains(value, index - 2, 
                                                     6, "BACHER", "MACHER" )) {
                  return true;
              } else {
                  return false;
              }
          }
      }
      
      /**
       * Complex condition 0 for 'CH'
       */
      private boolean conditionCH0(String value, int index) {
          if (index != 0) {
              return false;
          } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 
                     !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
              return false;
          } else if (contains(value, 0, 5, "CHORE")) {
              return false;
          } else {
              return true;
          }
      }
      
      /**
       * Complex condition 1 for 'CH'
       */
      private boolean conditionCH1(String value, int index) {
          return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 
                                                                     3, "SCH")) ||
                  contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
                  contains(value, index + 2, 1, "T", "S") ||
                  ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 
                    0) &&
                   (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index 
                    + 1 == value.length() - 1)));
      }
      
      /**
       * Complex condition 0 for 'L'
       */
      private boolean conditionL0(String value, int index) {
          if (index == value.length() - 3 && 
              contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
              return true;
          } else if ((contains(value, index - 1, 2, "AS", "OS") || 
                      contains(value, value.length() - 1, 1, "A", "O")) &&
                     contains(value, index - 1, 4, "ALLE")) {
              return true;
          } else {
              return false;
          }
      }
      
      /**
       * Complex condition 0 for 'M'
       */
      private boolean conditionM0(String value, int index) {
          if (charAt(value, index + 1) == 'M') {
              return true;
          } else {
              return contains(value, index - 1, 3, "UMB") &&
                  ((index + 1) == value.length() - 1 || 
                   contains(value, index + 2, 2, "ER"));
          }
      }
      
      //-- BEGIN HELPER FUNCTIONS --//
  
      /**
       * Determines whether or not a value is of slavo-germanic orgin.  A value is
       * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
       */
      private boolean isSlavoGermanic(String value) {
          return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 
              value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
      }
  
      /**
       * Determines whether or not a character is a vowel or not
       */
      private boolean isVowel(char ch) {
          return VOWELS.indexOf(ch) != -1;
      }
  
      /**
       * Determines whether or not the value starts with a silent letter.  It will
       * return <code>true</code> if the value starts with any of 'GN', 'KN',
       * 'PN', 'WR' or 'PS'.
       */    
      private boolean isSilentStart(String value) {
          boolean result = false;
          for (int i = 0; i < SILENT_START.length; i++) {
              if (value.startsWith(SILENT_START[i])) {
                  result = true;
                  break;
              }
          }
          return result;
      }
  
      /**
       * Cleans the input
       */    
      private String cleanInput(String input) {
          if (input == null) {
              return null;
          } else {
              input = input.trim();
              if (input.length() == 0) {
                  return null;
              } else {
                  return input.toUpperCase();
              }
          }
      }
  
      /**
       * Gets the character at index <code>index</code> if available, otherwise it
       * returns <code>Character.MIN_VALUE</code> so that there is some sort of a
       * default
       */    
      protected char charAt(String value, int index) {
          if (index < 0 || index >= value.length()) {
              return Character.MIN_VALUE;
          } else {
              return value.charAt(index);
          }
      }
  
      /**
       * Shortcut method with 1 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria) {
          return contains(value, start, length, 
                          new String[] { criteria });
      }
  
      /**
       * Shortcut method with 2 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria1, String criteria2) {
          return contains(value, start, length, 
                          new String[] { criteria1, criteria2 });
      }
  
      /**
       * Shortcut method with 3 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria1, String criteria2, 
                                      String criteria3) {
          return contains(value, start, length, 
                          new String[] { criteria1, criteria2, criteria3 });
      }
  
      /**
       * Shortcut method with 4 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria1, String criteria2, 
                                      String criteria3, String criteria4) {
          return contains(value, start, length, 
                          new String[] { criteria1, criteria2, criteria3, 
                                         criteria4 });
      }
  
      /**
       * Shortcut method with 5 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria1, String criteria2, 
                                      String criteria3, String criteria4, 
                                      String criteria5) {
          return contains(value, start, length, 
                          new String[] { criteria1, criteria2, criteria3, 
                                         criteria4, criteria5 });
      }
  
      /**
       * Shortcut method with 6 criteria
       */    
      private static boolean contains(String value, int start, int length, 
                                      String criteria1, String criteria2, 
                                      String criteria3, String criteria4, 
                                      String criteria5, String criteria6) {
          return contains(value, start, length, 
                          new String[] { criteria1, criteria2, criteria3, 
                                         criteria4, criteria5, criteria6 });
      }
      
      /**
       * Determines whether <code>value</code> contains any of the criteria 
       starting
       * at index <code>start</code> and matching up to length <code>length</code>
       */    
      protected static boolean contains(String value, int start, int length, 
                                        String[] criteria) {
          boolean result = false;
          if (start >= 0 && start + length <= value.length()) {
              String target = value.substring(start, start + length);
  
              for (int i = 0; i < criteria.length; i++) {
                  if (target.equals(criteria[i])) {
                      result = true;
                      break;
                  }
              }
          }
          return result;
      }
      
      //-- BEGIN INNER CLASSES --//
      
      /**
       * Inner class for storing results, since there is the optional alternate
       * encoding.
       */
      public class DoubleMetaphoneResult {
  
          private StringBuffer primary = new StringBuffer(maxCodeLen);
          private StringBuffer alternate = new StringBuffer(maxCodeLen);
          private int maxLength;
  
          public DoubleMetaphoneResult(int maxLength) {
              this.maxLength = maxLength;
          }
  
          public void append(char value) {
              appendPrimary(value);
              appendAlternate(value);
          }
  
          public void append(char primary, char alternate) {
              appendPrimary(primary);
              appendAlternate(alternate);
          }
  
          public void appendPrimary(char value) {
              if (primary.length() < maxLength) {
                  primary.append(value);
              }
          }
  
          public void appendAlternate(char value) {
              if (alternate.length() < maxLength) {
                  alternate.append(value);
              }
          }
  
          public void append(String value) {
              appendPrimary(value);
              appendAlternate(value);
          }
  
          public void append(String primary, String alternate) {
              appendPrimary(primary);
              appendAlternate(alternate);
          }
  
          public void appendPrimary(String value) {
              int addChars = maxLength - primary.length();
              if (value.length() <= addChars) {
                  primary.append(value);
              } else {
                  primary.append(value.substring(0, addChars));
              }
          }
  
          public void appendAlternate(String value) {
              int addChars = maxLength - alternate.length();
              if (value.length() <= addChars) {
                  alternate.append(value);
              } else {
                  alternate.append(value.substring(0, addChars));
              }
          }
  
          public String getPrimary() {
              return primary.toString();
          }
  
          public String getAlternate() {
              return alternate.toString();
          }
  
          public boolean isComplete() {
              return primary.length() >= maxLength && 
                  alternate.length() >= maxLength;
          }
      }
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org