You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by to...@apache.org on 2004/04/18 23:33:38 UTC

cvs commit: jakarta-commons/codec/src/java/org/apache/commons/codec/language Metaphone.java

tobrien     2004/04/18 14:33:38

  Modified:    codec/src/java/org/apache/commons/codec/language
                        Metaphone.java
  Log:
  Fixed the ending-MB bug - Bugzilla #28457 - also refactored some of the index arithmetic and content tests into functions for readability.
  
  Revision  Changes    Path
  1.16      +98 -56    jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java
  
  Index: Metaphone.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- Metaphone.java	29 Feb 2004 04:08:31 -0000	1.15
  +++ Metaphone.java	18 Apr 2004 21:33:38 -0000	1.16
  @@ -70,7 +70,6 @@
        * @return A metaphone code corresponding to the String supplied
        */
       public String metaphone(String txt) {
  -        int mtsz = 0  ;
           boolean hard = false ;
           if ((txt == null) || (txt.length() == 0)) {
               return "" ;
  @@ -126,99 +125,109 @@
           int wdsz = local.length();
           int n = 0 ;
   
  -        while ((mtsz < this.getMaxCodeLen()) && (n < wdsz)) { // max code size of 4 works well
  +        while ((code.length() < this.getMaxCodeLen()) && 
  +        	   (n < wdsz) ) { // max code size of 4 works well
               char symb = local.charAt(n) ;
               // remove duplicate letters except C
  -            if ((symb != 'C') && (n > 0) && (local.charAt(n - 1) == symb)) {
  +            if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
                   n++ ;
               } else { // not dup
                   switch(symb) {
                   case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
                       if (n == 0) { 
                           code.append(symb);
  -                        mtsz++;
                       }
                       break ; // only use vowel if leading char
                   case 'B' :
  -                    if ((n > 0) && !(n + 1 == wdsz) && (local.charAt(n - 1) == 'M')) { // not MB at end of word 
  -                        code.append(symb);
  +                    if ( isPreviousChar(local, n, 'M') && 
  +                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
  +						break;
                       } else {
                           code.append(symb);
                       }
  -                    mtsz++;
                       break;
                   case 'C' : // lots of C special cases
                       /* discard if SCI, SCE or SCY */
  -                    if ((n > 0) && (local.charAt(n - 1) == 'S') && (n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { 
  -                        break ;
  +                    if ( isPreviousChar(local, n, 'S') && 
  +                         !isLastChar(wdsz, n) && 
  +                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { 
  +                        break;
                       }
                       tmpS = local.toString();
  -                    if (tmpS.indexOf("CIA", n) == n) { // "CIA" -> X
  -                        code.append('X'); mtsz++; break ;
  +                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
  +                        code.append('X'); 
  +                        break;
                       }
  -                    if ((n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
  +                    if (!isLastChar(wdsz, n) && 
  +                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
                           code.append('S');
  -                        mtsz++; 
  -                        break ; // CI,CE,CY -> S
  +                        break; // CI,CE,CY -> S
                       }
  -                    if ((n > 0) && (tmpS.indexOf("SCH", n - 1) == n - 1)) { // SCH->sk
  +                    if (isPreviousChar(local, n, 'S') &&
  +						isNextChar(local, n, 'H') ) { // SCH->sk
                           code.append('K') ; 
  -                        mtsz++;
                           break ;
                       }
  -                    if (tmpS.indexOf("CH", n) == n) { // detect CH
  -                        if ((n == 0) && (wdsz >= 3) && (this.vowels.indexOf(local.charAt(2)) < 0)) { // CH consonant -> K consonant
  +                    if (isNextChar(local, n, 'H')) { // detect CH
  +                        if ((n == 0) && 
  +                        	(wdsz >= 3) && 
  +                            isVowel(local,2) ) { // CH consonant -> K consonant
                               code.append('K');
                           } else { 
                               code.append('X'); // CHvowel -> X
                           }
  -                        mtsz++;
                       } else { 
                           code.append('K');
  -                        mtsz++;
                       }
                       break ;
                   case 'D' :
  -                    if ((n + 2 < wdsz)   && (local.charAt(n + 1) == 'G') && (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
  +                    if (!isLastChar(wdsz, n + 1) && 
  +                        isNextChar(local, n, 'G') && 
  +                        (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
                           code.append('J'); n += 2 ;
                       } else { 
                           code.append('T');
                       }
  -                    mtsz++;
                       break ;
                   case 'G' : // GH silent at end or before consonant
  -                    if ((n + 2 == wdsz) && (local.charAt(n + 1) == 'H')) {
  +                    if (isLastChar(wdsz, n + 1) && 
  +                        isNextChar(local, n, 'H')) {
                           break;
                       }
  -                    if ((n + 2 < wdsz) && (local.charAt(n + 1) == 'H') && (this.vowels.indexOf(local.charAt(n + 2)) < 0)) {
  +                    if (!isLastChar(wdsz, n + 1) &&  
  +                        isNextChar(local,n,'H') && 
  +                        !isVowel(local,n+2)) {
                           break;
                       }
                       tmpS = local.toString();
  -                    if ((n > 0) && (tmpS.indexOf("GN", n) == n) || (tmpS.indexOf("GNED", n) == n)) {
  +                    if ((n > 0) && 
  +                    	( regionMatch(local, n, "GN") ||
  +					      regionMatch(local, n, "GNED") ) ) {
                           break; // silent G
                       }
  -                    if ((n > 0) && (local.charAt(n - 1) == 'G')) {
  +                    if (isPreviousChar(local, n, 'G')) {
                           hard = true ;
                       } else {
                           hard = false ;
                       }
  -                    if ((n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && (!hard)) {
  +                    if (!isLastChar(wdsz, n) && 
  +                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && 
  +                        (!hard)) {
                           code.append('J');
                       } else {
                           code.append('K');
                       }
  -                    mtsz++;
                       break ;
                   case 'H':
  -                    if (n + 1 == wdsz) {
  +                    if (isLastChar(wdsz, n)) {
                           break ; // terminal H
                       }
  -                    if ((n > 0) && (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
  +                    if ((n > 0) && 
  +                        (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
                           break;
                       }
  -                    if (this.vowels.indexOf(local.charAt(n + 1)) >= 0) {
  -                        code.append('H'); 
  -                        mtsz++;// Hvowel
  +                    if (isVowel(local,n+1)) {
  +                        code.append('H'); // Hvowel
                       }
                       break;
                   case 'F': 
  @@ -228,78 +237,111 @@
                   case 'N' : 
                   case 'R' :
                       code.append(symb); 
  -                    mtsz++; 
                       break;
                   case 'K' :
                       if (n > 0) { // not initial
  -                        if (local.charAt(n - 1) != 'C') {
  +                        if (!isPreviousChar(local, n, 'C')) {
                               code.append(symb);
                           }
                       } else {
                           code.append(symb); // initial K
                       }
  -                    mtsz++ ;
                       break ;
                   case 'P' :
  -                    if ((n + 1 < wdsz) && (local.charAt(n + 1) == 'H')) {
  +                    if (isNextChar(local,n,'H')) {
                           // PH -> F
                           code.append('F');
                       } else {
                           code.append(symb);
                       }
  -                    mtsz++;
                       break ;
                   case 'Q' :
                       code.append('K');
  -                    mtsz++; 
                       break;
                   case 'S' :
  -                    tmpS = local.toString();
  -                    if ((tmpS.indexOf("SH", n) == n) || (tmpS.indexOf("SIO", n) == n) || (tmpS.indexOf("SIA", n) == n)) {
  +                    if (regionMatch(local,n,"SH") || 
  +					    regionMatch(local,n,"SIO") || 
  +					    regionMatch(local,n,"SIA")) {
                           code.append('X');
                       } else {
                           code.append('S');
                       }
  -                    mtsz++;
                       break;
                   case 'T' :
  -                    tmpS = local.toString(); // TIA TIO -> X
  -                    if ((tmpS.indexOf("TIA", n) == n) || (tmpS.indexOf("TIO", n) == n)) {
  +                    if (regionMatch(local,n,"TIA") || 
  +						regionMatch(local,n,"TIO")) {
                           code.append('X'); 
  -                        mtsz++; 
                           break;
                       }
  -                    if (tmpS.indexOf("TCH", n) == n) {
  +                    if (regionMatch(local,n,"TCH")) {
  +						// Silent if in "TCH"
                           break;
                       }
                       // substitute numeral 0 for TH (resembles theta after all)
  -                    if (tmpS.indexOf("TH", n) == n) {
  +                    if (regionMatch(local,n,"TH")) {
                           code.append('0');
                       } else {
                           code.append('T');
                       }
  -                    mtsz++ ;
                       break ;
                   case 'V' :
  -                    code.append('F'); mtsz++;break ;
  +                    code.append('F'); break ;
                   case 'W' : case 'Y' : // silent if not followed by vowel
  -                    if ((n + 1 < wdsz) && (this.vowels.indexOf(local.charAt(n + 1)) >= 0)) {
  +                    if (!isLastChar(wdsz,n) && 
  +                    	isVowel(local,n+1)) {
                           code.append(symb);
  -                        mtsz++;
                       }
                       break ;
                   case 'X' :
  -                    code.append('K'); code.append('S');mtsz += 2;
  +                    code.append('K'); code.append('S');
                       break ;
                   case 'Z' :
  -                    code.append('S'); mtsz++; break ;
  +                    code.append('S'); break ;
                   } // end switch
                   n++ ;
               } // end else from symb != 'C'
  -            if (mtsz > this.getMaxCodeLen()) { code.setLength(this.getMaxCodeLen()); }
  +            if (code.length() > this.getMaxCodeLen()) { 
  +            	code.setLength(this.getMaxCodeLen()); 
  +            }
           }
           return code.toString();
  -    } 
  +    }
  +
  +	private boolean isVowel(StringBuffer string, int index) {
  +		return (this.vowels.indexOf(string.charAt(index)) >= 0);
  +	}
  +
  +	private boolean isPreviousChar(StringBuffer string, int index, char c) {
  +		boolean matches = false;
  +		if( index > 0 &&
  +		    index < string.length() ) {
  +			matches = string.charAt(index - 1) == c;
  +		}
  +		return matches;
  +	}
  +
  +	private boolean isNextChar(StringBuffer string, int index, char c) {
  +		boolean matches = false;
  +		if( index >= 0 &&
  +		    index < string.length() - 1 ) {
  +			matches = string.charAt(index + 1) == c;
  +		}
  +		return matches;
  +	}
  +
  +	private boolean regionMatch(StringBuffer string, int index, String test) {
  +		boolean matches = false;
  +		if( index >= 0 &&
  +		    (index + test.length() - 1) < string.length() ) {
  +			String substring = string.substring( index, index + test.length());
  +			matches = substring.equals( test );
  +		}
  +		return matches;
  +	}
  +
  +	private boolean isLastChar(int wdsz, int n) {
  +		return n + 1 == wdsz;
  +	} 
       
       
       /**
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org