You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2014/06/18 21:57:30 UTC

svn commit: r1603614 - in /commons/proper/codec/trunk/src: main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java

Author: ggregory
Date: Wed Jun 18 19:57:30 2014
New Revision: 1603614

URL: http://svn.apache.org/r1603614
Log:
[CODEC-187] Beider Morse Phonetic Matching producing incorrect tokens. Revert changes from r1603573.

Modified:
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt?rev=1603614&r1=1603613&r2=1603614&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt Wed Jun 18 19:57:30 2014
@@ -15,141 +15,139 @@
  * limitations under the License.
  */
 
-// ASHKENAZIC
+// CONSTONANTS
+"ph"    ""  ""  "f" // foreign
+"sh"    ""  ""  "S" // foreign
+"kh"    ""  ""  "x" // foreign
 
-// A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant
-// Q = ü ; Y = ä = ö
-// H = initial "H" in German/English
-
-// CONSONANTS
-"b" "" "" "(b|v[spanish])"
-"J"  ""  ""  "z" // Argentina Spanish: "ll" = /Z/, but approximately /Z/ = /z/
-    
-// VOWELS
-// "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")
-//  {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")
-//  {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")
-    
-"aiB" "" "[bp]" "(D|Dm)"
-"AiB" "" "[bp]" "(D|Dm)"
-"oiB" "" "[bp]" "(D|Dm)"
-"OiB" "" "[bp]" "(D|Dm)"
-"uiB" "" "[bp]" "(D|Dm)"
-"UiB" "" "[bp]" "(D|Dm)"
-"eiB" "" "[bp]" "(D|Dm)"
-"EiB" "" "[bp]" "(D|Dm)"
-"iiB" "" "[bp]" "(D|Dm)"
-"IiB" "" "[bp]" "(D|Dm)"
-    
-"aiB" "" "[dgkstvz]" "(D|Dn)"
-"AiB" "" "[dgkstvz]" "(D|Dn)"
-"oiB" "" "[dgkstvz]" "(D|Dn)"
-"OiB" "" "[dgkstvz]" "(D|Dn)"
-"uiB" "" "[dgkstvz]" "(D|Dn)"
-"UiB" "" "[dgkstvz]" "(D|Dn)"
-"eiB" "" "[dgkstvz]" "(D|Dn)"
-"EiB" "" "[dgkstvz]" "(D|Dn)"
-"iiB" "" "[dgkstvz]" "(D|Dn)"
-"IiB" "" "[dgkstvz]" "(D|Dn)"
-      
-"B" "" "[bp]" "(o|om[polish]|im[polish])"
-"B" "" "[dgkstvz]" "(a|o|on[polish]|in[polish])"
-"B"  ""  ""  "(a|o)"
-    
-"aiF" "" "[bp]" "(D|Dm)"
-"AiF" "" "[bp]" "(D|Dm)"
-"oiF" "" "[bp]" "(D|Dm)"
-"OiF" "" "[bp]" "(D|Dm)"
-"uiF" "" "[bp]" "(D|Dm)"
-"UiF" "" "[bp]" "(D|Dm)"
-"eiF" "" "[bp]" "(D|Dm)"
-"EiF" "" "[bp]" "(D|Dm)"
-"iiF" "" "[bp]" "(D|Dm)"
-"IiF" "" "[bp]" "(D|Dm)"
-        
-"aiF" "" "[dgkstvz]" "(D|Dn)"
-"AiF" "" "[dgkstvz]" "(D|Dn)"
-"oiF" "" "[dgkstvz]" "(D|Dn)"
-"OiF" "" "[dgkstvz]" "(D|Dn)"
-"uiF" "" "[dgkstvz]" "(D|Dn)"
-"UiF" "" "[dgkstvz]" "(D|Dn)"
-"eiF" "" "[dgkstvz]" "(D|Dn)"
-"EiF" "" "[dgkstvz]" "(D|Dn)"
-"iiF" "" "[dgkstvz]" "(D|Dn)"
-"IiF" "" "[dgkstvz]" "(D|Dn)"
-            
-"F" "" "[bp]" "(i|im[polish]|om[polish])"
-"F" "" "[dgkstvz]" "(i|in[polish]|on[polish])"
-"F"  ""  ""  "i"
-        
-"P"  ""  ""  "(o|u)"
-        
-"I"  "[aeiouAEIBFOUQY]"  ""  "i"
-"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])"  // "line"
-"I"  ""  "$"  "i"
-"I"  ""  "[^k]$"  "i"
-"Ik"  "[lr]"  "$"  "(ik|Qk[german])"
-"Ik"  ""  "$"  "ik"
-"sIts"  ""  "$"  "(sits|sQts[german])"
-"Its"  ""  "$"  "its"
-"I"  ""  ""  "(Q[german]|i)"
-   
-"lE" "[bdfgkmnprsStvzZ]" "$" "(li|il[english])"  // Apple < Appel
-"lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])"  // Applebaum < Appelbaum
-    
-"au" "" "" "(D|a|u)"
-"ou" "" "" "(D|o|u)"
-    
-"ai" "" "" "(D|a|i)"
-"Ai" "" "" "(D|a|i)"
-"oi" "" "" "(D|o|i)"
-"Oi" "" "" "(D|o|i)"
-"ui" "" "" "(D|u|i)"
-"Ui" "" "" "(D|u|i)"
-"ei" "" "" "(D|i)"
-"Ei" "" "" "(D|i)"
-    
-"iA" "" "$" "(ia|io)"
-"iA" "" "" "(ia|io|iY[german])"
-"A" "" "[^aeiouAEBFIOU]e" "(a|o|Y[german]|D[english])" // "plane"
-        
-"E" "i[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Wineberg (vineberg/vajneberg) --> vajnberg
-"E" "a[^aeiouAEIOU]" "" "(i|Y[german]|[english])" //  Shaneberg (shaneberg/shejneberg) --> shejnberg
-        
-"e"  ""  "[fklmnprstv]$"  "i"
-"e"  ""  "ts$"  "i"
-"e"  ""  "$"  "i"
-"e"  "[DaoiuAOIUQY]"  ""  "i"
-"e"  ""  "[aoAOQY]"  "i"
-"e"  ""  ""  "(i|Y[german])"
-    
-"E"  ""  "[fklmnprst]$"  "i"
-"E"  ""  "ts$"  "i"
-"E"  ""  "$"  "i"
-"E"  "[DaoiuAOIUQY]"  ""  "i"
-"E"  ""  "[aoAOQY]"  "i"
-"E"  ""  ""  "(i|Y[german])"
-        
-"a"  ""  ""  "(a|o)"
-    
-"O"  ""  "[fklmnprstv]$"  "o"
-"O"  ""  "ts$"  "o"
-"O"  ""  "$"  "o"
-"O"  "[oeiuQY]"  ""  "o"
-"O"  ""  ""  "(o|Y[german])"
-    
-"A"  ""  "[fklmnprst]$"  "(a|o)"
-"A"  ""  "ts$"  "(a|o)"
-"A"  ""  "$"  "(a|o)"
-"A"  "[oeiuQY]"  ""  "(a|o)"
-"A"  ""  ""  "(a|o|Y[german])"
-
-"U"  ""  "$"  "u"
-"U"  "[DoiuQY]"  ""  "u"
-"U"  ""  "[^k]$"  "u"
-"Uk"  "[lr]"  "$"  "(uk|Qk[german])"
-"Uk"  ""  "$"  "uk"
-  
-"sUts"  ""  "$"  "(suts|sQts[german])"
-"Uts"  ""  "$"  "uts"
-"U"  ""  ""  "(u|Q[german])"
+"gli"   ""  ""  "(gli|l[italian])"
+"gni"   ""  ""  "(gni|ni[italian+french])"
+"gn"    ""  "[aeou]"    "(n[italian+french]|nj[italian+french]|gn)
+"gh"    ""  ""  "g" // It + translit. from Arabic
+"dh"    ""  ""  "d" // translit. from Arabic
+"bh"    ""  ""  "d" // translit. from Arabic
+"th"    ""  ""  "t" // translit. from Arabic
+"lh"    ""  ""  "l" // Port
+"nh"    ""  ""  "nj" // Port
+
+"ig"    "[aeiou]"   ""  "(ig|tS[spanish])"
+"ix"    "[aeiou]"   ""  "S" // Sp
+"tx"    ""  ""  "tS" // Sp
+"tj"    ""  "$"  "tS" // Sp
+"tj"    ""  ""  "dZ" // Sp
+"tg"    ""  ""  "(tg|dZ[spanish])"
+
+"gi"    ""  "[aeou]"    "dZ" // Italian
+"g" ""  "y" "Z" // French
+"gg"    ""  "[ei]"  "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+"g" ""  "[ei]"  "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+
+"guy"   ""  ""  "gi"
+"gue"   ""  "$" "(k[french]|ge)"
+"gu"    ""  "[ei]"  "(g|gv") // not It
+"gu"    ""  "[ao]"  "gv" // not It
+
+"ñ" ""  ""  "(n|nj)"
+"ny"    ""  ""  "nj"
+
+"sc"    ""  "[ei]"  "(s|S[italian])"
+"sç"    ""  "[aeiou]"   "s" // not It
+"ss"    ""  ""  "s"
+"ç" ""  ""  "s"   // not It
+
+"ch"    ""  "[ei]"  "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])"
+"ch"    ""  ""  "(S|tS[spanish]|dZ[spanish])"
+
+"ci"    ""  "[aeou]"    "(tS[italian]|si)"
+"cc"	""	"[eiyéèê]"	"(tS[italian]|ks[portuguese+french+spanish])"
+"c"	""	"[eiyéèê]"	"(tS[italian]|s[portuguese+french+spanish])"
+   //array("c"	""	"[aou]"	"(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted)
+
+"s"	"^"	""	"s"
+"s"	"[aáuiíoóeéêy]"	"[aáuiíoóeéêy]"	"(s[spanish]|z[portuguese+french+italian])"
+"s"	""	"[dglmnrv]"	"(z|Z[portuguese])"
+
+"z"	""	"$"	"(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr
+"z"	""	"[bdgv]"	"(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr
+"z"	""	"[ptckf]"	"(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp
+"z"	""	""	"(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp
+
+"que"	""	"$"	"(k[french]|ke)"
+"qu"	""	"[eiu]"	"k"
+"qu"	""	"[ao]"	"(kv|k)" // k is It
+
+"ex"	""	"[aáuiíoóeéêy]"	"(ez[portuguese]|eS[portuguese]|eks|egz)"
+"ex"	""	"[cs]"	"(e[portuguese]|ek)"
+
+"m"	""	"[cdglnrst]"	"(m|n[portuguese])"
+"m"	""	"[bfpv]"	"(m|n[portuguese+spanish])"
+"m"	""	"$"	"(m|n[portuguese])"
+
+"b"	"^"	""	"(b|V[spanish])"
+"v"	"^"	""	"(v|B[spanish])"
+
+ // VOWELS
+"eau"	""	""	"o" // Fr
+
+"ouh"	""	"[aioe]"	"(v[french]|uh)"
+"uh"	""	"[aioe]"	"(v|uh)"
+"ou"	""	"[aioe]"	"v" // french
+"uo"	""	""	"(vo|o)"
+"u"	""	"[aie]"	"v"
+
+"i"	"[aáuoóeéê]"	""	"j"
+"i"	""	"[aeou]"	"j"
+"y"	"[aáuiíoóeéê]"	""	"j"
+"y"	""	"[aeiíou]"	"j"
+"e"	""	"$"	"(e|E[french])"
+
+"ão"	""	""	"(au|an)" // Port
+"ãe"	""	""	"(aj|an)" // Port
+"ãi"	""	""	"(aj|an)" // Port
+"õe"	""	""	"(oj|on)" // Port
+"où"	""	""	"u" // Fr
+"ou"	""	""	"(ou|u[french])"
+
+"â"	""	""	"a" // Port & Fr
+"à"	""	""	"a" // Port
+"á"	""	""	"a" // Port & Sp
+"ã"	""	""	"(a|an)" // Port
+"é"	""	""	"e"
+"ê"	""	""	"e" // Port & Fr
+"è"	""	""	"e" // Sp & Fr & It
+"í"	""	""	"i" // Port & Sp
+"î"	""	""	"i" // Fr
+"ô"	""	""	"o" // Port & Fr
+"ó"	""	""	"o" // Port & Sp & It
+"õ"	""	""	"(o|on)" // Port
+"ò"	""	""	"o"  // Sp & It
+"ú"	""	""	"u" // Port & Sp
+"ü"	""	""	"u" // Port & Sp
+
+ // LATIN ALPHABET
+"a"	""	""	"a"
+"b"	""	""	"(b|v[spanish])"
+"c"	""	""	"k"
+"d"	""	""	"d"
+"e"	""	""	"e"
+"f"	""	""	"f"
+"g"	""	""	"g"
+"h"	""	""	"h"
+"i"	""	""	"i"
+"j"	""	""	"(x[spanish]|Z)" // not It
+"k"	""	""	"k"
+"l"	""	""	"l"
+"m"	""	""	"m"
+"n"	""	""	"n"
+"o"	""	""	"o"
+"p"	""	""	"p"
+"q"	""	""	"k"
+"r"	""	""	"r"
+"s"	""	""	"(s|S[portuguese])"
+"t"	""	""	"t"
+"u"	""	""	"u"
+"v"	""	""	"(v|b[spanish])"
+"w"	""	""	"v"    // foreign
+"x"	""	""	"(ks|gz|S[portuguese+spanish])"   // S/ks Port & Sp, gz Sp, It only ks
+"y"	""	""	"i"
+"z"	""	""	"z"

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1603614&r1=1603613&r2=1603614&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Wed Jun 18 19:57:30 2014
@@ -186,10 +186,6 @@ public class PhoneticEngineRegressionTes
         args.put("nameType", "GENERIC");
         args.put("ruleType", "APPROX");
         assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
-
-        args.put("nameType", "ASHKENAZI");
-        args.put("ruleType", "APPROX");
-        assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
     }
 
     /**