You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2014/06/18 21:57:30 UTC
svn commit: r1603614 - in /commons/proper/codec/trunk/src:
main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
Author: ggregory
Date: Wed Jun 18 19:57:30 2014
New Revision: 1603614
URL: http://svn.apache.org/r1603614
Log:
[CODEC-187] Beider Morse Phonetic Matching producing incorrect tokens. Revert changes from r1603573.
Modified:
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt?rev=1603614&r1=1603613&r2=1603614&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt Wed Jun 18 19:57:30 2014
@@ -15,141 +15,139 @@
* limitations under the License.
*/
-// ASHKENAZIC
+// CONSTONANTS
+"ph" "" "" "f" // foreign
+"sh" "" "" "S" // foreign
+"kh" "" "" "x" // foreign
-// A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant
-// Q = ü ; Y = ä = ö
-// H = initial "H" in German/English
-
-// CONSONANTS
-"b" "" "" "(b|v[spanish])"
-"J" "" "" "z" // Argentina Spanish: "ll" = /Z/, but approximately /Z/ = /z/
-
-// VOWELS
-// "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")
-// {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")
-// {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")
-
-"aiB" "" "[bp]" "(D|Dm)"
-"AiB" "" "[bp]" "(D|Dm)"
-"oiB" "" "[bp]" "(D|Dm)"
-"OiB" "" "[bp]" "(D|Dm)"
-"uiB" "" "[bp]" "(D|Dm)"
-"UiB" "" "[bp]" "(D|Dm)"
-"eiB" "" "[bp]" "(D|Dm)"
-"EiB" "" "[bp]" "(D|Dm)"
-"iiB" "" "[bp]" "(D|Dm)"
-"IiB" "" "[bp]" "(D|Dm)"
-
-"aiB" "" "[dgkstvz]" "(D|Dn)"
-"AiB" "" "[dgkstvz]" "(D|Dn)"
-"oiB" "" "[dgkstvz]" "(D|Dn)"
-"OiB" "" "[dgkstvz]" "(D|Dn)"
-"uiB" "" "[dgkstvz]" "(D|Dn)"
-"UiB" "" "[dgkstvz]" "(D|Dn)"
-"eiB" "" "[dgkstvz]" "(D|Dn)"
-"EiB" "" "[dgkstvz]" "(D|Dn)"
-"iiB" "" "[dgkstvz]" "(D|Dn)"
-"IiB" "" "[dgkstvz]" "(D|Dn)"
-
-"B" "" "[bp]" "(o|om[polish]|im[polish])"
-"B" "" "[dgkstvz]" "(a|o|on[polish]|in[polish])"
-"B" "" "" "(a|o)"
-
-"aiF" "" "[bp]" "(D|Dm)"
-"AiF" "" "[bp]" "(D|Dm)"
-"oiF" "" "[bp]" "(D|Dm)"
-"OiF" "" "[bp]" "(D|Dm)"
-"uiF" "" "[bp]" "(D|Dm)"
-"UiF" "" "[bp]" "(D|Dm)"
-"eiF" "" "[bp]" "(D|Dm)"
-"EiF" "" "[bp]" "(D|Dm)"
-"iiF" "" "[bp]" "(D|Dm)"
-"IiF" "" "[bp]" "(D|Dm)"
-
-"aiF" "" "[dgkstvz]" "(D|Dn)"
-"AiF" "" "[dgkstvz]" "(D|Dn)"
-"oiF" "" "[dgkstvz]" "(D|Dn)"
-"OiF" "" "[dgkstvz]" "(D|Dn)"
-"uiF" "" "[dgkstvz]" "(D|Dn)"
-"UiF" "" "[dgkstvz]" "(D|Dn)"
-"eiF" "" "[dgkstvz]" "(D|Dn)"
-"EiF" "" "[dgkstvz]" "(D|Dn)"
-"iiF" "" "[dgkstvz]" "(D|Dn)"
-"IiF" "" "[dgkstvz]" "(D|Dn)"
-
-"F" "" "[bp]" "(i|im[polish]|om[polish])"
-"F" "" "[dgkstvz]" "(i|in[polish]|on[polish])"
-"F" "" "" "i"
-
-"P" "" "" "(o|u)"
-
-"I" "[aeiouAEIBFOUQY]" "" "i"
-"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])" // "line"
-"I" "" "$" "i"
-"I" "" "[^k]$" "i"
-"Ik" "[lr]" "$" "(ik|Qk[german])"
-"Ik" "" "$" "ik"
-"sIts" "" "$" "(sits|sQts[german])"
-"Its" "" "$" "its"
-"I" "" "" "(Q[german]|i)"
-
-"lE" "[bdfgkmnprsStvzZ]" "$" "(li|il[english])" // Apple < Appel
-"lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])" // Applebaum < Appelbaum
-
-"au" "" "" "(D|a|u)"
-"ou" "" "" "(D|o|u)"
-
-"ai" "" "" "(D|a|i)"
-"Ai" "" "" "(D|a|i)"
-"oi" "" "" "(D|o|i)"
-"Oi" "" "" "(D|o|i)"
-"ui" "" "" "(D|u|i)"
-"Ui" "" "" "(D|u|i)"
-"ei" "" "" "(D|i)"
-"Ei" "" "" "(D|i)"
-
-"iA" "" "$" "(ia|io)"
-"iA" "" "" "(ia|io|iY[german])"
-"A" "" "[^aeiouAEBFIOU]e" "(a|o|Y[german]|D[english])" // "plane"
-
-"E" "i[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Wineberg (vineberg/vajneberg) --> vajnberg
-"E" "a[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Shaneberg (shaneberg/shejneberg) --> shejnberg
-
-"e" "" "[fklmnprstv]$" "i"
-"e" "" "ts$" "i"
-"e" "" "$" "i"
-"e" "[DaoiuAOIUQY]" "" "i"
-"e" "" "[aoAOQY]" "i"
-"e" "" "" "(i|Y[german])"
-
-"E" "" "[fklmnprst]$" "i"
-"E" "" "ts$" "i"
-"E" "" "$" "i"
-"E" "[DaoiuAOIUQY]" "" "i"
-"E" "" "[aoAOQY]" "i"
-"E" "" "" "(i|Y[german])"
-
-"a" "" "" "(a|o)"
-
-"O" "" "[fklmnprstv]$" "o"
-"O" "" "ts$" "o"
-"O" "" "$" "o"
-"O" "[oeiuQY]" "" "o"
-"O" "" "" "(o|Y[german])"
-
-"A" "" "[fklmnprst]$" "(a|o)"
-"A" "" "ts$" "(a|o)"
-"A" "" "$" "(a|o)"
-"A" "[oeiuQY]" "" "(a|o)"
-"A" "" "" "(a|o|Y[german])"
-
-"U" "" "$" "u"
-"U" "[DoiuQY]" "" "u"
-"U" "" "[^k]$" "u"
-"Uk" "[lr]" "$" "(uk|Qk[german])"
-"Uk" "" "$" "uk"
-
-"sUts" "" "$" "(suts|sQts[german])"
-"Uts" "" "$" "uts"
-"U" "" "" "(u|Q[german])"
+"gli" "" "" "(gli|l[italian])"
+"gni" "" "" "(gni|ni[italian+french])"
+"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn)
+"gh" "" "" "g" // It + translit. from Arabic
+"dh" "" "" "d" // translit. from Arabic
+"bh" "" "" "d" // translit. from Arabic
+"th" "" "" "t" // translit. from Arabic
+"lh" "" "" "l" // Port
+"nh" "" "" "nj" // Port
+
+"ig" "[aeiou]" "" "(ig|tS[spanish])"
+"ix" "[aeiou]" "" "S" // Sp
+"tx" "" "" "tS" // Sp
+"tj" "" "$" "tS" // Sp
+"tj" "" "" "dZ" // Sp
+"tg" "" "" "(tg|dZ[spanish])"
+
+"gi" "" "[aeou]" "dZ" // Italian
+"g" "" "y" "Z" // French
+"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+
+"guy" "" "" "gi"
+"gue" "" "$" "(k[french]|ge)"
+"gu" "" "[ei]" "(g|gv") // not It
+"gu" "" "[ao]" "gv" // not It
+
+"ñ" "" "" "(n|nj)"
+"ny" "" "" "nj"
+
+"sc" "" "[ei]" "(s|S[italian])"
+"sç" "" "[aeiou]" "s" // not It
+"ss" "" "" "s"
+"ç" "" "" "s" // not It
+
+"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])"
+"ch" "" "" "(S|tS[spanish]|dZ[spanish])"
+
+"ci" "" "[aeou]" "(tS[italian]|si)"
+"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])"
+"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])"
+ //array("c" "" "[aou]" "(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted)
+
+"s" "^" "" "s"
+"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "(s[spanish]|z[portuguese+french+italian])"
+"s" "" "[dglmnrv]" "(z|Z[portuguese])"
+
+"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr
+"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr
+"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp
+"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp
+
+"que" "" "$" "(k[french]|ke)"
+"qu" "" "[eiu]" "k"
+"qu" "" "[ao]" "(kv|k)" // k is It
+
+"ex" "" "[aáuiÃoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)"
+"ex" "" "[cs]" "(e[portuguese]|ek)"
+
+"m" "" "[cdglnrst]" "(m|n[portuguese])"
+"m" "" "[bfpv]" "(m|n[portuguese+spanish])"
+"m" "" "$" "(m|n[portuguese])"
+
+"b" "^" "" "(b|V[spanish])"
+"v" "^" "" "(v|B[spanish])"
+
+ // VOWELS
+"eau" "" "" "o" // Fr
+
+"ouh" "" "[aioe]" "(v[french]|uh)"
+"uh" "" "[aioe]" "(v|uh)"
+"ou" "" "[aioe]" "v" // french
+"uo" "" "" "(vo|o)"
+"u" "" "[aie]" "v"
+
+"i" "[aáuoóeéê]" "" "j"
+"i" "" "[aeou]" "j"
+"y" "[aáuiÃoóeéê]" "" "j"
+"y" "" "[aeiÃou]" "j"
+"e" "" "$" "(e|E[french])"
+
+"ão" "" "" "(au|an)" // Port
+"ãe" "" "" "(aj|an)" // Port
+"ãi" "" "" "(aj|an)" // Port
+"õe" "" "" "(oj|on)" // Port
+"où" "" "" "u" // Fr
+"ou" "" "" "(ou|u[french])"
+
+"â" "" "" "a" // Port & Fr
+"Ã " "" "" "a" // Port
+"á" "" "" "a" // Port & Sp
+"ã" "" "" "(a|an)" // Port
+"é" "" "" "e"
+"ê" "" "" "e" // Port & Fr
+"è" "" "" "e" // Sp & Fr & It
+"Ã" "" "" "i" // Port & Sp
+"î" "" "" "i" // Fr
+"ô" "" "" "o" // Port & Fr
+"ó" "" "" "o" // Port & Sp & It
+"õ" "" "" "(o|on)" // Port
+"ò" "" "" "o" // Sp & It
+"ú" "" "" "u" // Port & Sp
+"ü" "" "" "u" // Port & Sp
+
+ // LATIN ALPHABET
+"a" "" "" "a"
+"b" "" "" "(b|v[spanish])"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "(x[spanish]|Z)" // not It
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"q" "" "" "k"
+"r" "" "" "r"
+"s" "" "" "(s|S[portuguese])"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "(v|b[spanish])"
+"w" "" "" "v" // foreign
+"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks
+"y" "" "" "i"
+"z" "" "" "z"
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1603614&r1=1603613&r2=1603614&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Wed Jun 18 19:57:30 2014
@@ -186,10 +186,6 @@ public class PhoneticEngineRegressionTes
args.put("nameType", "GENERIC");
args.put("ruleType", "APPROX");
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
-
- args.put("nameType", "ASHKENAZI");
- args.put("ruleType", "APPROX");
- assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
}
/**