You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2014/06/19 02:41:40 UTC
svn commit: r1603689 - in /commons/proper/codec/trunk/src:
main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
Author: ggregory
Date: Thu Jun 19 00:41:39 2014
New Revision: 1603689
URL: http://svn.apache.org/r1603689
Log:
[CODEC-187] Beider Morse Phonetic Matching producing incorrect tokens. Apply patch https://issues.apache.org/jira/secure/attachment/12651251/CODEC-187_ashkenazi_approx_any_v2.patch
Modified:
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt?rev=1603689&r1=1603688&r2=1603689&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt Thu Jun 19 00:41:39 2014
@@ -15,139 +15,141 @@
* limitations under the License.
*/
-// CONSTONANTS
-"ph" "" "" "f" // foreign
-"sh" "" "" "S" // foreign
-"kh" "" "" "x" // foreign
+// ASHKENAZIC
-"gli" "" "" "(gli|l[italian])"
-"gni" "" "" "(gni|ni[italian+french])"
-"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn)
-"gh" "" "" "g" // It + translit. from Arabic
-"dh" "" "" "d" // translit. from Arabic
-"bh" "" "" "d" // translit. from Arabic
-"th" "" "" "t" // translit. from Arabic
-"lh" "" "" "l" // Port
-"nh" "" "" "nj" // Port
-
-"ig" "[aeiou]" "" "(ig|tS[spanish])"
-"ix" "[aeiou]" "" "S" // Sp
-"tx" "" "" "tS" // Sp
-"tj" "" "$" "tS" // Sp
-"tj" "" "" "dZ" // Sp
-"tg" "" "" "(tg|dZ[spanish])"
-
-"gi" "" "[aeou]" "dZ" // Italian
-"g" "" "y" "Z" // French
-"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])"
-"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])"
-
-"guy" "" "" "gi"
-"gue" "" "$" "(k[french]|ge)"
-"gu" "" "[ei]" "(g|gv") // not It
-"gu" "" "[ao]" "gv" // not It
-
-"ñ" "" "" "(n|nj)"
-"ny" "" "" "nj"
-
-"sc" "" "[ei]" "(s|S[italian])"
-"sç" "" "[aeiou]" "s" // not It
-"ss" "" "" "s"
-"ç" "" "" "s" // not It
-
-"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])"
-"ch" "" "" "(S|tS[spanish]|dZ[spanish])"
-
-"ci" "" "[aeou]" "(tS[italian]|si)"
-"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])"
-"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])"
- //array("c" "" "[aou]" "(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted)
-
-"s" "^" "" "s"
-"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "(s[spanish]|z[portuguese+french+italian])"
-"s" "" "[dglmnrv]" "(z|Z[portuguese])"
-
-"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr
-"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr
-"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp
-"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp
-
-"que" "" "$" "(k[french]|ke)"
-"qu" "" "[eiu]" "k"
-"qu" "" "[ao]" "(kv|k)" // k is It
-
-"ex" "" "[aáuiÃoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)"
-"ex" "" "[cs]" "(e[portuguese]|ek)"
-
-"m" "" "[cdglnrst]" "(m|n[portuguese])"
-"m" "" "[bfpv]" "(m|n[portuguese+spanish])"
-"m" "" "$" "(m|n[portuguese])"
-
-"b" "^" "" "(b|V[spanish])"
-"v" "^" "" "(v|B[spanish])"
-
- // VOWELS
-"eau" "" "" "o" // Fr
-
-"ouh" "" "[aioe]" "(v[french]|uh)"
-"uh" "" "[aioe]" "(v|uh)"
-"ou" "" "[aioe]" "v" // french
-"uo" "" "" "(vo|o)"
-"u" "" "[aie]" "v"
-
-"i" "[aáuoóeéê]" "" "j"
-"i" "" "[aeou]" "j"
-"y" "[aáuiÃoóeéê]" "" "j"
-"y" "" "[aeiÃou]" "j"
-"e" "" "$" "(e|E[french])"
-
-"ão" "" "" "(au|an)" // Port
-"ãe" "" "" "(aj|an)" // Port
-"ãi" "" "" "(aj|an)" // Port
-"õe" "" "" "(oj|on)" // Port
-"où" "" "" "u" // Fr
-"ou" "" "" "(ou|u[french])"
-
-"â" "" "" "a" // Port & Fr
-"Ã " "" "" "a" // Port
-"á" "" "" "a" // Port & Sp
-"ã" "" "" "(a|an)" // Port
-"é" "" "" "e"
-"ê" "" "" "e" // Port & Fr
-"è" "" "" "e" // Sp & Fr & It
-"Ã" "" "" "i" // Port & Sp
-"î" "" "" "i" // Fr
-"ô" "" "" "o" // Port & Fr
-"ó" "" "" "o" // Port & Sp & It
-"õ" "" "" "(o|on)" // Port
-"ò" "" "" "o" // Sp & It
-"ú" "" "" "u" // Port & Sp
-"ü" "" "" "u" // Port & Sp
-
- // LATIN ALPHABET
-"a" "" "" "a"
-"b" "" "" "(b|v[spanish])"
-"c" "" "" "k"
-"d" "" "" "d"
-"e" "" "" "e"
-"f" "" "" "f"
-"g" "" "" "g"
-"h" "" "" "h"
-"i" "" "" "i"
-"j" "" "" "(x[spanish]|Z)" // not It
-"k" "" "" "k"
-"l" "" "" "l"
-"m" "" "" "m"
-"n" "" "" "n"
-"o" "" "" "o"
-"p" "" "" "p"
-"q" "" "" "k"
-"r" "" "" "r"
-"s" "" "" "(s|S[portuguese])"
-"t" "" "" "t"
-"u" "" "" "u"
-"v" "" "" "(v|b[spanish])"
-"w" "" "" "v" // foreign
-"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks
-"y" "" "" "i"
-"z" "" "" "z"
+// A, E, I, O, P, U should create variants, but a, e, i, o, u should not create any new variant
+// Q = ü ; Y = ä = ö
+// H = initial "H" in German/English
+
+// CONSONANTS
+"b" "" "" "(b|v[spanish])"
+"J" "" "" "z" // Argentina Spanish: "ll" = /Z/, but approximately /Z/ = /z/
+
+// VOWELS
+// "ALL" DIPHTHONGS are interchangeable BETWEEN THEM and with monophthongs of which they are composed ("D" means "diphthong")
+// {a,o} are totally interchangeable if non-stressed; in German "a/o" can actually be from "ä/ö" (that are equivalent to "e")
+// {i,e} are interchangeable if non-stressed, while in German "u" can actually be from "ü" (that is equivalent to "i")
+
+"aiB" "" "[bp]" "(D|Dm)"
+"AiB" "" "[bp]" "(D|Dm)"
+"oiB" "" "[bp]" "(D|Dm)"
+"OiB" "" "[bp]" "(D|Dm)"
+"uiB" "" "[bp]" "(D|Dm)"
+"UiB" "" "[bp]" "(D|Dm)"
+"eiB" "" "[bp]" "(D|Dm)"
+"EiB" "" "[bp]" "(D|Dm)"
+"iiB" "" "[bp]" "(D|Dm)"
+"IiB" "" "[bp]" "(D|Dm)"
+
+"aiB" "" "[dgkstvz]" "(D|Dn)"
+"AiB" "" "[dgkstvz]" "(D|Dn)"
+"oiB" "" "[dgkstvz]" "(D|Dn)"
+"OiB" "" "[dgkstvz]" "(D|Dn)"
+"uiB" "" "[dgkstvz]" "(D|Dn)"
+"UiB" "" "[dgkstvz]" "(D|Dn)"
+"eiB" "" "[dgkstvz]" "(D|Dn)"
+"EiB" "" "[dgkstvz]" "(D|Dn)"
+"iiB" "" "[dgkstvz]" "(D|Dn)"
+"IiB" "" "[dgkstvz]" "(D|Dn)"
+
+"B" "" "[bp]" "(o|om[polish]|im[polish])"
+"B" "" "[dgkstvz]" "(a|o|on[polish]|in[polish])"
+"B" "" "" "(a|o)"
+
+"aiF" "" "[bp]" "(D|Dm)"
+"AiF" "" "[bp]" "(D|Dm)"
+"oiF" "" "[bp]" "(D|Dm)"
+"OiF" "" "[bp]" "(D|Dm)"
+"uiF" "" "[bp]" "(D|Dm)"
+"UiF" "" "[bp]" "(D|Dm)"
+"eiF" "" "[bp]" "(D|Dm)"
+"EiF" "" "[bp]" "(D|Dm)"
+"iiF" "" "[bp]" "(D|Dm)"
+"IiF" "" "[bp]" "(D|Dm)"
+
+"aiF" "" "[dgkstvz]" "(D|Dn)"
+"AiF" "" "[dgkstvz]" "(D|Dn)"
+"oiF" "" "[dgkstvz]" "(D|Dn)"
+"OiF" "" "[dgkstvz]" "(D|Dn)"
+"uiF" "" "[dgkstvz]" "(D|Dn)"
+"UiF" "" "[dgkstvz]" "(D|Dn)"
+"eiF" "" "[dgkstvz]" "(D|Dn)"
+"EiF" "" "[dgkstvz]" "(D|Dn)"
+"iiF" "" "[dgkstvz]" "(D|Dn)"
+"IiF" "" "[dgkstvz]" "(D|Dn)"
+
+"F" "" "[bp]" "(i|im[polish]|om[polish])"
+"F" "" "[dgkstvz]" "(i|in[polish]|on[polish])"
+"F" "" "" "i"
+
+"P" "" "" "(o|u)"
+
+"I" "[aeiouAEIBFOUQY]" "" "i"
+"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])" // "line"
+"I" "" "$" "i"
+"I" "" "[^k]$" "i"
+"Ik" "[lr]" "$" "(ik|Qk[german])"
+"Ik" "" "$" "ik"
+"sIts" "" "$" "(sits|sQts[german])"
+"Its" "" "$" "its"
+"I" "" "" "(Q[german]|i)"
+
+"lE" "[bdfgkmnprsStvzZ]" "$" "(li|il[english])" // Apple < Appel
+"lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])" // Applebaum < Appelbaum
+
+"au" "" "" "(D|a|u)"
+"ou" "" "" "(D|o|u)"
+
+"ai" "" "" "(D|a|i)"
+"Ai" "" "" "(D|a|i)"
+"oi" "" "" "(D|o|i)"
+"Oi" "" "" "(D|o|i)"
+"ui" "" "" "(D|u|i)"
+"Ui" "" "" "(D|u|i)"
+"ei" "" "" "(D|i)"
+"Ei" "" "" "(D|i)"
+
+"iA" "" "$" "(ia|io)"
+"iA" "" "" "(ia|io|iY[german])"
+"A" "" "[^aeiouAEBFIOU]e" "(a|o|Y[german]|D[english])" // "plane"
+
+"E" "i[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Wineberg (vineberg/vajneberg) --> vajnberg
+"E" "a[^aeiouAEIOU]" "" "(i|Y[german]|[english])" // Shaneberg (shaneberg/shejneberg) --> shejnberg
+
+"e" "" "[fklmnprstv]$" "i"
+"e" "" "ts$" "i"
+"e" "" "$" "i"
+"e" "[DaoiuAOIUQY]" "" "i"
+"e" "" "[aoAOQY]" "i"
+"e" "" "" "(i|Y[german])"
+
+"E" "" "[fklmnprst]$" "i"
+"E" "" "ts$" "i"
+"E" "" "$" "i"
+"E" "[DaoiuAOIUQY]" "" "i"
+"E" "" "[aoAOQY]" "i"
+"E" "" "" "(i|Y[german])"
+
+"a" "" "" "(a|o)"
+
+"O" "" "[fklmnprstv]$" "o"
+"O" "" "ts$" "o"
+"O" "" "$" "o"
+"O" "[oeiuQY]" "" "o"
+"O" "" "" "(o|Y[german])"
+
+"A" "" "[fklmnprst]$" "(a|o)"
+"A" "" "ts$" "(a|o)"
+"A" "" "$" "(a|o)"
+"A" "[oeiuQY]" "" "(a|o)"
+"A" "" "" "(a|o|Y[german])"
+
+"U" "" "$" "u"
+"U" "[DoiuQY]" "" "u"
+"U" "" "[^k]$" "u"
+"Uk" "[lr]" "$" "(uk|Qk[german])"
+"Uk" "" "$" "uk"
+
+"sUts" "" "$" "(suts|sQts[german])"
+"Uts" "" "$" "uts"
+"U" "" "" "(u|Q[german])"
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1603689&r1=1603688&r2=1603689&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Thu Jun 19 00:41:39 2014
@@ -86,7 +86,7 @@ public class PhoneticEngineRegressionTes
// concat is true, ruleType is EXACT
args = new TreeMap<String, String>();
args.put("nameType", "ASHKENAZI");
- assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
args.put("ruleType", "EXACT");
assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
@@ -97,7 +97,7 @@ public class PhoneticEngineRegressionTes
// concat is false, ruleType is EXACT
args = new TreeMap<String, String>();
args.put("nameType", "ASHKENAZI");
- assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
args.put("ruleType", "EXACT");
assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
@@ -108,23 +108,23 @@ public class PhoneticEngineRegressionTes
// concat is true, ruleType is APPROX
args = new TreeMap<String, String>();
args.put("nameType", "ASHKENAZI");
- assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
args.put("ruleType", "APPROX");
- assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
- assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, true, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
args.put("languageSet", "italian,greek,spanish");
- assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+ assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|ongilo|onxilo");
assertEquals(encode(args, true, "1234"), "");
// concat is false, ruleType is APPROX
args = new TreeMap<String, String>();
args.put("nameType", "ASHKENAZI");
- assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
args.put("ruleType", "APPROX");
- assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
- assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, false, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
args.put("languageSet", "italian,greek,spanish");
- assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+ assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|ongilo|onxilo");
assertEquals(encode(args, false, "1234"), "");
}
@@ -186,6 +186,10 @@ public class PhoneticEngineRegressionTes
args.put("nameType", "GENERIC");
args.put("ruleType", "APPROX");
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
+
+ args.put("nameType", "ASHKENAZI");
+ args.put("ruleType", "APPROX");
+ assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
}
/**