You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2014/07/05 21:58:39 UTC
svn commit: r1608115 - in /commons/proper/codec/trunk/src:
main/java/org/apache/commons/codec/language/bm/
main/resources/org/apache/commons/codec/language/bm/
test/java/org/apache/commons/codec/language/bm/
Author: tn
Date: Sat Jul 5 19:58:38 2014
New Revision: 1608115
URL: http://svn.apache.org/r1608115
Log:
[CODEC-187] Apply patch to make BeiderMorse phonetic engine compatible with v3.3 of the reference implementation.
Added:
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt (with props)
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt (with props)
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt (with props)
Modified:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Sat Jul 5 19:58:38 2014
@@ -64,9 +64,12 @@ import org.apache.commons.codec.StringEn
* encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
* Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
* splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * <p>
+ * <b>Note</b>: this version of the Beider-Morse encoding is equivalent with v3.3 of the reference implementation.
*
* @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
* @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
+ *
* @since 1.6
* @version $Id$
*/
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java Sat Jul 5 19:58:38 2014
@@ -95,11 +95,11 @@ public class Lang {
private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
- private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
+ private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
static {
for (final NameType s : NameType.values()) {
- Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
+ Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
}
}
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java Sat Jul 5 19:58:38 2014
@@ -73,6 +73,8 @@ public class Languages {
public abstract boolean isSingleton();
public abstract LanguageSet restrictTo(LanguageSet other);
+
+ public abstract LanguageSet merge(LanguageSet other);
}
/**
@@ -128,6 +130,22 @@ public class Languages {
}
@Override
+ public LanguageSet merge(final LanguageSet other) {
+ if (other == NO_LANGUAGES) {
+ return this;
+ } else if (other == ANY_LANGUAGE) {
+ return other;
+ } else {
+ final SomeLanguages sl = (SomeLanguages) other;
+ final Set<String> ls = new HashSet<String>(languages);
+ for (String lang : sl.languages) {
+ ls.add(lang);
+ }
+ return from(ls);
+ }
+ }
+
+ @Override
public String toString() {
return "Languages(" + languages.toString() + ")";
}
@@ -217,6 +235,11 @@ public class Languages {
}
@Override
+ public LanguageSet merge(final LanguageSet other) {
+ return other;
+ }
+
+ @Override
public String toString() {
return "NO_LANGUAGES";
}
@@ -252,6 +275,11 @@ public class Languages {
}
@Override
+ public LanguageSet merge(final LanguageSet other) {
+ return other;
+ }
+
+ @Override
public String toString() {
return "ANY_LANGUAGE";
}
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Sat Jul 5 19:58:38 2014
@@ -28,7 +28,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import java.util.TreeSet;
+import java.util.TreeMap;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.Rule.Phoneme;
@@ -335,7 +335,8 @@ public class PhoneticEngine {
return phonemeBuilder;
}
- final Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
+ final Map<Rule.Phoneme, Rule.Phoneme> phonemes =
+ new TreeMap<Rule.Phoneme, Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
for (final Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
@@ -355,10 +356,21 @@ public class PhoneticEngine {
i = rulesApplication.getI();
}
- phonemes.addAll(subBuilder.getPhonemes());
+ // the phonemes map orders the phonemes only based on their text, but ignores the language set
+ // when adding new phonemes, check for equal phonemes and merge their language set, otherwise
+ // phonemes with the same text but different language set get lost
+ for (final Rule.Phoneme newPhoneme : subBuilder.getPhonemes()) {
+ if (phonemes.containsKey(newPhoneme)) {
+ final Rule.Phoneme oldPhoneme = phonemes.remove(newPhoneme);
+ final Rule.Phoneme mergedPhoneme = oldPhoneme.mergeWithLanguage(newPhoneme.getLanguages());
+ phonemes.put(mergedPhoneme, mergedPhoneme);
+ } else {
+ phonemes.put(newPhoneme, newPhoneme);
+ }
+ }
}
- return new PhonemeBuilder(phonemes);
+ return new PhonemeBuilder(phonemes.keySet());
}
/**
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java Sat Jul 5 19:58:38 2014
@@ -147,6 +147,22 @@ public class Rule {
return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
this.languages.restrictTo(right.languages));
}
+
+ /**
+ * Returns a new Phoneme with the same text but a union of its
+ * current language set and the given one.
+ *
+ * @param lang the language set to merge
+ * @return a new Phoneme
+ */
+ public Phoneme mergeWithLanguage(final LanguageSet lang) {
+ return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
+ }
+
+ @Override
+ public String toString() {
+ return phonemeText.toString() + "[" + languages + "]";
+ }
}
public interface PhonemeExpr {
@@ -442,6 +458,9 @@ public class Rule {
sb.append("Rule");
sb.append("{line=").append(myLine);
sb.append(", loc='").append(loc).append('\'');
+ sb.append(", pat='").append(pat).append('\'');
+ sb.append(", lcon='").append(lCon).append('\'');
+ sb.append(", rcon='").append(rCon).append('\'');
sb.append('}');
return sb.toString();
}
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt Sat Jul 5 19:58:38 2014
@@ -16,4 +16,4 @@
*/
"E" "" "" "e"
-"I "" "" "i"
\ No newline at end of file
+"I" "" "" "i"
\ No newline at end of file
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt Sat Jul 5 19:58:38 2014
@@ -24,6 +24,7 @@
"b" "^" "" "b"
"b" "" "" "(b|v)"
+"J" "" "" "l"
"ja" "" "" "i"
"jA" "" "" "i"
"jB" "" "" "i"
@@ -75,17 +76,20 @@
"ou" "^" "" "(u|v|1)"
"o" "^" "" "(u|v|1)"
"O" "^" "" "(u|v|1)"
+"P" "^" "" "(u|v|1)"
"U" "^" "" "(u|v|1)"
"u" "^" "" "(u|v|1)"
"o" "" "$" "(u|1)"
"O" "" "$" "(u|1)"
+"P" "" "$" "(u|1)"
"u" "" "$" "(u|1)"
"U" "" "$" "(u|1)"
"ou" "" "" "u"
"o" "" "" "u"
"O" "" "" "u"
+"P" "" "" "u"
"U" "" "" "u"
"VV" "" "" "u" // alef/ayin + vov from ruleshebrew
@@ -102,8 +106,9 @@
//"z" "" "" "(z|Z)"
//"d" "" "" "(d|dZ)"
-"TB" "" "$" "(t|s)" // tav from ruleshebrew; only Ashkenazic
-"TB" "" "" "t" // tav from ruleshebrew; only Ashkenazic
+"TB" "^" "" "t" // tav from ruleshebrew; only Ashkenazic
+"TB" "" "$" "s" // tav from ruleshebrew; only Ashkenazic
+"TB" "" "" "(t|s)" // tav from ruleshebrew; only Ashkenazic
"T" "" "" "t" // tet from ruleshebrew
//"k" "" "" "(k|x)"
Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt Sat Jul 5 19:58:38 2014
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// ASHKENAZIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+zh polish+russian+german+english true
+eau french true
+[aoeiuäöü]h german true
+^vogel german, true
+vogel$ german true
+witz german true
+tz$ german+russian+english true
+^tz russian+english true
+güe spanish true
+güi spanish true
+ghe romanian true
+ghi romanian true
+vici$ romanian true
+schi$ romanian true
+chsch german true
+tsch german true
+ssch german true
+sch$ german+russian true
+^sch german+russian true
+^rz polish true
+rz$ polish+german true
+[^aoeiuäöü]rz polish true
+rz[^aoeiuäöü] polish true
+cki$ polish true
+ska$ polish true
+cka$ polish true
+ue german+russian true
+ae german+russian+english true
+oe german+french+russian+english true
+th$ german true
+^th german true
+th[^aoeiu] german true
+mann german true
+cz polish true
+cy polish true
+niew polish true
+stein german true
+heim$ german true
+heimer$ german true
+ii$ russian true
+iy$ russian true
+yy$ russian true
+yi$ russian true
+yj$ russian true
+ij$ russian true
+gaus$ russian true
+gauz$ russian true
+gauz$ russian true
+goltz$ russian true
+gol'tz$ russian true
+golts$ russian true
+gol'ts$ russian true
+^goltz russian true
+^gol'tz russian true
+^golts russian true
+^gol'ts russian true
+gendler$ russian true
+gejmer$ russian true
+gejm$ russian true
+geimer$ russian true
+geim$ russian true
+geymer russian true
+geym$ russian true
+gof$ russian true
+thal german true
+zweig german true
+ck$ german+english true
+c$ polish+romanian+hungarian true
+sz polish+hungarian true
+gue spanish+french true
+gui spanish+french true
+guy french true
+cs$ hungarian true
+^cs hungarian true
+dzs hungarian true
+zs$ hungarian true
+^zs hungarian true
+^wl polish true
+^wr polish+english+german true
+
+gy$ hungarian true
+gy[aeou] hungarian true
+gy hungarian+russian true
+ly hungarian+russian+polish true
+ny hungarian+russian+polish true
+ty hungarian+russian+polish true
+
+// 1.2 special characters
+â romanian+french true
+Ä romanian true
+Ã french true
+ä german true
+á hungarian+spanish true
+Ä
polish true
+Ä polish true
+ç french true
+Ä polish true
+é french+hungarian+spanish true
+è french true
+ê french true
+Ã hungarian+spanish true
+î romanian+french true
+Å polish true
+Å polish true
+ñ spanish true
+ó polish+hungarian+spanish true
+ö german+hungarian true
+õ hungarian true
+Å romanian true
+Å polish true
+Å£ romanian true
+ü german+hungarian true
+ù french true
+ű hungarian true
+ú hungarian+spanish true
+ź polish true
+ż polish true
+
+Ã german true
+
+// Every Cyrillic word has at least one Cyrillic vowel (аÑеоиÑÑÑÑÑ)
+а cyrillic true
+Ñ cyrillic true
+о cyrillic true
+е cyrillic true
+и cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+
+// Hebrew
+× hebrew true
+× hebrew true
+× ebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+
+
+// 2. following are rules to reject the language
+// Every Latin character word has at least one Latin vowel
+a cyrillic+hebrew false
+o cyrillic+hebrew false
+e cyrillic+hebrew false
+i cyrillic+hebrew false
+y cyrillic+hebrew+romanian false
+u cyrillic+hebrew false
+
+v[^aoeiuäüö] german false // in german "v" can be found before a vowel only
+y[^aoeiu] german false // in german "y" usually appears only in the last position; sometimes before a vowel
+c[^aohk] german false
+dzi german+english+french false
+ou german false
+aj german+english+french false
+ej german+english+french false
+oj german+english+french false
+uj german+english+french false
+k romanian false
+v polish false
+ky polish false
+eu russian+polish false
+w french+romanian+spanish+hungarian+russian false
+kie french+spanish false
+gie french+romanian+spanish false
+q hungarian+polish+russian+romanian false
+sch hungarian+polish+french+spanish false
+^h russian false
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt Sat Jul 5 19:58:38 2014
@@ -55,11 +55,11 @@
"sh" "" "[äöü]" "sh" // german
"sh" "" "[aeiou]" "(S[russian+english]|sh)"
"sh" "" "" "S" // russian+english
-
+
"kh" "" "" "(x[russian+english]|kh)"
-
+
"chs" "" "" "(ks[german]|xs|tSs[russian+english])"
-
+
// French "ch" is currently disabled
//array("ch" "" "[ei]" "(x|tS|k[romanian]|S[french])"
//array("ch" "" "" "(x|tS[russian+english]|S[french])"
@@ -212,8 +212,8 @@
"v" "^" "" "(v|f[german])"
"h" "[aeiouyäöü]" "" "" //german
-"h" "" "" "(h|x[".(romanian+polish)."])"
-"h" "^" "" "(h|H[".(english+german)."])" // H can be exact "h" or approximate "kh"
+"h" "" "" "(h|x[romanian+polish])"
+"h" "^" "" "(h|H[english+german])" // H can be exact "h" or approximate "kh"
// VOWELS
"yi" "^" "" "i"
@@ -275,7 +275,7 @@
"Ä
" "" "[bp]" "om" // polish
"Ä
" "" "" "on" // polish
-"ä" "" "" "Y" // german
+"ä" "" "" "(Y|e)" // german
"á" "" "" "a" // hungarian
"Ä" "" "" "(e[romanian]|a)" //romanian
"Ã " "" "" "a" // french
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt Sat Jul 5 19:58:38 2014
@@ -81,7 +81,7 @@
"ae" "" "" "Y"
"oe" "" "" "Y"
"ü" "" "" "Q"
-"ä" "" "" "Y"
+"ä" "" "" "(Y|e)"
"ö" "" "" "Y"
"ei" "" "" "aj"
"ey" "" "" "aj"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt Sat Jul 5 19:58:38 2014
@@ -29,6 +29,12 @@
"mp" "" "" "(mp|b[greeklatin])"
"ng" "" "" "(ng|g[greeklatin])"
+"B" "" "[fktSs]" "(p|f[spanish])"
+"B" "" "p" ""
+"B" "" "$" "(p|f[spanish])"
+"V" "" "[pktSs]" "(f|p[spanish])"
+"V" "" "f" ""
+"V" "" "$" "(f|p[spanish])"
"B" "" "" "(b|v[spanish])"
"V" "" "" "(v|b[spanish])"
@@ -58,6 +64,7 @@
"lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])" // Applebaum < Appelbaum
"rE" "[bdfgkmnprsStvzZ]" "" "(ri|ir[english]|rY[german])"
+"EE" "" "" "(i|)"
"ea" "" "" "(D|a|i)"
"au" "" "" "(D|a|u)"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt Sat Jul 5 19:58:38 2014
@@ -15,6 +15,9 @@
* limitations under the License.
*/
+"1a" "" "" "(D|a)"
+"1i" "" "" "(D|i|e)"
+"1u" "" "" "(D|u|o)"
"j1" "" "" "(ja|je|jo|ju|j)"
"1" "" "" "(a|e|i|o|u|)"
"u" "" "" "(o|u)"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt Sat Jul 5 19:58:38 2014
@@ -37,19 +37,19 @@
"sEn" "[rmnl]" "$" "(zn|zon)"
"sEn" "" "$" "(sn|son)"
-"e" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"i" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"E" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"I" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"Q" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"Y" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"e" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"i" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"E" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"I" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"Q" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"Y" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
-"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"e" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"i" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"E" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"I" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"Q" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"Y" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
"lEs" "" "" "(lEs|lz)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)
"lE" "[bdfgkmnprStvzZ]" "" "(lE|l)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt Sat Jul 5 19:58:38 2014
@@ -28,6 +28,13 @@
"O" "" "" "o"
"P" "" "" "o"
"U" "" "" "u"
-
+
+"B" "" "[fktSs]" "p"
+"B" "" "p" ""
+"B" "" "$" "p"
+"V" "" "[pktSs]" "f"
+"V" "" "f" ""
+"V" "" "$" "f"
+
"B" "" "" "b"
"V" "" "" "v"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt Sat Jul 5 19:58:38 2014
@@ -15,4 +15,4 @@
* limitations under the License.
*/
-"l" "" "" ""
\ No newline at end of file
+"1" "" "" ""
\ No newline at end of file
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt Sat Jul 5 19:58:38 2014
@@ -23,7 +23,10 @@
"p" "" "" "f"
"b" "^" "" "b"
"b" "" "" "(b|v)"
-
+"B" "" "" "(b|v)" // Spanish "b"
+"V" "" "" "v" // Spanish "v"
+"EE" "" "" "(1|)" // final "e" (english & french)
+
"ja" "" "" "i"
"jA" "" "" "i"
"je" "" "" "i"
@@ -64,17 +67,20 @@
"ou" "^" "" "(u|v|1)"
"o" "^" "" "(u|v|1)"
"O" "^" "" "(u|v|1)"
+"P" "^" "" "(u|v|1)"
"U" "^" "" "(u|v|1)"
"u" "^" "" "(u|v|1)"
"o" "" "$" "(u|1)"
"O" "" "$" "(u|1)"
+"P" "" "$" "(u|1)"
"u" "" "$" "(u|1)"
"U" "" "$" "(u|1)"
"ou" "" "" "u"
"o" "" "" "u"
"O" "" "" "u"
+"P" "" "" "u"
"U" "" "" "u"
"VV" "" "" "u" // alef/ayin + vov from ruleshebrew
@@ -91,8 +97,8 @@
//"z" "" "" "(z|Z)"
//"d" "" "" "(d|dZ)"
-"TB" "" "$" "(t|s)" // tav from ruleshebrew; only Ashkenazic
-"TB" "" "" "t" // tav from ruleshebrew; only Ashkenazic
+"TB" "^" "" "t" // tav from ruleshebrew
+"TB" "" "" "(t|s)" // tav from ruleshebrew; s is only Ashkenazic
"T" "" "" "t" // tet from ruleshebrew
//"k" "" "" "(k|x)"
Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt Sat Jul 5 19:58:38 2014
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// GENERIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+^oâ english true
+^o' english true
+^mc english true
+^fitz english true
+ceau french+romanian true
+eau romanian true
+eau$ french true // mp: I've added this
+eaux$ french true // mp: I've added this
+ault$ french true
+oult$ french true
+eux$ french true
+eix$ french true
+glou$ greeklatin true
+uu dutch true
+tx spanish true
+witz german true
+tz$ german+russian+english true
+^tz russian+english true
+poulos$ greeklatin true
+pulos$ greeklatin true
+iou greeklatin true
+sj$ dutch true
+^sj dutch true
+güe spanish true
+güi spanish true
+ghe romanian+greeklatin true
+ghi romanian+greeklatin true
+escu$ romanian true
+esco$ romanian true
+vici$ romanian true
+schi$ romanian true
+ii$ russian true
+iy$ russian true
+yy$ russian true
+yi$ russian true
+^rz polish true
+rz$ polish+german true
+[bcdfgklmnpstwz]rz polish true
+rz[bcdfghklmnpstw] polish true
+cki$ polish true
+ska$ polish true
+cka$ polish true
+ae german+russian+english true
+oe german+french+russian+english+dutch true
+th$ german+english true
+^th german+english+greeklatin true
+mann german true
+cz polish true
+cy polish+greeklatin true
+niew polish true
+etti$ italian true
+eti$ italian true
+ati$ italian true
+ato$ italian true
+[aoei]no$ italian true
+[aoei]ni$ italian true
+esi$ italian true
+oli$ italian true
+field$ english true
+stein german true
+heim$ german true
+heimer$ german true
+thal german true
+zweig german true
+[aeou]h german true
+äh german true
+öh german true
+üh german true
+[ln]h[ao]$ portuguese true
+[ln]h[aou] portuguese+french+german+dutch+czech+spanish+turkish true
+chsch german true
+tsch german true
+sch$ german+russian true
+^sch german+russian true
+ck$ german+english true
+c$ polish+romanian+hungarian+czech+turkish true
+sz polish+hungarian true
+cs$ hungarian true
+^cs hungarian true
+dzs hungarian true
+zs$ hungarian true
+^zs hungarian true
+^wl polish true
+^wr polish+english+german+dutch true
+
+gy$ hungarian true
+gy[aeou] hungarian true
+gy hungarian+russian+french+greeklatin true
+guy french true
+gu[ei] spanish+french+portuguese true
+gu[ao] spanish+portuguese true
+gi[aou] italian+greeklatin true
+
+ly hungarian+russian+polish+greeklatin true
+ny hungarian+russian+polish+spanish+greeklatin true
+ty hungarian+russian+polish+greeklatin true
+
+// 1.2 special characters
+Ä polish true
+ç french+spanish+portuguese+turkish true
+Ä czech true
+Ä czech true
+Ä turkish true
+Å polish true
+Å polish true
+ñ spanish true
+Å czech true
+Å czech true
+Å polish true
+Å romanian+turkish true
+Å¡ czech true
+Å£ romanian true
+Å¥ czech true
+ź polish true
+ż polish true
+
+Ã german true
+
+ä german true
+á hungarian+spanish+portuguese+czech+greeklatin true
+â romanian+french+portuguese true
+Ä romanian true
+Ä
polish true
+Ã portuguese true
+ã portuguese true
+Ä polish true
+é french+hungarian+czech+greeklatin true
+è french+spanish+italian true
+ê french true
+Ä czech true
+ê french+portuguese true
+Ã hungarian+spanish+portuguese+czech+greeklatin true
+î romanian+french true
+ı turkish true
+ó polish+hungarian+spanish+italian+portuguese+czech+greeklatin true
+ö german+hungarian+turkish true
+ô french+portuguese true
+õ portuguese+hungarian true
+ò italian+spanish true
+ű hungarian true
+ú hungarian+spanish+portuguese+czech+greeklatin true
+ü german+hungarian+spanish+portuguese+turkish true
+ù french true
+ů czech true
+ý czech+greeklatin true
+
+// Every Cyrillic word has at least one Cyrillic vowel (аÑеоиÑÑÑÑÑ)
+а cyrillic true
+Ñ cyrillic true
+о cyrillic true
+е cyrillic true
+и cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+Ñ cyrillic true
+
+// Every Greek word has at least one Greek vowel
+α greek true
+ε greek true
+η greek true
+ι greek true
+ο greek true
+Ï
greek true
+Ï greek true
+
+// Arabic (only initial)
+ا arabic true // alif (isol + init)
+ب arabic true // ba'
+ت arabic true // ta'
+Ø« arabic true // tha'
+ج arabic true // jim
+Ø arabic true // h.a'
+Ø®' arabic true // kha'
+د arabic true // dal (isol + init)
+Ø° arabic true // dhal (isol + init)
+ر arabic true // ra' (isol + init)
+ز arabic true // za' (isol + init)
+س arabic true // sin
+Ø´ arabic true // shin
+ص arabic true // s.ad
+ض arabic true // d.ad
+Ø· arabic true // t.a'
+ظ arabic true // z.a'
+ع arabic true // 'ayn
+غ arabic true // ghayn
+Ù arabic true // fa'
+Ù arabic true // qaf
+Ù arabic true // kaf
+Ù arabic true // lam
+Ù
arabic true // mim
+Ù arabic true // nun
+Ù arabic true // ha'
+Ù arabic true // waw (isol + init)
+Ù arabic true // ya'
+
+Ø¢ arabic true // alif madda
+Ø¥ arabic true // alif + diacritic
+Ø£ arabic true // alif + hamza
+ؤ arabic true // waw + hamza
+ئ arabic true // ya' + hamza
+Ùا arabic true // ligature l+a
+
+// Hebrew
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+
+// 2. following are rules to reject the language
+
+// Every Latin character word has at least one Latin vowel
+a cyrillic+hebrew+greek+arabic false
+o cyrillic+hebrew+greek+arabic false
+e cyrillic+hebrew+greek+arabic false
+i cyrillic+hebrew+greek+arabic false
+y cyrillic+hebrew+greek+arabic+romanian+dutch false
+u cyrillic+hebrew+greek+arabic false
+
+j italian false
+j[^aoeiuy] french+spanish+portuguese+greeklatin false
+g czech false
+k romanian+spanish+portuguese+french+italian false
+q hungarian+polish+russian+romanian+czech+dutch+turkish+greeklatin false
+v polish false
+w french+romanian+spanish+hungarian+russian+czech+turkish+greeklatin false
+x czech+hungarian+dutch+turkish false // polish excluded from the list
+
+dj spanish+turkish false
+v[^aoeiu] german false // in german, "v" can be found before a vowel only
+y[^aoeiu] german false // in german, "y" usually appears only in the last position; sometimes before a vowel
+c[^aohk] german false
+dzi german+english+french+turkish false
+ou german false
+a[eiou] turkish false // no diphthongs in Turkish
+ö[eaiou] turkish false
+ü[eaiou] turkish false
+e[aiou] turkish false
+i[aeou] turkish false
+o[aieu] turkish false
+u[aieo] turkish false
+aj german+english+french+dutch false
+ej german+english+french+dutch false
+oj german+english+french+dutch false
+uj german+english+french+dutch false
+eu russian+polish false
+ky polish false
+kie french+spanish+greeklatin false
+gie portuguese+romanian+spanish+greeklatin false
+ch[aou] italian false
+ch turkish false
+son$ german false
+sc[ei] french false
+sch hungarian+polish+french+spanish false
+^h russian false
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt Sat Jul 5 19:58:38 2014
@@ -81,7 +81,7 @@
"ck" "" "" "(k|tsk[polish+czech])"
"cz" "" "" "(tS|tsz[czech])" // Polish
- //Proceccing of "h" in various combinations
+ //Processing of "h" in various combinations
"rh" "^" "" "r"
"dh" "^" "" "d"
"bh" "^" "" "b"
@@ -124,7 +124,7 @@
"ouh" "" "[aioe]" "(v[french]|uh)"
"uh" "" "[aioe]" "(v|uh)"
-"h" "." "$" "" // match h at the end of words, but not as a single letter
+"h" "." "$" "" // match h at the end of words, but not as a single letter: difference to the original version
"h" "[aeiouyäöü]" "" "" // german
"h" "^" "" "(h|x[romanian+greeklatin]|H[english+romanian+polish+french+portuguese+italian+spanish])"
@@ -288,7 +288,7 @@
// LANGUAGE SPECIFIC CHARACTERS
"Ä
" "" "[bp]" "om" // polish
"Ä
" "" "" "on" // polish
-"ä" "" "" "Y"
+"ä" "" "" "(Y|e)"
"á" "" "" "a" // Port & Sp
"Ã " "" "" "a"
"â" "" "" "a"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt Sat Jul 5 19:58:38 2014
@@ -15,60 +15,62 @@
* limitations under the License.
*/
-
-"ا" "" "" "a" // alif isol & init
-
+// General
+"ا" "" "" "a" // alif isol & init
+"ب" "" "$" "b"
"ب" "" "" "b1" // ba' isol
-
+"ت" "" "$" "t"
"ت" "" "" "t1" // ta' isol
-
+"Ø«" "" "$" "t"
"Ø«" "" "" "t1" // tha' isol
-
+"ج" "" "$" "(dZ|Z)"
"ج" "" "" "(dZ1|Z1)" // jim isol
-
+"Ø" "^" "" "1"
+"Ø" "" "$" "1"
"Ø" "" "" "(h1|1)" // h.a' isol
-
+"Ø®" "" "$" "x"
"Ø®" "" "" "x1" // kha' isol
-
+"د" "" "$" "d"
"د" "" "" "d1" // dal isol & init
-
+"Ø°" "" "$" "d"
"Ø°" "" "" "d1" // dhal isol & init
-
-"ر" "" "" "r1" // dhal isol & init
-
+"ر" "" "$" "r"
+"ر" "" "" "r1" // ra' isol & init
+"ز" "" "$" "z"
"ز" "" "" "z1" // za' isol & init
-
+"س" "" "$" "s"
"س" "" "" "s1" // sin isol
-
+"Ø´" "" "$" "S"
"Ø´" "" "" "S1" // shin isol
-
+"ص" "" "$" "s"
"ص" "" "" "s1" // s.ad isol
-
+"ض" "" "$" "d"
"ض" "" "" "d1" // d.ad isol
-
+"Ø·" "" "$" "t"
"Ø·" "" "" "t1" // t.a' isol
-
+"ظ" "" "$" "z"
"ظ" "" "" "z1" // z.a' isol
-
-"ع" "" "" "(h1|1)" // ayin isol
-
+"ع" "^" "" "1"
+"ع" "" "$" "1"
+"ع" "" "" "(h1|1)" // ayin isol
+"غ" "" "$" "g"
"غ" "" "" "g1" // ghayin isol
-
+"Ù" "" "$" "f"
"Ù" "" "" "f1" // fa' isol
-
+"Ù" "" "$" "k"
"Ù" "" "" "k1" // qaf isol
-
+"Ù" "" "$" "k"
"Ù" "" "" "k1" // kaf isol
-
+"Ù" "" "$" "l"
"Ù" "" "" "l1" // lam isol
-
+"Ù
" "" "$" "m"
"Ù
" "" "" "m1" // mim isol
-
+"Ù" "" "$" "n"
"Ù" "" "" "n1" // nun isol
-
+"Ù" "^" "" "1"
+"Ù" "" "$" "1"
"Ù" "" "" "(h1|1)" // h isol
-
+"Ù" "" "$" "(u|v)"
"Ù" "" "" "(u|v1)" // waw, isol + init
-
-
+"Ùâ" "" "$" "(i|j)"
"Ùâ" "" "" "(i|j1)" // ya' isol
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt Sat Jul 5 19:58:38 2014
@@ -82,7 +82,7 @@
"ae" "" "" "Y"
"oe" "" "" "Y"
"ü" "" "" "Q"
-"ä" "" "" "Y"
+"ä" "" "" "(Y|e)"
"ö" "" "" "Y"
"ei" "" "" "(aj|ej)"
"ey" "" "" "(aj|ej)"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt Sat Jul 5 19:58:38 2014
@@ -1,293 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// 1. following are rules to accept the language
-// 1.1 Special letter combinations
-^oâ english true
-^o' english true
-^mc english true
-^fitz english true
-ceau french+romanian true
-eau$ french true // mp: I've added this
-eaux$ french true // mp: I've added this
-ault$ french true
-oult$ french true
-eux$ french true
-eix$ french true
-glou$ greeklatin true
-uu dutch true
-tx spanish true
-witz german true
-tz$ german+russian+english true
-^tz russian+english true
-poulos$ greeklatin true
-pulos$ greeklatin true
-iou greeklatin true
-sj$ dutch true
-^sj dutch true
-güe spanish true
-güi spanish true
-ghe romanian+greeklatin true
-ghi romanian+greeklatin true
-escu$ romanian true
-esco$ romanian true
-vici$ romanian true
-schi$ romanian true
-ii$ russian true
-iy$ russian true
-yy$ russian true
-yi$ russian true
-^rz polish true
-rz$ polish+german true
-[bcdfgklmnpstwz]rz polish true
-rz[bcdfghklmnpstw] polish true
-etti$ italian true
-eti$ italian true
-ati$ italian true
-ato$ italian true
-[aoei]no$ italian true
-[aoei]ni$ italian true
-esi$ italian true
-oli$ italian true
-field$ english true
-cki$ polish true
-ska$ polish true
-cka$ polish true
-ae german+russian+english true
-oe german+french+russian+english+dutch true
-th$ german+english true
-^th german+english+greeklatin true
-mann german true
-cz polish true
-cy polish+greeklatin true
-niew polish true
-stein german true
-heim$ german true
-heimer$ german true
-thal german true
-zweig german true
-[aeou]h german true
-äh german true
-öh german true
-üh german true
-[ln]h[ao]$ portuguese true
-[ln]h[aou] portuguese+french+german+dutch+czech+spanish+turkish true
-chsch german true
-tsch german true
-sch$ german+russian true
-^sch german+russian true
-ck$ german+english true
-c$ polish+romanian+hungarian+czech+turkish true
-sz polish+hungarian true
-cs$ hungarian true
-^cs hungarian true
-dzs hungarian true
-zs$ hungarian true
-^zs hungarian true
-^wl polish true
-^wr polish+english+german+dutch true
-
-gy$ hungarian true
-gy[aeou] hungarian true
-gy hungarian+russian+french+greeklatin true
-guy french true
-gu[ei] spanish+french+portuguese true
-gu[ao] spanish+portuguese true
-gi[aou] italian+greeklatin true
-
-ly hungarian+russian+polish+greeklatin true
-ny hungarian+russian+polish+spanish+greeklatin true
-ty hungarian+russian+polish+greeklatin true
-
-// 1.2 special characters
-Ä polish true
-ç french+spanish+portuguese+turkish true
-Ä czech true
-Ä czech true
-Ä turkish true
-Å polish true
-Å polish true
-ñ spanish true
-Å czech true
-Å czech true
-Å polish true
-Å romanian+turkish true
-Å¡ czech true
-Å£ romanian true
-Å¥ czech true
-ź polish true
-ż polish true
-
-Ã german true
-
-ä german true
-á hungarian+spanish+portuguese+czech+greeklatin true
-â romanian+french+portuguese true
-Ä romanian true
-Ä
polish true
-Ã portuguese true
-ã portuguese true
-Ä polish true
-é french+hungarian+czech+greeklatin true
-è french+spanish+italian true
-ê french true
-Ä czech true
-ê french+portuguese true
-Ã hungarian+spanish+portuguese+czech+greeklatin true
-î romanian+french true
-ı turkish true
-ó polish+hungarian+spanish+italian+portuguese+czech+greeklatin true
-ö german+hungarian+turkish true
-ô french+portuguese true
-õ portuguese+hungarian true
-ò italian+spanish true
-ű hungarian true
-ú hungarian+spanish+portuguese+czech+greeklatin true
-ü german+hungarian+spanish+portuguese+turkish true
-ù french true
-ů czech true
-ý czech+greeklatin true
-
-// Every Cyrillic word has at least one Cyrillic vowel (аÑеоиÑÑÑÑÑ)
-а cyrillic true
-Ñ cyrillic true
-о cyrillic true
-е cyrillic true
-и cyrillic true
-Ñ cyrillic true
-Ñ cyrillic true
-Ñ cyrillic true
-Ñ cyrillic true
-Ñ cyrillic true
-
-// Every Greek word has at least one Greek vowel
-α greek true
-ε greek true
-η greek true
-ι greek true
-ο greek true
-Ï
greek true
-Ï greek true
-
-// Arabic (only initial)
-ا arabic true // alif (isol + init)
-ب arabic true // ba'
-ت arabic true // ta'
-Ø« arabic true // tha'
-ج arabic true // jim
-Ø arabic true // h.a'
-Ø®' arabic true // kha'
-د arabic true // dal (isol + init)
-Ø° arabic true // dhal (isol + init)
-ر arabic true // ra' (isol + init)
-ز arabic true // za' (isol + init)
-س arabic true // sin
-Ø´ arabic true // shin
-ص arabic true // s.ad
-ض arabic true // d.ad
-Ø· arabic true // t.a'
-ظ arabic true // z.a'
-ع arabic true // 'ayn
-غ arabic true // ghayn
-Ù arabic true // fa'
-Ù arabic true // qaf
-Ù arabic true // kaf
-Ù arabic true // lam
-Ù
arabic true // mim
-Ù arabic true // nun
-Ù arabic true // ha'
-Ù arabic true // waw (isol + init)
-Ù arabic true // ya'
-
-Ø¢ arabic true // alif madda
-Ø¥ arabic true // alif + diacritic
-Ø£ arabic true // alif + hamza
-ؤ arabic true // waw + hamza
-ئ arabic true // ya' + hamza
-
-
-// Hebrew
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-× hebrew true
-ס hebrew true
-×¢ hebrew true
-פ hebrew true
-צ hebrew true
-ק hebrew true
-ר hebrew true
-ש hebrew true
-ת hebrew true
-
-// 2. following are rules to reject the language
-
-// Every Latin character word has at least one Latin vowel
-a cyrillic+hebrew+greek+arabic false
-o cyrillic+hebrew+greek+arabic false
-e cyrillic+hebrew+greek+arabic false
-i cyrillic+hebrew+greek+arabic false
-y cyrillic+hebrew+greek+arabic+romanian+dutch false
-u cyrillic+hebrew+greek+arabic false
-
-j italian false
-j[^aoeiuy] french+spanish+portuguese+greeklatin false
-g czech false
-k romanian+spanish+portuguese+french+italian false
-q hungarian+polish+russian+romanian+czech+dutch+turkish+greeklatin false
-v polish false
-w french+romanian+spanish+hungarian+russian+czech+turkish+greeklatin false
-x czech+hungarian+dutch+turkish false // polish excluded from the list
-
-dj spanish+turkish false
-v[^aoeiu] german false // in german, "v" can be found before a vowel only
-y[^aoeiu] german false // in german, "y" usually appears only in the last position; sometimes before a vowel
-c[^aohk] german false
-dzi german+english+french+turkish false
-ou german false
-a[eiou] turkish false // no diphthongs in Turkish
-ö[eaio] turkish false
-ü[eaio] turkish false
-e[aiou] turkish false
-i[aeou] turkish false
-o[aieu] turkish false
-u[aieo] turkish false
-aj german+english+french+dutch false
-ej german+english+french+dutch false
-oj german+english+french+dutch false
-uj german+english+french+dutch false
-eu russian+polish false
-ky polish false
-kie french+spanish+greeklatin false
-gie portuguese+romanian+spanish+greeklatin false
-ch[aou] italian false
-ch turkish false
-son$ german false
-sc[ei] french false
-sch hungarian+polish+french+spanish false
-^h russian false
-etti$ greeklatin false
Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt Sat Jul 5 19:58:38 2014
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// SEPHARDIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+eau french true
+ou french true
+gni italian+french true
+tx spanish true
+tj spanish true
+gy french true
+guy french true
+
+sh spanish+portuguese true // English, but no sign for /sh/ in these languages
+
+lh portuguese true
+nh portuguese true
+ny spanish true
+
+gue spanish+french true
+gui spanish+french true
+gia italian true
+gie italian true
+gio italian true
+giu italian true
+
+// 1.2 special characters
+ñ spanish true
+â portuguese+french true
+á portuguese+spanish true
+Ã portuguese true
+ã portuguese true
+ê french+portuguese true
+Ã portuguese+spanish true
+î french true
+ô french+portuguese true
+õ portuguese true
+ò italian+spanish true
+ú portuguese+spanish true
+ù french true
+ü portuguese+spanish true
+
+// Hebrew
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+× hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+
+// 2. following are rules to reject the language
+
+// Every Latin character word has at least one Latin vowel
+a hebrew false
+o hebrew false
+e hebrew false
+i hebrew false
+y hebrew false
+u hebrew false
+
+kh spanish false
+gua italian false
+guo italian false
+ç italian false
+cha italian false
+cho italian false
+chu italian false
+j italian false
+dj spanish false
+sce french false
+sci french false
+ó french false
+è portuguese false
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Sat Jul 5 19:58:38 2014
@@ -17,7 +17,7 @@
package org.apache.commons.codec.language.bm;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.*;
import java.util.Arrays;
import java.util.HashSet;
@@ -185,11 +185,16 @@ public class PhoneticEngineRegressionTes
Map<String, String> args = new TreeMap<String, String>();
args.put("nameType", "GENERIC");
args.put("ruleType", "APPROX");
+
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
+ assertEquals(encode(args, true, "Bendzin"), "bndzn|bntsn|bnzn|vndzn|vntsn");
args.put("nameType", "ASHKENAZI");
args.put("ruleType", "APPROX");
+
assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
+ assertEquals(encode(args, true, "Halpern"), "YlpYrn|Ylpirn|alpYrn|alpirn|olpYrn|olpirn|xalpirn|xolpirn");
+
}
/**
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Sat Jul 5 19:58:38 2014
@@ -17,8 +17,7 @@
package org.apache.commons.codec.language.bm;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
import java.util.Arrays;
import java.util.List;
@@ -41,8 +40,8 @@ public class PhoneticEngineTest {
public static List<Object[]> data() {
return Arrays
.asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, Boolean.TRUE, TEN },
- new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, TEN },
- new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, Integer.valueOf(1) },
+ new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinolt|rinult", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, TEN },
+ new Object[] { "Renault", "rinDlt", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, Integer.valueOf(1) },
new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, Boolean.TRUE, TEN },
new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, Boolean.TRUE, TEN },
new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, Boolean.TRUE, TEN },