You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2014/07/05 21:58:39 UTC

svn commit: r1608115 - in /commons/proper/codec/trunk/src: main/java/org/apache/commons/codec/language/bm/ main/resources/org/apache/commons/codec/language/bm/ test/java/org/apache/commons/codec/language/bm/

Author: tn
Date: Sat Jul  5 19:58:38 2014
New Revision: 1608115

URL: http://svn.apache.org/r1608115
Log:
[CODEC-187] Apply patch to make BeiderMorse phonetic engine compatible with v3.3 of the reference implementation.

Added:
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt   (with props)
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt   (with props)
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt   (with props)
Modified:
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt
    commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Sat Jul  5 19:58:38 2014
@@ -64,9 +64,12 @@ import org.apache.commons.codec.StringEn
  * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
  * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
  * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * <p>
+ * <b>Note</b>: this version of the Beider-Morse encoding is equivalent with v3.3 of the reference implementation.
  *
  * @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
  * @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
+ *
  * @since 1.6
  * @version $Id$
  */

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java Sat Jul  5 19:58:38 2014
@@ -95,11 +95,11 @@ public class Lang {
 
     private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
 
-    private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
+    private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
 
     static {
         for (final NameType s : NameType.values()) {
-            Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
+            Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
         }
     }
 

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java Sat Jul  5 19:58:38 2014
@@ -73,6 +73,8 @@ public class Languages {
         public abstract boolean isSingleton();
 
         public abstract LanguageSet restrictTo(LanguageSet other);
+
+        public abstract LanguageSet merge(LanguageSet other);
     }
 
     /**
@@ -128,6 +130,22 @@ public class Languages {
         }
 
         @Override
+        public LanguageSet merge(final LanguageSet other) {
+            if (other == NO_LANGUAGES) {
+                return this;
+            } else if (other == ANY_LANGUAGE) {
+                return other;
+            } else {
+                final SomeLanguages sl = (SomeLanguages) other;
+                final Set<String> ls = new HashSet<String>(languages);
+                for (String lang : sl.languages) {
+                  ls.add(lang);
+                }
+                return from(ls);
+            }
+        }
+
+        @Override
         public String toString() {
             return "Languages(" + languages.toString() + ")";
         }
@@ -217,6 +235,11 @@ public class Languages {
         }
 
         @Override
+        public LanguageSet merge(final LanguageSet other) {
+            return other;
+        }
+
+        @Override
         public String toString() {
             return "NO_LANGUAGES";
         }
@@ -252,6 +275,11 @@ public class Languages {
         }
 
         @Override
+        public LanguageSet merge(final LanguageSet other) {
+            return other;
+        }
+
+        @Override
         public String toString() {
             return "ANY_LANGUAGE";
         }

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Sat Jul  5 19:58:38 2014
@@ -28,7 +28,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeSet;
+import java.util.TreeMap;
 
 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
 import org.apache.commons.codec.language.bm.Rule.Phoneme;
@@ -335,7 +335,8 @@ public class PhoneticEngine {
             return phonemeBuilder;
         }
 
-        final Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
+        final Map<Rule.Phoneme, Rule.Phoneme> phonemes =
+            new TreeMap<Rule.Phoneme, Rule.Phoneme>(Rule.Phoneme.COMPARATOR);
 
         for (final Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
             PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
@@ -355,10 +356,21 @@ public class PhoneticEngine {
                 i = rulesApplication.getI();
             }
 
-            phonemes.addAll(subBuilder.getPhonemes());
+            // the phonemes map orders the phonemes only based on their text, but ignores the language set
+            // when adding new phonemes, check for equal phonemes and merge their language set, otherwise
+            // phonemes with the same text but different language set get lost
+            for (final Rule.Phoneme newPhoneme : subBuilder.getPhonemes()) {
+                if (phonemes.containsKey(newPhoneme)) {
+                    final Rule.Phoneme oldPhoneme = phonemes.remove(newPhoneme);
+                    final Rule.Phoneme mergedPhoneme = oldPhoneme.mergeWithLanguage(newPhoneme.getLanguages());
+                    phonemes.put(mergedPhoneme, mergedPhoneme);
+                } else {
+                    phonemes.put(newPhoneme, newPhoneme);
+                }
+            }
         }
 
-        return new PhonemeBuilder(phonemes);
+        return new PhonemeBuilder(phonemes.keySet());
     }
 
     /**

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java Sat Jul  5 19:58:38 2014
@@ -147,6 +147,22 @@ public class Rule {
             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
                                this.languages.restrictTo(right.languages));
         }
+
+        /**
+         * Returns a new Phoneme with the same text but a union of its
+         * current language set and the given one.
+         *
+         * @param lang the language set to merge
+         * @return a new Phoneme
+         */
+        public Phoneme mergeWithLanguage(final LanguageSet lang) {
+          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
+        }
+
+        @Override
+        public String toString() {
+          return phonemeText.toString() + "[" + languages + "]";
+        }
     }
 
     public interface PhonemeExpr {
@@ -442,6 +458,9 @@ public class Rule {
                                         sb.append("Rule");
                                         sb.append("{line=").append(myLine);
                                         sb.append(", loc='").append(loc).append('\'');
+                                        sb.append(", pat='").append(pat).append('\'');
+                                        sb.append(", lcon='").append(lCon).append('\'');
+                                        sb.append(", rcon='").append(rCon).append('\'');
                                         sb.append('}');
                                         return sb.toString();
                                     }

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_exact_russian.txt Sat Jul  5 19:58:38 2014
@@ -16,4 +16,4 @@
  */
 
 "E" "" "" "e"
-"I "" "" "i"
\ No newline at end of file
+"I" "" "" "i"
\ No newline at end of file

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_hebrew_common.txt Sat Jul  5 19:58:38 2014
@@ -24,6 +24,7 @@
 "b" "^" "" "b"    
 "b" "" "" "(b|v)"    
         
+"J" "" "" "l" 
 "ja" "" "" "i"
 "jA" "" "" "i"
 "jB" "" "" "i"
@@ -75,17 +76,20 @@
 "ou" "^" "" "(u|v|1)"
 "o" "^" "" "(u|v|1)"
 "O" "^" "" "(u|v|1)"
+"P" "^" "" "(u|v|1)" 
 "U" "^" "" "(u|v|1)"
 "u" "^" "" "(u|v|1)"
     
 "o" "" "$" "(u|1)"
 "O" "" "$" "(u|1)"
+"P" "" "$" "(u|1)" 
 "u" "" "$" "(u|1)"
 "U" "" "$" "(u|1)"
     
 "ou" "" "" "u"
 "o" "" "" "u"
 "O" "" "" "u"
+"P" "" "" "u" 
 "U" "" "" "u"
         
 "VV" "" "" "u" // alef/ayin + vov from ruleshebrew
@@ -102,8 +106,9 @@
     //"z" "" "" "(z|Z)"
     //"d" "" "" "(d|dZ)"
        
-"TB" "" "$" "(t|s)" // tav from ruleshebrew; only Ashkenazic
-"TB" "" "" "t" // tav from ruleshebrew; only Ashkenazic
+"TB" "^" "" "t" // tav from ruleshebrew; only Ashkenazic
+"TB" "" "$" "s" // tav from ruleshebrew; only Ashkenazic
+"TB" "" "" "(t|s)" // tav from ruleshebrew; only Ashkenazic
 "T" "" "" "t"   // tet from ruleshebrew
     
    //"k" "" "" "(k|x)"

Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt Sat Jul  5 19:58:38 2014
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// ASHKENAZIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+zh polish+russian+german+english true
+eau french true
+[aoeiuäöü]h german true
+^vogel german, true
+vogel$ german true
+witz german true
+tz$ german+russian+english true
+^tz russian+english true
+güe spanish true
+güi spanish true
+ghe romanian true
+ghi romanian true
+vici$ romanian true
+schi$ romanian true
+chsch german true
+tsch german true
+ssch german true
+sch$ german+russian true
+^sch german+russian true
+^rz polish true
+rz$ polish+german true
+[^aoeiuäöü]rz polish true
+rz[^aoeiuäöü] polish true
+cki$ polish true
+ska$ polish true
+cka$ polish true
+ue german+russian true
+ae german+russian+english true
+oe german+french+russian+english true
+th$ german true
+^th german true
+th[^aoeiu] german true
+mann german true
+cz polish true
+cy polish true
+niew polish true
+stein german true
+heim$ german true
+heimer$ german true
+ii$ russian true
+iy$ russian true
+yy$ russian true
+yi$ russian true
+yj$ russian true
+ij$ russian true
+gaus$ russian true
+gauz$ russian true
+gauz$ russian true
+goltz$ russian true
+gol'tz$ russian true 
+golts$ russian true 
+gol'ts$ russian true 
+^goltz russian true
+^gol'tz russian true 
+^golts russian true 
+^gol'ts russian true 
+gendler$ russian true 
+gejmer$ russian true 
+gejm$ russian true 
+geimer$ russian true 
+geim$ russian true 
+geymer russian true 
+geym$ russian true 
+gof$ russian true 
+thal german true
+zweig german true
+ck$ german+english true
+c$ polish+romanian+hungarian true
+sz polish+hungarian true
+gue spanish+french true
+gui spanish+french true
+guy french true
+cs$ hungarian true
+^cs hungarian true
+dzs hungarian true
+zs$ hungarian true
+^zs hungarian true
+^wl polish true
+^wr polish+english+german true
+
+gy$ hungarian true
+gy[aeou] hungarian true
+gy hungarian+russian true
+ly hungarian+russian+polish true
+ny hungarian+russian+polish true
+ty hungarian+russian+polish true 
+
+// 1.2 special characters    
+â romanian+french true
+ă romanian true
+à french true
+ä german true
+á hungarian+spanish true
+ą polish true
+ć polish true
+ç french true
+ę polish true
+é french+hungarian+spanish true
+è french true
+ê french true
+í hungarian+spanish true
+î romanian+french true
+ł polish true
+ń polish true
+ñ spanish true
+ó polish+hungarian+spanish true
+ö german+hungarian true
+õ hungarian true
+ş romanian true
+ś polish true
+Å£ romanian true
+ü german+hungarian true
+ù french true
+ű hungarian true
+ú hungarian+spanish true
+ź polish true
+ż polish true
+    
+ß german true
+
+// Every Cyrillic word has at least one Cyrillic vowel (аёеоиуыэюя) 
+а cyrillic true 
+ё cyrillic true 
+о cyrillic true 
+е cyrillic true 
+и cyrillic true 
+у cyrillic true 
+ы cyrillic true 
+э cyrillic true 
+ю cyrillic true 
+я cyrillic true 
+    
+// Hebrew 
+א hebrew true
+ב hebrew true
+ג ebrew true
+ד hebrew true
+ה hebrew true
+ו hebrew true
+ז hebrew true
+ח hebrew true
+ט hebrew true
+י hebrew true
+כ hebrew true
+ל hebrew true
+מ hebrew true
+×  hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true 
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+    
+    
+// 2. following are rules to reject the language
+// Every Latin character word has at least one Latin vowel  
+a cyrillic+hebrew false 
+o cyrillic+hebrew false 
+e cyrillic+hebrew false 
+i cyrillic+hebrew false 
+y cyrillic+hebrew+romanian false 
+u cyrillic+hebrew false 
+  
+v[^aoeiuäüö] german false // in german "v" can be found before a vowel only
+y[^aoeiu] german false  // in german "y" usually appears only in the last position; sometimes before a vowel
+c[^aohk] german false
+dzi german+english+french false
+ou german false
+aj german+english+french false
+ej german+english+french false
+oj german+english+french false
+uj german+english+french false
+k romanian false
+v polish false
+ky polish false
+eu russian+polish false
+w french+romanian+spanish+hungarian+russian false
+kie french+spanish false
+gie french+romanian+spanish false
+q hungarian+polish+russian+romanian false
+sch hungarian+polish+french+spanish false
+^h russian false

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_lang.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt Sat Jul  5 19:58:38 2014
@@ -55,11 +55,11 @@
 "sh" "" "[äöü]" "sh" // german
 "sh" "" "[aeiou]" "(S[russian+english]|sh)"
 "sh" "" "" "S" // russian+english
-    
+
 "kh" "" "" "(x[russian+english]|kh)"
-    
+
 "chs" "" "" "(ks[german]|xs|tSs[russian+english])"
-    
+
     // French "ch" is currently disabled
     //array("ch" "" "[ei]" "(x|tS|k[romanian]|S[french])"
     //array("ch" "" "" "(x|tS[russian+english]|S[french])"
@@ -212,8 +212,8 @@
 "v" "^" "" "(v|f[german])"
         
 "h" "[aeiouyäöü]" "" "" //german
-"h" "" "" "(h|x[".(romanian+polish)."])"
-"h" "^" "" "(h|H[".(english+german)."])" // H can be exact "h" or approximate "kh"
+"h" "" "" "(h|x[romanian+polish])"
+"h" "^" "" "(h|H[english+german])" // H can be exact "h" or approximate "kh"
     
  // VOWELS  
 "yi" "^" "" "i"
@@ -275,7 +275,7 @@
     
 "ą" "" "[bp]" "om"  // polish
 "ą" "" "" "on"  // polish
-"ä" "" "" "Y"  // german
+"ä" "" "" "(Y|e)" // german
 "á" "" "" "a" // hungarian
 "ă" "" "" "(e[romanian]|a)" //romanian
 "à" "" "" "a"  // french

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_german.txt Sat Jul  5 19:58:38 2014
@@ -81,7 +81,7 @@
 "ae" "" "" "Y" 
 "oe" "" "" "Y" 
 "ü" "" "" "Q"
-"ä" "" "" "Y"
+"ä" "" "" "(Y|e)"
 "ö" "" "" "Y"
 "ei" "" "" "aj"
 "ey" "" "" "aj"

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt Sat Jul  5 19:58:38 2014
@@ -29,6 +29,12 @@
 "mp" "" "" "(mp|b[greeklatin])"
 "ng" "" "" "(ng|g[greeklatin])"
 
+"B" "" "[fktSs]" "(p|f[spanish])" 
+"B" "" "p" "" 
+"B" "" "$" "(p|f[spanish])" 
+"V" "" "[pktSs]" "(f|p[spanish])" 
+"V" "" "f" "" 
+"V" "" "$" "(f|p[spanish])" 
 "B" "" "" "(b|v[spanish])"
 "V" "" "" "(v|b[spanish])"
     
@@ -58,6 +64,7 @@
 "lE" "[bdfgkmnprsStvzZ]" "" "(li|il[english]|lY[german])"  // Applebaum < Appelbaum
 "rE" "[bdfgkmnprsStvzZ]" "" "(ri|ir[english]|rY[german])"
     
+"EE" "" "" "(i|)" 
 "ea" "" "" "(D|a|i)"
     
 "au" "" "" "(D|a|u)"

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_arabic.txt Sat Jul  5 19:58:38 2014
@@ -15,6 +15,9 @@
  * limitations under the License.
  */
 
+"1a" "" "" "(D|a)" 
+"1i" "" "" "(D|i|e)" 
+"1u" "" "" "(D|u|o)" 
 "j1" "" "" "(ja|je|jo|ju|j)"
 "1" "" "" "(a|e|i|o|u|)"
 "u" "" "" "(o|u)"

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_common.txt Sat Jul  5 19:58:38 2014
@@ -37,19 +37,19 @@
 "sEn" "[rmnl]" "$" "(zn|zon)"
 "sEn" "" "$" "(sn|son)"
             
-"e" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"i" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"E" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"I" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"Q" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"Y" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"e" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"i" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"E" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"I" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"Q" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
+"Y" "[BbdfgklmnprsStvzZ]" "[ln]$" ""
 
-"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
-"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"e" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"i" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"E" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"I" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"Q" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
+"Y" "[BbdfgklmnprsStvzZ]" "[ln][BbdfgklmnprsStvzZ]" ""
 
 "lEs" "" "" "(lEs|lz)"  // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)
 "lE" "[bdfgkmnprStvzZ]" "" "(lE|l)"  // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_any.txt Sat Jul  5 19:58:38 2014
@@ -28,6 +28,13 @@
 "O" "" "" "o"
 "P" "" "" "o"
 "U" "" "" "u"
-    
+
+"B" "" "[fktSs]" "p" 
+"B" "" "p" "" 
+"B" "" "$" "p" 
+"V" "" "[pktSs]" "f" 
+"V" "" "f" "" 
+"V" "" "$" "f" 
+
 "B" "" "" "b"
 "V" "" "" "v"

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_exact_arabic.txt Sat Jul  5 19:58:38 2014
@@ -15,4 +15,4 @@
  * limitations under the License.
  */
 
-"l" "" "" ""
\ No newline at end of file
+"1" "" "" ""
\ No newline at end of file

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_hebrew_common.txt Sat Jul  5 19:58:38 2014
@@ -23,7 +23,10 @@
 "p" "" "" "f"   
 "b" "^" "" "b"    
 "b" "" "" "(b|v)"    
-        
+"B" "" "" "(b|v)" // Spanish "b"
+"V" "" "" "v" // Spanish "v"
+"EE" "" "" "(1|)" // final "e" (english & french)
+
 "ja" "" "" "i"
 "jA" "" "" "i"  
 "je" "" "" "i"
@@ -64,17 +67,20 @@
 "ou" "^" "" "(u|v|1)"
 "o" "^" "" "(u|v|1)"
 "O" "^" "" "(u|v|1)"
+"P" "^" "" "(u|v|1)" 
 "U" "^" "" "(u|v|1)"
 "u" "^" "" "(u|v|1)"
     
 "o" "" "$" "(u|1)"
 "O" "" "$" "(u|1)"
+"P" "" "$" "(u|1)" 
 "u" "" "$" "(u|1)"
 "U" "" "$" "(u|1)"
     
 "ou" "" "" "u"
 "o" "" "" "u"
 "O" "" "" "u"
+"P" "" "" "u" 
 "U" "" "" "u"
         
 "VV" "" "" "u" // alef/ayin + vov from ruleshebrew
@@ -91,8 +97,8 @@
     //"z" "" "" "(z|Z)"
     //"d" "" "" "(d|dZ)"
    
-"TB" "" "$" "(t|s)" // tav from ruleshebrew; only Ashkenazic
-"TB" "" "" "t" // tav from ruleshebrew; only Ashkenazic    
+"TB" "^" "" "t" // tav from ruleshebrew
+"TB" "" "" "(t|s)" // tav from ruleshebrew; s is only Ashkenazic
 "T" "" "" "t"   // tet from  ruleshebrew
     
    //"k" "" "" "(k|x)"

Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt Sat Jul  5 19:58:38 2014
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// GENERIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+^o’ english true
+^o' english true
+^mc english true
+^fitz english true
+ceau french+romanian true
+eau romanian true
+eau$ french true // mp: I've added this
+eaux$ french true // mp: I've added this
+ault$ french true
+oult$ french true
+eux$ french true
+eix$ french true
+glou$ greeklatin true
+uu dutch true
+tx spanish true
+witz german true
+tz$ german+russian+english true
+^tz russian+english true
+poulos$ greeklatin true
+pulos$ greeklatin true
+iou greeklatin true
+sj$ dutch true
+^sj dutch true
+güe spanish true
+güi spanish true
+ghe romanian+greeklatin true
+ghi romanian+greeklatin true
+escu$ romanian true
+esco$ romanian true
+vici$ romanian true
+schi$ romanian true
+ii$ russian true
+iy$ russian true
+yy$ russian true
+yi$ russian true
+^rz polish true
+rz$ polish+german true
+[bcdfgklmnpstwz]rz polish true
+rz[bcdfghklmnpstw] polish true
+cki$ polish true
+ska$ polish true
+cka$ polish true
+ae german+russian+english true
+oe german+french+russian+english+dutch true
+th$ german+english true
+^th german+english+greeklatin true
+mann german true
+cz polish true
+cy polish+greeklatin true
+niew polish true
+etti$ italian true
+eti$ italian true
+ati$ italian true
+ato$ italian true
+[aoei]no$ italian true
+[aoei]ni$ italian true
+esi$ italian true
+oli$ italian true
+field$ english true
+stein german true
+heim$ german true
+heimer$ german true
+thal german true
+zweig german true
+[aeou]h german true
+äh german true
+öh german true
+üh german true
+[ln]h[ao]$ portuguese true
+[ln]h[aou] portuguese+french+german+dutch+czech+spanish+turkish true
+chsch german true
+tsch german true
+sch$ german+russian true
+^sch german+russian true
+ck$ german+english true
+c$ polish+romanian+hungarian+czech+turkish true
+sz polish+hungarian true
+cs$ hungarian true
+^cs hungarian true
+dzs hungarian true
+zs$ hungarian true
+^zs hungarian true
+^wl polish true
+^wr polish+english+german+dutch true
+
+gy$ hungarian true
+gy[aeou] hungarian true
+gy hungarian+russian+french+greeklatin true
+guy french true
+gu[ei] spanish+french+portuguese true
+gu[ao] spanish+portuguese true
+gi[aou] italian+greeklatin true
+        
+ly hungarian+russian+polish+greeklatin true
+ny hungarian+russian+polish+spanish+greeklatin true
+ty hungarian+russian+polish+greeklatin true 
+
+// 1.2 special characters    
+ć polish true
+ç french+spanish+portuguese+turkish true
+č czech true
+ď czech true
+ğ turkish true
+ł polish true
+ń polish true
+ñ spanish true
+ň czech true
+ř czech true
+ś polish true
+ş romanian+turkish true
+Å¡ czech true
+Å£ romanian true
+Å¥ czech true
+ź polish true
+ż polish true
+        
+ß german true
+
+ä german true
+á hungarian+spanish+portuguese+czech+greeklatin true
+â romanian+french+portuguese true
+ă romanian true
+ą polish true
+à portuguese true
+ã portuguese true
+ę polish true
+é french+hungarian+czech+greeklatin true
+è french+spanish+italian true
+ê french true
+ě czech true
+ê french+portuguese true
+í hungarian+spanish+portuguese+czech+greeklatin true
+î romanian+french true
+ı turkish true
+ó polish+hungarian+spanish+italian+portuguese+czech+greeklatin true
+ö german+hungarian+turkish true
+ô french+portuguese true
+õ portuguese+hungarian true
+ò italian+spanish true
+ű hungarian true
+ú hungarian+spanish+portuguese+czech+greeklatin true
+ü german+hungarian+spanish+portuguese+turkish true
+ù french true
+ů czech true
+ý czech+greeklatin true
+   
+// Every Cyrillic word has at least one Cyrillic vowel (аёеоиуыэюя) 
+а cyrillic true 
+ё cyrillic true 
+о cyrillic true 
+е cyrillic true 
+и cyrillic true 
+у cyrillic true 
+ы cyrillic true 
+э cyrillic true 
+ю cyrillic true 
+я cyrillic true 
+ 
+// Every Greek word has at least one Greek vowel
+α greek true 
+ε greek true 
+η greek true 
+ι greek true 
+ο greek true 
+υ greek true 
+ω greek true 
+
+// Arabic (only initial)
+ا arabic true // alif (isol + init)   
+ب arabic true // ba' 
+ت arabic true // ta' 
+Ø« arabic true // tha'
+ج arabic true // jim
+Ø­ arabic true // h.a' 
+Ø®' arabic true // kha' 
+د arabic true // dal (isol + init)
+Ø° arabic true // dhal (isol + init)
+ر arabic true // ra' (isol + init)
+ز arabic true // za' (isol + init)
+س arabic true // sin 
+Ø´ arabic true // shin 
+ص arabic true // s.ad 
+ض arabic true // d.ad 
+Ø· arabic true // t.a' 
+ظ arabic true // z.a' 
+ع arabic true // 'ayn
+غ arabic true // ghayn 
+ف arabic true // fa' 
+ق arabic true // qaf 
+ك arabic true // kaf  
+ل arabic true // lam 
+م arabic true // mim 
+ن arabic true // nun 
+ه arabic true // ha' 
+و arabic true // waw (isol + init)
+ي arabic true // ya' 
+    
+Ø¢ arabic true // alif madda  
+Ø¥ arabic true // alif + diacritic  
+Ø£ arabic true // alif + hamza
+ؤ arabic true //  waw + hamza
+ئ arabic true //  ya' + hamza
+لا arabic true // ligature l+a
+                
+// Hebrew 
+א hebrew true
+ב hebrew true
+ג hebrew true
+ד hebrew true
+ה hebrew true
+ו hebrew true
+ז hebrew true
+ח hebrew true
+ט hebrew true
+י hebrew true
+כ hebrew true
+ל hebrew true
+מ hebrew true
+×  hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true 
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+      
+// 2. following are rules to reject the language
+   
+// Every Latin character word has at least one Latin vowel  
+a cyrillic+hebrew+greek+arabic false 
+o cyrillic+hebrew+greek+arabic false 
+e cyrillic+hebrew+greek+arabic false 
+i cyrillic+hebrew+greek+arabic false 
+y cyrillic+hebrew+greek+arabic+romanian+dutch false 
+u cyrillic+hebrew+greek+arabic false 
+  
+j italian false
+j[^aoeiuy] french+spanish+portuguese+greeklatin false 
+g czech false
+k romanian+spanish+portuguese+french+italian false
+q hungarian+polish+russian+romanian+czech+dutch+turkish+greeklatin false
+v polish false
+w french+romanian+spanish+hungarian+russian+czech+turkish+greeklatin false
+x czech+hungarian+dutch+turkish false // polish excluded from the list
+    
+dj spanish+turkish false
+v[^aoeiu] german false // in german, "v" can be found before a vowel only
+y[^aoeiu] german false  // in german, "y" usually appears only in the last position; sometimes before a vowel
+c[^aohk] german false
+dzi german+english+french+turkish false
+ou german false
+a[eiou] turkish false // no diphthongs in Turkish
+ö[eaiou] turkish false 
+ü[eaiou] turkish false 
+e[aiou] turkish false 
+i[aeou] turkish false 
+o[aieu] turkish false 
+u[aieo] turkish false 
+aj german+english+french+dutch false
+ej german+english+french+dutch false
+oj german+english+french+dutch false
+uj german+english+french+dutch false
+eu russian+polish false
+ky polish false
+kie french+spanish+greeklatin false
+gie portuguese+romanian+spanish+greeklatin false
+ch[aou] italian false
+ch turkish false
+son$ german false
+sc[ei] french false
+sch hungarian+polish+french+spanish false
+^h russian false

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_lang.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_any.txt Sat Jul  5 19:58:38 2014
@@ -81,7 +81,7 @@
 "ck" "" "" "(k|tsk[polish+czech])"
 "cz" "" "" "(tS|tsz[czech])" // Polish
    
-    //Proceccing of "h" in various combinations         
+    //Processing of "h" in various combinations         
 "rh" "^" "" "r"
 "dh" "^" "" "d"
 "bh" "^" "" "b"
@@ -124,7 +124,7 @@
           
 "ouh" "" "[aioe]" "(v[french]|uh)"
 "uh" "" "[aioe]" "(v|uh)"
-"h" "." "$" "" // match h at the end of words, but not as a single letter
+"h" "." "$" "" // match h at the end of words, but not as a single letter: difference to the original version
 "h" "[aeiouyäöü]" "" ""  // german
 "h" "^" "" "(h|x[romanian+greeklatin]|H[english+romanian+polish+french+portuguese+italian+spanish])" 
          
@@ -288,7 +288,7 @@
 // LANGUAGE SPECIFIC CHARACTERS 
 "ą" "" "[bp]" "om" // polish
 "ą" "" "" "on"  // polish
-"ä" "" "" "Y" 
+"ä" "" "" "(Y|e)" 
 "á" "" "" "a" // Port & Sp
 "à" "" "" "a" 
 "â" "" "" "a" 

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_arabic.txt Sat Jul  5 19:58:38 2014
@@ -15,60 +15,62 @@
  * limitations under the License.
  */
 
-
-"ا" "" "" "a" // alif isol & init 
-                
+// General
+"ا" "" "" "a" // alif isol & init
+"ب" "" "$" "b" 
 "ب" "" "" "b1" // ba' isol
-        
+"ت" "" "$" "t" 
 "ت" "" "" "t1" // ta' isol
-        
+"Ø«" "" "$" "t" 
 "Ø«" "" "" "t1" // tha' isol
-
+"ج" "" "$" "(dZ|Z)" 
 "ج" "" "" "(dZ1|Z1)" // jim isol
-        
+"Ø­" "^" "" "1" 
+"Ø­" "" "$" "1" 
 "Ø­" "" "" "(h1|1)" // h.a' isol
-    
+"Ø®" "" "$" "x" 
 "Ø®" "" "" "x1" // kha' isol
-    
+"د" "" "$" "d" 
 "د" "" "" "d1" // dal isol & init
-           
+"Ø°" "" "$" "d" 
 "Ø°" "" "" "d1" // dhal isol & init
-        
-"ر" "" "" "r1" // dhal isol & init
-    
+"ر" "" "$" "r" 
+"ر" "" "" "r1" // ra' isol & init
+"ز" "" "$" "z" 
 "ز" "" "" "z1" // za' isol & init
-        
+"س" "" "$" "s" 
 "س" "" "" "s1" // sin isol
-    
+"Ø´" "" "$" "S" 
 "Ø´" "" "" "S1" // shin isol
-    
+"ص" "" "$" "s" 
 "ص" "" "" "s1" // s.ad isol
-    
+"ض" "" "$" "d" 
 "ض" "" "" "d1" // d.ad isol
-        
+"Ø·" "" "$" "t" 
 "Ø·" "" "" "t1" // t.a' isol
-        
+"ظ" "" "$" "z" 
 "ظ" "" "" "z1" // z.a' isol
-        
-"ع" "" "" "(h1|1)" // ayin isol 
-    
+"ع" "^" "" "1" 
+"ع" "" "$" "1" 
+"ع" "" "" "(h1|1)" // ayin isol
+"غ" "" "$" "g" 
 "غ" "" "" "g1" // ghayin isol
-    
+"ف" "" "$" "f" 
 "ف" "" "" "f1" // fa' isol
-    
+"ق" "" "$" "k" 
 "ق" "" "" "k1" // qaf isol
-    
+"ك" "" "$" "k" 
 "ك" "" "" "k1" // kaf isol
-    
+"ل" "" "$" "l" 
 "ل" "" "" "l1" // lam isol
-    
+"م" "" "$" "m" 
 "م" "" "" "m1" // mim isol
-    
+"ن" "" "$" "n" 
 "ن" "" "" "n1" // nun isol
-    
+"ه" "^" "" "1" 
+"ه" "" "$" "1" 
 "ه" "" "" "(h1|1)" // h isol
-        
+"و" "" "$" "(u|v)" 
 "و" "" "" "(u|v1)" // waw, isol + init
-               
-    
+"ي‎" "" "$" "(i|j)" 
 "ي‎" "" "" "(i|j1)" // ya' isol

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_rules_german.txt Sat Jul  5 19:58:38 2014
@@ -82,7 +82,7 @@
 "ae" "" "" "Y"
 "oe" "" "" "Y"
 "ü" "" "" "Q"
-"ä" "" "" "Y"
+"ä" "" "" "(Y|e)"
 "ö" "" "" "Y"
 "ei" "" "" "(aj|ej)"
 "ey" "" "" "(aj|ej)"

Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/lang.txt Sat Jul  5 19:58:38 2014
@@ -1,293 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// 1. following are rules to accept the language
-// 1.1 Special letter combinations
-^o’ english true
-^o' english true
-^mc english true
-^fitz english true
-ceau french+romanian true
-eau$ french true // mp: I've added this
-eaux$ french true // mp: I've added this
-ault$ french true
-oult$ french true
-eux$ french true
-eix$ french true
-glou$ greeklatin true
-uu dutch true
-tx spanish true
-witz german true
-tz$ german+russian+english true
-^tz russian+english true
-poulos$ greeklatin true
-pulos$ greeklatin true
-iou greeklatin true
-sj$ dutch true
-^sj dutch true
-güe spanish true
-güi spanish true
-ghe romanian+greeklatin true
-ghi romanian+greeklatin true
-escu$ romanian true
-esco$ romanian true
-vici$ romanian true
-schi$ romanian true
-ii$ russian true
-iy$ russian true
-yy$ russian true
-yi$ russian true
-^rz polish true
-rz$ polish+german true
-[bcdfgklmnpstwz]rz polish true
-rz[bcdfghklmnpstw] polish true
-etti$ italian true
-eti$ italian true
-ati$ italian true
-ato$ italian true
-[aoei]no$ italian true
-[aoei]ni$ italian true
-esi$ italian true
-oli$ italian true
-field$ english true
-cki$ polish true
-ska$ polish true
-cka$ polish true
-ae german+russian+english true
-oe german+french+russian+english+dutch true
-th$ german+english true
-^th german+english+greeklatin true
-mann german true
-cz polish true
-cy polish+greeklatin true
-niew polish true
-stein german true
-heim$ german true
-heimer$ german true
-thal german true
-zweig german true
-[aeou]h german true
-äh german true
-öh german true
-üh german true
-[ln]h[ao]$ portuguese true
-[ln]h[aou] portuguese+french+german+dutch+czech+spanish+turkish true
-chsch german true
-tsch german true
-sch$ german+russian true
-^sch german+russian true
-ck$ german+english true
-c$ polish+romanian+hungarian+czech+turkish true
-sz polish+hungarian true
-cs$ hungarian true
-^cs hungarian true
-dzs hungarian true
-zs$ hungarian true
-^zs hungarian true
-^wl polish true
-^wr polish+english+german+dutch true
-
-gy$ hungarian true
-gy[aeou] hungarian true
-gy hungarian+russian+french+greeklatin true
-guy french true
-gu[ei] spanish+french+portuguese true
-gu[ao] spanish+portuguese true
-gi[aou] italian+greeklatin true
-
-ly hungarian+russian+polish+greeklatin true
-ny hungarian+russian+polish+spanish+greeklatin true
-ty hungarian+russian+polish+greeklatin true
-
-// 1.2 special characters
-ć polish true
-ç french+spanish+portuguese+turkish true
-č czech true
-ď czech true
-ğ turkish true
-ł polish true
-ń polish true
-ñ spanish true
-ň czech true
-ř czech true
-ś polish true
-ş romanian+turkish true
-Å¡ czech true
-Å£ romanian true
-Å¥ czech true
-ź polish true
-ż polish true
-
-ß german true
-
-ä german true
-á hungarian+spanish+portuguese+czech+greeklatin true
-â romanian+french+portuguese true
-ă romanian true
-ą polish true
-à portuguese true
-ã portuguese true
-ę polish true
-é french+hungarian+czech+greeklatin true
-è french+spanish+italian true
-ê french true
-ě czech true
-ê french+portuguese true
-í hungarian+spanish+portuguese+czech+greeklatin true
-î romanian+french true
-ı turkish true
-ó polish+hungarian+spanish+italian+portuguese+czech+greeklatin true
-ö german+hungarian+turkish true
-ô french+portuguese true
-õ portuguese+hungarian true
-ò italian+spanish true
-ű hungarian true
-ú hungarian+spanish+portuguese+czech+greeklatin true
-ü german+hungarian+spanish+portuguese+turkish true
-ù french true
-ů czech true
-ý czech+greeklatin true
-
-// Every Cyrillic word has at least one Cyrillic vowel (аёеоиуыэюя)
-а cyrillic true
-ё cyrillic true
-о cyrillic true
-е cyrillic true
-и cyrillic true
-у cyrillic true
-ы cyrillic true
-э cyrillic true
-ю cyrillic true
-я cyrillic true
-
-// Every Greek word has at least one Greek vowel
-α greek true
-ε greek true
-η greek true
-ι greek true
-ο greek true
-υ greek true
-ω greek true
-
-// Arabic (only initial)
-ا arabic true // alif (isol + init)   
-ب arabic true // ba' 
-ت arabic true // ta' 
-Ø« arabic true // tha'
-ج arabic true // jim
-Ø­ arabic true // h.a' 
-Ø®' arabic true // kha' 
-د arabic true // dal (isol + init)
-Ø° arabic true // dhal (isol + init)
-ر arabic true // ra' (isol + init)
-ز arabic true // za' (isol + init)
-س arabic true // sin 
-Ø´ arabic true // shin 
-ص arabic true // s.ad 
-ض arabic true // d.ad 
-Ø· arabic true // t.a' 
-ظ arabic true // z.a' 
-ع arabic true // 'ayn
-غ arabic true // ghayn 
-ف arabic true // fa' 
-ق arabic true // qaf 
-ك arabic true // kaf  
-ل arabic true // lam 
-م arabic true // mim 
-ن arabic true // nun 
-ه arabic true // ha' 
-و arabic true // waw (isol + init)
-ي arabic true // ya' 
-    
-Ø¢ arabic true // alif madda  
-Ø¥ arabic true // alif + diacritic  
-Ø£ arabic true // alif + hamza
-ؤ arabic true //  waw + hamza
-ئ arabic true //  ya' + hamza
-
-
-// Hebrew
-א hebrew true
-ב hebrew true
-ג hebrew true
-ד hebrew true
-ה hebrew true
-ו hebrew true
-ז hebrew true
-ח hebrew true
-ט hebrew true
-י hebrew true
-כ hebrew true
-ל hebrew true
-מ hebrew true
-×  hebrew true
-ס hebrew true
-×¢ hebrew true
-פ hebrew true
-צ hebrew true
-ק hebrew true
-ר hebrew true
-ש hebrew true
-ת hebrew true
-
-// 2. following are rules to reject the language
-
-// Every Latin character word has at least one Latin vowel
-a cyrillic+hebrew+greek+arabic false
-o cyrillic+hebrew+greek+arabic false
-e cyrillic+hebrew+greek+arabic false
-i cyrillic+hebrew+greek+arabic false
-y cyrillic+hebrew+greek+arabic+romanian+dutch false
-u cyrillic+hebrew+greek+arabic false
-
-j italian false
-j[^aoeiuy] french+spanish+portuguese+greeklatin false
-g czech false
-k romanian+spanish+portuguese+french+italian false
-q hungarian+polish+russian+romanian+czech+dutch+turkish+greeklatin false
-v polish false
-w french+romanian+spanish+hungarian+russian+czech+turkish+greeklatin false
-x czech+hungarian+dutch+turkish false // polish excluded from the list
-
-dj spanish+turkish false
-v[^aoeiu] german false // in german, "v" can be found before a vowel only
-y[^aoeiu] german false  // in german, "y" usually appears only in the last position; sometimes before a vowel
-c[^aohk] german false
-dzi german+english+french+turkish false
-ou german false
-a[eiou] turkish false // no diphthongs in Turkish
-ö[eaio] turkish false
-ü[eaio] turkish false
-e[aiou] turkish false
-i[aeou] turkish false
-o[aieu] turkish false
-u[aieo] turkish false
-aj german+english+french+dutch false
-ej german+english+french+dutch false
-oj german+english+french+dutch false
-uj german+english+french+dutch false
-eu russian+polish false
-ky polish false
-kie french+spanish+greeklatin false
-gie portuguese+romanian+spanish+greeklatin false
-ch[aou] italian false
-ch turkish false
-son$ german false
-sc[ei] french false
-sch hungarian+polish+french+spanish false
-^h russian false
-etti$ greeklatin false

Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt?rev=1608115&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt Sat Jul  5 19:58:38 2014
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// SEPHARDIC
+
+// 1. following are rules to accept the language
+// 1.1 Special letter combinations
+eau french true
+ou french true
+gni italian+french true
+tx spanish true
+tj spanish true
+gy french true
+guy french true
+
+sh spanish+portuguese true // English, but no sign for /sh/ in these languages
+
+lh portuguese true
+nh portuguese true
+ny spanish true
+
+gue spanish+french true
+gui spanish+french true
+gia italian true
+gie italian true
+gio italian true
+giu italian true
+            
+// 1.2 special characters    
+ñ spanish true
+â portuguese+french true
+á portuguese+spanish true
+à portuguese true
+ã portuguese true
+ê french+portuguese true
+í portuguese+spanish true
+î french true
+ô french+portuguese true
+õ portuguese true
+ò italian+spanish true
+ú portuguese+spanish true
+ù french true
+ü portuguese+spanish true
+      
+// Hebrew 
+א hebrew true
+ב hebrew true
+ג hebrew true
+ד hebrew true
+ה hebrew true
+ו hebrew true
+ז hebrew true
+ח hebrew true
+ט hebrew true
+י hebrew true
+כ hebrew true
+ל hebrew true
+מ hebrew true
+×  hebrew true
+ס hebrew true
+×¢ hebrew true
+פ hebrew true
+צ hebrew true 
+ק hebrew true
+ר hebrew true
+ש hebrew true
+ת hebrew true
+        
+// 2. following are rules to reject the language
+    
+// Every Latin character word has at least one Latin vowel  
+a hebrew false 
+o hebrew false 
+e hebrew false 
+i hebrew false 
+y hebrew false 
+u hebrew false 
+      
+kh spanish false
+gua italian false
+guo italian false
+ç italian false
+cha italian false
+cho italian false
+chu italian false
+j italian false
+dj spanish false
+sce french false
+sci french false
+ó french false
+è portuguese false

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/sep_lang.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Sat Jul  5 19:58:38 2014
@@ -17,7 +17,7 @@
 
 package org.apache.commons.codec.language.bm;
 
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.*;
 
 import java.util.Arrays;
 import java.util.HashSet;
@@ -185,11 +185,16 @@ public class PhoneticEngineRegressionTes
         Map<String, String> args = new TreeMap<String, String>();
         args.put("nameType", "GENERIC");
         args.put("ruleType", "APPROX");
+
         assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
+        assertEquals(encode(args, true, "Bendzin"), "bndzn|bntsn|bnzn|vndzn|vntsn");
 
         args.put("nameType", "ASHKENAZI");
         args.put("ruleType", "APPROX");
+
         assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
+        assertEquals(encode(args, true, "Halpern"), "YlpYrn|Ylpirn|alpYrn|alpirn|olpYrn|olpirn|xalpirn|xolpirn");
+
     }
 
     /**

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1608115&r1=1608114&r2=1608115&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Sat Jul  5 19:58:38 2014
@@ -17,8 +17,7 @@
 
 package org.apache.commons.codec.language.bm;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 import java.util.Arrays;
 import java.util.List;
@@ -41,8 +40,8 @@ public class PhoneticEngineTest {
     public static List<Object[]> data() {
         return Arrays
                 .asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, Boolean.TRUE, TEN },
-                        new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, TEN },
-                        new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, Integer.valueOf(1) },
+                        new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinolt|rinult", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, TEN },
+                        new Object[] { "Renault", "rinDlt", NameType.ASHKENAZI, RuleType.APPROX, Boolean.TRUE, Integer.valueOf(1) },
                         new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, Boolean.TRUE, TEN },
                         new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, Boolean.TRUE, TEN },
                         new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, Boolean.TRUE, TEN },