You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/06/27 20:33:54 UTC
[09/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic +
tests. Rather than porting over the entire commons-codec library,
only the language features were ported and added to this library.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
new file mode 100644
index 0000000..de636f8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Sephardic
+
+// CONSONANTS
+"kh" "" "" "x" // foreign
+"ph" "" "" "f"
+
+"ç" "" "" "s"
+"x" "" "" "ks"
+"ch" "" "" "S"
+"c" "" "[eiyéèê]" "s"
+"c" "" "" "k"
+"gn" "" "" "(n|gn)"
+"g" "" "[eiy]" "Z"
+"gue" "" "$" "k"
+"gu" "" "[eiy]" "g"
+//"aill" "" "e" "aj" // non Jewish
+//"ll" "" "e" "(l|j)" // non Jewish
+"que" "" "$" "k"
+"qu" "" "" "k"
+"q" "" "" "k"
+"s" "[aeiouyéèê]" "[aeiouyéèê]" "z"
+"h" "[bdgt]" "" "" // translit from Arabic
+"h" "" "$" "" // foreign
+"j" "" "" "Z"
+"w" "" "" "v"
+"ouh" "" "[aioe]" "(v|uh)"
+"ou" "" "[aeio]" "v"
+"uo" "" "" "(vo|o)"
+"u" "" "[aeio]" "v"
+
+// VOWELS
+"aue" "" "" "aue"
+"eau" "" "" "o"
+//"au" "" "" "(o|au)" // non Jewish
+"ai" "" "" "aj" // [e] is non Jewish
+"ay" "" "" "aj" // [e] is non Jewish
+"é" "" "" "e"
+"ê" "" "" "e"
+"è" "" "" "e"
+"à" "" "" "a"
+"â" "" "" "a"
+"où" "" "" "u"
+"ou" "" "" "u"
+"oi" "" "" "oj" // [ua] is non Jewish
+"ei" "" "" "ej" // [e] is non Jewish, in Ashk should be aj
+"ey" "" "" "ej" // [e] non Jewish, in Ashk should be aj
+//"eu" "" "" "(e|o)" // non Jewish
+"y" "[ou]" "" "j"
+"e" "" "$" "(e|)"
+"i" "" "[aou]" "j"
+"y" "" "[aoeu]" "j"
+"y" "" "" "i"
+
+// TRIVIAL
+"a" "" "" "a"
+"b" "" "" "b"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"z" "" "" "z"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
new file mode 100644
index 0000000..91cf5ba
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Sephardic
+
+"אי" "" "" "i"
+"עי" "" "" "i"
+"עו" "" "" "VV"
+"או" "" "" "VV"
+
+"ג׳" "" "" "Z"
+"ד׳" "" "" "dZ"
+
+"א" "" "" "L"
+"ב" "" "" "b"
+"ג" "" "" "g"
+"ד" "" "" "d"
+
+"ה" "^" "" "1"
+"ה" "" "$" "1"
+"ה" "" "" ""
+
+"וו" "" "" "V"
+"וי" "" "" "WW"
+"ו" "" "" "W"
+"ז" "" "" "z"
+"ח" "" "" "X"
+"ט" "" "" "T"
+"יי" "" "" "i"
+"י" "" "" "i"
+"ך" "" "" "X"
+"כ" "^" "" "K"
+"כ" "" "" "k"
+"ל" "" "" "l"
+"ם" "" "" "m"
+"מ" "" "" "m"
+"ן" "" "" "n"
+"נ" "" "" "n"
+"ס" "" "" "s"
+"ע" "" "" "L"
+"ף" "" "" "f"
+"פ" "" "" "f"
+"ץ" "" "" "C"
+"צ" "" "" "C"
+"ק" "" "" "K"
+"ר" "" "" "r"
+"ש" "" "" "s"
+"ת" "" "" "T" // Special for Sephardim
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
new file mode 100644
index 0000000..76cf14b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"kh" "" "" "x" // foreign
+
+"gli" "" "" "(l|gli)"
+"gn" "" "[aeou]" "(n|nj|gn)"
+"gni" "" "" "(ni|gni)"
+
+"gi" "" "[aeou]" "dZ"
+"gg" "" "[ei]" "dZ"
+"g" "" "[ei]" "dZ"
+"h" "[bdgt]" "" "g" // gh is It; others from Arabic translit
+
+"ci" "" "[aeou]" "tS"
+"ch" "" "[ei]" "k"
+"sc" "" "[ei]" "S"
+"cc" "" "[ei]" "tS"
+"c" "" "[ei]" "tS"
+"s" "[aeiou]" "[aeiou]" "z"
+
+"i" "[aeou]" "" "j"
+"i" "" "[aeou]" "j"
+"y" "[aeou]" "" "j" // foreign
+"y" "" "[aeou]" "j" // foreign
+
+"qu" "" "" "k"
+"uo" "" "" "(vo|o)"
+"u" "" "[aei]" "v"
+
+"�" "" "" "e"
+"�" "" "" "e"
+"�" "" "" "o"
+"�" "" "" "o"
+
+// LATIN ALPHABET
+"a" "" "" "a"
+"b" "" "" "b"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "(Z|dZ|j)" // foreign
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"q" "" "" "k"
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"w" "" "" "v" // foreign
+"x" "" "" "ks" // foreign
+"y" "" "" "i" // foreign
+"z" "" "" "(ts|dz)"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
new file mode 100644
index 0000000..67cbd9b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"kh" "" "" "x" // foreign
+"ch" "" "" "S"
+"ss" "" "" "s"
+"sc" "" "[ei]" "s"
+"sç" "" "[aou]" "s"
+"ç" "" "" "s"
+"c" "" "[ei]" "s"
+// "c" "" "[aou]" "(k|C)"
+
+"s" "^" "" "s"
+"s" "[aáuiíoóeéêy]" "[aáuiíoóeéêy]" "z"
+"s" "" "[dglmnrv]" "(Z|S)" // Z is Brazil
+
+"z" "" "$" "(Z|s|S)" // s and S in Brazil
+"z" "" "[bdgv]" "(Z|z)" // Z in Brazil
+"z" "" "[ptckf]" "(s|S|z)" // s and S in Brazil
+
+"gu" "" "[eiu]" "g"
+"gu" "" "[ao]" "gv"
+"g" "" "[ei]" "Z"
+"qu" "" "[eiu]" "k"
+"qu" "" "[ao]" "kv"
+
+"uo" "" "" "(vo|o|u)"
+"u" "" "[aei]" "v"
+
+"lh" "" "" "l"
+"nh" "" "" "nj"
+"h" "[bdgt]" "" "" // translit. from Arabic
+
+"ex" "" "[aáuiíoóeéêy]" "(ez|eS|eks)" // ez in Brazil
+"ex" "" "[cs]" "e"
+
+"y" "[aáuiíoóeéê]" "" "j"
+"y" "" "[aeiíou]" "j"
+"m" "" "[bcdfglnprstv]" "(m|n)" // maybe to add a rule for m/n before a consonant that disappears [preceding vowel becomes nasalized]
+"m" "" "$" "(m|n)" // maybe to add a rule for final m/n that disappears [preceding vowel becomes nasalized]
+
+"ão" "" "" "(au|an|on)"
+"ãe" "" "" "(aj|an)"
+"ãi" "" "" "(aj|an)"
+"õe" "" "" "(oj|on)"
+"i" "[aáuoóeéê]" "" "j"
+"i" "" "[aeou]" "j"
+
+"â" "" "" "a"
+"à" "" "" "a"
+"á" "" "" "a"
+"ã" "" "" "(a|an|on)"
+"é" "" "" "e"
+"ê" "" "" "e"
+"í" "" "" "i"
+"ô" "" "" "o"
+"ó" "" "" "o"
+"õ" "" "" "(o|on)"
+"ú" "" "" "u"
+"ü" "" "" "u"
+
+"aue" "" "" "aue"
+
+// LATIN ALPHABET
+"a" "" "" "a"
+"b" "" "" "b"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "(e|i)"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "Z"
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "(o|u)"
+"p" "" "" "p"
+"q" "" "" "k"
+"r" "" "" "r"
+"s" "" "" "S"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"w" "" "" "v"
+"x" "" "" "(S|ks)"
+"y" "" "" "i"
+"z" "" "" "z"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
new file mode 100644
index 0000000..b900e7e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//Sephardic
+
+// Includes both Spanish (Castillian) & Catalan
+
+// CONSONANTS
+"ñ" "" "" "(n|nj)"
+"ny" "" "" "nj" // Catalan
+"ç" "" "" "s" // Catalan
+
+"ig" "[aeiou]" "" "(tS|ig)" // tS is Catalan
+"ix" "[aeiou]" "" "S" // Catalan
+"tx" "" "" "tS" // Catalan
+"tj" "" "$" "tS" // Catalan
+"tj" "" "" "dZ" // Catalan
+"tg" "" "" "(tg|dZ)" // dZ is Catalan
+"ch" "" "" "(tS|dZ)" // dZ is typical for Argentina
+"bh" "" "" "b" // translit. from Arabic
+"h" "[dgt]" "" "" // translit. from Arabic
+
+"j" "" "" "(x|Z)" // Z is Catalan
+"x" "" "" "(ks|gz|S)" // ks is Spanish, all are Catalan
+
+//"ll" "" "" "(l|Z)" // Z is typical for Argentina, only Ashkenazic
+"w" "" "" "v" // foreign words
+
+"v" "^" "" "(B|v)"
+"b" "^" "" "(b|V)"
+"v" "" "" "(b|v)"
+"b" "" "" "(b|v)"
+"m" "" "[bpvf]" "(m|n)"
+
+"c" "" "[ei]" "s"
+// "c" "" "[aou]" "(k|C)"
+"c" "" "" "k"
+
+"z" "" "" "(z|s)" // as "c" befoire "e" or "i", in Spain it is like unvoiced English "th"
+
+"gu" "" "[ei]" "(g|gv)" // "gv" because "u" can actually be "ü"
+"g" "" "[ei]" "(x|g|dZ)" // "g" only for foreign words; dZ is Catalan
+
+"qu" "" "" "k"
+"q" "" "" "k"
+
+"uo" "" "" "(vo|o)"
+"u" "" "[aei]" "v"
+
+// "y" "" "" "(i|j|S|Z)" // S or Z are peculiar to South America; only Ashkenazic
+"y" "" "" "(i|j)"
+
+// VOWELS
+"ü" "" "" "v"
+"á" "" "" "a"
+"é" "" "" "e"
+"í" "" "" "i"
+"ó" "" "" "o"
+"ú" "" "" "u"
+"à" "" "" "a" // Catalan
+"è" "" "" "e" // Catalan
+"ò" "" "" "o" // Catalan
+
+// TRIVIAL
+"a" "" "" "a"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
new file mode 100644
index 0000000..1abfcd1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
@@ -0,0 +1,131 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a Caverphone 1.0 value.
+ /// <para/>
+ /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 1.0
+ /// algorithm:
+ /// <para/>
+ /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>
+ /// <para/>
+ /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp060902.pdf">Caverphone 1.0 specification</a>
+ /// <para/>
+ /// This class is immutable and thread-safe.
+ /// <para/>
+ /// since 1.5
+ /// </summary>
+ public class Caverphone1 : AbstractCaverphone
+ {
+ private static readonly string SIX_1 = "111111";
+
+ /// <summary>
+ /// Encodes the given string into a Caverphone value.
+ /// </summary>
+ /// <param name="source">The source string.</param>
+ /// <returns>A caverphone code for the given string.</returns>
+ public override string Encode(string source)
+ {
+ string txt = source;
+ if (txt == null || txt.Length == 0)
+ {
+ return SIX_1;
+ }
+
+ // 1. Convert to lowercase
+ txt = txt.ToLowerInvariant(); // LUCENENET NOTE: This doesn't work right under "en" language, but does under invariant
+
+ // 2. Remove anything not A-Z
+ txt = Regex.Replace(txt, "[^a-z]", "");
+
+ // 3. Handle various start options
+ // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in.
+ txt = Regex.Replace(txt, "^cough", "cou2f");
+ txt = Regex.Replace(txt, "^rough", "rou2f");
+ txt = Regex.Replace(txt, "^tough", "tou2f");
+ txt = Regex.Replace(txt, "^enough", "enou2f");
+ txt = Regex.Replace(txt, "^gn", "2n");
+
+ // End
+ txt = Regex.Replace(txt, "mb$", "m2");
+
+ // 4. Handle replacements
+ txt = Regex.Replace(txt, "cq", "2q");
+ txt = Regex.Replace(txt, "ci", "si");
+ txt = Regex.Replace(txt, "ce", "se");
+ txt = Regex.Replace(txt, "cy", "sy");
+ txt = Regex.Replace(txt, "tch", "2ch");
+ txt = Regex.Replace(txt, "c", "k");
+ txt = Regex.Replace(txt, "q", "k");
+ txt = Regex.Replace(txt, "x", "k");
+ txt = Regex.Replace(txt, "v", "f");
+ txt = Regex.Replace(txt, "dg", "2g");
+ txt = Regex.Replace(txt, "tio", "sio");
+ txt = Regex.Replace(txt, "tia", "sia");
+ txt = Regex.Replace(txt, "d", "t");
+ txt = Regex.Replace(txt, "ph", "fh");
+ txt = Regex.Replace(txt, "b", "p");
+ txt = Regex.Replace(txt, "sh", "s2");
+ txt = Regex.Replace(txt, "z", "s");
+ txt = Regex.Replace(txt, "^[aeiou]", "A");
+ // 3 is a temporary placeholder marking a vowel
+ txt = Regex.Replace(txt, "[aeiou]", "3");
+ txt = Regex.Replace(txt, "3gh3", "3kh3");
+ txt = Regex.Replace(txt, "gh", "22");
+ txt = Regex.Replace(txt, "g", "k");
+ txt = Regex.Replace(txt, "s+", "S");
+ txt = Regex.Replace(txt, "t+", "T");
+ txt = Regex.Replace(txt, "p+", "P");
+ txt = Regex.Replace(txt, "k+", "K");
+ txt = Regex.Replace(txt, "f+", "F");
+ txt = Regex.Replace(txt, "m+", "M");
+ txt = Regex.Replace(txt, "n+", "N");
+ txt = Regex.Replace(txt, "w3", "W3");
+ txt = Regex.Replace(txt, "wy", "Wy"); // 1.0 only
+ txt = Regex.Replace(txt, "wh3", "Wh3");
+ txt = Regex.Replace(txt, "why", "Why"); // 1.0 only
+ txt = Regex.Replace(txt, "w", "2");
+ txt = Regex.Replace(txt, "^h", "A");
+ txt = Regex.Replace(txt, "h", "2");
+ txt = Regex.Replace(txt, "r3", "R3");
+ txt = Regex.Replace(txt, "ry", "Ry"); // 1.0 only
+ txt = Regex.Replace(txt, "r", "2");
+ txt = Regex.Replace(txt, "l3", "L3");
+ txt = Regex.Replace(txt, "ly", "Ly"); // 1.0 only
+ txt = Regex.Replace(txt, "l", "2");
+ txt = Regex.Replace(txt, "j", "y"); // 1.0 only
+ txt = Regex.Replace(txt, "y3", "Y3"); // 1.0 only
+ txt = Regex.Replace(txt, "y", "2"); // 1.0 only
+
+ // 5. Handle removals
+ txt = Regex.Replace(txt, "2", "");
+ txt = Regex.Replace(txt, "3", "");
+
+ // 6. put ten 1s on the end
+ txt = txt + SIX_1;
+
+ // 7. take the first six characters as the code
+ return txt.Substring(0, SIX_1.Length - 0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
new file mode 100644
index 0000000..cec7388
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
@@ -0,0 +1,133 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a Caverphone 2.0 value.
+ /// <para/>
+ /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0
+ /// algorithm:
+ /// <para/>
+ /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>
+ /// <para/>
+ /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp150804.pdf">Caverphone 2.0 specification</a>
+ /// <para/>
+ /// This class is immutable and thread-safe.
+ /// </summary>
+ public class Caverphone2 : AbstractCaverphone
+ {
+ private static readonly string TEN_1 = "1111111111";
+
+ /// <summary>
+ /// Encodes the given string into a Caverphone 2.0 value.
+ /// </summary>
+ /// <param name="source">The source string.</param>
+ /// <returns>A caverphone code for the given string.</returns>
+ public override string Encode(string source)
+ {
+ string txt = source;
+ if (txt == null || txt.Length == 0)
+ {
+ return TEN_1;
+ }
+
+ // 1. Convert to lowercase
+ txt = new CultureInfo("en").TextInfo.ToLower(txt);
+
+ // 2. Remove anything not A-Z
+ txt = Regex.Replace(txt, "[^a-z]", "");
+
+ // 2.5. Remove final e
+ txt = Regex.Replace(txt, "e$", ""); // 2.0 only
+
+ // 3. Handle various start options
+ txt = Regex.Replace(txt, "^cough", "cou2f");
+ txt = Regex.Replace(txt, "^rough", "rou2f");
+ txt = Regex.Replace(txt, "^tough", "tou2f");
+ txt = Regex.Replace(txt, "^enough", "enou2f"); // 2.0 only
+ txt = Regex.Replace(txt, "^trough", "trou2f"); // 2.0 only
+ // note the spec says ^enough here again, c+p error I assume
+ txt = Regex.Replace(txt, "^gn", "2n");
+
+ // End
+ txt = Regex.Replace(txt, "mb$", "m2");
+
+ // 4. Handle replacements
+ txt = Regex.Replace(txt, "cq", "2q");
+ txt = Regex.Replace(txt, "ci", "si");
+ txt = Regex.Replace(txt, "ce", "se");
+ txt = Regex.Replace(txt, "cy", "sy");
+ txt = Regex.Replace(txt, "tch", "2ch");
+ txt = Regex.Replace(txt, "c", "k");
+ txt = Regex.Replace(txt, "q", "k");
+ txt = Regex.Replace(txt, "x", "k");
+ txt = Regex.Replace(txt, "v", "f");
+ txt = Regex.Replace(txt, "dg", "2g");
+ txt = Regex.Replace(txt, "tio", "sio");
+ txt = Regex.Replace(txt, "tia", "sia");
+ txt = Regex.Replace(txt, "d", "t");
+ txt = Regex.Replace(txt, "ph", "fh");
+ txt = Regex.Replace(txt, "b", "p");
+ txt = Regex.Replace(txt, "sh", "s2");
+ txt = Regex.Replace(txt, "z", "s");
+ txt = Regex.Replace(txt, "^[aeiou]", "A");
+ txt = Regex.Replace(txt, "[aeiou]", "3");
+ txt = Regex.Replace(txt, "j", "y"); // 2.0 only
+ txt = Regex.Replace(txt, "^y3", "Y3"); // 2.0 only
+ txt = Regex.Replace(txt, "^y", "A"); // 2.0 only
+ txt = Regex.Replace(txt, "y", "3"); // 2.0 only
+ txt = Regex.Replace(txt, "3gh3", "3kh3");
+ txt = Regex.Replace(txt, "gh", "22");
+ txt = Regex.Replace(txt, "g", "k");
+ txt = Regex.Replace(txt, "s+", "S");
+ txt = Regex.Replace(txt, "t+", "T");
+ txt = Regex.Replace(txt, "p+", "P");
+ txt = Regex.Replace(txt, "k+", "K");
+ txt = Regex.Replace(txt, "f+", "F");
+ txt = Regex.Replace(txt, "m+", "M");
+ txt = Regex.Replace(txt, "n+", "N");
+ txt = Regex.Replace(txt, "w3", "W3");
+ txt = Regex.Replace(txt, "wh3", "Wh3");
+ txt = Regex.Replace(txt, "w$", "3"); // 2.0 only
+ txt = Regex.Replace(txt, "w", "2");
+ txt = Regex.Replace(txt, "^h", "A");
+ txt = Regex.Replace(txt, "h", "2");
+ txt = Regex.Replace(txt, "r3", "R3");
+ txt = Regex.Replace(txt, "r$", "3"); // 2.0 only
+ txt = Regex.Replace(txt, "r", "2");
+ txt = Regex.Replace(txt, "l3", "L3");
+ txt = Regex.Replace(txt, "l$", "3"); // 2.0 only
+ txt = Regex.Replace(txt, "l", "2");
+
+ // 5. Handle removals
+ txt = Regex.Replace(txt, "2", "");
+ txt = Regex.Replace(txt, "3$", "A"); // 2.0 only
+ txt = Regex.Replace(txt, "3", "");
+
+ // 6. put ten 1s on the end
+ txt = txt + TEN_1;
+
+ // 7. take the first ten characters as the code
+ return txt.Substring(0, TEN_1.Length);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
new file mode 100644
index 0000000..a4824b3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
@@ -0,0 +1,501 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a Cologne Phonetic value.
+ /// </summary>
+ /// <remarks>
+ /// Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">KÖlner Phonetik</a>
+ /// (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
+ /// <para/>
+ /// The <i>KÖlner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
+ /// It is related to the well-known soundex algorithm.
+ /// <para/>
+ /// <h2>Algorithm</h2>
+ /// <list type="bullet">
+ /// <item>
+ /// <term>Step 1:</term>
+ /// <description>
+ /// After preprocessing (conversion to upper case, transcription of <a
+ /// href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
+ /// letters of the supplied text are replaced by their phonetic code according to the following table.
+ /// <list type="table">
+ /// <listheader>
+ /// <term>Letter</term>
+ /// <term>Context</term>
+ /// <term>Code</term>
+ /// </listheader>
+ /// <item>
+ /// <term>A, E, I, J, O, U, Y</term>
+ /// <term></term>
+ /// <term>0</term>
+ /// </item>
+ /// <item>
+ /// <term>H</term>
+ /// <term></term>
+ /// <term>-</term>
+ /// </item>
+ /// <item>
+ /// <term>B</term>
+ /// <term></term>
+ /// <term>1</term>
+ /// </item>
+ /// <item>
+ /// <term>P</term>
+ /// <term>not before H</term>
+ /// <term>1</term>
+ /// </item>
+ /// <item>
+ /// <term>D, T</term>
+ /// <term>not before C, S, Z</term>
+ /// <term>2</term>
+ /// </item>
+ /// <item>
+ /// <term>F, V, W</term>
+ /// <term></term>
+ /// <term>3</term>
+ /// </item>
+ /// <item>
+ /// <term>P</term>
+ /// <term>before H</term>
+ /// <term>3</term>
+ /// </item>
+ /// <item>
+ /// <term>G, K, Q</term>
+ /// <term></term>
+ /// <term>4</term>
+ /// </item>
+ /// <item>
+ /// <term>C</term>
+ /// <term>t onset before A, H, K, L, O, Q, R, U, X <para>OR</para>
+ /// before A, H, K, O, Q, U, X except after S, Z</term>
+ /// <term>4</term>
+ /// </item>
+ /// <item>
+ /// <term>X</term>
+ /// <term>not after C, K, Q</term>
+ /// <term>48</term>
+ /// </item>
+ /// <item>
+ /// <term>L</term>
+ /// <term></term>
+ /// <term>5</term>
+ /// </item>
+ /// <item>
+ /// <term>M, N</term>
+ /// <term></term>
+ /// <term>6</term>
+ /// </item>
+ /// <item>
+ /// <term>R</term>
+ /// <term></term>
+ /// <term>7</term>
+ /// </item>
+ /// <item>
+ /// <term>S, Z</term>
+ /// <term></term>
+ /// <term>8</term>
+ /// </item>
+ /// <item>
+ /// <term>C</term>
+ /// <term>after S, Z <para>OR</para>
+ /// at onset except before A, H, K, L, O, Q, R, U, X <para>OR</para>
+ /// not before A, H, K, O, Q, U, X
+ /// </term>
+ /// <term>8</term>
+ /// </item>
+ /// <item>
+ /// <term>D, T</term>
+ /// <term>before C, S, Z</term>
+ /// <term>8</term>
+ /// </item>
+ /// <item>
+ /// <term>X</term>
+ /// <term>after C, K, Q</term>
+ /// <term>8</term>
+ /// </item>
+ /// </list>
+ /// <para>
+ /// <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
+ /// KÖlner Phonetik -- Buchstabencodes</a>)</i></small>
+ /// </para>
+ /// <h4>Example:</h4>
+ /// <c>"MÜller-LÜdenscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"</c>
+ /// </description>
+ /// </item>
+ /// <item>
+ /// <term>Step 2:</term>
+ /// <description>
+ /// Collapse of all multiple consecutive code digits.
+ /// <h4>Example:</h4>
+ /// <c>"6005507500206880022" => "6050750206802"</c>
+ /// </description>
+ /// </item>
+ /// <item>
+ /// <term>Step 3:</term>
+ /// <description>
+ /// Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
+ /// if they occur after removing the "0" digits.
+ /// <h4>Example:</h4>
+ /// <c>"6050750206802" => "65752682"</c>
+ /// </description>
+ /// </item>
+ /// </list>
+ /// <para/>
+ /// This class is thread-safe.
+ /// <para/>
+ /// See: <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a>
+ /// <para/>
+ /// since 1.5
+ /// </remarks>
+ public class ColognePhonetic : IStringEncoder
+ {
+ // Predefined char arrays for better performance and less GC load
+ private static readonly char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
+ private static readonly char[] SCZ = new char[] { 'S', 'C', 'Z' };
+ private static readonly char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
+ private static readonly char[] GKQ = new char[] { 'G', 'K', 'Q' };
+ private static readonly char[] CKQ = new char[] { 'C', 'K', 'Q' };
+ private static readonly char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
+ private static readonly char[] SZ = new char[] { 'S', 'Z' };
+ private static readonly char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
+ private static readonly char[] TDX = new char[] { 'T', 'D', 'X' };
+
+ /// <summary>
+ /// This class is not thread-safe; the field <see cref="length"/> is mutable.
+ /// However, it is not shared between threads, as it is constructed on demand
+ /// by the method <see cref="ColognePhonetic.GetColognePhonetic(string)"/>.
+ /// </summary>
+ private abstract class CologneBuffer
+ {
+
+ protected readonly char[] data;
+
+ protected int length = 0;
+
+ public CologneBuffer(char[] data)
+ {
+ this.data = data;
+ this.length = data.Length;
+ }
+
+ public CologneBuffer(int buffSize)
+ {
+ this.data = new char[buffSize];
+ this.length = 0;
+ }
+
+ protected abstract char[] CopyData(int start, int length);
+
+ public virtual int Length
+ {
+ get { return length; }
+ }
+
+ public override string ToString()
+ {
+ return new string(CopyData(0, length));
+ }
+ }
+
+ private class CologneOutputBuffer : CologneBuffer
+ {
+ public CologneOutputBuffer(int buffSize)
+ : base(buffSize)
+ {
+ }
+
+ public void AddRight(char chr)
+ {
+ data[length] = chr;
+ length++;
+ }
+
+ protected override char[] CopyData(int start, int length)
+ {
+ char[] newData = new char[length];
+ System.Array.Copy(data, start, newData, 0, length);
+ return newData;
+ }
+ }
+
+ private class CologneInputBuffer : CologneBuffer
+ {
+ public CologneInputBuffer(char[] data)
+ : base(data)
+ {
+ }
+
+ public virtual void AddLeft(char ch)
+ {
+ length++;
+ data[GetNextPos()] = ch;
+ }
+
+ protected override char[] CopyData(int start, int length)
+ {
+ char[] newData = new char[length];
+ System.Array.Copy(data, data.Length - this.length + start, newData, 0, length);
+ return newData;
+ }
+
+ public virtual char GetNextChar()
+ {
+ return data[GetNextPos()];
+ }
+
+ protected virtual int GetNextPos()
+ {
+ return data.Length - length;
+ }
+
+ public virtual char RemoveNext()
+ {
+ char ch = GetNextChar();
+ length--;
+ return ch;
+ }
+ }
+
+ /// <summary>
+ /// Maps some Germanic characters to plain for internal processing. The following characters are mapped:
+ /// <list type="bullet">
+ /// <item><description>capital a, umlaut mark</description></item>
+ /// <item><description>capital u, umlaut mark</description></item>
+ /// <item><description>capital o, umlaut mark</description></item>
+ /// <item><description>small sharp s, German</description></item>
+ /// </list>
+ /// </summary>
+ private static readonly char[][] PREPROCESS_MAP = {
+ new char[] {'\u00C4', 'A'}, // capital a, umlaut mark
+ new char[] {'\u00DC', 'U'}, // capital u, umlaut mark
+ new char[] {'\u00D6', 'O'}, // capital o, umlaut mark
+ new char[] {'\u00DF', 'S'} // small sharp s, German
+ };
+
+ /// <summary>
+ /// Returns whether the array contains the key, or not.
+ /// </summary>
+ private static bool ArrayContains(char[] arr, char key)
+ {
+ foreach (char element in arr)
+ {
+ if (element == key)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /// <summary>
+ /// <para>
+ /// Implements the <i>Kölner Phonetik</i> algorithm.
+ /// </para>
+ /// <para>
+ /// In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
+ /// </para>
+ ///
+ /// </summary>
+ /// <param name="text"></param>
+ /// <returns>The corresponding encoding according to the <i>Kölner Phonetik</i> algorithm</returns>
+ public virtual string GetColognePhonetic(string text)
+ {
+ if (text == null)
+ {
+ return null;
+ }
+
+ text = Preprocess(text);
+
+ CologneOutputBuffer output = new CologneOutputBuffer(text.Length * 2);
+ CologneInputBuffer input = new CologneInputBuffer(text.ToCharArray());
+
+ char nextChar;
+
+ char lastChar = '-';
+ char lastCode = '/';
+ char code;
+ char chr;
+
+ int rightLength = input.Length;
+
+ while (rightLength > 0)
+ {
+ chr = input.RemoveNext();
+
+ if ((rightLength = input.Length) > 0)
+ {
+ nextChar = input.GetNextChar();
+ }
+ else
+ {
+ nextChar = '-';
+ }
+
+ if (ArrayContains(AEIJOUY, chr))
+ {
+ code = '0';
+ }
+ else if (chr == 'H' || chr < 'A' || chr > 'Z')
+ {
+ if (lastCode == '/')
+ {
+ continue;
+ }
+ code = '-';
+ }
+ else if (chr == 'B' || (chr == 'P' && nextChar != 'H'))
+ {
+ code = '1';
+ }
+ else if ((chr == 'D' || chr == 'T') && !ArrayContains(SCZ, nextChar))
+ {
+ code = '2';
+ }
+ else if (ArrayContains(WFPV, chr))
+ {
+ code = '3';
+ }
+ else if (ArrayContains(GKQ, chr))
+ {
+ code = '4';
+ }
+ else if (chr == 'X' && !ArrayContains(CKQ, lastChar))
+ {
+ code = '4';
+ input.AddLeft('S');
+ rightLength++;
+ }
+ else if (chr == 'S' || chr == 'Z')
+ {
+ code = '8';
+ }
+ else if (chr == 'C')
+ {
+ if (lastCode == '/')
+ {
+ if (ArrayContains(AHKLOQRUX, nextChar))
+ {
+ code = '4';
+ }
+ else
+ {
+ code = '8';
+ }
+ }
+ else
+ {
+ if (ArrayContains(SZ, lastChar) || !ArrayContains(AHOUKQX, nextChar))
+ {
+ code = '8';
+ }
+ else
+ {
+ code = '4';
+ }
+ }
+ }
+ else if (ArrayContains(TDX, chr))
+ {
+ code = '8';
+ }
+ else if (chr == 'R')
+ {
+ code = '7';
+ }
+ else if (chr == 'L')
+ {
+ code = '5';
+ }
+ else if (chr == 'M' || chr == 'N')
+ {
+ code = '6';
+ }
+ else
+ {
+ code = chr;
+ }
+
+ if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8'))
+ {
+ output.AddRight(code);
+ }
+
+ lastChar = chr;
+ lastCode = code;
+ }
+ return output.ToString();
+ }
+
+ // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+ //@Override
+ // public Object encode(final Object object) throws EncoderException
+ //{
+ // if (!(object instanceof String)) {
+ // throw new EncoderException("This method's parameter was expected to be of the type " +
+ // String.class.getName() +
+ // ". But actually it was of the type " +
+ // object.getClass().getName() +
+ // ".");
+ // }
+ // return encode((String) object);
+ // }
+
+
+ public virtual string Encode(string text)
+ {
+ return GetColognePhonetic(text);
+ }
+
+ public virtual bool IsEncodeEqual(string text1, string text2)
+ {
+ return GetColognePhonetic(text1).Equals(GetColognePhonetic(text2));
+ }
+
+ /// <summary>
+ /// Converts the string to upper case and replaces germanic characters as defined in <see cref="PREPROCESS_MAP"/>.
+ /// </summary>
+ private string Preprocess(string text)
+ {
+ text = new CultureInfo("de").TextInfo.ToUpper(text);
+
+ char[] chrs = text.ToCharArray();
+
+ for (int index = 0; index < chrs.Length; index++)
+ {
+ if (chrs[index] > 'Z')
+ {
+ foreach (char[] element in PREPROCESS_MAP)
+ {
+ if (chrs[index] == element[0])
+ {
+ chrs[index] = element[1];
+ break;
+ }
+ }
+ }
+ }
+ return new string(chrs);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
new file mode 100644
index 0000000..e72bc38
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
@@ -0,0 +1,620 @@
+// commons-codec version compatibility level: 1.10
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Reflection;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a Daitch-Mokotoff Soundex value.
+ /// </summary>
+ /// <remarks>
+ /// The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
+ /// accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
+ /// <para/>
+ /// The main differences compared to the other soundex variants are:
+ /// <list type="bullet">
+ /// <item><description>coded names are 6 digits long</description></item>
+ /// <item><description>the initial character of the name is coded</description></item>
+ /// <item><description>rules to encoded multi-character n-grams</description></item>
+ /// <item><description>multiple possible encodings for the same name (branching)</description></item>
+ /// </list>
+ /// <para/>
+ /// This implementation supports branching, depending on the used method:
+ /// <list type="bullet">
+ /// <item><term><see cref="Encode(string)"/></term><description>branching disabled, only the first code will be returned</description></item>
+ /// <item><term><see cref="GetSoundex(string)"/></term><description>branching enabled, all codes will be returned, separated by '|'</description></item>
+ /// </list>
+ /// <para/>
+ /// Note: this implementation has additional branching rules compared to the original description of the algorithm. The
+ /// rules can be customized by overriding the default rules contained in the resource file
+ /// <c>Lucene.Net.Analysis.Phonetic.Language.dmrules.txt</c>.
+ /// <para/>
+ /// This class is thread-safe.
+ /// <para/>
+ /// See: <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
+ /// <para/>
+ /// See: <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
+ /// <para/>
+ /// since 1.10
+ /// </remarks>
+ /// <seealso cref="Soundex"/>
+ public class DaitchMokotoffSoundex : IStringEncoder
+ {
+ /// <summary>
+ /// Inner class representing a branch during DM soundex encoding.
+ /// </summary>
+ private sealed class Branch
+ {
+ private readonly StringBuilder builder;
+ private string cachedString;
+ private string lastReplacement;
+
+ internal Branch()
+ {
+ builder = new StringBuilder();
+ lastReplacement = null;
+ cachedString = null;
+ }
+
+ /// <summary>
+ /// Creates a new branch, identical to this branch.
+ /// </summary>
+ /// <returns>A new, identical branch.</returns>
+ public Branch CreateBranch()
+ {
+ Branch branch = new Branch();
+ branch.builder.Append(ToString());
+ branch.lastReplacement = this.lastReplacement;
+ return branch;
+ }
+
+ public override bool Equals(object other)
+ {
+ if (this == other)
+ {
+ return true;
+ }
+ if (!(other is Branch))
+ {
+ return false;
+ }
+
+ return ToString().Equals(((Branch)other).ToString());
+ }
+
+ /// <summary>
+ /// Finish this branch by appending '0's until the maximum code length has been reached.
+ /// </summary>
+ public void Finish()
+ {
+ while (builder.Length < MAX_LENGTH)
+ {
+ builder.Append('0');
+ cachedString = null;
+ }
+ }
+
+ public override int GetHashCode()
+ {
+ return ToString().GetHashCode();
+ }
+
+ /// <summary>
+ /// Process the next replacement to be added to this branch.
+ /// </summary>
+ /// <param name="replacement">The next replacement to append.</param>
+ /// <param name="forceAppend">Indicates if the default processing shall be overridden.</param>
+ public void ProcessNextReplacement(string replacement, bool forceAppend)
+ {
+ bool append = lastReplacement == null || !lastReplacement.EndsWith(replacement, StringComparison.Ordinal) || forceAppend;
+
+ if (append && builder.Length < MAX_LENGTH)
+ {
+ builder.Append(replacement);
+ // remove all characters after the maximum length
+ if (builder.Length > MAX_LENGTH)
+ {
+ //builder.delete(MAX_LENGTH, builder.Length);
+ builder.Remove(MAX_LENGTH, builder.Length - MAX_LENGTH);
+ }
+ cachedString = null;
+ }
+
+ lastReplacement = replacement;
+ }
+
+ public override string ToString()
+ {
+ if (cachedString == null)
+ {
+ cachedString = builder.ToString();
+ }
+ return cachedString;
+ }
+ }
+
+ /// <summary>
+ /// Inner class for storing rules.
+ /// </summary>
+ private sealed class Rule
+ {
+ private readonly string pattern;
+ private readonly string[] replacementAtStart;
+ private readonly string[] replacementBeforeVowel;
+ private readonly string[] replacementDefault;
+
+ internal Rule(string pattern, string replacementAtStart, string replacementBeforeVowel,
+ string replacementDefault)
+ {
+ this.pattern = pattern;
+ this.replacementAtStart = Regex.Split(replacementAtStart, "\\|");
+ this.replacementBeforeVowel = Regex.Split(replacementBeforeVowel, "\\|");
+ this.replacementDefault = Regex.Split(replacementDefault, "\\|");
+ }
+
+ // LUCENENET specific - need read access to pattern
+ public string Pattern
+ {
+ get { return pattern; }
+ }
+
+ public int PatternLength
+ {
+ get { return pattern.Length; }
+ }
+
+ public string[] GetReplacements(string context, bool atStart)
+ {
+ if (atStart)
+ {
+ return replacementAtStart;
+ }
+
+ int nextIndex = PatternLength;
+ bool nextCharIsVowel = nextIndex < context.Length ? IsVowel(context[nextIndex]) : false;
+ if (nextCharIsVowel)
+ {
+ return replacementBeforeVowel;
+ }
+
+ return replacementDefault;
+ }
+
+ private bool IsVowel(char ch)
+ {
+ return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+ }
+
+ public bool Matches(string context)
+ {
+ return context.StartsWith(pattern, StringComparison.Ordinal);
+ }
+
+ public override string ToString()
+ {
+ return string.Format("{0}=({1},{2},{3})", pattern, Collections.ToString(replacementAtStart),
+ Collections.ToString(replacementBeforeVowel), Collections.ToString(replacementDefault));
+ }
+ }
+
+ private static readonly string COMMENT = "//";
+ private static readonly string DOUBLE_QUOTE = "\"";
+
+ private static readonly string MULTILINE_COMMENT_END = "*/";
+
+ private static readonly string MULTILINE_COMMENT_START = "/*";
+
+ /// <summary>The resource file containing the replacement and folding rules</summary>
+ private static readonly string RESOURCE_FILE = "dmrules.txt";
+
+ /// <summary>The code length of a DM soundex value.</summary>
+ private static readonly int MAX_LENGTH = 6;
+
+ /// <summary>Transformation rules indexed by the first character of their pattern.</summary>
+ private static readonly IDictionary<char, IList<Rule>> RULES = new Dictionary<char, IList<Rule>>();
+
+ /// <summary>Folding rules.</summary>
+ private static readonly IDictionary<char, char> FOLDINGS = new Dictionary<char, char>();
+
+ private class DaitchMokotoffRuleComparer : IComparer<Rule>
+ {
+ public int Compare(Rule rule1, Rule rule2)
+ {
+ return rule2.PatternLength - rule1.PatternLength;
+ }
+ }
+
+ static DaitchMokotoffSoundex()
+ {
+ Stream rulesIS = typeof(DaitchMokotoffSoundex).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(DaitchMokotoffSoundex), RESOURCE_FILE);
+ if (rulesIS == null)
+ {
+ throw new ArgumentException("Unable to load resource: " + RESOURCE_FILE);
+ }
+
+ using (TextReader scanner = new StreamReader(rulesIS, Encoding.UTF8))
+ {
+ ParseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+ }
+
+ // sort RULES by pattern length in descending order
+ foreach (var rule in RULES)
+ {
+ IList<Rule> ruleList = rule.Value;
+ ruleList.Sort(new DaitchMokotoffRuleComparer());
+ }
+ }
+
+ private static void ParseRules(TextReader scanner, string location,
+ IDictionary<char, IList<Rule>> ruleMapping, IDictionary<char, char> asciiFoldings)
+ {
+ int currentLine = 0;
+ bool inMultilineComment = false;
+
+ string rawLine;
+ while ((rawLine = scanner.ReadLine()) != null)
+ {
+ currentLine++;
+ string line = rawLine;
+
+ if (inMultilineComment)
+ {
+ if (line.EndsWith(MULTILINE_COMMENT_END, StringComparison.Ordinal))
+ {
+ inMultilineComment = false;
+ }
+ continue;
+ }
+
+ if (line.StartsWith(MULTILINE_COMMENT_START, StringComparison.Ordinal))
+ {
+ inMultilineComment = true;
+ }
+ else
+ {
+ // discard comments
+ int cmtI = line.IndexOf(COMMENT);
+ if (cmtI >= 0)
+ {
+ line = line.Substring(0, cmtI - 0);
+ }
+
+ // trim leading-trailing whitespace
+ line = line.Trim();
+
+ if (line.Length == 0)
+ {
+ continue; // empty lines can be safely skipped
+ }
+
+ if (line.Contains("="))
+ {
+ // folding
+ string[] parts = line.Split(new string[] { "=" }, StringSplitOptions.RemoveEmptyEntries);
+ if (parts.Length != 2)
+ {
+ throw new ArgumentException("Malformed folding statement split into " + parts.Length +
+ " parts: " + rawLine + " in " + location);
+ }
+ else
+ {
+ string leftCharacter = parts[0];
+ string rightCharacter = parts[1];
+
+ if (leftCharacter.Length != 1 || rightCharacter.Length != 1)
+ {
+ throw new ArgumentException("Malformed folding statement - " +
+ "patterns are not single characters: " + rawLine + " in " + location);
+ }
+
+ asciiFoldings[leftCharacter[0]] = rightCharacter[0];
+ }
+ }
+ else
+ {
+ // rule
+ string[] parts = Regex.Split(line, "\\s+");
+ if (parts.Length != 4)
+ {
+ throw new ArgumentException("Malformed rule statement split into " + parts.Length +
+ " parts: " + rawLine + " in " + location);
+ }
+ else
+ {
+ try
+ {
+ string pattern = StripQuotes(parts[0]);
+ string replacement1 = StripQuotes(parts[1]);
+ string replacement2 = StripQuotes(parts[2]);
+ string replacement3 = StripQuotes(parts[3]);
+
+ Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+ char patternKey = r.Pattern[0];
+ IList<Rule> rules;
+ if (!ruleMapping.TryGetValue(patternKey, out rules) || rules == null)
+ {
+ rules = new List<Rule>();
+ ruleMapping[patternKey] = rules;
+ }
+ rules.Add(r);
+ }
+ catch (ArgumentException e)
+ {
+ throw new InvalidOperationException(
+ "Problem parsing line '" + currentLine + "' in " + location, e);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static string StripQuotes(string str)
+ {
+ if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+ {
+ str = str.Substring(1);
+ }
+
+ if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+ {
+ str = str.Substring(0, str.Length - 1);
+ }
+
+ return str;
+ }
+
+ /// <summary>Whether to use ASCII folding prior to encoding.</summary>
+ private readonly bool folding;
+
+ /// <summary>
+ /// Creates a new instance with ASCII-folding enabled.
+ /// </summary>
+ public DaitchMokotoffSoundex()
+ : this(true)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new instance.
+ /// <para/>
+ /// With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
+ /// è -> e.
+ /// </summary>
+ /// <param name="folding">If ASCII-folding shall be performed before encoding.</param>
+ public DaitchMokotoffSoundex(bool folding)
+ {
+ this.folding = folding;
+ }
+
+ /// <summary>
+ /// Performs a cleanup of the input string before the actual soundex transformation.
+ /// <para/>
+ /// Removes all whitespace characters and performs ASCII folding if enabled.
+ /// </summary>
+ /// <param name="input">The input string to cleanup.</param>
+ /// <returns>A cleaned up string.</returns>
+ private string Cleanup(string input)
+ {
+ StringBuilder sb = new StringBuilder();
+ foreach (char c in input.ToCharArray())
+ {
+ char ch = c;
+ if (char.IsWhiteSpace(ch))
+ {
+ continue;
+ }
+
+ ch = char.ToLowerInvariant(ch);
+ if (folding && FOLDINGS.ContainsKey(ch))
+ {
+ ch = FOLDINGS[ch];
+ }
+ sb.Append(ch);
+ }
+ return sb.ToString();
+ }
+
+ // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+ //**
+ // * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+ // * <p>
+ // * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+ // * EncoderException if the supplied object is not of type java.lang.String.
+ // * </p>
+ // *
+ // * @see #soundex(String)
+ // *
+ // * @param obj
+ // * Object to encode
+ // * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+ // * supplied.
+ // * @throws EncoderException
+ // * if the parameter supplied is not of type java.lang.String
+ // * @throws IllegalArgumentException
+ // * if a character is not mapped
+ // */
+ //@Override
+ // public Object encode(object obj)
+ //{
+ // if (!(obj instanceof String)) {
+ // throw new EncoderException(
+ // "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+ // }
+ // return encode((String) obj);
+ //}
+
+ /// <summary>
+ /// Encodes a string using the Daitch-Mokotoff soundex algorithm without branching.
+ /// </summary>
+ /// <param name="source">A string to encode.</param>
+ /// <returns>A DM Soundex code corresponding to the string supplied.</returns>
+ /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+ /// <seealso cref="GetSoundex(string)"/>
+ public virtual string Encode(string source)
+ {
+ if (source == null)
+ {
+ return null;
+ }
+ return GetSoundex(source, false)[0];
+ }
+
+ /// <summary>
+ /// Encodes a string using the Daitch-Mokotoff soundex algorithm with branching.
+ /// <para/>
+ /// In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+ /// separated by '|'.
+ /// <para/>
+ /// Example: the name "AUERBACH" is encoded as both
+ /// <list type="bullet">
+ /// <item><description>097400</description></item>
+ /// <item><description>097500</description></item>
+ /// </list>
+ /// <para/>
+ /// Thus the result will be "097400|097500".
+ /// </summary>
+ /// <param name="source">A string to encode.</param>
+ /// <returns>A string containing a set of DM Soundex codes corresponding to the string supplied.</returns>
+ /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+ public virtual string GetSoundex(string source)
+ {
+ string[] branches = GetSoundex(source, true);
+ StringBuilder sb = new StringBuilder();
+ int index = 0;
+ foreach (string branch in branches)
+ {
+ sb.Append(branch);
+ if (++index < branches.Length)
+ {
+ sb.Append('|');
+ }
+ }
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Perform the actual DM Soundex algorithm on the input string.
+ /// </summary>
+ /// <param name="source">A string to encode.</param>
+ /// <param name="branching">If branching shall be performed.</param>
+ /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns>
+ /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+ private string[] GetSoundex(string source, bool branching)
+ {
+ if (source == null)
+ {
+ return null;
+ }
+
+ string input = Cleanup(source);
+
+ // LinkedHashSet preserves input order. In .NET we can use List for that purpose.
+ IList<Branch> currentBranches = new List<Branch>();
+ currentBranches.Add(new Branch());
+
+ char lastChar = '\0';
+ for (int index = 0; index < input.Length; index++)
+ {
+ char ch = input[index];
+
+ // ignore whitespace inside a name
+ if (char.IsWhiteSpace(ch))
+ {
+ continue;
+ }
+
+ string inputContext = input.Substring(index);
+ IList<Rule> rules;
+ if (!RULES.TryGetValue(ch, out rules) || rules == null)
+ {
+ continue;
+ }
+
+ // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+ IList<Branch> nextBranches = branching ? new List<Branch>() : Collections.EmptyList<Branch>();
+
+ foreach (Rule rule in rules)
+ {
+ if (rule.Matches(inputContext))
+ {
+ if (branching)
+ {
+ nextBranches.Clear();
+ }
+ string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0');
+ bool branchingRequired = replacements.Length > 1 && branching;
+
+ foreach (Branch branch in currentBranches)
+ {
+ foreach (string nextReplacement in replacements)
+ {
+ // if we have multiple replacements, always create a new branch
+ Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch;
+
+ // special rule: occurrences of mn or nm are treated differently
+ bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+
+ nextBranch.ProcessNextReplacement(nextReplacement, force);
+
+ if (branching)
+ {
+ if (!nextBranches.Contains(nextBranch))
+ {
+ nextBranches.Add(nextBranch);
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+
+ if (branching)
+ {
+ currentBranches.Clear();
+ currentBranches.AddRange(nextBranches);
+ }
+ index += rule.PatternLength - 1;
+ break;
+ }
+ }
+
+ lastChar = ch;
+ }
+
+ string[] result = new string[currentBranches.Count];
+ int idx = 0;
+ foreach (Branch branch in currentBranches)
+ {
+ branch.Finish();
+ result[idx++] = branch.ToString();
+ }
+
+ return result;
+ }
+ }
+}