You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/06/27 20:33:53 UTC
[08/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic +
tests. Rather than porting over the entire commons-codec library,
only the language features were ported and added to this library.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
new file mode 100644
index 0000000..d54968d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
@@ -0,0 +1,1280 @@
+// commons-codec version compatibility level: 1.9
+using System;
+using System.Globalization;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <c>Lawrence
+ /// Philips</c>.
+ /// <para/>
+ /// This class is conditionally thread-safe. The instance field <see cref="maxCodeLen"/> is mutable
+ /// <see cref="MaxCodeLen"/> but is not volatile, and accesses are not synchronized. If an instance of the class is
+ /// shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
+ /// of the value between threads, and must not set <see cref="MaxCodeLen"/> after initial setup.
+ /// <para/>
+ /// See <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a>
+ /// <para/>
+ /// See <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a>
+ /// </summary>
+ public class DoubleMetaphone : IStringEncoder
+ {
+ /// <summary>
+ /// "Vowels" to test for
+ /// </summary>
+ private static readonly string VOWELS = "AEIOUY";
+
+ /// <summary>
+ /// Prefixes when present which are not pronounced
+ /// </summary>
+ private static readonly string[] SILENT_START =
+ { "GN", "KN", "PN", "WR", "PS" };
+ private static readonly string[] L_R_N_M_B_H_F_V_W_SPACE =
+ { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
+ private static readonly string[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
+ { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
+ private static readonly string[] L_T_K_S_N_M_B_Z =
+ { "L", "T", "K", "S", "N", "M", "B", "Z" };
+
+ /// <summary>
+ /// Maximum length of an encoding, default is 4
+ /// </summary>
+ private int maxCodeLen = 4;
+
+ /// <summary>
+ /// Creates an instance of this <see cref="DoubleMetaphone"/> encoder
+ /// </summary>
+ public DoubleMetaphone()
+ : base()
+ {
+ }
+
+ /// <summary>
+ /// Encode a value with Double Metaphone.
+ /// </summary>
+ /// <param name="value">String to encode.</param>
+ /// <returns>An encoded string.</returns>
+ public virtual string GetDoubleMetaphone(string value)
+ {
+ return GetDoubleMetaphone(value, false);
+ }
+
+ /// <summary>
+ /// Encode a value with Double Metaphone, optionally using the alternate encoding.
+ /// </summary>
+ /// <param name="value">String to encode.</param>
+ /// <param name="alternate">Use alternate encode.</param>
+ /// <returns>An encoded string.</returns>
+ public virtual string GetDoubleMetaphone(string value, bool alternate)
+ {
+ value = CleanInput(value);
+ if (value == null)
+ {
+ return null;
+ }
+
+ bool slavoGermanic = IsSlavoGermanic(value);
+ int index = IsSilentStart(value) ? 1 : 0;
+
+ DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.MaxCodeLen);
+
+ while (!result.IsComplete && index <= value.Length - 1)
+ {
+ switch (value[index])
+ {
+ case 'A':
+ case 'E':
+ case 'I':
+ case 'O':
+ case 'U':
+ case 'Y':
+ index = HandleAEIOUY(result, index);
+ break;
+ case 'B':
+ result.Append('P');
+ index = CharAt(value, index + 1) == 'B' ? index + 2 : index + 1;
+ break;
+ case '\u00C7':
+ // A C with a Cedilla
+ result.Append('S');
+ index++;
+ break;
+ case 'C':
+ index = HandleC(value, result, index);
+ break;
+ case 'D':
+ index = HandleD(value, result, index);
+ break;
+ case 'F':
+ result.Append('F');
+ index = CharAt(value, index + 1) == 'F' ? index + 2 : index + 1;
+ break;
+ case 'G':
+ index = HandleG(value, result, index, slavoGermanic);
+ break;
+ case 'H':
+ index = HandleH(value, result, index);
+ break;
+ case 'J':
+ index = HandleJ(value, result, index, slavoGermanic);
+ break;
+ case 'K':
+ result.Append('K');
+ index = CharAt(value, index + 1) == 'K' ? index + 2 : index + 1;
+ break;
+ case 'L':
+ index = HandleL(value, result, index);
+ break;
+ case 'M':
+ result.Append('M');
+ index = ConditionM0(value, index) ? index + 2 : index + 1;
+ break;
+ case 'N':
+ result.Append('N');
+ index = CharAt(value, index + 1) == 'N' ? index + 2 : index + 1;
+ break;
+ case '\u00D1':
+ // N with a tilde (spanish ene)
+ result.Append('N');
+ index++;
+ break;
+ case 'P':
+ index = HandleP(value, result, index);
+ break;
+ case 'Q':
+ result.Append('K');
+ index = CharAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
+ break;
+ case 'R':
+ index = HandleR(value, result, index, slavoGermanic);
+ break;
+ case 'S':
+ index = HandleS(value, result, index, slavoGermanic);
+ break;
+ case 'T':
+ index = HandleT(value, result, index);
+ break;
+ case 'V':
+ result.Append('F');
+ index = CharAt(value, index + 1) == 'V' ? index + 2 : index + 1;
+ break;
+ case 'W':
+ index = HandleW(value, result, index);
+ break;
+ case 'X':
+ index = HandleX(value, result, index);
+ break;
+ case 'Z':
+ index = HandleZ(value, result, index, slavoGermanic);
+ break;
+ default:
+ index++;
+ break;
+ }
+ }
+
+ return alternate ? result.Alternate : result.Primary;
+ }
+
+ // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+ // /**
+ // * Encode the value using DoubleMetaphone. It will only work if
+ // * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
+ // *
+ // * @param obj Object to encode (should be of type String)
+ // * @return An encoded Object (will be of type String)
+ // * @throws EncoderException encode parameter is not of type String
+ // */
+
+ //public virtual object Encode(object obj)
+ // {
+ // if (!(obj is String)) {
+ // throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
+ // }
+ // return GetDoubleMetaphone((String) obj);
+ // }
+
+ /// <summary>
+ /// Encode the value using DoubleMetaphone.
+ /// </summary>
+ /// <param name="value">String to encode.</param>
+ /// <returns>An encoded string.</returns>
+ public virtual string Encode(string value)
+ {
+ return GetDoubleMetaphone(value);
+ }
+
+ /// <summary>
+ /// Check if the Double Metaphone values of two <see cref="string"/> values
+ /// are equal.
+ /// </summary>
+ /// <param name="value1">The left-hand side of the encoded <see cref="string.Equals(object)"/>.</param>
+ /// <param name="value2">The right-hand side of the encoded <see cref="string.Equals(object)"/>.</param>
+ /// <returns><c>true</c> if the encoded <see cref="string"/>s are equal; <c>false</c> otherwise.</returns>
+ public virtual bool IsDoubleMetaphoneEqual(string value1, string value2)
+ {
+ return IsDoubleMetaphoneEqual(value1, value2, false);
+ }
+
+ /// <summary>
+ /// Check if the Double Metaphone values of two <see cref="string"/> values
+ /// are equal, optionally using the alternate value.
+ /// </summary>
+ /// <param name="value1">The left-hand side of the encoded <see cref="string.Equals(object)"/>.</param>
+ /// <param name="value2">The right-hand side of the encoded <see cref="string.Equals(object)"/>.</param>
+ /// <param name="alternate">Use the alternate value if <c>true</c>.</param>
+ /// <returns><c>true</c> if the encoded <see cref="string"/>s are equal; <c>false</c> otherwise.</returns>
+ public virtual bool IsDoubleMetaphoneEqual(string value1, string value2, bool alternate)
+ {
+ return GetDoubleMetaphone(value1, alternate).Equals(GetDoubleMetaphone(value2, alternate));
+ }
+
+ /// <summary>
+ /// Gets or Sets the maxCodeLen.
+ /// </summary>
+ public virtual int MaxCodeLen
+ {
+ get { return this.maxCodeLen; }
+ set { this.maxCodeLen = value; }
+ }
+
+ //-- BEGIN HANDLERS --//
+
+ /// <summary>
+ /// Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
+ /// </summary>
+ private int HandleAEIOUY(DoubleMetaphoneResult result, int index)
+ {
+ if (index == 0)
+ {
+ result.Append('A');
+ }
+ return index + 1;
+ }
+
+ /// <summary>
+ /// Handles 'C' cases.
+ /// </summary>
+ private int HandleC(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (ConditionC0(value, index))
+ { // very confusing, moved out
+ result.Append('K');
+ index += 2;
+ }
+ else if (index == 0 && Contains(value, index, 6, "CAESAR"))
+ {
+ result.Append('S');
+ index += 2;
+ }
+ else if (Contains(value, index, 2, "CH"))
+ {
+ index = HandleCH(value, result, index);
+ }
+ else if (Contains(value, index, 2, "CZ") &&
+ !Contains(value, index - 2, 4, "WICZ"))
+ {
+ //-- "Czerny" --//
+ result.Append('S', 'X');
+ index += 2;
+ }
+ else if (Contains(value, index + 1, 3, "CIA"))
+ {
+ //-- "focaccia" --//
+ result.Append('X');
+ index += 3;
+ }
+ else if (Contains(value, index, 2, "CC") &&
+ !(index == 1 && CharAt(value, 0) == 'M'))
+ {
+ //-- double "cc" but not "McClelland" --//
+ return HandleCC(value, result, index);
+ }
+ else if (Contains(value, index, 2, "CK", "CG", "CQ"))
+ {
+ result.Append('K');
+ index += 2;
+ }
+ else if (Contains(value, index, 2, "CI", "CE", "CY"))
+ {
+ //-- Italian vs. English --//
+ if (Contains(value, index, 3, "CIO", "CIE", "CIA"))
+ {
+ result.Append('S', 'X');
+ }
+ else
+ {
+ result.Append('S');
+ }
+ index += 2;
+ }
+ else
+ {
+ result.Append('K');
+ if (Contains(value, index + 1, 2, " C", " Q", " G"))
+ {
+ //-- Mac Caffrey, Mac Gregor --//
+ index += 3;
+ }
+ else if (Contains(value, index + 1, 1, "C", "K", "Q") &&
+ !Contains(value, index + 1, 2, "CE", "CI"))
+ {
+ index += 2;
+ }
+ else
+ {
+ index++;
+ }
+ }
+
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'CC' cases.
+ /// </summary>
+ private int HandleCC(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (Contains(value, index + 2, 1, "I", "E", "H") &&
+ !Contains(value, index + 2, 2, "HU"))
+ {
+ //-- "bellocchio" but not "bacchus" --//
+ if ((index == 1 && CharAt(value, index - 1) == 'A') ||
+ Contains(value, index - 1, 5, "UCCEE", "UCCES"))
+ {
+ //-- "accident", "accede", "succeed" --//
+ result.Append("KS");
+ }
+ else
+ {
+ //-- "bacci", "bertucci", other Italian --//
+ result.Append('X');
+ }
+ index += 3;
+ }
+ else
+ { // Pierce's rule
+ result.Append('K');
+ index += 2;
+ }
+
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'CH' cases.
+ /// </summary>
+ private int HandleCH(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (index > 0 && Contains(value, index, 4, "CHAE"))
+ { // Michael
+ result.Append('K', 'X');
+ return index + 2;
+ }
+ else if (ConditionCH0(value, index))
+ {
+ //-- Greek roots ("chemistry", "chorus", etc.) --//
+ result.Append('K');
+ return index + 2;
+ }
+ else if (ConditionCH1(value, index))
+ {
+ //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
+ result.Append('K');
+ return index + 2;
+ }
+ else
+ {
+ if (index > 0)
+ {
+ if (Contains(value, 0, 2, "MC"))
+ {
+ result.Append('K');
+ }
+ else
+ {
+ result.Append('X', 'K');
+ }
+ }
+ else
+ {
+ result.Append('X');
+ }
+ return index + 2;
+ }
+ }
+
+ /// <summary>
+ /// Handles 'D' cases.
+ /// </summary>
+ private int HandleD(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (Contains(value, index, 2, "DG"))
+ {
+ //-- "Edge" --//
+ if (Contains(value, index + 2, 1, "I", "E", "Y"))
+ {
+ result.Append('J');
+ index += 3;
+ //-- "Edgar" --//
+ }
+ else
+ {
+ result.Append("TK");
+ index += 2;
+ }
+ }
+ else if (Contains(value, index, 2, "DT", "DD"))
+ {
+ result.Append('T');
+ index += 2;
+ }
+ else
+ {
+ result.Append('T');
+ index++;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'G' cases.
+ /// </summary>
+ private int HandleG(string value, DoubleMetaphoneResult result, int index,
+ bool slavoGermanic)
+ {
+ if (CharAt(value, index + 1) == 'H')
+ {
+ index = HandleGH(value, result, index);
+ }
+ else if (CharAt(value, index + 1) == 'N')
+ {
+ if (index == 1 && IsVowel(CharAt(value, 0)) && !slavoGermanic)
+ {
+ result.Append("KN", "N");
+ }
+ else if (!Contains(value, index + 2, 2, "EY") &&
+ CharAt(value, index + 1) != 'Y' && !slavoGermanic)
+ {
+ result.Append("N", "KN");
+ }
+ else
+ {
+ result.Append("KN");
+ }
+ index = index + 2;
+ }
+ else if (Contains(value, index + 1, 2, "LI") && !slavoGermanic)
+ {
+ result.Append("KL", "L");
+ index += 2;
+ }
+ else if (index == 0 &&
+ (CharAt(value, index + 1) == 'Y' ||
+ Contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER)))
+ {
+ //-- -ges-, -gep-, -gel-, -gie- at beginning --//
+ result.Append('K', 'J');
+ index += 2;
+ }
+ else if ((Contains(value, index + 1, 2, "ER") ||
+ CharAt(value, index + 1) == 'Y') &&
+ !Contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
+ !Contains(value, index - 1, 1, "E", "I") &&
+ !Contains(value, index - 1, 3, "RGY", "OGY"))
+ {
+ //-- -ger-, -gy- --//
+ result.Append('K', 'J');
+ index += 2;
+ }
+ else if (Contains(value, index + 1, 1, "E", "I", "Y") ||
+ Contains(value, index - 1, 4, "AGGI", "OGGI"))
+ {
+ //-- Italian "biaggi" --//
+ if (Contains(value, 0, 4, "VAN ", "VON ") ||
+ Contains(value, 0, 3, "SCH") ||
+ Contains(value, index + 1, 2, "ET"))
+ {
+ //-- obvious germanic --//
+ result.Append('K');
+ }
+ else if (Contains(value, index + 1, 3, "IER"))
+ {
+ result.Append('J');
+ }
+ else
+ {
+ result.Append('J', 'K');
+ }
+ index += 2;
+ }
+ else if (CharAt(value, index + 1) == 'G')
+ {
+ index += 2;
+ result.Append('K');
+ }
+ else
+ {
+ index++;
+ result.Append('K');
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'GH' cases.
+ /// </summary>
+ private int HandleGH(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (index > 0 && !IsVowel(CharAt(value, index - 1)))
+ {
+ result.Append('K');
+ index += 2;
+ }
+ else if (index == 0)
+ {
+ if (CharAt(value, index + 2) == 'I')
+ {
+ result.Append('J');
+ }
+ else
+ {
+ result.Append('K');
+ }
+ index += 2;
+ }
+ else if ((index > 1 && Contains(value, index - 2, 1, "B", "H", "D")) ||
+ (index > 2 && Contains(value, index - 3, 1, "B", "H", "D")) ||
+ (index > 3 && Contains(value, index - 4, 1, "B", "H")))
+ {
+ //-- Parker's rule (with some further refinements) - "hugh"
+ index += 2;
+ }
+ else
+ {
+ if (index > 2 && CharAt(value, index - 1) == 'U' &&
+ Contains(value, index - 3, 1, "C", "G", "L", "R", "T"))
+ {
+ //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
+ result.Append('F');
+ }
+ else if (index > 0 && CharAt(value, index - 1) != 'I')
+ {
+ result.Append('K');
+ }
+ index += 2;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'H' cases.
+ /// </summary>
+ private int HandleH(string value, DoubleMetaphoneResult result, int index)
+ {
+ //-- only keep if first & before vowel or between 2 vowels --//
+ if ((index == 0 || IsVowel(CharAt(value, index - 1))) &&
+ IsVowel(CharAt(value, index + 1)))
+ {
+ result.Append('H');
+ index += 2;
+ //-- also takes car of "HH" --//
+ }
+ else
+ {
+ index++;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'J' cases.
+ /// </summary>
+ private int HandleJ(string value, DoubleMetaphoneResult result, int index,
+ bool slavoGermanic)
+ {
+ if (Contains(value, index, 4, "JOSE") || Contains(value, 0, 4, "SAN "))
+ {
+ //-- obvious Spanish, "Jose", "San Jacinto" --//
+ if ((index == 0 && (CharAt(value, index + 4) == ' ') ||
+ value.Length == 4) || Contains(value, 0, 4, "SAN "))
+ {
+ result.Append('H');
+ }
+ else
+ {
+ result.Append('J', 'H');
+ }
+ index++;
+ }
+ else
+ {
+ if (index == 0 && !Contains(value, index, 4, "JOSE"))
+ {
+ result.Append('J', 'A');
+ }
+ else if (IsVowel(CharAt(value, index - 1)) && !slavoGermanic &&
+ (CharAt(value, index + 1) == 'A' || CharAt(value, index + 1) == 'O'))
+ {
+ result.Append('J', 'H');
+ }
+ else if (index == value.Length - 1)
+ {
+ result.Append('J', ' ');
+ }
+ else if (!Contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
+ !Contains(value, index - 1, 1, "S", "K", "L"))
+ {
+ result.Append('J');
+ }
+
+ if (CharAt(value, index + 1) == 'J')
+ {
+ index += 2;
+ }
+ else
+ {
+ index++;
+ }
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'L' cases.
+ /// </summary>
+ private int HandleL(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (CharAt(value, index + 1) == 'L')
+ {
+ if (ConditionL0(value, index))
+ {
+ result.AppendPrimary('L');
+ }
+ else
+ {
+ result.Append('L');
+ }
+ index += 2;
+ }
+ else
+ {
+ index++;
+ result.Append('L');
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'P' cases.
+ /// </summary>
+ private int HandleP(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (CharAt(value, index + 1) == 'H')
+ {
+ result.Append('F');
+ index += 2;
+ }
+ else
+ {
+ result.Append('P');
+ index = Contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'R' cases.
+ /// </summary>
+ private int HandleR(string value, DoubleMetaphoneResult result, int index,
+ bool slavoGermanic)
+ {
+ if (index == value.Length - 1 && !slavoGermanic &&
+ Contains(value, index - 2, 2, "IE") &&
+ !Contains(value, index - 4, 2, "ME", "MA"))
+ {
+ result.AppendAlternate('R');
+ }
+ else
+ {
+ result.Append('R');
+ }
+ return CharAt(value, index + 1) == 'R' ? index + 2 : index + 1;
+ }
+
+ /// <summary>
+ /// Handles 'S' cases.
+ /// </summary>
+ private int HandleS(string value, DoubleMetaphoneResult result, int index,
+ bool slavoGermanic)
+ {
+ if (Contains(value, index - 1, 3, "ISL", "YSL"))
+ {
+ //-- special cases "island", "isle", "carlisle", "carlysle" --//
+ index++;
+ }
+ else if (index == 0 && Contains(value, index, 5, "SUGAR"))
+ {
+ //-- special case "sugar-" --//
+ result.Append('X', 'S');
+ index++;
+ }
+ else if (Contains(value, index, 2, "SH"))
+ {
+ if (Contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ"))
+ {
+ //-- germanic --//
+ result.Append('S');
+ }
+ else
+ {
+ result.Append('X');
+ }
+ index += 2;
+ }
+ else if (Contains(value, index, 3, "SIO", "SIA") || Contains(value, index, 4, "SIAN"))
+ {
+ //-- Italian and Armenian --//
+ if (slavoGermanic)
+ {
+ result.Append('S');
+ }
+ else
+ {
+ result.Append('S', 'X');
+ }
+ index += 3;
+ }
+ else if ((index == 0 && Contains(value, index + 1, 1, "M", "N", "L", "W")) ||
+ Contains(value, index + 1, 1, "Z"))
+ {
+ //-- german & anglicisations, e.g. "smith" match "schmidt" //
+ // "snider" match "schneider" --//
+ //-- also, -sz- in slavic language although in hungarian it //
+ // is pronounced "s" --//
+ result.Append('S', 'X');
+ index = Contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
+ }
+ else if (Contains(value, index, 2, "SC"))
+ {
+ index = HandleSC(value, result, index);
+ }
+ else
+ {
+ if (index == value.Length - 1 && Contains(value, index - 2, 2, "AI", "OI"))
+ {
+ //-- french e.g. "resnais", "artois" --//
+ result.AppendAlternate('S');
+ }
+ else
+ {
+ result.Append('S');
+ }
+ index = Contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'SC' cases.
+ /// </summary>
+ private int HandleSC(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (CharAt(value, index + 2) == 'H')
+ {
+ //-- Schlesinger's rule --//
+ if (Contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM"))
+ {
+ //-- Dutch origin, e.g. "school", "schooner" --//
+ if (Contains(value, index + 3, 2, "ER", "EN"))
+ {
+ //-- "schermerhorn", "schenker" --//
+ result.Append("X", "SK");
+ }
+ else
+ {
+ result.Append("SK");
+ }
+ }
+ else
+ {
+ if (index == 0 && !IsVowel(CharAt(value, 3)) && CharAt(value, 3) != 'W')
+ {
+ result.Append('X', 'S');
+ }
+ else
+ {
+ result.Append('X');
+ }
+ }
+ }
+ else if (Contains(value, index + 2, 1, "I", "E", "Y"))
+ {
+ result.Append('S');
+ }
+ else
+ {
+ result.Append("SK");
+ }
+ return index + 3;
+ }
+
+ /// <summary>
+ /// Handles 'T' cases.
+ /// </summary>
+ private int HandleT(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (Contains(value, index, 4, "TION"))
+ {
+ result.Append('X');
+ index += 3;
+ }
+ else if (Contains(value, index, 3, "TIA", "TCH"))
+ {
+ result.Append('X');
+ index += 3;
+ }
+ else if (Contains(value, index, 2, "TH") || Contains(value, index, 3, "TTH"))
+ {
+ if (Contains(value, index + 2, 2, "OM", "AM") ||
+ //-- special case "thomas", "thames" or germanic --//
+ Contains(value, 0, 4, "VAN ", "VON ") ||
+ Contains(value, 0, 3, "SCH"))
+ {
+ result.Append('T');
+ }
+ else
+ {
+ result.Append('0', 'T');
+ }
+ index += 2;
+ }
+ else
+ {
+ result.Append('T');
+ index = Contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'W' cases.
+ /// </summary>
+ private int HandleW(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (Contains(value, index, 2, "WR"))
+ {
+ //-- can also be in middle of word --//
+ result.Append('R');
+ index += 2;
+ }
+ else
+ {
+ if (index == 0 && (IsVowel(CharAt(value, index + 1)) ||
+ Contains(value, index, 2, "WH")))
+ {
+ if (IsVowel(CharAt(value, index + 1)))
+ {
+ //-- Wasserman should match Vasserman --//
+ result.Append('A', 'F');
+ }
+ else
+ {
+ //-- need Uomo to match Womo --//
+ result.Append('A');
+ }
+ index++;
+ }
+ else if ((index == value.Length - 1 && IsVowel(CharAt(value, index - 1))) ||
+ Contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
+ Contains(value, 0, 3, "SCH"))
+ {
+ //-- Arnow should match Arnoff --//
+ result.AppendAlternate('F');
+ index++;
+ }
+ else if (Contains(value, index, 4, "WICZ", "WITZ"))
+ {
+ //-- Polish e.g. "filipowicz" --//
+ result.Append("TS", "FX");
+ index += 4;
+ }
+ else
+ {
+ index++;
+ }
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'X' cases.
+ /// </summary>
+ private int HandleX(string value, DoubleMetaphoneResult result, int index)
+ {
+ if (index == 0)
+ {
+ result.Append('S');
+ index++;
+ }
+ else
+ {
+ if (!((index == value.Length - 1) &&
+ (Contains(value, index - 3, 3, "IAU", "EAU") ||
+ Contains(value, index - 2, 2, "AU", "OU"))))
+ {
+ //-- French e.g. breaux --//
+ result.Append("KS");
+ }
+ index = Contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /// <summary>
+ /// Handles 'Z' cases.
+ /// </summary>
+ private int HandleZ(string value, DoubleMetaphoneResult result, int index,
+ bool slavoGermanic)
+ {
+ if (CharAt(value, index + 1) == 'H')
+ {
+ //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
+ result.Append('J');
+ index += 2;
+ }
+ else
+ {
+ if (Contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
+ (slavoGermanic && (index > 0 && CharAt(value, index - 1) != 'T')))
+ {
+ result.Append("S", "TS");
+ }
+ else
+ {
+ result.Append('S');
+ }
+ index = CharAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ //-- BEGIN CONDITIONS --//
+
+ /// <summary>
+ /// Complex condition 0 for 'C'.
+ /// </summary>
+ private bool ConditionC0(string value, int index)
+ {
+ if (Contains(value, index, 4, "CHIA"))
+ {
+ return true;
+ }
+ else if (index <= 1)
+ {
+ return false;
+ }
+ else if (IsVowel(CharAt(value, index - 2)))
+ {
+ return false;
+ }
+ else if (!Contains(value, index - 1, 3, "ACH"))
+ {
+ return false;
+ }
+ else
+ {
+ char c = CharAt(value, index + 2);
+ return (c != 'I' && c != 'E') ||
+ Contains(value, index - 2, 6, "BACHER", "MACHER");
+ }
+ }
+
+ /// <summary>
+ /// Complex condition 0 for 'CH'.
+ /// </summary>
+ private bool ConditionCH0(string value, int index)
+ {
+ if (index != 0)
+ {
+ return false;
+ }
+ else if (!Contains(value, index + 1, 5, "HARAC", "HARIS") &&
+ !Contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM"))
+ {
+ return false;
+ }
+ else if (Contains(value, 0, 5, "CHORE"))
+ {
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+ }
+
+ /// <summary>
+ /// Complex condition 1 for 'CH'.
+ /// </summary>
+ private bool ConditionCH1(string value, int index)
+ {
+ return ((Contains(value, 0, 4, "VAN ", "VON ") || Contains(value, 0, 3, "SCH")) ||
+ Contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
+ Contains(value, index + 2, 1, "T", "S") ||
+ ((Contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
+ (Contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.Length - 1)));
+ }
+
+ /// <summary>
+ /// Complex condition 0 for 'L'.
+ /// </summary>
+ private bool ConditionL0(string value, int index)
+ {
+ if (index == value.Length - 3 &&
+ Contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE"))
+ {
+ return true;
+ }
+ else if ((Contains(value, value.Length - 2, 2, "AS", "OS") ||
+ Contains(value, value.Length - 1, 1, "A", "O")) &&
+ Contains(value, index - 1, 4, "ALLE"))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Complex condition 0 for 'M'.
+ /// </summary>
+ private bool ConditionM0(string value, int index)
+ {
+ if (CharAt(value, index + 1) == 'M')
+ {
+ return true;
+ }
+ return Contains(value, index - 1, 3, "UMB") &&
+ ((index + 1) == value.Length - 1 || Contains(value, index + 2, 2, "ER"));
+ }
+
+ //-- BEGIN HELPER FUNCTIONS --//
+
+ /// <summary>
+ /// Determines whether or not a value is of slavo-germanic origin. A value is
+ /// of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
+ /// </summary>
+ private bool IsSlavoGermanic(string value)
+ {
+ return value.IndexOf('W') > -1 || value.IndexOf('K') > -1 ||
+ value.IndexOf("CZ") > -1 || value.IndexOf("WITZ") > -1;
+ }
+
+ /// <summary>
+ /// Determines whether or not a character is a vowel or not
+ /// </summary>
+ private bool IsVowel(char ch)
+ {
+ return VOWELS.IndexOf(ch) != -1;
+ }
+
+ /// <summary>
+ /// Determines whether or not the value starts with a silent letter. It will
+ /// return <c>true</c> if the value starts with any of 'GN', 'KN',
+ /// 'PN', 'WR' or 'PS'.
+ /// </summary>
+ private bool IsSilentStart(string value)
+ {
+ bool result = false;
+ foreach (string element in SILENT_START)
+ {
+ if (value.StartsWith(element, StringComparison.Ordinal))
+ {
+ result = true;
+ break;
+ }
+ }
+ return result;
+ }
+
+ /// <summary>
+ /// Cleans the input.
+ /// </summary>
+ private string CleanInput(string input)
+ {
+ if (input == null)
+ {
+ return null;
+ }
+ input = input.Trim();
+ if (input.Length == 0)
+ {
+ return null;
+ }
+ return new CultureInfo("en").TextInfo.ToUpper(input);
+ }
+
+ /// <summary>
+ /// Gets the character at index <paramref name="index"/> if available, otherwise
+ /// it returns <see cref="char.MinValue"/> so that there is some sort
+ /// of a default.
+ /// </summary>
+ protected virtual char CharAt(string value, int index)
+ {
+ if (index < 0 || index >= value.Length)
+ {
+ return char.MinValue;
+ }
+ return value[index];
+ }
+
+ /// <summary>
+ /// Determines whether <paramref name="value"/> contains any of the criteria starting at index <paramref name="start"/> and
+ /// matching up to length <paramref name="length"/>.
+ /// </summary>
+ protected static bool Contains(string value, int start, int length,
+ params string[] criteria)
+ {
+ bool result = false;
+ if (start >= 0 && start + length <= value.Length)
+ {
+ string target = value.Substring(start, length);
+
+ foreach (string element in criteria)
+ {
+ if (target.Equals(element))
+ {
+ result = true;
+ break;
+ }
+ }
+ }
+ return result;
+ }
+
+ //-- BEGIN INNER CLASSES --//
+
+ /// <summary>
+ /// Inner class for storing results, since there is the optional alternate encoding.
+ /// </summary>
+ public class DoubleMetaphoneResult
+ {
+ private readonly StringBuilder primary;
+ private readonly StringBuilder alternate;
+ private readonly int maxLength;
+
+ public DoubleMetaphoneResult(int maxLength)
+ {
+ this.maxLength = maxLength;
+ this.primary = new StringBuilder(maxLength);
+ this.alternate = new StringBuilder(maxLength);
+ }
+
+ public virtual void Append(char value)
+ {
+ AppendPrimary(value);
+ AppendAlternate(value);
+ }
+
+ public virtual void Append(char primary, char alternate)
+ {
+ AppendPrimary(primary);
+ AppendAlternate(alternate);
+ }
+
+ public virtual void AppendPrimary(char value)
+ {
+ if (this.primary.Length < this.maxLength)
+ {
+ this.primary.Append(value);
+ }
+ }
+
+ public virtual void AppendAlternate(char value)
+ {
+ if (this.alternate.Length < this.maxLength)
+ {
+ this.alternate.Append(value);
+ }
+ }
+
+ public virtual void Append(string value)
+ {
+ AppendPrimary(value);
+ AppendAlternate(value);
+ }
+
+ public virtual void Append(string primary, string alternate)
+ {
+ AppendPrimary(primary);
+ AppendAlternate(alternate);
+ }
+
+ public virtual void AppendPrimary(string value)
+ {
+ int addChars = this.maxLength - this.primary.Length;
+ if (value.Length <= addChars)
+ {
+ this.primary.Append(value);
+ }
+ else
+ {
+ this.primary.Append(value.Substring(0, addChars - 0));
+ }
+ }
+
+ public virtual void AppendAlternate(string value)
+ {
+ int addChars = this.maxLength - this.alternate.Length;
+ if (value.Length <= addChars)
+ {
+ this.alternate.Append(value);
+ }
+ else
+ {
+ this.alternate.Append(value.Substring(0, addChars - 0));
+ }
+ }
+
+ public virtual string Primary
+ {
+ get { return this.primary.ToString(); }
+ }
+
+ public virtual string Alternate
+ {
+ get { return this.alternate.ToString(); }
+ }
+
+ public virtual bool IsComplete
+ {
+ get
+ {
+ return this.primary.Length >= this.maxLength &&
+ this.alternate.Length >= this.maxLength;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
new file mode 100644
index 0000000..c30e571
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
@@ -0,0 +1,425 @@
+// commons-codec version compatibility level: 1.9
+using System;
+using System.Globalization;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Match Rating Approach Phonetic Algorithm Developed by <c>Western Airlines</c> in 1977.
+ /// <para/>
+ /// This class is immutable and thread-safe.
+ /// <para/>
+ /// See: <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
+ /// <para/>
+ /// since 1.8
+ /// </summary>
+ public class MatchRatingApproachEncoder : IStringEncoder
+ {
+ private static readonly string SPACE = " ";
+
+ private static readonly string EMPTY = "";
+
+ /// <summary>
+ /// Constants used mainly for the min rating value.
+ /// </summary>
+ private static readonly int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
+ ELEVEN = 11, TWELVE = 12;
+
+ /// <summary>
+ /// The plain letter equivalent of the accented letters.
+ /// </summary>
+ private static readonly string PLAIN_ASCII = "AaEeIiOoUu" + // grave
+ "AaEeIiOoUuYy" + // acute
+ "AaEeIiOoUuYy" + // circumflex
+ "AaOoNn" + // tilde
+ "AaEeIiOoUuYy" + // umlaut
+ "Aa" + // ring
+ "Cc" + // cedilla
+ "OoUu"; // double acute
+
+ /// <summary>
+ /// Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
+ /// </summary>
+ private static readonly string UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
+ "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
+ "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
+ "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
+ "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
+ "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
+
+ private static readonly string[] DOUBLE_CONSONANT =
+ new string[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
+ "TT", "VV", "WW", "XX", "YY", "ZZ" };
+
+ /// <summary>
+ /// Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
+ /// spaces.
+ /// </summary>
+ /// <param name="name">The name to be cleaned.</param>
+ /// <returns>The cleaned name.</returns>
+ internal string CleanName(string name)
+ {
+ string upperName = new CultureInfo("en").TextInfo.ToUpper(name);
+
+ string[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
+ foreach (string str in charsToTrim)
+ {
+ upperName = Regex.Replace(upperName, str, EMPTY);
+ }
+
+ upperName = RemoveAccents(upperName);
+ upperName = Regex.Replace(upperName, "\\s+", EMPTY);
+
+ return upperName;
+ }
+
+ // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+ // **
+ // * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
+ // * Encoder interface Throws an EncoderException if input object is not of type java.lang.string.
+ // *
+ // * @param pObject
+ // * Object to encode
+ // * @return An object (or type java.lang.string) containing the Match Rating Approach code which corresponds to the
+ // * string supplied.
+ // * @throws EncoderException
+ // * if the parameter supplied is not of type java.lang.string
+ // */
+ //public Object encode(Object pObject) throws EncoderException
+ //{
+ //if (!(pObject instanceof string)) {
+ // throw new EncoderException(
+ // "Parameter supplied to Match Rating Approach encoder is not of type java.lang.string");
+ // }
+ //return encode((string) pObject);
+ //}
+
+ /// <summary>
+ /// Encodes a string using the Match Rating Approach (MRA) algorithm.
+ /// </summary>
+ /// <param name="name">String to encode.</param>
+ /// <returns>The MRA code corresponding to the string supplied.</returns>
+ public string Encode(string name)
+ {
+ // Bulletproof for trivial input - NINO
+ if (name == null || EMPTY.Equals(name, StringComparison.OrdinalIgnoreCase) ||
+ SPACE.Equals(name, StringComparison.OrdinalIgnoreCase) || name.Length == 1)
+ {
+ return EMPTY;
+ }
+
+ // Preprocessing
+ name = CleanName(name);
+
+ // BEGIN: Actual encoding part of the algorithm...
+ // 1. Delete all vowels unless the vowel begins the word
+ name = RemoveVowels(name);
+
+ // 2. Remove second consonant from any double consonant
+ name = RemoveDoubleConsonants(name);
+
+ // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
+ name = GetFirst3Last3(name);
+
+ return name;
+ }
+
+ /// <summary>
+ /// Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name.
+ /// </summary>
+ /// <param name="name">The string to get the substrings from.</param>
+ /// <returns>Annexed first & last 3 letters of input word.</returns>
+ internal string GetFirst3Last3(string name)
+ {
+ int nameLength = name.Length;
+
+ if (nameLength > SIX)
+ {
+ string firstThree = name.Substring(0, THREE - 0);
+ string lastThree = name.Substring(nameLength - THREE, nameLength - (nameLength - THREE));
+ return firstThree + lastThree;
+ }
+ else
+ {
+ return name;
+ }
+ }
+
+ /// <summary>
+ /// Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
+ /// min rating. Values strictly from documentation.
+ /// </summary>
+ /// <param name="sumLength">The length of 2 strings sent down.</param>
+ /// <returns>The min rating value.</returns>
+ internal int GetMinRating(int sumLength)
+ {
+ int minRating = 0;
+
+ if (sumLength <= FOUR)
+ {
+ minRating = FIVE;
+ }
+ else if (sumLength >= FIVE && sumLength <= SEVEN)
+ {
+ minRating = FOUR;
+ }
+ else if (sumLength >= EIGHT && sumLength <= ELEVEN)
+ {
+ minRating = THREE;
+ }
+ else if (sumLength == TWELVE)
+ {
+ minRating = TWO;
+ }
+ else
+ {
+ minRating = ONE; // docs said little here.
+ }
+
+ return minRating;
+ }
+
+ /// <summary>
+ /// Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
+ /// strings are cleaned in the same way as <see cref="Encode(string)"/>.
+ /// </summary>
+ /// <param name="name1">First of the 2 strings (names) to compare.</param>
+ /// <param name="name2">Second of the 2 names to compare.</param>
+ /// <returns><c>true</c> if the encodings are identical <c>false</c> otherwise.</returns>
+ public virtual bool IsEncodeEquals(string name1, string name2)
+ {
+ // Bulletproof for trivial input - NINO
+ if (name1 == null || EMPTY.Equals(name1, StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name1, StringComparison.OrdinalIgnoreCase))
+ {
+ return false;
+ }
+ else if (name2 == null || EMPTY.Equals(name2, StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name2, StringComparison.OrdinalIgnoreCase))
+ {
+ return false;
+ }
+ else if (name1.Length == 1 || name2.Length == 1)
+ {
+ return false;
+ }
+ else if (name1.Equals(name2, StringComparison.OrdinalIgnoreCase))
+ {
+ return true;
+ }
+
+ // Preprocessing
+ name1 = CleanName(name1);
+ name2 = CleanName(name2);
+
+ // Actual MRA Algorithm
+
+ // 1. Remove vowels
+ name1 = RemoveVowels(name1);
+ name2 = RemoveVowels(name2);
+
+ // 2. Remove double consonants
+ name1 = RemoveDoubleConsonants(name1);
+ name2 = RemoveDoubleConsonants(name2);
+
+ // 3. Reduce down to 3 letters
+ name1 = GetFirst3Last3(name1);
+ name2 = GetFirst3Last3(name2);
+
+ // 4. Check for length difference - if 3 or greater then no similarity
+ // comparison is done
+ if (Math.Abs(name1.Length - name2.Length) >= THREE)
+ {
+ return false;
+ }
+
+ // 5. Obtain the minimum rating value by calculating the length sum of the
+ // encoded strings and sending it down.
+ int sumLength = Math.Abs(name1.Length + name2.Length);
+ int minRating = 0;
+ minRating = GetMinRating(sumLength);
+
+ // 6. Process the encoded strings from left to right and remove any
+ // identical characters found from both strings respectively.
+ int count = LeftToRightThenRightToLeftProcessing(name1, name2);
+
+ // 7. Each PNI item that has a similarity rating equal to or greater than
+ // the min is considered to be a good candidate match
+ return count >= minRating;
+
+ }
+
+ /// <summary>
+ /// Determines if a letter is a vowel.
+ /// </summary>
+ /// <param name="letter">The letter under investiagtion.</param>
+ /// <returns><c>true</c> if a vowel, else <c>false</c>.</returns>
+ internal bool IsVowel(string letter)
+ {
+ return letter.Equals("E", StringComparison.OrdinalIgnoreCase) || letter.Equals("A", StringComparison.OrdinalIgnoreCase) || letter.Equals("O", StringComparison.OrdinalIgnoreCase) ||
+ letter.Equals("I", StringComparison.OrdinalIgnoreCase) || letter.Equals("U", StringComparison.OrdinalIgnoreCase);
+ }
+
+ /// <summary>
+ /// Processes the names from left to right (first) then right to left removing identical letters in same positions.
+ /// Then subtracts the longer string that remains from 6 and returns this.
+ /// </summary>
+ /// <param name="name1"></param>
+ /// <param name="name2"></param>
+ /// <returns></returns>
+ internal int LeftToRightThenRightToLeftProcessing(string name1, string name2)
+ {
+ char[] name1Char = name1.ToCharArray();
+ char[] name2Char = name2.ToCharArray();
+
+ int name1Size = name1.Length - 1;
+ int name2Size = name2.Length - 1;
+
+ string name1LtRStart = EMPTY;
+ string name1LtREnd = EMPTY;
+
+ string name2RtLStart = EMPTY;
+ string name2RtLEnd = EMPTY;
+
+ for (int i = 0; i < name1Char.Length; i++)
+ {
+ if (i > name2Size)
+ {
+ break;
+ }
+
+ name1LtRStart = name1.Substring(i, 1);
+ name1LtREnd = name1.Substring(name1Size - i, 1);
+
+ name2RtLStart = name2.Substring(i, 1);
+ name2RtLEnd = name2.Substring(name2Size - i, 1);
+
+ // Left to right...
+ if (name1LtRStart.Equals(name2RtLStart, StringComparison.Ordinal))
+ {
+ name1Char[i] = ' ';
+ name2Char[i] = ' ';
+ }
+
+ // Right to left...
+ if (name1LtREnd.Equals(name2RtLEnd, StringComparison.Ordinal))
+ {
+ name1Char[name1Size - i] = ' ';
+ name2Char[name2Size - i] = ' ';
+ }
+ }
+
+ // Char arrays -> string & remove extraneous space
+ string strA = Regex.Replace(new string(name1Char), "\\s+", EMPTY);
+ string strB = Regex.Replace(new string(name2Char), "\\s+", EMPTY);
+
+ // Final bit - subtract longest string from 6 and return this int value
+ if (strA.Length > strB.Length)
+ {
+ return Math.Abs(SIX - strA.Length);
+ }
+ else
+ {
+ return Math.Abs(SIX - strB.Length);
+ }
+ }
+
+ /// <summary>
+ /// Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
+ /// http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
+ /// </summary>
+ /// <param name="accentedWord">The word that may have accents in it.</param>
+ /// <returns>De-accented word.</returns>
+ internal string RemoveAccents(string accentedWord)
+ {
+ if (accentedWord == null)
+ {
+ return null;
+ }
+
+ StringBuilder sb = new StringBuilder();
+ int n = accentedWord.Length;
+
+ for (int i = 0; i < n; i++)
+ {
+ char c = accentedWord[i];
+ int pos = UNICODE.IndexOf(c);
+ if (pos > -1)
+ {
+ sb.Append(PLAIN_ASCII[pos]);
+ }
+ else
+ {
+ sb.Append(c);
+ }
+ }
+
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Replaces any double consonant pair with the single letter equivalent.
+ /// </summary>
+ /// <param name="name">String to have double consonants removed.</param>
+ /// <returns>Single consonant word.</returns>
+ internal string RemoveDoubleConsonants(string name)
+ {
+ string replacedName = name.ToUpperInvariant();
+ foreach (string dc in DOUBLE_CONSONANT)
+ {
+ if (replacedName.Contains(dc))
+ {
+ string singleLetter = dc.Substring(0, 1 - 0);
+ replacedName = replacedName.Replace(dc, singleLetter);
+ }
+ }
+ return replacedName;
+ }
+
+ /// <summary>
+ /// Deletes all vowels unless the vowel begins the word.
+ /// </summary>
+ /// <param name="name">The name to have vowels removed.</param>
+ /// <returns>De-voweled word.</returns>
+ internal string RemoveVowels(string name)
+ {
+ // Extract first letter
+ string firstLetter = name.Substring(0, 1 - 0);
+
+ name = Regex.Replace(name, "A", EMPTY);
+ name = Regex.Replace(name, "E", EMPTY);
+ name = Regex.Replace(name, "I", EMPTY);
+ name = Regex.Replace(name, "O", EMPTY);
+ name = Regex.Replace(name, "U", EMPTY);
+
+ name = Regex.Replace(name, "\\s{2,}\\b", SPACE);
+
+ // return isVowel(firstLetter) ? (firstLetter + name) : name;
+ if (IsVowel(firstLetter))
+ {
+ return firstLetter + name;
+ }
+ else
+ {
+ return name;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
new file mode 100644
index 0000000..dd3038f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
@@ -0,0 +1,494 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encodes a string into a Metaphone value.
+ /// <para/>
+ /// Initial Java implementation by <c>William B. Brogden. December, 1997</c>.
+ /// Permission given by <c>wbrogden</c> for code to be used anywhere.
+ /// <para/>
+ /// <c>Hanging on the Metaphone</c> by <c>Lawrence Philips</c> in <c>Computer Language of Dec. 1990,
+ /// p 39.</c>
+ /// <para/>
+ /// Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+ /// <para/>
+ /// <list type="bullet">
+ /// <item><description><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </description></item>
+ /// <item><description><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </description></item>
+ /// </list>
+ /// <para/>
+ /// They have had undocumented changes from the originally published algorithm.
+ /// For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
+ /// <para/>
+ /// This class is conditionally thread-safe.
+ /// The instance field <see cref="maxCodeLen"/> is mutable <see cref="MaxCodeLen"/>
+ /// but is not volatile, and accesses are not synchronized.
+ /// If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
+ /// is used to ensure safe publication of the value between threads, and must not set <see cref="MaxCodeLen"/>
+ /// after initial setup.
+ /// </summary>
+ public class Metaphone : IStringEncoder
+ {
+ /// <summary>
+ /// Five values in the English language
+ /// </summary>
+ private static readonly string VOWELS = "AEIOU";
+
+ /// <summary>
+ /// Variable used in Metaphone algorithm
+ /// </summary>
+ private static readonly string FRONTV = "EIY";
+
+ /// <summary>
+ /// Variable used in Metaphone algorithm
+ /// </summary>
+ private static readonly string VARSON = "CSPTG";
+
+ /// <summary>
+ /// The max code length for metaphone is 4
+ /// </summary>
+ private int maxCodeLen = 4;
+
+ /// <summary>
+ /// Creates an instance of the <see cref="Metaphone"/> encoder
+ /// </summary>
+ public Metaphone()
+ : base()
+ {
+ }
+
+ /// <summary>
+ /// Find the metaphone value of a string. This is similar to the
+ /// soundex algorithm, but better at finding similar sounding words.
+ /// All input is converted to upper case.
+ /// Limitations: Input format is expected to be a single ASCII word
+ /// with only characters in the A - Z range, no punctuation or numbers.
+ /// </summary>
+ /// <param name="txt">String to find the metaphone code for.</param>
+ /// <returns>A metaphone code corresponding to the string supplied.</returns>
+ public virtual string GetMetaphone(string txt)
+ {
+ bool hard = false;
+ if (txt == null || txt.Length == 0)
+ {
+ return "";
+ }
+ // single character is itself
+ if (txt.Length == 1)
+ {
+ return new CultureInfo("en").TextInfo.ToUpper(txt);
+ }
+
+ char[] inwd = new CultureInfo("en").TextInfo.ToUpper(txt).ToCharArray();
+
+ StringBuilder local = new StringBuilder(40); // manipulate
+ StringBuilder code = new StringBuilder(10); // output
+ // handle initial 2 characters exceptions
+ switch (inwd[0])
+ {
+ case 'K':
+ case 'G':
+ case 'P': /* looking for KN, etc*/
+ if (inwd[1] == 'N')
+ {
+ local.Append(inwd, 1, inwd.Length - 1);
+ }
+ else
+ {
+ local.Append(inwd);
+ }
+ break;
+ case 'A': /* looking for AE */
+ if (inwd[1] == 'E')
+ {
+ local.Append(inwd, 1, inwd.Length - 1);
+ }
+ else
+ {
+ local.Append(inwd);
+ }
+ break;
+ case 'W': /* looking for WR or WH */
+ if (inwd[1] == 'R')
+ { // WR -> R
+ local.Append(inwd, 1, inwd.Length - 1);
+ break;
+ }
+ if (inwd[1] == 'H')
+ {
+ local.Append(inwd, 1, inwd.Length - 1);
+ local[0] = 'W'; // WH -> W
+ }
+ else
+ {
+ local.Append(inwd);
+ }
+ break;
+ case 'X': /* initial X becomes S */
+ inwd[0] = 'S';
+ local.Append(inwd);
+ break;
+ default:
+ local.Append(inwd);
+ break;
+ } // now local has working string with initials fixed
+
+ int wdsz = local.Length;
+ int n = 0;
+
+ while (code.Length < this.MaxCodeLen &&
+ n < wdsz)
+ { // max code size of 4 works well
+ char symb = local[n];
+ // remove duplicate letters except C
+ if (symb != 'C' && IsPreviousChar(local, n, symb))
+ {
+ n++;
+ }
+ else
+ { // not dup
+ switch (symb)
+ {
+ case 'A':
+ case 'E':
+ case 'I':
+ case 'O':
+ case 'U':
+ if (n == 0)
+ {
+ code.Append(symb);
+ }
+ break; // only use vowel if leading char
+ case 'B':
+ if (IsPreviousChar(local, n, 'M') &&
+ IsLastChar(wdsz, n))
+ { // B is silent if word ends in MB
+ break;
+ }
+ code.Append(symb);
+ break;
+ case 'C': // lots of C special cases
+ /* discard if SCI, SCE or SCY */
+ if (IsPreviousChar(local, n, 'S') &&
+ !IsLastChar(wdsz, n) &&
+ FRONTV.IndexOf(local[n + 1]) >= 0)
+ {
+ break;
+ }
+ if (RegionMatch(local, n, "CIA"))
+ { // "CIA" -> X
+ code.Append('X');
+ break;
+ }
+ if (!IsLastChar(wdsz, n) &&
+ FRONTV.IndexOf(local[n + 1]) >= 0)
+ {
+ code.Append('S');
+ break; // CI,CE,CY -> S
+ }
+ if (IsPreviousChar(local, n, 'S') &&
+ IsNextChar(local, n, 'H'))
+ { // SCH->sk
+ code.Append('K');
+ break;
+ }
+ if (IsNextChar(local, n, 'H'))
+ { // detect CH
+ if (n == 0 &&
+ wdsz >= 3 &&
+ IsVowel(local, 2))
+ { // CH consonant -> K consonant
+ code.Append('K');
+ }
+ else
+ {
+ code.Append('X'); // CHvowel -> X
+ }
+ }
+ else
+ {
+ code.Append('K');
+ }
+ break;
+ case 'D':
+ if (!IsLastChar(wdsz, n + 1) &&
+ IsNextChar(local, n, 'G') &&
+ FRONTV.IndexOf(local[n + 2]) >= 0)
+ { // DGE DGI DGY -> J
+ code.Append('J'); n += 2;
+ }
+ else
+ {
+ code.Append('T');
+ }
+ break;
+ case 'G': // GH silent at end or before consonant
+ if (IsLastChar(wdsz, n + 1) &&
+ IsNextChar(local, n, 'H'))
+ {
+ break;
+ }
+ if (!IsLastChar(wdsz, n + 1) &&
+ IsNextChar(local, n, 'H') &&
+ !IsVowel(local, n + 2))
+ {
+ break;
+ }
+ if (n > 0 &&
+ (RegionMatch(local, n, "GN") ||
+ RegionMatch(local, n, "GNED")))
+ {
+ break; // silent G
+ }
+ if (IsPreviousChar(local, n, 'G'))
+ {
+ // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
+ hard = true;
+ }
+ else
+ {
+ hard = false;
+ }
+ if (!IsLastChar(wdsz, n) &&
+ FRONTV.IndexOf(local[n + 1]) >= 0 &&
+ !hard)
+ {
+ code.Append('J');
+ }
+ else
+ {
+ code.Append('K');
+ }
+ break;
+ case 'H':
+ if (IsLastChar(wdsz, n))
+ {
+ break; // terminal H
+ }
+ if (n > 0 &&
+ VARSON.IndexOf(local[n - 1]) >= 0)
+ {
+ break;
+ }
+ if (IsVowel(local, n + 1))
+ {
+ code.Append('H'); // Hvowel
+ }
+ break;
+ case 'F':
+ case 'J':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'R':
+ code.Append(symb);
+ break;
+ case 'K':
+ if (n > 0)
+ { // not initial
+ if (!IsPreviousChar(local, n, 'C'))
+ {
+ code.Append(symb);
+ }
+ }
+ else
+ {
+ code.Append(symb); // initial K
+ }
+ break;
+ case 'P':
+ if (IsNextChar(local, n, 'H'))
+ {
+ // PH -> F
+ code.Append('F');
+ }
+ else
+ {
+ code.Append(symb);
+ }
+ break;
+ case 'Q':
+ code.Append('K');
+ break;
+ case 'S':
+ if (RegionMatch(local, n, "SH") ||
+ RegionMatch(local, n, "SIO") ||
+ RegionMatch(local, n, "SIA"))
+ {
+ code.Append('X');
+ }
+ else
+ {
+ code.Append('S');
+ }
+ break;
+ case 'T':
+ if (RegionMatch(local, n, "TIA") ||
+ RegionMatch(local, n, "TIO"))
+ {
+ code.Append('X');
+ break;
+ }
+ if (RegionMatch(local, n, "TCH"))
+ {
+ // Silent if in "TCH"
+ break;
+ }
+ // substitute numeral 0 for TH (resembles theta after all)
+ if (RegionMatch(local, n, "TH"))
+ {
+ code.Append('0');
+ }
+ else
+ {
+ code.Append('T');
+ }
+ break;
+ case 'V':
+ code.Append('F'); break;
+ case 'W':
+ case 'Y': // silent if not followed by vowel
+ if (!IsLastChar(wdsz, n) &&
+ IsVowel(local, n + 1))
+ {
+ code.Append(symb);
+ }
+ break;
+ case 'X':
+ code.Append('K');
+ code.Append('S');
+ break;
+ case 'Z':
+ code.Append('S');
+ break;
+ default:
+ // do nothing
+ break;
+ } // end switch
+ n++;
+ } // end else from symb != 'C'
+ if (code.Length > this.MaxCodeLen)
+ {
+ code.Length = this.MaxCodeLen;
+ }
+ }
+ return code.ToString();
+ }
+
+ private bool IsVowel(StringBuilder sb, int index)
+ {
+ return VOWELS.IndexOf(sb[index]) >= 0;
+ }
+
+ private bool IsPreviousChar(StringBuilder sb, int index, char c)
+ {
+ bool matches = false;
+ if (index > 0 &&
+ index < sb.Length)
+ {
+ matches = sb[index - 1] == c;
+ }
+ return matches;
+ }
+
+ private bool IsNextChar(StringBuilder sb, int index, char c)
+ {
+ bool matches = false;
+ if (index >= 0 &&
+ index < sb.Length - 1)
+ {
+ matches = sb[index + 1] == c;
+ }
+ return matches;
+ }
+
+ private bool RegionMatch(StringBuilder sb, int index, string test)
+ {
+ bool matches = false;
+ if (index >= 0 &&
+ index + test.Length - 1 < sb.Length)
+ {
+ string substring = sb.ToString(index, test.Length);
+ matches = substring.Equals(test);
+ }
+ return matches;
+ }
+
+ private bool IsLastChar(int wdsz, int n)
+ {
+ return n + 1 == wdsz;
+ }
+
+ // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+ // /**
+ // * Encodes an Object using the metaphone algorithm. This method
+ // * is provided in order to satisfy the requirements of the
+ // * Encoder interface, and will throw an EncoderException if the
+ // * supplied object is not of type java.lang.String.
+ // *
+ // * @param obj Object to encode
+ // * @return An object (or type java.lang.String) containing the
+ // * metaphone code which corresponds to the String supplied.
+ // * @throws EncoderException if the parameter supplied is not
+ // * of type java.lang.String
+ // */
+ // @Override
+ //public object encode(object obj)
+ // {
+ // if (!(obj is String)) {
+ // throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
+ // }
+ // return GetMetaphone((String) obj);
+ // }
+
+ /// <summary>
+ /// Encodes a string using the <see cref="Metaphone"/> algorithm.
+ /// </summary>
+ /// <param name="str">String to encode.</param>
+ /// <returns>The metaphone code corresponding to the string supplied.</returns>
+ public virtual string Encode(string str)
+ {
+ return GetMetaphone(str);
+ }
+
+ /// <summary>
+ /// Tests is the metaphones of two strings are identical.
+ /// </summary>
+ /// <param name="str1">First of two strings to compare.</param>
+ /// <param name="str2">Second of two strings to compare.</param>
+ /// <returns><c>true</c> if the metaphones of these strings are identical, <c>false</c> otherwise.</returns>
+ public virtual bool IsMetaphoneEqual(string str1, string str2)
+ {
+ return GetMetaphone(str1).Equals(GetMetaphone(str2));
+ }
+
+ /// <summary>
+ /// Gets or Sets <see cref="maxCodeLen"/>.
+ /// </summary>
+ public virtual int MaxCodeLen
+ {
+ get { return this.maxCodeLen; }
+ set { this.maxCodeLen = value; }
+ }
+ }
+}