You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/06/27 20:33:58 UTC
[13/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic +
tests. Rather than porting over the entire commons-codec library,
only the language features were ported and added to this library.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs
new file mode 100644
index 0000000..3cf5c7a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs
@@ -0,0 +1,578 @@
+// commons-codec version compatibility level: 1.9
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Converts words into potential phonetic representations.
+ /// </summary>
+ /// <remarks>
+ /// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
+ /// into account the likely source language. Next, this phonetic representation is converted into a
+ /// pan-European 'average' representation, allowing comparison between different versions of essentially
+ /// the same word from different languages.
+ /// <para/>
+ /// This class is intentionally immutable and thread-safe.
+ /// If you wish to alter the settings for a PhoneticEngine, you
+ /// must make a new one with the updated settings.
+ /// <para/>
+ /// Ported from phoneticengine.php
+ /// <para/>
+ /// since 1.6
+ /// </remarks>
+ public class PhoneticEngine
+ {
+ internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);
+
+ /// <summary>
+ /// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
+ /// this package, and probably not outside the <see cref="PhoneticEngine"/> class.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ internal sealed class PhonemeBuilder
+ {
+ /// <summary>
+ /// An empty builder where all phonemes must come from some set of languages. This will contain a single
+ /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
+ /// phoneme from scratch.
+ /// </summary>
+ /// <param name="languages">The set of languages.</param>
+ /// <returns>A new, empty phoneme builder.</returns>
+ public static PhonemeBuilder Empty(LanguageSet languages)
+ {
+ return new PhonemeBuilder(new Phoneme("", languages));
+ }
+
+ private readonly IList<Phoneme> phonemes;
+
+ private PhonemeBuilder(Phoneme phoneme)
+ {
+ // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that
+ this.phonemes = new List<Phoneme>();
+ this.phonemes.Add(phoneme);
+ }
+
+ internal PhonemeBuilder(IList<Phoneme> phonemes)
+ {
+ this.phonemes = phonemes;
+ }
+
+ /// <summary>
+ /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
+ /// </summary>
+ /// <param name="str">The characters to append to the phonemes.</param>
+ public void Append(ICharSequence str)
+ {
+ foreach (Phoneme ph in this.phonemes)
+ {
+ ph.Append(str.ToString());
+ }
+ }
+
+ /// <summary>
+ /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
+ /// </summary>
+ /// <param name="str">The characters to append to the phonemes.</param>
+ // LUCENENET specific
+ public void Append(string str)
+ {
+ foreach (Phoneme ph in this.phonemes)
+ {
+ ph.Append(str);
+ }
+ }
+
+ /// <summary>
+ /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
+ /// </summary>
+ /// <param name="str">The characters to append to the phonemes.</param>
+ // LUCENENET specific
+ public void Append(StringBuilder str)
+ {
+ foreach (Phoneme ph in this.phonemes)
+ {
+ ph.Append(str.ToString());
+ }
+ }
+
+ /// <summary>
+ /// Applies the given phoneme expression to all phonemes in this phoneme builder.
+ /// <para/>
+ /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
+ /// incompatible.
+ /// </summary>
+ /// <param name="phonemeExpr">The expression to apply.</param>
+ /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
+ public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
+ {
+ // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that
+ IList<Phoneme> newPhonemes = new List<Phoneme>(maxPhonemes);
+
+ //EXPR_continue:
+ foreach (Phoneme left in this.phonemes)
+ {
+ foreach (Phoneme right in phonemeExpr.Phonemes)
+ {
+ LanguageSet languages = left.Languages.RestrictTo(right.Languages);
+ if (!languages.IsEmpty)
+ {
+ Phoneme join = new Phoneme(left, right, languages);
+ if (newPhonemes.Count < maxPhonemes)
+ {
+ newPhonemes.Add(join);
+ if (newPhonemes.Count >= maxPhonemes)
+ {
+ goto EXPR_break;
+ }
+ }
+ }
+ }
+ }
+ EXPR_break: { }
+
+ this.phonemes.Clear();
+ // LUCENENET: We need to filter out any duplicates, since we converted from LinkedHashSet
+ // to List.
+ this.phonemes.AddRange(newPhonemes.Where(x => !phonemes.Any(y => y.Equals(x))));
+ }
+
+ /// <summary>
+ /// Gets underlying phoneme set. Please don't mutate.
+ /// </summary>
+ public IList<Phoneme> Phonemes
+ {
+ get { return this.phonemes; }
+ }
+
+ /// <summary>
+ /// Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
+ /// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially
+ /// expensive operation, which should be avoided when debugging.
+ /// </summary>
+ /// <returns>The stringified phoneme set.</returns>
+ public string MakeString()
+ {
+ StringBuilder sb = new StringBuilder();
+
+ foreach (Phoneme ph in this.phonemes)
+ {
+ if (sb.Length > 0)
+ {
+ sb.Append("|");
+ }
+ sb.Append(ph.GetPhonemeText());
+ }
+
+ return sb.ToString();
+ }
+ }
+
+ /// <summary>
+ /// A function closure capturing the application of a list of rules to an input sequence at a particular offset.
+ /// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the
+ /// index of the next char in <c>input</c> that must be processed next (the input up to that index having been
+ /// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a
+ /// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes
+ /// updated by the matching rule.
+ /// <para/>
+ /// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
+ /// as it is constructed as needed by the calling methods.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ private sealed class RulesApplication
+ {
+ private readonly IDictionary<string, IList<Rule>> finalRules;
+ private readonly string input;
+
+ private PhonemeBuilder phonemeBuilder;
+ private int i;
+ private readonly int maxPhonemes;
+ private bool found;
+
+ public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input,
+ PhonemeBuilder phonemeBuilder, int i, int maxPhonemes)
+ {
+ if (finalRules == null)
+ {
+ throw new ArgumentNullException("The finalRules argument must not be null");
+ }
+ this.finalRules = finalRules;
+ this.phonemeBuilder = phonemeBuilder;
+ this.input = input;
+ this.i = i;
+ this.maxPhonemes = maxPhonemes;
+ }
+
+ public int I
+ {
+ get { return this.i; }
+ }
+
+ public PhonemeBuilder PhonemeBuilder
+ {
+ get { return this.phonemeBuilder; }
+ }
+
+ /// <summary>
+ /// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
+ /// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
+ /// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling.
+ /// </summary>
+ /// <returns><c>this</c></returns>
+ public RulesApplication Invoke()
+ {
+ this.found = false;
+ int patternLength = 1;
+ IList<Rule> rules;
+ if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out rules) && rules != null)
+ {
+ foreach (Rule rule in rules)
+ {
+ string pattern = rule.Pattern;
+ patternLength = pattern.Length;
+ if (rule.PatternAndContextMatches(this.input, this.i))
+ {
+ this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes);
+ this.found = true;
+ break;
+ }
+ }
+ }
+
+ if (!this.found)
+ {
+ patternLength = 1;
+ }
+
+ this.i += patternLength;
+ return this;
+ }
+
+ public bool IsFound
+ {
+ get { return this.found; }
+ }
+ }
+
+ private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = new Dictionary<NameType, ISet<string>>();
+
+ static PhoneticEngine()
+ {
+ NAME_PREFIXES[NameType.ASHKENAZI] =
+ Collections.UnmodifiableSet(
+ new HashSet<string>() { "bar", "ben", "da", "de", "van", "von" });
+ NAME_PREFIXES[NameType.SEPHARDIC] =
+ Collections.UnmodifiableSet(
+ new HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la",
+ "della", "des", "di", "do", "dos", "du", "van", "von" });
+ NAME_PREFIXES[NameType.GENERIC] =
+ Collections.UnmodifiableSet(
+ new HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della",
+ "des", "di", "do", "dos", "du", "van", "von" });
+ }
+
+ /// <summary>
+ /// Joins some strings with an internal separator.
+ /// </summary>
+ /// <param name="strings">Strings to join.</param>
+ /// <param name="sep">String to separate them with.</param>
+ /// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns>
+ private static string Join(IEnumerable<string> strings, string sep)
+ {
+ StringBuilder sb = new StringBuilder();
+ using (IEnumerator<string> si = strings.GetEnumerator())
+ {
+ if (si.MoveNext())
+ {
+ sb.Append(si.Current);
+ }
+ while (si.MoveNext())
+ {
+ sb.Append(sep).Append(si.Current);
+ }
+ }
+
+ return sb.ToString();
+ }
+
+ private static readonly int DEFAULT_MAX_PHONEMES = 20;
+
+ private readonly Lang lang;
+
+ private readonly NameType nameType;
+
+ private readonly RuleType ruleType;
+
+ private readonly bool concat;
+
+ private readonly int maxPhonemes;
+
+ /// <summary>
+ /// Generates a new, fully-configured phonetic engine.
+ /// </summary>
+ /// <param name="nameType">The type of names it will use.</param>
+ /// <param name="ruleType">The type of rules it will apply.</param>
+ /// <param name="concat">If it will concatenate multiple encodings.</param>
+ public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat)
+ : this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES)
+ {
+ }
+
+ /// <summary>
+ /// Generates a new, fully-configured phonetic engine.
+ /// <para/>
+ /// since 1.7
+ /// </summary>
+ /// <param name="nameType">The type of names it will use.</param>
+ /// <param name="ruleType">The type of rules it will apply.</param>
+ /// <param name="concat">If it will concatenate multiple encodings.</param>
+ /// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param>
+ public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat,
+ int maxPhonemes)
+ {
+ if (ruleType == RuleType.RULES)
+ {
+ throw new ArgumentException("ruleType must not be " + RuleType.RULES);
+ }
+ this.nameType = nameType;
+ this.ruleType = ruleType;
+ this.concat = concat;
+ this.lang = Lang.GetInstance(nameType);
+ this.maxPhonemes = maxPhonemes;
+ }
+
+ /// <summary>
+ /// Applies the final rules to convert from a language-specific phonetic representation to a
+ /// language-independent representation.
+ /// </summary>
+ /// <param name="phonemeBuilder">The current phonemes.</param>
+ /// <param name="finalRules">The final rules to apply.</param>
+ /// <returns>The resulting phonemes.</returns>
+ private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
+ IDictionary<string, IList<Rule>> finalRules)
+ {
+ if (finalRules == null)
+ {
+ throw new ArgumentNullException("finalRules can not be null");
+ }
+ if (finalRules.Count == 0)
+ {
+ return phonemeBuilder;
+ }
+
+ ISet<Phoneme> phonemes = new SortedSet<Phoneme>(Phoneme.COMPARER);
+
+ foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
+ {
+ PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages);
+ string phonemeText = phoneme.GetPhonemeText();
+
+ for (int i = 0; i < phonemeText.Length;)
+ {
+ RulesApplication rulesApplication =
+ new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
+ bool found = rulesApplication.IsFound;
+ subBuilder = rulesApplication.PhonemeBuilder;
+
+ if (!found)
+ {
+ // not found, appending as-is
+ subBuilder.Append(phonemeText.Substring(i, 1));
+ }
+
+ i = rulesApplication.I;
+ }
+
+ phonemes.UnionWith(subBuilder.Phonemes);
+ }
+
+ return new PhonemeBuilder(phonemes.ToList());
+ }
+
+ /// <summary>
+ /// Encodes a string to its phonetic representation.
+ /// </summary>
+ /// <param name="input">The string to encode.</param>
+ /// <returns>The encoding of the input.</returns>
+ public virtual string Encode(string input)
+ {
+ LanguageSet languageSet = this.lang.GuessLanguages(input);
+ return Encode(input, languageSet);
+ }
+
+ /// <summary>
+ /// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
+ /// </summary>
+ /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
+ /// <param name="languageSet"></param>
+ /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
+ public virtual string Encode(string input, LanguageSet languageSet)
+ {
+ IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
+ // rules common across many (all) languages
+ IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
+ // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
+ IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);
+
+ // tidy the input
+ // lower case is a locale-dependent operation
+ input = input.ToLowerInvariant().Replace('-', ' ').Trim();
+
+ if (this.nameType == NameType.GENERIC)
+ {
+ if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'"))
+ { // check for d'
+ string remainder = input.Substring(2);
+ string combined = "d" + remainder;
+ return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
+ }
+ foreach (string l in NAME_PREFIXES[this.nameType])
+ {
+ // handle generic prefixes
+ if (input.StartsWith(l + " ", StringComparison.Ordinal))
+ {
+ // check for any prefix in the words list
+ string remainder = input.Substring(l.Length + 1); // input without the prefix
+ string combined = l + remainder; // input with prefix without space
+ return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
+ }
+ }
+ }
+
+ IList<string> words = WHITESPACE.Split(input).ToList();
+ IList<string> words2 = new List<string>();
+
+ // special-case handling of word prefixes based upon the name type
+ switch (this.nameType)
+ {
+ case NameType.SEPHARDIC:
+ foreach (string aWord in words)
+ {
+ string[] parts = aWord.Split(new char[] { '\'' }, StringSplitOptions.RemoveEmptyEntries);
+ string lastPart = parts[parts.Length - 1];
+ words2.Add(lastPart);
+ }
+ words2.RemoveAll(NAME_PREFIXES[this.nameType]);
+ break;
+ case NameType.ASHKENAZI:
+ words2.AddRange(words);
+ words2.RemoveAll(NAME_PREFIXES[this.nameType]);
+ break;
+ case NameType.GENERIC:
+ words2.AddRange(words);
+ break;
+ default:
+ throw new InvalidOperationException("Unreachable case: " + this.nameType);
+ }
+
+ if (this.concat)
+ {
+ // concat mode enabled
+ input = Join(words2, " ");
+ }
+ else if (words2.Count == 1)
+ {
+ // not a multi-word name
+ //input = words.iterator().next();
+ input = words.FirstOrDefault();
+ }
+ else
+ {
+ // encode each word in a multi-word name separately (normally used for approx matches)
+ StringBuilder result = new StringBuilder();
+ foreach (string word in words2)
+ {
+ result.Append("-").Append(Encode(word));
+ }
+ // return the result without the leading "-"
+ return result.ToString(1, result.Length - 1);
+ }
+
+ PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);
+
+ // loop over each char in the input - we will handle the increment manually
+ for (int i = 0; i < input.Length;)
+ {
+ RulesApplication rulesApplication =
+ new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
+ i = rulesApplication.I;
+ phonemeBuilder = rulesApplication.PhonemeBuilder;
+ }
+
+ // Apply the general rules
+ phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
+ // Apply the language-specific rules
+ phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);
+
+ return phonemeBuilder.MakeString();
+ }
+
+ /// <summary>
+ /// Gets the Lang language guessing rules being used.
+ /// </summary>
+ public virtual Lang Lang
+ {
+ get { return this.lang; }
+ }
+
+ /// <summary>
+ /// Gets the <see cref="Bm.NameType"/> being used.
+ /// </summary>
+ public virtual NameType NameType
+ {
+ get { return this.nameType; }
+ }
+
+ /// <summary>
+ /// Gets the <see cref="Bm.RuleType"/> being used.
+ /// </summary>
+ public virtual RuleType RuleType
+ {
+ get { return this.ruleType; }
+ }
+
+ /// <summary>
+ /// Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
+ /// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is.
+ /// </summary>
+ public virtual bool IsConcat
+ {
+ get { return this.concat; }
+ }
+
+ /// <summary>
+ /// Gets the maximum number of phonemes the engine will calculate for a given input.
+ /// <para/>
+ /// since 1.7
+ /// </summary>
+ public virtual int MaxPhonemes
+ {
+ get { return this.maxPhonemes; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs
new file mode 100644
index 0000000..c70d404
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs
@@ -0,0 +1,37 @@
+// commons-codec version compatibility level: 1.9
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Constants used to process resource files.
+ /// <para/>
+ /// This class is immutable and thread-safe.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ internal class ResourceConstants
+ {
+ public static readonly string CMT = "//";
+ public static readonly Encoding ENCODING = Encoding.UTF8;
+ public static readonly string EXT_CMT_END = "*/";
+ public static readonly string EXT_CMT_START = "/*";
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs
new file mode 100644
index 0000000..52f3d9a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs
@@ -0,0 +1,1069 @@
+// commons-codec version compatibility level: 1.9
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Reflection;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A phoneme rule.
+ /// </summary>
+ /// <remarks>
+ /// Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
+ /// and a logical flag indicating if all languages must be in play. A rule matches if:
+ /// <list type="bullet">
+ /// <item><description>the pattern matches at the current position</description></item>
+ /// <item><description>the string up until the beginning of the pattern matches the left context</description></item>
+ /// <item><description>the string from the end of the pattern matches the right context</description></item>
+ /// <item><description>logical is ALL and all languages are in scope; or</description></item>
+ /// <item><description>logical is any other value and at least one language is in scope</description></item>
+ /// </list>
+ /// <para/>
+ /// Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
+ /// to explicitly construct their own.
+ /// <para/>
+ /// Rules are immutable and thread-safe.
+ /// <para/>
+ /// <b>Rules resources</b>
+ /// <para/>
+ /// Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
+ /// named following the pattern:
+ /// <c>Lucene.Net.Analysis.Phonetic.Language.Bm.<see cref="NameType"/>_<see cref="RuleType"/>_[language].txt</c>
+ /// <para/>
+ /// The format of these resources is the following:
+ /// <list type="table">
+ /// <item>
+ /// <term>Rules:</term>
+ /// <description>
+ /// whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
+ /// will be interpreted as:
+ /// <list type="number">
+ /// <item><description>pattern</description></item>
+ /// <item><description>left context</description></item>
+ /// <item><description>right context</description></item>
+ /// <item><description>phoneme</description></item>
+ /// </list>
+ /// </description>
+ /// </item>
+ /// <item>
+ /// <term>End-of-line comments:</term>
+ /// <description>Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</description>
+ /// </item>
+ /// <item>
+ /// <term>Multi-line comments:</term>
+ /// <description>Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending in '*' and '/' is found.</description>
+ /// </item>
+ /// <item>
+ /// <term>Blank lines:</term>
+ /// <description>All blank lines will be skipped.</description>
+ /// </item>
+ /// </list>
+ /// <para/>
+ /// since 1.6
+ /// </remarks>
+ public class Rule
+ {
+ private static Regex PIPE = new Regex("[|]", RegexOptions.Compiled);
+ private static Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);
+ private static Regex PLUS = new Regex("[+]", RegexOptions.Compiled);
+
+ private class AllStringsRMatcher : IRPattern
+ {
+ public bool IsMatch(StringBuilder input)
+ {
+ return true;
+ }
+
+ public bool IsMatch(string input)
+ {
+ return true;
+ }
+
+ public bool IsMatch(ICharSequence input)
+ {
+ return true;
+ }
+ }
+
+ public static readonly IRPattern ALL_STRINGS_RMATCHER = new AllStringsRMatcher();
+
+
+ public static readonly string ALL = "ALL";
+
+ private static readonly string DOUBLE_QUOTE = "\"";
+
+ private static readonly string HASH_INCLUDE = "#include";
+
+ private static readonly IDictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>> RULES =
+ new Dictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>>();
+
+ static Rule()
+ {
+ foreach (NameType s in Enum.GetValues(typeof(NameType)))
+ {
+ IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>> rts =
+ new Dictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>();
+
+ foreach (RuleType rt in Enum.GetValues(typeof(RuleType)))
+ {
+ IDictionary<string, IDictionary<string, IList<Rule>>> rs = new Dictionary<string, IDictionary<string, IList<Rule>>>();
+
+ Languages ls = Languages.GetInstance(s);
+ foreach (string l in ls.GetLanguages())
+ {
+ try
+ {
+ rs[l] = ParseRules(CreateScanner(s, rt, l), CreateResourceName(s, rt, l));
+ }
+ catch (InvalidOperationException e)
+ {
+ throw new InvalidOperationException("Problem processing " + CreateResourceName(s, rt, l), e);
+ }
+ }
+ if (!rt.Equals(RuleType.RULES))
+ {
+ rs["common"] = ParseRules(CreateScanner(s, rt, "common"), CreateResourceName(s, rt, "common"));
+ }
+
+ rts[rt] = Collections.UnmodifiableMap(rs);
+ }
+
+ RULES[s] = Collections.UnmodifiableMap(rts);
+ }
+ }
+
+ private static bool Contains(ICharSequence chars, char input)
+ {
+ for (int i = 0; i < chars.Length; i++)
+ {
+ if (chars[i] == input)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+ private static bool Contains(string chars, char input)
+ {
+ for (int i = 0; i < chars.Length; i++)
+ {
+ if (chars[i] == input)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+ private static bool Contains(StringBuilder chars, char input)
+ {
+ for (int i = 0; i < chars.Length; i++)
+ {
+ if (chars[i] == input)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static string CreateResourceName(NameType nameType, RuleType rt, string lang)
+ {
+ return string.Format("{0}_{1}_{2}.txt",
+ nameType.GetName(), rt.GetName(), lang);
+ }
+
+ private static TextReader CreateScanner(NameType nameType, RuleType rt, string lang)
+ {
+ string resName = CreateResourceName(nameType, rt, lang);
+ Stream rulesIS = typeof(Languages).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Languages), resName);
+
+ if (rulesIS == null)
+ {
+ throw new ArgumentException("Unable to load resource: " + resName);
+ }
+
+ return new StreamReader(rulesIS, ResourceConstants.ENCODING);
+ }
+
+ private static TextReader CreateScanner(string lang)
+ {
+ string resName = string.Format("{0}.txt", lang);
+ Stream rulesIS = typeof(Languages).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Languages), resName);
+
+ if (rulesIS == null)
+ {
+ throw new ArgumentException("Unable to load resource: " + resName);
+ }
+
+ return new StreamReader(rulesIS, ResourceConstants.ENCODING);
+ }
+
+ private static bool EndsWith(ICharSequence input, string suffix)
+ {
+ if (suffix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--)
+ {
+ if (input[i] != suffix[j])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static bool EndsWith(string input, string suffix)
+ {
+ if (suffix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--)
+ {
+ if (input[i] != suffix[j])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static bool EndsWith(StringBuilder input, string suffix)
+ {
+ if (suffix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--)
+ {
+ if (input[i] != suffix[j])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// Gets rules for a combination of name type, rule type and languages.
+ /// </summary>
+ /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
+ /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
+ /// <param name="langs">The set of languages to consider.</param>
+ /// <returns>A list of <see cref="Rule"/>s that apply.</returns>
+ public static IList<Rule> GetInstance(NameType nameType, RuleType rt,
+ LanguageSet langs)
+ {
+ IDictionary<string, IList<Rule>> ruleMap = GetInstanceMap(nameType, rt, langs);
+ IList<Rule> allRules = new List<Rule>();
+ foreach (IList<Rule> rules in ruleMap.Values)
+ {
+ allRules.AddRange(rules);
+ }
+ return allRules;
+ }
+
+ /// <summary>
+ /// Gets rules for a combination of name type, rule type and a single language.
+ /// </summary>
+ /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
+ /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
+ /// <param name="lang">The language to consider.</param>
+ /// <returns>A list of <see cref="Rule"/>s that apply.</returns>
+ public static IList<Rule> GetInstance(NameType nameType, RuleType rt, string lang)
+ {
+ return GetInstance(nameType, rt, LanguageSet.From(new HashSet<string>() { lang }));
+ }
+
+ /// <summary>
+ /// Gets rules for a combination of name type, rule type and languages.
+ /// <para/>
+ /// since 1.9
+ /// </summary>
+ /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
+ /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
+ /// <param name="langs">The set of languages to consider.</param>
+ /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns>
+ public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt,
+ LanguageSet langs)
+ {
+ return langs.IsSingleton ? GetInstanceMap(nameType, rt, langs.GetAny()) :
+ GetInstanceMap(nameType, rt, Languages.ANY);
+ }
+
+ /// <summary>
+ /// Gets rules for a combination of name type, rule type and a single language.
+ /// <para/>
+ /// since 1.9
+ /// </summary>
+ /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
+ /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
+ /// <param name="lang">The language to consider.</param>
+ /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns>
+ public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt,
+ string lang)
+ {
+ IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>> nameTypes;
+ IDictionary<string, IDictionary<string, IList<Rule>>> ruleTypes;
+ IDictionary<string, IList<Rule>> rules = null;
+
+ if (RULES.TryGetValue(nameType, out nameTypes) && nameTypes != null &&
+ nameTypes.TryGetValue(rt, out ruleTypes) && ruleTypes != null &&
+ ruleTypes.TryGetValue(lang, out rules) && rules != null)
+ {
+ }
+ else
+ {
+ throw new ArgumentException(string.Format("No rules found for {0}, {1}, {2}.",
+ nameType.GetName(), rt.GetName(), lang));
+ }
+
+ return rules;
+ }
+
+ private static Phoneme ParsePhoneme(string ph)
+ {
+ int open = ph.IndexOf("[");
+ if (open >= 0)
+ {
+ if (!ph.EndsWith("]", StringComparison.Ordinal))
+ {
+ throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'");
+ }
+ string before = ph.Substring(0, open - 0);
+ string input = ph.Substring(open + 1, (ph.Length - 1) - (open + 1));
+ ISet<string> langs = new HashSet<string>(PLUS.Split(input));
+
+ return new Phoneme(before, LanguageSet.From(langs));
+ }
+ else
+ {
+ return new Phoneme(ph, Languages.ANY_LANGUAGE);
+ }
+ }
+
+ private static IPhonemeExpr ParsePhonemeExpr(string ph)
+ {
+ if (ph.StartsWith("(", StringComparison.Ordinal))
+ { // we have a bracketed list of options
+ if (!ph.EndsWith(")", StringComparison.Ordinal))
+ {
+ throw new ArgumentException("Phoneme starts with '(' so must end with ')'");
+ }
+
+ IList<Phoneme> phs = new List<Phoneme>();
+ string body = ph.Substring(1, (ph.Length - 1) - 1);
+ foreach (string part in PIPE.Split(body))
+ {
+ phs.Add(ParsePhoneme(part));
+ }
+ if (body.StartsWith("|", StringComparison.Ordinal) || body.EndsWith("|", StringComparison.Ordinal))
+ {
+ phs.Add(new Phoneme("", Languages.ANY_LANGUAGE));
+ }
+
+ return new PhonemeList(phs);
+ }
+ else
+ {
+ return ParsePhoneme(ph);
+ }
+ }
+
+ private class RuleAnonymousHelper : Rule
+ {
+ private readonly int myLine;
+ private readonly string loc;
+
+ public RuleAnonymousHelper(string pat, string lCon, string rCon, IPhonemeExpr ph, int cLine, string location)
+ : base(pat, lCon, rCon, ph)
+ {
+ this.myLine = cLine;
+ this.loc = location;
+ }
+
+ public override string ToString()
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.Append("Rule");
+ sb.Append("{line=").Append(myLine);
+ sb.Append(", loc='").Append(loc).Append('\'');
+ sb.Append('}');
+ return sb.ToString();
+ }
+ }
+
+ private static IDictionary<string, IList<Rule>> ParseRules(TextReader reader, string location)
+ {
+ IDictionary<string, IList<Rule>> lines = new HashMap<string, IList<Rule>>();
+ int currentLine = 0;
+
+ bool inMultilineComment = false;
+ string rawLine;
+ try
+ {
+ while ((rawLine = reader.ReadLine()) != null)
+ {
+ currentLine++;
+ string line = rawLine;
+
+ if (inMultilineComment)
+ {
+ if (line.EndsWith(ResourceConstants.EXT_CMT_END))
+ {
+ inMultilineComment = false;
+ }
+ }
+ else
+ {
+ if (line.StartsWith(ResourceConstants.EXT_CMT_START))
+ {
+ inMultilineComment = true;
+ }
+ else
+ {
+ // discard comments
+ int cmtI = line.IndexOf(ResourceConstants.CMT);
+ if (cmtI >= 0)
+ {
+ line = line.Substring(0, cmtI);
+ }
+
+ // trim leading-trailing whitespace
+ line = line.Trim();
+
+ if (line.Length == 0)
+ {
+ continue; // empty lines can be safely skipped
+ }
+
+ if (line.StartsWith(HASH_INCLUDE, StringComparison.Ordinal))
+ {
+ // include statement
+ string incl = line.Substring(HASH_INCLUDE.Length).Trim();
+ if (incl.Contains(" "))
+ {
+ throw new ArgumentException("Malformed import statement '" + rawLine + "' in " +
+ location);
+ }
+ else
+ {
+ lines.PutAll(ParseRules(CreateScanner(incl), location + "->" + incl));
+ }
+ }
+ else
+ {
+ // rule
+ string[] parts = WHITESPACE.Split(line);
+ if (parts.Length != 4)
+ {
+ throw new ArgumentException("Malformed rule statement split into " + parts.Length +
+ " parts: " + rawLine + " in " + location);
+ }
+ else
+ {
+ try
+ {
+ string pat = StripQuotes(parts[0]);
+ string lCon = StripQuotes(parts[1]);
+ string rCon = StripQuotes(parts[2]);
+ IPhonemeExpr ph = ParsePhonemeExpr(StripQuotes(parts[3]));
+ int cLine = currentLine;
+ Rule r = new RuleAnonymousHelper(pat, lCon, rCon, ph, cLine, location);
+
+ string patternKey = r.pattern.Substring(0, 1 - 0);
+ IList<Rule> rules;
+ if (!lines.TryGetValue(patternKey, out rules) || rules == null)
+ {
+ rules = new List<Rule>();
+ lines[patternKey] = rules;
+ }
+ rules.Add(r);
+ }
+ catch (ArgumentException e)
+ {
+ throw new InvalidOperationException("Problem parsing line '" + currentLine + "' in " +
+ location, e);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ finally
+ {
+ reader.Dispose();
+ }
+
+ return lines;
+ }
+
+ private class RPatternHelper : IRPattern
+ {
+ private readonly Func<StringBuilder, bool> isMatchSB;
+ private readonly Func<string, bool> isMatchStr;
+ private readonly Func<ICharSequence, bool> isMatchCS;
+
+ public RPatternHelper(Func<StringBuilder, bool> isMatchSB, Func<string, bool> isMatchStr, Func<ICharSequence, bool> isMatchCS)
+ {
+ this.isMatchSB = isMatchSB;
+ this.isMatchStr = isMatchStr;
+ this.isMatchCS = isMatchCS;
+ }
+
+ public bool IsMatch(StringBuilder input)
+ {
+ return isMatchSB(input);
+ }
+
+ public bool IsMatch(string input)
+ {
+ return isMatchStr(input);
+ }
+
+ public bool IsMatch(ICharSequence input)
+ {
+ return isMatchCS(input);
+ }
+ }
+
+ /// <summary>
+ /// Attempts to compile the regex into direct string ops, falling back to <see cref="Regex"/> and <see cref="Match"/> in the worst case.
+ /// </summary>
+ /// <param name="regex">The regular expression to compile.</param>
+ /// <returns>An RPattern that will match this regex.</returns>
+ private static IRPattern GetPattern(string regex)
+ {
+ bool startsWith = regex.StartsWith("^");
+ bool endsWith = regex.EndsWith("$");
+ string content = regex.Substring(startsWith ? 1 : 0, (endsWith ? regex.Length - 1 : regex.Length) - (startsWith ? 1 : 0));
+ bool boxes = content.Contains("[");
+
+ if (!boxes)
+ {
+ if (startsWith && endsWith)
+ {
+ // exact match
+ if (content.Length == 0)
+ {
+ // empty
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return input.Length == 0;
+ }, isMatchStr: (input) =>
+ {
+ return input.Length == 0;
+ }, isMatchCS: (input) =>
+ {
+ return input.Length == 0;
+ });
+ }
+ else
+ {
+
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return input.Equals(content);
+ }, isMatchStr: (input) =>
+ {
+ return input.Equals(content);
+ }, isMatchCS: (input) =>
+ {
+ return input.Equals(content);
+ });
+ }
+ }
+ else if ((startsWith || endsWith) && content.Length == 0)
+ {
+ // matches every string
+ return ALL_STRINGS_RMATCHER;
+ }
+ else if (startsWith)
+ {
+ // matches from start
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return StartsWith(input, content);
+ }, isMatchStr: (input) =>
+ {
+ return StartsWith(input, content);
+ }, isMatchCS: (input) =>
+ {
+ return StartsWith(input, content);
+ });
+
+ }
+ else if (endsWith)
+ {
+ // matches from start
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return EndsWith(input, content);
+ }, isMatchStr: (input) =>
+ {
+ return EndsWith(input, content);
+ }, isMatchCS: (input) =>
+ {
+ return EndsWith(input, content);
+ });
+ }
+ }
+ else
+ {
+ bool startsWithBox = content.StartsWith("[", StringComparison.Ordinal);
+ bool endsWithBox = content.EndsWith("]", StringComparison.Ordinal);
+
+ if (startsWithBox && endsWithBox)
+ {
+ string boxContent = content.Substring(1, (content.Length - 1) - 1);
+ if (!boxContent.Contains("["))
+ {
+ // box containing alternatives
+ bool negate = boxContent.StartsWith("^", StringComparison.Ordinal);
+ if (negate)
+ {
+ boxContent = boxContent.Substring(1);
+ }
+ string bContent = boxContent;
+ bool shouldMatch = !negate;
+
+ if (startsWith && endsWith)
+ {
+ // exact match
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch;
+ }, isMatchStr: (input) =>
+ {
+ return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch;
+ }, isMatchCS: (input) =>
+ {
+ return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch;
+ });
+ }
+ else if (startsWith)
+ {
+ // first char
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch;
+ }, isMatchStr: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch;
+ }, isMatchCS: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch;
+ });
+ }
+ else if (endsWith)
+ {
+ // last char
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch;
+ }, isMatchStr: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch;
+ }, isMatchCS: (input) =>
+ {
+ return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch;
+ });
+ }
+ }
+ }
+ }
+ Regex pattern = new Regex(regex, RegexOptions.Compiled);
+
+ return new RPatternHelper(isMatchSB: (input) =>
+ {
+ Match matcher = pattern.Match(input.ToString());
+ return matcher.Success;
+ }, isMatchStr: (input) =>
+ {
+ Match matcher = pattern.Match(input);
+ return matcher.Success;
+ }, isMatchCS: (input) =>
+ {
+ Match matcher = pattern.Match(input.ToString());
+ return matcher.Success;
+ });
+ }
+
+ private static bool StartsWith(ICharSequence input, string prefix)
+ {
+ if (prefix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = 0; i < prefix.Length; i++)
+ {
+ if (input[i] != prefix[i])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static bool StartsWith(string input, string prefix)
+ {
+ if (prefix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = 0; i < prefix.Length; i++)
+ {
+ if (input[i] != prefix[i])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static bool StartsWith(StringBuilder input, string prefix)
+ {
+ if (prefix.Length > input.Length)
+ {
+ return false;
+ }
+ for (int i = 0; i < prefix.Length; i++)
+ {
+ if (input[i] != prefix[i])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static string StripQuotes(string str)
+ {
+ if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+ {
+ str = str.Substring(1);
+ }
+
+ if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+ {
+ str = str.Substring(0, str.Length - 1);
+ }
+
+ return str;
+ }
+
+ private readonly IRPattern lContext;
+
+ private readonly string pattern;
+
+ private readonly IPhonemeExpr phoneme;
+
+ private readonly IRPattern rContext;
+
+ /// <summary>
+ /// Creates a new rule.
+ /// </summary>
+ /// <param name="pattern">The pattern.</param>
+ /// <param name="lContext">The left context.</param>
+ /// <param name="rContext">The right context.</param>
+ /// <param name="phoneme">The resulting phoneme.</param>
+ public Rule(string pattern, string lContext, string rContext, IPhonemeExpr phoneme)
+ {
+ this.pattern = pattern;
+ this.lContext = GetPattern(lContext + "$");
+ this.rContext = GetPattern("^" + rContext);
+ this.phoneme = phoneme;
+ }
+
+ /// <summary>
+ /// Gets the left context pattern. This is a regular expression that must match to the left of the pattern.
+ /// </summary>
+ public virtual IRPattern LContext
+ {
+ get { return this.lContext; }
+ }
+
+ /// <summary>
+ /// Gets the pattern. This is a string-literal that must exactly match.
+ /// </summary>
+ public virtual string Pattern
+ {
+ get { return this.pattern; }
+ }
+
+ /// <summary>
+ /// Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
+ /// </summary>
+ public virtual IPhonemeExpr Phoneme
+ {
+ get { return this.phoneme; }
+ }
+
+ /// <summary>
+ /// Gets the right context pattern. This is a regular expression that must match to the right of the pattern.
+ /// </summary>
+ public virtual IRPattern RContext
+ {
+ get { return this.rContext; }
+ }
+
+ /// <summary>
+ /// Decides if the pattern and context match the input starting at a position. It is a match if the
+ /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and
+ /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>.
+ /// </summary>
+ /// <param name="input">The input <see cref="ICharSequence"/>.</param>
+ /// <param name="i">The int position within the input.</param>
+ /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns>
+ public virtual bool PatternAndContextMatches(ICharSequence input, int i)
+ {
+ if (i < 0)
+ {
+ throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes");
+ }
+
+ int patternLength = this.pattern.Length;
+ int ipl = i + patternLength;
+
+ if (ipl > input.Length)
+ {
+ // not enough room for the pattern to match
+ return false;
+ }
+
+ // evaluate the pattern, left context and right context
+ // fail early if any of the evaluations is not successful
+ if (!input.SubSequence(i, ipl).Equals(this.pattern))
+ {
+ return false;
+ }
+ else if (!this.rContext.IsMatch(input.SubSequence(ipl, input.Length)))
+ {
+ return false;
+ }
+ return this.lContext.IsMatch(input.SubSequence(0, i));
+ }
+
+ /// <summary>
+ /// Decides if the pattern and context match the input starting at a position. It is a match if the
+ /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and
+ /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>.
+ /// </summary>
+ /// <param name="input">The input <see cref="string"/>.</param>
+ /// <param name="i">The int position within the input.</param>
+ /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns>
+ // LUCENENET specific
+ public virtual bool PatternAndContextMatches(string input, int i)
+ {
+ if (i < 0)
+ {
+ throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes");
+ }
+
+ int patternLength = this.pattern.Length;
+ int ipl = i + patternLength;
+
+ if (ipl > input.Length)
+ {
+ // not enough room for the pattern to match
+ return false;
+ }
+
+ // evaluate the pattern, left context and right context
+ // fail early if any of the evaluations is not successful
+ if (!input.Substring(i, (ipl - i)).Equals(this.pattern))
+ {
+ return false;
+ }
+ else if (!this.rContext.IsMatch(input.Substring(ipl, (input.Length - ipl))))
+ {
+ return false;
+ }
+ return this.lContext.IsMatch(input.Substring(0, (i - 0)));
+ }
+
+ /// <summary>
+ /// Decides if the pattern and context match the input starting at a position. It is a match if the
+ /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and
+ /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>.
+ /// </summary>
+ /// <param name="input">The input <see cref="StringBuilder"/>.</param>
+ /// <param name="i">The int position within the input.</param>
+ /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns>
+ // LUCENENET specific
+ public virtual bool PatternAndContextMatches(StringBuilder input, int i)
+ {
+ if (i < 0)
+ {
+ throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes");
+ }
+
+ int patternLength = this.pattern.Length;
+ int ipl = i + patternLength;
+
+ if (ipl > input.Length)
+ {
+ // not enough room for the pattern to match
+ return false;
+ }
+
+ // evaluate the pattern, left context and right context
+ // fail early if any of the evaluations is not successful
+ if (!input.ToString(i, (ipl - i)).Equals(this.pattern))
+ {
+ return false;
+ }
+ else if (!this.rContext.IsMatch(input.ToString(ipl, (input.Length - ipl))))
+ {
+ return false;
+ }
+ return this.lContext.IsMatch(input.ToString(0, (i - 0)));
+ }
+
+ }
+
+ public sealed class Phoneme : IPhonemeExpr
+ {
+ private class PhonemeComparer : IComparer<Phoneme>
+ {
+ public int Compare(Phoneme o1, Phoneme o2)
+ {
+ for (int i = 0; i < o1.phonemeText.Length; i++)
+ {
+ if (i >= o2.phonemeText.Length)
+ {
+ return +1;
+ }
+ int c = o1.phonemeText[i] - o2.phonemeText[i];
+ if (c != 0)
+ {
+ return c;
+ }
+ }
+
+ if (o1.phonemeText.Length < o2.phonemeText.Length)
+ {
+ return -1;
+ }
+
+ return 0;
+ }
+ }
+
+ public static readonly IComparer<Phoneme> COMPARER = new PhonemeComparer();
+ private readonly StringBuilder phonemeText;
+ private readonly LanguageSet languages;
+
+ public Phoneme(string phonemeText, LanguageSet languages)
+ {
+ this.phonemeText = new StringBuilder(phonemeText);
+ this.languages = languages;
+ }
+
+ public Phoneme(StringBuilder phonemeText, LanguageSet languages)
+ {
+ this.phonemeText = new StringBuilder(phonemeText.ToString());
+ this.languages = languages;
+ }
+
+ public Phoneme(ICharSequence phonemeText, LanguageSet languages)
+ {
+ this.phonemeText = new StringBuilder(phonemeText.ToString());
+ this.languages = languages;
+ }
+
+ public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight)
+ : this(phonemeLeft.phonemeText, phonemeLeft.languages)
+ {
+ this.phonemeText.Append(phonemeRight.phonemeText);
+ }
+
+ public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight, LanguageSet languages)
+ : this(phonemeLeft.phonemeText, languages)
+ {
+ this.phonemeText.Append(phonemeRight.phonemeText);
+ }
+
+ public Phoneme Append(string str)
+ {
+ this.phonemeText.Append(str);
+ return this;
+ }
+
+ public LanguageSet Languages
+ {
+ get { return this.languages; }
+ }
+
+ public IList<Phoneme> Phonemes
+ {
+ get { return new Phoneme[] { this }; }
+ }
+
+ public string GetPhonemeText()
+ {
+ return this.phonemeText.ToString();
+ }
+
+ [Obsolete("since 1.9")]
+ public Phoneme Join(Phoneme right)
+ {
+ return new Phoneme(this.phonemeText.ToString() + right.phonemeText.ToString(),
+ this.languages.RestrictTo(right.Languages));
+ }
+ }
+
+ public interface IPhonemeExpr
+ {
+ IList<Phoneme> Phonemes { get; }
+ }
+
+ public sealed class PhonemeList : IPhonemeExpr
+ {
+ private readonly IList<Phoneme> phonemes;
+
+ public PhonemeList(IList<Phoneme> phonemes)
+ {
+ this.phonemes = phonemes;
+ }
+
+ public IList<Phoneme> Phonemes
+ {
+ get { return this.phonemes; }
+ }
+ }
+
+ /// <summary>
+ /// A minimal wrapper around the functionality of <see cref="Rule"/> Pattern that we use, to allow for alternate implementations.
+ /// </summary>
+ public interface IRPattern
+ {
+ bool IsMatch(ICharSequence input);
+ bool IsMatch(string input);
+ bool IsMatch(StringBuilder input);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs
new file mode 100644
index 0000000..ff3af97
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs
@@ -0,0 +1,68 @@
+// commons-codec version compatibility level: 1.9
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Types of rule.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ public enum RuleType
+ {
+ /// <summary>
+ /// Approximate rules, which will lead to the largest number of phonetic interpretations.
+ /// </summary>
+ APPROX,
+
+ /// <summary>
+ /// Exact rules, which will lead to a minimum number of phonetic interpretations.
+ /// </summary>
+ EXACT,
+
+ /// <summary>
+ /// For internal use only. Please use <see cref="APPROX"/> or <see cref="EXACT"/>.
+ /// </summary>
+ RULES
+ }
+
+ public static class RuleTypeExtensions
+ {
+ /// <summary>
+ /// Gets the rule name.
+ /// </summary>
+ /// <param name="ruleType">The <see cref="RuleType"/>.</param>
+ /// <returns>The rule name.</returns>
+ public static string GetName(this RuleType ruleType)
+ {
+ switch (ruleType)
+ {
+ case RuleType.APPROX:
+ return "approx";
+ case RuleType.EXACT:
+ return "exact";
+ case RuleType.RULES:
+ return "rules";
+ }
+
+ throw new ArgumentException("Invalid ruleType");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt
new file mode 100644
index 0000000..3f4f4c9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// CONSTONANTS
+"ph" "" "" "f" // foreign
+"sh" "" "" "S" // foreign
+"kh" "" "" "x" // foreign
+
+"gli" "" "" "(gli|l[italian])"
+"gni" "" "" "(gni|ni[italian+french])"
+"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn)
+"gh" "" "" "g" // It + translit. from Arabic
+"dh" "" "" "d" // translit. from Arabic
+"bh" "" "" "d" // translit. from Arabic
+"th" "" "" "t" // translit. from Arabic
+"lh" "" "" "l" // Port
+"nh" "" "" "nj" // Port
+
+"ig" "[aeiou]" "" "(ig|tS[spanish])"
+"ix" "[aeiou]" "" "S" // Sp
+"tx" "" "" "tS" // Sp
+"tj" "" "$" "tS" // Sp
+"tj" "" "" "dZ" // Sp
+"tg" "" "" "(tg|dZ[spanish])"
+
+"gi" "" "[aeou]" "dZ" // Italian
+"g" "" "y" "Z" // French
+"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])"
+
+"guy" "" "" "gi"
+"gue" "" "$" "(k[french]|ge)"
+"gu" "" "[ei]" "(g|gv") // not It
+"gu" "" "[ao]" "gv" // not It
+
+"ñ" "" "" "(n|nj)"
+"ny" "" "" "nj"
+
+"sc" "" "[ei]" "(s|S[italian])"
+"sç" "" "[aeiou]" "s" // not It
+"ss" "" "" "s"
+"ç" "" "" "s" // not It
+
+"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])"
+"ch" "" "" "(S|tS[spanish]|dZ[spanish])"
+
+"ci" "" "[aeou]" "(tS[italian]|si)"
+"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])"
+"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])"
+ //array("c" "" "[aou]" "(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted)
+
+"s" "^" "" "s"
+"s" "[aáuiíoóeéêy]" "[aáuiíoóeéêy]" "(s[spanish]|z[portuguese+french+italian])"
+"s" "" "[dglmnrv]" "(z|Z[portuguese])"
+
+"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr
+"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr
+"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp
+"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp
+
+"que" "" "$" "(k[french]|ke)"
+"qu" "" "[eiu]" "k"
+"qu" "" "[ao]" "(kv|k)" // k is It
+
+"ex" "" "[aáuiíoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)"
+"ex" "" "[cs]" "(e[portuguese]|ek)"
+
+"m" "" "[cdglnrst]" "(m|n[portuguese])"
+"m" "" "[bfpv]" "(m|n[portuguese+spanish])"
+"m" "" "$" "(m|n[portuguese])"
+
+"b" "^" "" "(b|V[spanish])"
+"v" "^" "" "(v|B[spanish])"
+
+ // VOWELS
+"eau" "" "" "o" // Fr
+
+"ouh" "" "[aioe]" "(v[french]|uh)"
+"uh" "" "[aioe]" "(v|uh)"
+"ou" "" "[aioe]" "v" // french
+"uo" "" "" "(vo|o)"
+"u" "" "[aie]" "v"
+
+"i" "[aáuoóeéê]" "" "j"
+"i" "" "[aeou]" "j"
+"y" "[aáuiíoóeéê]" "" "j"
+"y" "" "[aeiíou]" "j"
+"e" "" "$" "(e|E[$french])"
+
+"ão" "" "" "(au|an)" // Port
+"ãe" "" "" "(aj|an)" // Port
+"ãi" "" "" "(aj|an)" // Port
+"õe" "" "" "(oj|on)" // Port
+"où" "" "" "u" // Fr
+"ou" "" "" "(ou|u[french])"
+
+"â" "" "" "a" // Port & Fr
+"à" "" "" "a" // Port
+"á" "" "" "a" // Port & Sp
+"ã" "" "" "(a|an)" // Port
+"é" "" "" "e"
+"ê" "" "" "e" // Port & Fr
+"è" "" "" "e" // Sp & Fr & It
+"í" "" "" "i" // Port & Sp
+"î" "" "" "i" // Fr
+"ô" "" "" "o" // Port & Fr
+"ó" "" "" "o" // Port & Sp & It
+"õ" "" "" "(o|on)" // Port
+"ò" "" "" "o" // Sp & It
+"ú" "" "" "u" // Port & Sp
+"ü" "" "" "u" // Port & Sp
+
+ // LATIN ALPHABET
+"a" "" "" "a"
+"b" "" "" "(b|v[spanish])"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "(x[spanish]|Z)" // not It
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"q" "" "" "k"
+"r" "" "" "r"
+"s" "" "" "(s|S[portuguese])"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "(v|b[spanish])"
+"w" "" "" "v" // foreign
+"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks
+"y" "" "" "i"
+"z" "" "" "z"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt
new file mode 100644
index 0000000..e95a756
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include ash_exact_approx_common
+
+// REGRESSIVE ASSIMILATION OF CONSONANTS
+"n" "" "[bp]" "m"
+
+// PECULIARITY OF "h"
+"h" "" "" ""
+"H" "" "" "(x|)"
+
+// POLISH OGONEK IMPOSSIBLE
+"F" "" "[bdgkpstvzZ]h" "e"
+"F" "" "[bdgkpstvzZ]x" "e"
+"B" "" "[bdgkpstvzZ]h" "a"
+"B" "" "[bdgkpstvzZ]x" "a"
+
+// "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern; Frankel = Frankl, Finkelstein = Finklstein
+"e" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"i" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"E" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"I" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"F" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"Q" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"Y" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+
+"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"F" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+
+"lEs" "" "" "(lEs|lz)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)
+"lE" "[bdfgkmnprStvzZ]" "" "(lE|l)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein)
+
+// SIMPLIFICATION: (TRIPHTHONGS & DIPHTHONGS) -> ONE GENERIC DIPHTHONG "D"
+"aue" "" "" "D"
+"oue" "" "" "D"
+
+"AvE" "" "" "(D|AvE)"
+"Ave" "" "" "(D|Ave)"
+"avE" "" "" "(D|avE)"
+"ave" "" "" "(D|ave)"
+
+"OvE" "" "" "(D|OvE)"
+"Ove" "" "" "(D|Ove)"
+"ovE" "" "" "(D|ovE)"
+"ove" "" "" "(D|ove)"
+
+"ea" "" "" "(D|ea)"
+"EA" "" "" "(D|EA)"
+"Ea" "" "" "(D|Ea)"
+"eA" "" "" "(D|eA)"
+
+"aji" "" "" "D"
+"ajI" "" "" "D"
+"aje" "" "" "D"
+"ajE" "" "" "D"
+
+"Aji" "" "" "D"
+"AjI" "" "" "D"
+"Aje" "" "" "D"
+"AjE" "" "" "D"
+
+"oji" "" "" "D"
+"ojI" "" "" "D"
+"oje" "" "" "D"
+"ojE" "" "" "D"
+
+"Oji" "" "" "D"
+"OjI" "" "" "D"
+"Oje" "" "" "D"
+"OjE" "" "" "D"
+
+"eji" "" "" "D"
+"ejI" "" "" "D"
+"eje" "" "" "D"
+"ejE" "" "" "D"
+
+"Eji" "" "" "D"
+"EjI" "" "" "D"
+"Eje" "" "" "D"
+"EjE" "" "" "D"
+
+"uji" "" "" "D"
+"ujI" "" "" "D"
+"uje" "" "" "D"
+"ujE" "" "" "D"
+
+"Uji" "" "" "D"
+"UjI" "" "" "D"
+"Uje" "" "" "D"
+"UjE" "" "" "D"
+
+"iji" "" "" "D"
+"ijI" "" "" "D"
+"ije" "" "" "D"
+"ijE" "" "" "D"
+
+"Iji" "" "" "D"
+"IjI" "" "" "D"
+"Ije" "" "" "D"
+"IjE" "" "" "D"
+
+"aja" "" "" "D"
+"ajA" "" "" "D"
+"ajo" "" "" "D"
+"ajO" "" "" "D"
+"aju" "" "" "D"
+"ajU" "" "" "D"
+
+"Aja" "" "" "D"
+"AjA" "" "" "D"
+"Ajo" "" "" "D"
+"AjO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"oja" "" "" "D"
+"ojA" "" "" "D"
+"ojo" "" "" "D"
+"ojO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"Oja" "" "" "D"
+"OjA" "" "" "D"
+"Ojo" "" "" "D"
+"OjO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"eja" "" "" "D"
+"ejA" "" "" "D"
+"ejo" "" "" "D"
+"ejO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"Eja" "" "" "D"
+"EjA" "" "" "D"
+"Ejo" "" "" "D"
+"EjO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"uja" "" "" "D"
+"ujA" "" "" "D"
+"ujo" "" "" "D"
+"ujO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"Uja" "" "" "D"
+"UjA" "" "" "D"
+"Ujo" "" "" "D"
+"UjO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"ija" "" "" "D"
+"ijA" "" "" "D"
+"ijo" "" "" "D"
+"ijO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"Ija" "" "" "D"
+"IjA" "" "" "D"
+"Ijo" "" "" "D"
+"IjO" "" "" "D"
+"Aju" "" "" "D"
+"AjU" "" "" "D"
+
+"j" "" "" "i"
+
+// lander = lender = länder
+"lYndEr" "" "$" "lYnder"
+"lander" "" "$" "lYnder"
+"lAndEr" "" "$" "lYnder"
+"lAnder" "" "$" "lYnder"
+"landEr" "" "$" "lYnder"
+"lender" "" "$" "lYnder"
+"lEndEr" "" "$" "lYnder"
+"lendEr" "" "$" "lYnder"
+"lEnder" "" "$" "lYnder"
+
+// CONSONANTS {z & Z; s & S} are approximately interchangeable
+"s" "" "[rmnl]" "z"
+"S" "" "[rmnl]" "z"
+"s" "[rmnl]" "" "z"
+"S" "[rmnl]" "" "z"
+
+"dS" "" "$" "S"
+"dZ" "" "$" "S"
+"Z" "" "$" "S"
+"S" "" "$" "(S|s)"
+"z" "" "$" "(S|s)"
+
+"S" "" "" "s"
+"dZ" "" "" "z"
+"Z" "" "" "z"
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt
new file mode 100644
index 0000000..4210173
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt
@@ -0,0 +1,18 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include ash_approx_russian
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt
new file mode 100644
index 0000000..84d8174
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// VOWELS
+"I" "" "[^aEIeiou]e" "(Q|i|D)" // like in "five"
+"I" "" "$" "i"
+"I" "[aEIeiou]" "" "i"
+"I" "" "[^k]$" "i"
+"Ik" "[lr]" "$" "(ik|Qk)"
+"Ik" "" "$" "ik"
+"sIts" "" "$" "(sits|sQts)"
+"Its" "" "$" "its"
+"I" "" "" "(i|Q)"
+
+"lE" "[bdfgkmnprsStvzZ]" "" "(il|li|lY)" // Applebaum < Appelbaum
+
+"au" "" "" "(D|a|u)"
+"ou" "" "" "(D|o|u)"
+"ai" "" "" "(D|a|i)"
+"oi" "" "" "(D|o|i)"
+"ui" "" "" "(D|u|i)"
+
+"E" "D[^aeiEIou]" "" "(i|)" // Weinberg, Shaneberg (shaneberg/shejneberg) --> shejnberg
+"e" "D[^aeiEIou]" "" "(i|)"
+
+"e" "" "" "i"
+"E" "" "[fklmnprsStv]$" "i"
+"E" "" "ts$" "i"
+"E" "[DaoiEuQY]" "" "i"
+"E" "" "[aoQY]" "i"
+"E" "" "" "(Y|i)"
+
+"a" "" "" "(a|o)"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt
new file mode 100644
index 0000000..fa8ee99
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"I" "" "$" "i"
+"I" "[aEIeiou]" "" "i"
+"I" "" "[^k]$" "i"
+"Ik" "[lr]" "$" "(ik|Qk)"
+"Ik" "" "$" "ik"
+"sIts" "" "$" "(sits|sQts)"
+"Its" "" "$" "its"
+"I" "" "" "(i|Q)"
+
+"au" "" "" "(D|a|u)"
+"ou" "" "" "(D|o|u)"
+"ai" "" "" "(D|a|i)"
+"oi" "" "" "(D|o|i)"
+"ui" "" "" "(D|u|i)"
+
+"a" "" "" "(a|o)"
+"e" "" "" "i"
+
+"E" "" "[fklmnprsStv]$" "i"
+"E" "" "ts$" "i"
+"E" "[aoiuQ]" "" "i"
+"E" "" "[aoQ]" "i"
+"E" "" "" "(Y|i)"
\ No newline at end of file