You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/06/27 20:33:51 UTC
[06/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic +
tests. Rather than porting over the entire commons-codec library,
only the language features were ported and added to this library.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs
new file mode 100644
index 0000000..07e7f66
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs
@@ -0,0 +1,111 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Phonetic
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class DoubleMetaphoneFilterTest : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestSize4FalseInject()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+ AssertTokenStreamContents(filter, new String[] { "ANTR" });
+ }
+
+ [Test]
+ public void TestSize4TrueInject()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
+ AssertTokenStreamContents(filter, new String[] { "international", "ANTR" });
+ }
+ [Test]
+ public void TestAlternateInjectFalse()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+ AssertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
+ }
+ [Test]
+ public void TestSize8FalseInject()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+ AssertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
+ }
+ [Test]
+ public void TestNonConvertableStringsWithInject()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
+ AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
+ }
+
+ [Test]
+ public void TestNonConvertableStringsWithoutInject()
+ {
+ TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+ AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
+
+ // should have something after the stream
+ stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
+ filter = new DoubleMetaphoneFilter(stream, 8, false);
+ AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
+ }
+
+ [Test]
+ public void TestRandom()
+ {
+ int codeLen = TestUtil.NextInt(Random(), 1, 8);
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false));
+ });
+
+ CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER);
+
+ Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true));
+ });
+
+ CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ [Test]
+ public void TestEmptyTerm()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, Random().nextBoolean()));
+ });
+
+ CheckOneTerm(a, "", "");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs
new file mode 100644
index 0000000..bd3681b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs
@@ -0,0 +1,255 @@
+using NUnit.Framework;
+using System;
+using System.Globalization;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests BeiderMorseEncoder.
+ /// </summary>
+ public class BeiderMorseEncoderTest : StringEncoderAbstractTest<BeiderMorseEncoder>
+ {
+ private static readonly char[] TEST_CHARS = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u' };
+
+ private void AssertNotEmpty(BeiderMorseEncoder bmpm, string value)
+ {
+ Assert.False(bmpm.Encode(value).Equals(""), value);
+ }
+
+ private BeiderMorseEncoder CreateGenericApproxEncoder()
+ {
+ BeiderMorseEncoder encoder = new BeiderMorseEncoder();
+ encoder.NameType=(NameType.GENERIC);
+ encoder.RuleType=(RuleType.APPROX);
+ return encoder;
+ }
+
+ protected override BeiderMorseEncoder CreateStringEncoder()
+ {
+ return new BeiderMorseEncoder();
+ }
+
+ /**
+ * Tests we do not blow up.
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestAllChars()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ for (char c = char.MinValue; c < char.MaxValue; c++)
+ {
+ bmpm.Encode(c.ToString());
+ }
+ }
+
+ [Test]
+ public void TestAsciiEncodeNotEmpty1Letter()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ for (char c = 'a'; c <= 'z'; c++)
+ {
+ string value = c.ToString();
+ string valueU = value.ToUpperInvariant();
+ AssertNotEmpty(bmpm, value);
+ AssertNotEmpty(bmpm, valueU);
+ }
+ }
+
+ [Test]
+ public void TestAsciiEncodeNotEmpty2Letters()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ for (char c1 = 'a'; c1 <= 'z'; c1++)
+ {
+ for (char c2 = 'a'; c2 <= 'z'; c2++)
+ {
+ String value = new String(new char[] { c1, c2 });
+ String valueU = value.ToUpperInvariant();
+ AssertNotEmpty(bmpm, value);
+ AssertNotEmpty(bmpm, valueU);
+ }
+ }
+ }
+
+ [Test]
+ public void TestEncodeAtzNotEmpty()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ //String[] names = { "ácz", "átz", "Ignácz", "Ignátz", "Ignác" };
+ String[]
+ names = { "\u00e1cz", "\u00e1tz", "Ign\u00e1cz", "Ign\u00e1tz", "Ign\u00e1c" };
+ foreach (String name in names)
+ {
+ AssertNotEmpty(bmpm, name);
+ }
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/CODEC-125?focusedCommentId=13071566&page=com.atlassian.jira.plugin.system.issuetabpanels:
+ * comment-tabpanel#comment-13071566
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestEncodeGna()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ bmpm.Encode("gna");
+ }
+
+ [Test]//@Test(expected = IllegalArgumentException.class)
+ public void TestInvalidLangIllegalArgumentException()
+ {
+ Assert.Throws<ArgumentException>(() => Rule.GetInstance(NameType.GENERIC, RuleType.APPROX, "noSuchLanguage"));
+ }
+
+ [Test]//@Test(expected = IllegalStateException.class)
+ public void TestInvalidLangIllegalStateException()
+ {
+ Assert.Throws<InvalidOperationException>(() => Lang.LoadFromResource("thisIsAMadeUpResourceName", Languages.GetInstance(NameType.GENERIC)));
+ }
+
+ [Test]//@Test(expected = IllegalArgumentException.class)
+ public void TestInvalidLanguageIllegalArgumentException()
+ {
+ Assert.Throws<ArgumentException>(() => Languages.GetInstance("thereIsNoSuchLanguage"));
+ }
+
+ [Test]//@Test(timeout = 10000L)
+ public void TestLongestEnglishSurname()
+ {
+ BeiderMorseEncoder bmpm = CreateGenericApproxEncoder();
+ bmpm.Encode("MacGhilleseatheanaich");
+ }
+
+ [Test]//@Test(expected = IndexOutOfBoundsException.class)
+ public void TestNegativeIndexForRuleMatchIndexOutOfBoundsException()
+ {
+ Assert.Throws<ArgumentOutOfRangeException>(() =>
+ {
+ Rule r = new Rule("a", "", "", new Phoneme("", Languages.ANY_LANGUAGE));
+ r.PatternAndContextMatches("bob", -1);
+ });
+ }
+
+ [Test]
+ public void TestOOM()
+ {
+ String phrase = "200697900'-->�</ bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae"
+ + "cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" /></ afe >"
+ + "<script><!-- f(';< cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >";
+
+ BeiderMorseEncoder encoder = new BeiderMorseEncoder();
+ encoder.NameType=(NameType.GENERIC);
+ encoder.RuleType=(RuleType.EXACT);
+ encoder.SetMaxPhonemes(10);
+
+ String phonemes = encoder.Encode(phrase);
+ Assert.True(phonemes.Length > 0);
+
+ String[] phonemeArr = new Regex("\\|").Split(phonemes);
+ Assert.True(phonemeArr.Length <= 10);
+ }
+
+ [Test]
+ public void TestSetConcat()
+ {
+ BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
+ bmpm.IsConcat=(false);
+ Assert.False(bmpm.IsConcat, "Should be able to set concat to false");
+ }
+
+ [Test]
+ public void TestSetNameTypeAsh()
+ {
+ BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
+ bmpm.NameType=(NameType.ASHKENAZI);
+ Assert.AreEqual(NameType.ASHKENAZI, bmpm.NameType, "Name type should have been set to ash");
+ }
+
+ [Test]
+ public void TestSetRuleTypeExact()
+ {
+ BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
+ bmpm.RuleType=(RuleType.EXACT);
+ Assert.AreEqual(RuleType.EXACT, bmpm.RuleType, "Rule type should have been set to exact");
+ }
+
+ [Test]//@Test(expected = IllegalArgumentException.class)
+ public void TestSetRuleTypeToRulesIllegalArgumentException()
+ {
+ Assert.Throws<ArgumentException>(() =>
+ {
+ BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
+ bmpm.RuleType=(RuleType.RULES);
+ });
+ }
+
+ /**
+ * (Un)luckily, the worse performing test because of the data in {@link #TEST_CHARS}
+ *
+ * @throws EncoderException
+ */
+ [Test]/* timeout = 20000L */
+ public void TestSpeedCheck()
+ {
+ BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder();
+ StringBuilder stringBuffer = new StringBuilder();
+ stringBuffer.append(TEST_CHARS[0]);
+ for (int i = 0, j = 1; i < 40; i++, j++)
+ {
+ if (j == TEST_CHARS.Length)
+ {
+ j = 0;
+ }
+ bmpm.Encode(stringBuffer.toString());
+ stringBuffer.append(TEST_CHARS[j]);
+ }
+ }
+
+ [Test]
+ public void TestSpeedCheck2()
+ {
+ BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder();
+ String phrase = "ItstheendoftheworldasweknowitandIfeelfine";
+
+ for (int i = 1; i <= phrase.Length; i++)
+ {
+ bmpm.Encode(phrase.Substring(0, i));
+ }
+ }
+
+ [Test]
+ public void TestSpeedCheck3()
+ {
+ BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder();
+ String phrase = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
+
+ for (int i = 1; i <= phrase.Length; i++)
+ {
+ bmpm.Encode(phrase.Substring(0, i));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs
new file mode 100644
index 0000000..45e9513
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs
@@ -0,0 +1,138 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class CacheSubSequencePerformanceTest
+ {
+ [Test, LongRunningTest]
+ public void Test()
+ {
+ //int times = 10000000;
+ int times = 100000; // LUCENENET: 10 million times would take several minutes to run - decreasing to 100,000
+ Console.WriteLine("Test with String : ");
+ Test("Angelo", times);
+ Console.WriteLine("Test with StringBuilder : ");
+ Test(new StringBuilder("Angelo"), times);
+ Console.WriteLine("Test with cached String : ");
+ Test(CacheSubSequence("Angelo").ToString(), times);
+ Console.WriteLine("Test with cached StringBuilder : ");
+ Test(CacheSubSequence(new StringBuilder("Angelo")).ToString(), times);
+ }
+
+ private void Test(string input, int times)
+ {
+ long beginTime = DateTime.UtcNow.Ticks;
+ for (int i = 0; i < times; i++)
+ {
+ Test(input);
+ }
+ Console.WriteLine(DateTime.UtcNow.Ticks - beginTime + " millis");
+ }
+
+ private void Test(StringBuilder input, int times)
+ {
+ long beginTime = DateTime.UtcNow.Ticks;
+ for (int i = 0; i < times; i++)
+ {
+ Test(input);
+ }
+ Console.WriteLine(DateTime.UtcNow.Ticks - beginTime + " millis");
+ }
+
+ private void Test(string input)
+ {
+ for (int i = 0; i < input.Length; i++)
+ {
+ for (int j = i; j <= input.Length; j++)
+ {
+ input.Substring(i, (j - i));
+ }
+ }
+ }
+
+ private void Test(StringBuilder input)
+ {
+ for (int i = 0; i < input.Length; i++)
+ {
+ for (int j = i; j <= input.Length; j++)
+ {
+ input.ToString(i, (j - i));
+ }
+ }
+ }
+
+ private class CachedCharSequence : ICharSequence
+ {
+ private readonly string[][] cache;
+ private readonly string cached;
+ public CachedCharSequence(string[][] cache, string cached)
+ {
+ this.cache = cache;
+ this.cached = cached;
+ }
+ public char this[int index]
+ {
+ get
+ {
+ return cached[index];
+ }
+ }
+
+ public int Length
+ {
+ get
+ {
+ return cached.Length;
+ }
+ }
+
+ public ICharSequence SubSequence(int start, int end)
+ {
+ if (start == end)
+ {
+ return "".ToCharSequence();
+ }
+ string res = cache[start][end - 1];
+ if (res == null)
+ {
+ res = cached.Substring(start, end - start);
+ cache[start][end - 1] = res;
+ }
+ return res.ToCharSequence();
+ }
+ }
+
+ private ICharSequence CacheSubSequence(string cached)
+ {
+ string[][] cache = Support.RectangularArrays.ReturnRectangularArray<string>(cached.Length, cached.Length);
+ return new CachedCharSequence(cache, cached);
+ }
+
+ private ICharSequence CacheSubSequence(StringBuilder cached)
+ {
+ string[][] cache = Support.RectangularArrays.ReturnRectangularArray<string>(cached.Length, cached.Length);
+ return new CachedCharSequence(cache, cached.ToString());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs
new file mode 100644
index 0000000..d50c6f7
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs
@@ -0,0 +1,84 @@
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests guessLanguages API.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ public class LanguageGuessingTest
+ {
+ private static string EXACT = "exact";
+ private static string ONE_OF = "one of";
+
+ public static List<object[]> Values = Arrays.AsList(new object[][] {
+ new object[] { "Renault", "french", EXACT },
+ new object[] { "Mickiewicz", "polish", EXACT },
+ new object[] { "Thompson", "english", ONE_OF }, // this also hits german and greeklatin
+ new object[] { "Nu\u00f1ez", "spanish", EXACT }, // Nuñez
+ new object[] { "Carvalho", "portuguese", EXACT },
+ new object[] { "\u010capek", "czech", EXACT }, // Čapek
+ new object[] { "Sjneijder", "dutch", EXACT },
+ new object[] { "Klausewitz", "german", EXACT },
+ new object[] { "K\u00fc\u00e7\u00fck", "turkish", EXACT }, // Küçük
+ new object[] { "Giacometti", "italian", EXACT },
+ new object[] { "Nagy", "hungarian", EXACT },
+ new object[] { "Ceau\u015fescu", "romanian", EXACT }, // Ceauşescu
+ new object[] { "Angelopoulos", "greeklatin", EXACT },
+ new object[] { "\u0391\u03b3\u03b3\u03b5\u03bb\u03cc\u03c0\u03bf\u03c5\u03bb\u03bf\u03c2", "greek", EXACT }, // Αγγελόπουλος
+ new object[] { "\u041f\u0443\u0448\u043a\u0438\u043d", "cyrillic", EXACT }, // Пушкин
+ new object[] { "\u05db\u05d4\u05df", "hebrew", EXACT }, // כהן
+ new object[] { "\u00e1cz", "any", EXACT }, // ácz
+ new object[] { "\u00e1tz", "any", EXACT } // átz
+ });
+
+
+
+ //private readonly String exactness;
+
+ private readonly Lang lang = Lang.GetInstance(NameType.GENERIC);
+ //private readonly String language;
+ //private readonly String name;
+
+ //[TestCaseSource("Values")]
+ //public LanguageGuessingTest(String name, String language, String exactness)
+ //{
+ // this.name = name;
+ // this.language = language;
+ // this.exactness = exactness;
+ //}
+
+ [Test]
+ [TestCaseSource("Values")]
+ public void TestLanguageGuessing(String name, String language, String exactness)
+ {
+ LanguageSet guesses = this.lang.GuessLanguages(name);
+
+ Assert.True(guesses.Contains(language),
+ "language predicted for name '" + name + "' is wrong: " + guesses + " should contain '" + language + "'"
+ );
+
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs
new file mode 100644
index 0000000..7b8b400
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs
@@ -0,0 +1,141 @@
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /**
+ * Tests performance for {@link PhoneticEngine}.
+ * <p>
+ * See <a href="https://issues.apache.org/jira/browse/CODEC-174">[CODEC-174] Improve performance of Beider Morse
+ * encoder</a>.
+ * </p>
+ * <p>
+ * Results for November 7, 2013, project SVN revision 1539678.
+ * </p>
+ * <p>
+ * Environment:
+ * </p>
+ * <ul>
+ * <li>java version "1.7.0_45"</li>
+ * <li>Java(TM) SE Runtime Environment (build 1.7.0_45-b18)</li>
+ * <li>Java HotSpot(TM) 64-Bit Server VM (build 24.45-b08, mixed mode)</li>
+ * <li>OS name: "windows 7", version: "6.1", arch: "amd64", family: "windows")</li>
+ * </ul>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 33,039 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 32,297 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 32,857 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>31,561 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 32,665 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 32,215 millis.</li>
+ * </ol>
+ * <p>
+ * On this file's revision 1539678, with patch <a
+ * href="https://issues.apache.org/jira/secure/attachment/12611963/CODEC-174-change-rules-storage-to-Map.patch"
+ * >CODEC-174-change-rules-storage-to-Map</a>:
+ * </p>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 18,196 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,858 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,644 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,591 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,861 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,696 millis.</li>
+ * </ol>
+ * <p>
+ * Patch applied, committed revision 1539783.
+ * </p>
+ * <p>
+ * On this file's revision 1539783, with patch <a
+ * href="https://issues.apache.org/jira/secure/attachment/12611962/CODEC-174-delete-subsequence-cache.patch"
+ * >CODEC-174-delete-subsequence-cache.patch</a>:
+ * </p>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,547 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,501 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,528 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 17,110 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,910 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 16,969 millis.</li>
+ * </ol>
+ * <p>
+ * Patch not applied.
+ * </p>
+ * <p>
+ * On this file's revision 1539787, with patch <a
+ * href="https://issues.apache.org/jira/secure/attachment/12612178/CODEC-174-reuse-set-in-PhonemeBuilder.patch"
+ * >CODEC-174-reuse-set-in-PhonemeBuilder.patch</a>:
+ * </p>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,724 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,451 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,742 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,186 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,600 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 16,405 millis.</li>
+ * </ol>
+ * <p>
+ * Patch applied, committed revision 1539788.
+ * </p>
+ * <p>
+ * Before patch https://issues.apache.org/jira/secure/attachment/12613371/CODEC-174-refactor-restrictTo-method-in-SomeLanguages.patch
+ * </p>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,133 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,064 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>12,838 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 12,970 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,122 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 13,293 millis.</li>
+ * </ol>
+ * <p>
+ * After patch https://issues.apache.org/jira/secure/attachment/12613371/CODEC-174-refactor-restrictTo-method-in-SomeLanguages.patch
+ * </p>
+ * <ol>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 11,576 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 11,506 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 11,361 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': <b>11,142 millis.</b></li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 11,430 millis.</li>
+ * <li>Time for encoding 80,000 times the input 'Angelo': 11,297 millis.</li>
+ * </ol>
+ * <p>
+ * Patch applied, committed revision 1541234.
+ * </p>
+ */
+ public class PhoneticEnginePerformanceTest
+ {
+ private static readonly int LOOP = 80000;
+
+ [Test]
+ public void Test()
+ {
+ PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
+ String input = "Angelo";
+ long startMillis = DateTime.UtcNow.Ticks;
+ for (int i = 0; i < LOOP; i++)
+ {
+ engine.Encode(input);
+ }
+ long totalMillis = DateTime.UtcNow.Ticks - startMillis;
+ Console.WriteLine(String.Format("Time for encoding {0} times the input '{1}': {2} millis.", LOOP, input, totalMillis));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs
new file mode 100644
index 0000000..cb9a40d
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs
@@ -0,0 +1,234 @@
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <see cref="PhoneticEngine"/> and <see cref="LanguageSet"/> in ways very similar to code found in solr-3.6.0.
+ /// <para/>
+ /// since 1.7
+ /// </summary>
+ public class PhoneticEngineRegressionTest
+ {
+ [Test]
+ public void TestSolrGENERIC()
+ {
+ IDictionary<String, String> args;
+
+ // concat is true, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "GENERIC");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+
+ // concat is true, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+ }
+
+ [Test]
+ public void TestSolrASHKENAZI()
+ {
+ IDictionary<String, String> args;
+
+ // concat is true, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "ASHKENAZI");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "angelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "ASHKENAZI");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "angelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+
+ // concat is true, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "ASHKENAZI");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "ASHKENAZI");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+ }
+
+ [Test]
+ public void TestSolrSEPHARDIC()
+ {
+ IDictionary<String, String> args;
+
+ // concat is true, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "SEPHARDIC");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|anxelo");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is EXACT
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "SEPHARDIC");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("ruleType", "EXACT");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|anxelo");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+
+ // concat is true, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "SEPHARDIC");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ Assert.AreEqual(Encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ Assert.AreEqual(Encode(args, true, "1234"), "");
+
+ // concat is false, ruleType is APPROX
+ args = new SortedDictionary<String, String>();
+ args.Put("nameType", "SEPHARDIC");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("ruleType", "APPROX");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ Assert.AreEqual(Encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
+ args.Put("languageSet", "italian,greek,spanish");
+ Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+ Assert.AreEqual(Encode(args, false, "1234"), "");
+ }
+
+ /**
+ * This code is similar in style to code found in Solr:
+ * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
+ *
+ * Making a JUnit test out of it to protect Solr from possible future
+ * regressions in Commons-Codec.
+ */
+ private static String Encode(IDictionary<String, String> args, bool concat, String input)
+ {
+ LanguageSet languageSet;
+ PhoneticEngine engine;
+
+ // PhoneticEngine = NameType + RuleType + concat
+ // we use common-codec's defaults: GENERIC + APPROX + true
+ String nameTypeArg;
+ args.TryGetValue("nameType", out nameTypeArg);
+ NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : (NameType)Enum.Parse(typeof(NameType), nameTypeArg, true);
+
+ String ruleTypeArg;
+ args.TryGetValue("ruleType", out ruleTypeArg);
+ RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : (RuleType)Enum.Parse(typeof(RuleType), ruleTypeArg, true);
+
+ engine = new PhoneticEngine(nameType, ruleType, concat);
+
+ // LanguageSet: defaults to automagic, otherwise a comma-separated list.
+ String languageSetArg;
+ args.TryGetValue("languageSet", out languageSetArg);
+ if (languageSetArg == null || languageSetArg.equals("auto"))
+ {
+ languageSet = null;
+ }
+ else
+ {
+ languageSet = LanguageSet.From(new HashSet<String>(Arrays.AsList(languageSetArg.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries))));
+ }
+
+ /*
+ org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
+
+ encoded = (languages == null)
+ ? engine.encode(termAtt.toString())
+ : engine.encode(termAtt.toString(), languages);
+
+ Hence our approach, below:
+ */
+ if (languageSet == null)
+ {
+ return engine.Encode(input);
+ }
+ else
+ {
+ return engine.Encode(input, languageSet);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs
new file mode 100644
index 0000000..281fc45
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs
@@ -0,0 +1,89 @@
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class PhoneticEngineTest
+ {
+ private static readonly int TEN = 10;
+
+ public static List<Object[]> Values = new List<object[]> { new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, true, TEN },
+ new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true, TEN },
+ new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, true, 1 },
+ new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true, TEN },
+ new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true, TEN },
+ new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true, TEN },
+ new Object[] {
+ "van helsing",
+ "(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)",
+ NameType.GENERIC,
+ RuleType.EXACT,
+ false, TEN } };
+
+ // private readonly bool concat;
+ //private readonly String name;
+ //private readonly NameType nameType;
+ //private readonly String phoneticExpected;
+ //private readonly RuleType ruleType;
+ //private readonly int maxPhonemes;
+
+ // public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType,
+ // RuleType ruleType, bool concat, int maxPhonemes)
+ // {
+ // this.name = name;
+ // this.phoneticExpected = phoneticExpected;
+ // this.nameType = nameType;
+ // this.ruleType = ruleType;
+ // this.concat = concat;
+ // this.maxPhonemes = maxPhonemes;
+ // }
+
+ [Test]//@Test(timeout = 10000L)
+ [TestCaseSource("Values")]
+ public void TestEncode(String name, String phoneticExpected, NameType nameType,
+ RuleType ruleType, bool concat, int maxPhonemes)
+ {
+ PhoneticEngine engine = new PhoneticEngine(nameType, ruleType, concat, maxPhonemes);
+
+ String phoneticActual = engine.Encode(name);
+
+ //System.err.println("expecting: " + this.phoneticExpected);
+ //System.err.println("actual: " + phoneticActual);
+ Assert.AreEqual(phoneticExpected, phoneticActual, "phoneme incorrect");
+
+ if (concat)
+ {
+ String[] split = new Regex("\\|").Split(phoneticActual);
+ Assert.True(split.Length <= maxPhonemes);
+ }
+ else
+ {
+ String[] words = phoneticActual.Split(new string[] { "-" }, StringSplitOptions.RemoveEmptyEntries);
+ foreach (String word in words)
+ {
+ String[] split = new Regex("\\|").Split(word);
+ Assert.True(split.Length <= maxPhonemes);
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs
new file mode 100644
index 0000000..fd2e8a2
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs
@@ -0,0 +1,163 @@
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language.Bm
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests Rule.
+ /// <para/>
+ /// since 1.6
+ /// </summary>
+ public class RuleTest
+ {
+ // private static class NegativeIntegerBaseMatcher : BaseMatcher<Integer> {
+ // @Override
+ // public void describeTo(final Description description)
+ // {
+ // description.appendText("value should be negative");
+ // }
+
+ // @Override
+ // public boolean matches(final Object item)
+ // {
+ // return ((Integer)item).intValue() < 0;
+ // }
+ //}
+
+ private Phoneme[][] MakePhonemes()
+ {
+ String[][]
+ words = {
+ new string[] { "rinD", "rinDlt", "rina", "rinalt", "rino", "rinolt", "rinu", "rinult" },
+ new string[] { "dortlaj", "dortlej", "ortlaj", "ortlej", "ortlej-dortlaj" } };
+ Phoneme[][] phonemes = new Phoneme[words.Length][];
+
+ for (int i = 0; i < words.Length; i++)
+ {
+ String[] words_i = words[i];
+ Phoneme[] phonemes_i = phonemes[i] = new Phoneme[words_i.Length];
+ for (int j = 0; j < words_i.Length; j++)
+ {
+ phonemes_i[j] = new Phoneme(words_i[j], Languages.NO_LANGUAGES);
+ }
+ }
+
+ return phonemes;
+ }
+
+ [Test]
+ public void TestPhonemeComparedToLaterIsNegative()
+ {
+ foreach (Phoneme[] phs in MakePhonemes())
+ {
+ for (int i = 0; i < phs.Length; i++)
+ {
+ for (int j = i + 1; j < phs.Length; j++)
+ {
+ int c = Phoneme.COMPARER.Compare(phs[i], phs[j]);
+
+ Assert.True(c < 0,
+ "Comparing " + phs[i].GetPhonemeText() + " to " + phs[j].GetPhonemeText() + " should be negative");
+ }
+ }
+ }
+ }
+
+ [Test]
+ public void TestPhonemeComparedToSelfIsZero()
+ {
+ foreach (Phoneme[] phs in MakePhonemes())
+ {
+ foreach (Phoneme ph in phs)
+ {
+ Assert.AreEqual(0,
+ Phoneme.COMPARER.Compare(ph, ph),
+ "Phoneme compared to itself should be zero: " + ph.GetPhonemeText());
+ }
+ }
+ }
+
+ [Test]
+ public void TestSubSequenceWorks()
+ {
+ // AppendableCharSequence is private to Rule. We can only make it through a Phoneme.
+
+ Phoneme a = new Phoneme("a", null);
+ Phoneme b = new Phoneme("b", null);
+ Phoneme cd = new Phoneme("cd", null);
+ Phoneme ef = new Phoneme("ef", null);
+ Phoneme ghi = new Phoneme("ghi", null);
+ Phoneme jkl = new Phoneme("jkl", null);
+
+ Assert.AreEqual('a', a.GetPhonemeText()[0]);
+ Assert.AreEqual('b', b.GetPhonemeText()[0]);
+ Assert.AreEqual('c', cd.GetPhonemeText()[0]);
+ Assert.AreEqual('d', cd.GetPhonemeText()[1]);
+ Assert.AreEqual('e', ef.GetPhonemeText()[0]);
+ Assert.AreEqual('f', ef.GetPhonemeText()[1]);
+ Assert.AreEqual('g', ghi.GetPhonemeText()[0]);
+ Assert.AreEqual('h', ghi.GetPhonemeText()[1]);
+ Assert.AreEqual('i', ghi.GetPhonemeText()[2]);
+ Assert.AreEqual('j', jkl.GetPhonemeText()[0]);
+ Assert.AreEqual('k', jkl.GetPhonemeText()[1]);
+ Assert.AreEqual('l', jkl.GetPhonemeText()[2]);
+
+ Phoneme a_b = new Phoneme(a, b);
+ Assert.AreEqual('a', a_b.GetPhonemeText()[0]);
+ Assert.AreEqual('b', a_b.GetPhonemeText()[1]);
+ Assert.AreEqual("ab", a_b.GetPhonemeText().Substring(0, 2 - 0).toString());
+ Assert.AreEqual("a", a_b.GetPhonemeText().Substring(0, 1 - 0).toString());
+ Assert.AreEqual("b", a_b.GetPhonemeText().Substring(1, 2 - 1).toString());
+
+ Phoneme cd_ef = new Phoneme(cd, ef);
+ Assert.AreEqual('c', cd_ef.GetPhonemeText()[0]);
+ Assert.AreEqual('d', cd_ef.GetPhonemeText()[1]);
+ Assert.AreEqual('e', cd_ef.GetPhonemeText()[2]);
+ Assert.AreEqual('f', cd_ef.GetPhonemeText()[3]);
+ Assert.AreEqual("c", cd_ef.GetPhonemeText().Substring(0, 1 - 0).toString());
+ Assert.AreEqual("d", cd_ef.GetPhonemeText().Substring(1, 2 - 1).toString());
+ Assert.AreEqual("e", cd_ef.GetPhonemeText().Substring(2, 3 - 2).toString());
+ Assert.AreEqual("f", cd_ef.GetPhonemeText().Substring(3, 4 - 3).toString());
+ Assert.AreEqual("cd", cd_ef.GetPhonemeText().Substring(0, 2 - 0).toString());
+ Assert.AreEqual("de", cd_ef.GetPhonemeText().Substring(1, 3 - 1).toString());
+ Assert.AreEqual("ef", cd_ef.GetPhonemeText().Substring(2, 4 - 2).toString());
+ Assert.AreEqual("cde", cd_ef.GetPhonemeText().Substring(0, 3 - 0).toString());
+ Assert.AreEqual("def", cd_ef.GetPhonemeText().Substring(1, 4 - 1).toString());
+ Assert.AreEqual("cdef", cd_ef.GetPhonemeText().Substring(0, 4 - 0).toString());
+
+ var test = new Phoneme(a, b);
+ Phoneme a_b_cd = new Phoneme(test, cd);
+ Assert.AreEqual('a', a_b_cd.GetPhonemeText()[0]);
+ Assert.AreEqual('b', a_b_cd.GetPhonemeText()[1]);
+ Assert.AreEqual('c', a_b_cd.GetPhonemeText()[2]);
+ Assert.AreEqual('d', a_b_cd.GetPhonemeText()[3]);
+ Assert.AreEqual("a", a_b_cd.GetPhonemeText().Substring(0, 1 - 0).toString());
+ Assert.AreEqual("b", a_b_cd.GetPhonemeText().Substring(1, 2 - 1).toString());
+ Assert.AreEqual("c", a_b_cd.GetPhonemeText().Substring(2, 3 - 2).toString());
+ Assert.AreEqual("d", a_b_cd.GetPhonemeText().Substring(3, 4 - 3).toString());
+ Assert.AreEqual("ab", a_b_cd.GetPhonemeText().Substring(0, 2 - 0).toString());
+ Assert.AreEqual("bc", a_b_cd.GetPhonemeText().Substring(1, 3 - 1).toString());
+ Assert.AreEqual("cd", a_b_cd.GetPhonemeText().Substring(2, 4 - 2).toString());
+ Assert.AreEqual("abc", a_b_cd.GetPhonemeText().Substring(0, 3 - 0).toString());
+ Assert.AreEqual("bcd", a_b_cd.GetPhonemeText().Substring(1, 4 - 1).toString());
+ Assert.AreEqual("abcd", a_b_cd.GetPhonemeText().Substring(0, 4 - 0).toString());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs
new file mode 100644
index 0000000..9112ed4
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs
@@ -0,0 +1,109 @@
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests Caverphone1.
+ /// </summary>
+ public class Caverphone1Test : StringEncoderAbstractTest<Caverphone1>
+ {
+ protected override Caverphone1 CreateStringEncoder()
+ {
+ return new Caverphone1();
+ }
+
+ /**
+ * Tests example adapted from version 2.0 http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * AT1111 words: add, aid, at, art, eat, earth, head, hit, hot, hold, hard, heart, it, out, old
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedCommonCodeAT1111()
+ {
+ this.CheckEncodingVariations("AT1111", new String[]{
+ "add",
+ "aid",
+ "at",
+ "art",
+ "eat",
+ "earth",
+ "head",
+ "hit",
+ "hot",
+ "hold",
+ "hard",
+ "heart",
+ "it",
+ "out",
+ "old"});
+ }
+
+ [Test]
+ public void TestEndMb()
+ {
+ String[]
+ []
+ data = { new string[] { "mb", "M11111" }, new string[] { "mbmb", "MPM111" } };
+ this.CheckEncodings(data);
+ }
+
+ /**
+ * Tests some examples from version 2.0 http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestIsCaverphoneEquals()
+ {
+ Caverphone1 caverphone = new Caverphone1();
+ Assert.False(caverphone.IsEncodeEqual("Peter", "Stevenson"), "Caverphone encodings should not be equal");
+ Assert.True(caverphone.IsEncodeEqual("Peter", "Peady"), "Caverphone encodings should be equal");
+ }
+
+ /**
+ * Tests example from http://caversham.otago.ac.nz/files/working/ctp060902.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestSpecificationV1Examples()
+ {
+ String[]
+ []
+ data = { new string[] { "David", "TFT111" }, new string[] { "Whittle", "WTL111" } };
+ this.CheckEncodings(data);
+ }
+
+ /**
+ * Tests examples from http://en.wikipedia.org/wiki/Caverphone
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestWikipediaExamples()
+ {
+ String[][] data = { new string[] { "Lee", "L11111" }, new string[] { "Thompson", "TMPSN1" } };
+ this.CheckEncodings(data);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs
new file mode 100644
index 0000000..4ec1daa
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs
@@ -0,0 +1,375 @@
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests Caverphone2.
+ /// </summary>
+ public class Caverphone2Test : StringEncoderAbstractTest<Caverphone2>
+ {
+ protected override Caverphone2 CreateStringEncoder()
+ {
+ return new Caverphone2();
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * AT11111111 words: add, aid, at, art, eat, earth, head, hit, hot, hold, hard, heart, it, out, old
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedCommonCodeAT11111111()
+ {
+ this.CheckEncodingVariations("AT11111111", new String[]{
+ "add",
+ "aid",
+ "at",
+ "art",
+ "eat",
+ "earth",
+ "head",
+ "hit",
+ "hot",
+ "hold",
+ "hard",
+ "heart",
+ "it",
+ "out",
+ "old"});
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedExamples()
+ {
+ String[]
+ []
+ data = { new string[] { "Stevenson", "STFNSN1111" }, new string[] { "Peter", "PTA1111111" } };
+ this.CheckEncodings(data);
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedRandomNameKLN1111111()
+ {
+ this.CheckEncodingVariations("KLN1111111", new String[]{
+ "Cailean",
+ "Calan",
+ "Calen",
+ "Callahan",
+ "Callan",
+ "Callean",
+ "Carleen",
+ "Carlen",
+ "Carlene",
+ "Carlin",
+ "Carline",
+ "Carlyn",
+ "Carlynn",
+ "Carlynne",
+ "Charlean",
+ "Charleen",
+ "Charlene",
+ "Charline",
+ "Cherlyn",
+ "Chirlin",
+ "Clein",
+ "Cleon",
+ "Cline",
+ "Cohleen",
+ "Colan",
+ "Coleen",
+ "Colene",
+ "Colin",
+ "Colleen",
+ "Collen",
+ "Collin",
+ "Colline",
+ "Colon",
+ "Cullan",
+ "Cullen",
+ "Cullin",
+ "Gaelan",
+ "Galan",
+ "Galen",
+ "Garlan",
+ "Garlen",
+ "Gaulin",
+ "Gayleen",
+ "Gaylene",
+ "Giliane",
+ "Gillan",
+ "Gillian",
+ "Glen",
+ "Glenn",
+ "Glyn",
+ "Glynn",
+ "Gollin",
+ "Gorlin",
+ "Kalin",
+ "Karlan",
+ "Karleen",
+ "Karlen",
+ "Karlene",
+ "Karlin",
+ "Karlyn",
+ "Kaylyn",
+ "Keelin",
+ "Kellen",
+ "Kellene",
+ "Kellyann",
+ "Kellyn",
+ "Khalin",
+ "Kilan",
+ "Kilian",
+ "Killen",
+ "Killian",
+ "Killion",
+ "Klein",
+ "Kleon",
+ "Kline",
+ "Koerlin",
+ "Kylen",
+ "Kylynn",
+ "Quillan",
+ "Quillon",
+ "Qulllon",
+ "Xylon"});
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedRandomNameTN11111111()
+ {
+ this.CheckEncodingVariations("TN11111111", new String[]{
+ "Dan",
+ "Dane",
+ "Dann",
+ "Darn",
+ "Daune",
+ "Dawn",
+ "Ddene",
+ "Dean",
+ "Deane",
+ "Deanne",
+ "DeeAnn",
+ "Deeann",
+ "Deeanne",
+ "Deeyn",
+ "Den",
+ "Dene",
+ "Denn",
+ "Deonne",
+ "Diahann",
+ "Dian",
+ "Diane",
+ "Diann",
+ "Dianne",
+ "Diannne",
+ "Dine",
+ "Dion",
+ "Dione",
+ "Dionne",
+ "Doane",
+ "Doehne",
+ "Don",
+ "Donn",
+ "Doone",
+ "Dorn",
+ "Down",
+ "Downe",
+ "Duane",
+ "Dun",
+ "Dunn",
+ "Duyne",
+ "Dyan",
+ "Dyane",
+ "Dyann",
+ "Dyanne",
+ "Dyun",
+ "Tan",
+ "Tann",
+ "Teahan",
+ "Ten",
+ "Tenn",
+ "Terhune",
+ "Thain",
+ "Thaine",
+ "Thane",
+ "Thanh",
+ "Thayne",
+ "Theone",
+ "Thin",
+ "Thorn",
+ "Thorne",
+ "Thun",
+ "Thynne",
+ "Tien",
+ "Tine",
+ "Tjon",
+ "Town",
+ "Towne",
+ "Turne",
+ "Tyne"});
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedRandomNameTTA1111111()
+ {
+ this.CheckEncodingVariations("TTA1111111", new String[]{
+ "Darda",
+ "Datha",
+ "Dedie",
+ "Deedee",
+ "Deerdre",
+ "Deidre",
+ "Deirdre",
+ "Detta",
+ "Didi",
+ "Didier",
+ "Dido",
+ "Dierdre",
+ "Dieter",
+ "Dita",
+ "Ditter",
+ "Dodi",
+ "Dodie",
+ "Dody",
+ "Doherty",
+ "Dorthea",
+ "Dorthy",
+ "Doti",
+ "Dotti",
+ "Dottie",
+ "Dotty",
+ "Doty",
+ "Doughty",
+ "Douty",
+ "Dowdell",
+ "Duthie",
+ "Tada",
+ "Taddeo",
+ "Tadeo",
+ "Tadio",
+ "Tati",
+ "Teador",
+ "Tedda",
+ "Tedder",
+ "Teddi",
+ "Teddie",
+ "Teddy",
+ "Tedi",
+ "Tedie",
+ "Teeter",
+ "Teodoor",
+ "Teodor",
+ "Terti",
+ "Theda",
+ "Theodor",
+ "Theodore",
+ "Theta",
+ "Thilda",
+ "Thordia",
+ "Tilda",
+ "Tildi",
+ "Tildie",
+ "Tildy",
+ "Tita",
+ "Tito",
+ "Tjader",
+ "Toddie",
+ "Toddy",
+ "Torto",
+ "Tuddor",
+ "Tudor",
+ "Turtle",
+ "Tuttle",
+ "Tutto"});
+ }
+
+ /**
+ * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestCaverphoneRevisitedRandomWords()
+ {
+ this.CheckEncodingVariations("RTA1111111", new String[] { "rather", "ready", "writer" });
+ this.CheckEncoding("SSA1111111", "social");
+ this.CheckEncodingVariations("APA1111111", new String[] { "able", "appear" });
+ }
+
+ [Test]
+ public void TestEndMb()
+ {
+ String[]
+ []
+ data = { new string[] { "mb", "M111111111" }, new string[] { "mbmb", "MPM1111111" } };
+ this.CheckEncodings(data);
+ }
+
+ // Caverphone Revisited
+ [Test]
+ public void TestIsCaverphoneEquals()
+ {
+ Caverphone2 caverphone = new Caverphone2();
+ Assert.False(caverphone.IsEncodeEqual("Peter", "Stevenson"), "Caverphone encodings should not be equal");
+ Assert.True(caverphone.IsEncodeEqual("Peter", "Peady"), "Caverphone encodings should be equal");
+ }
+
+ [Test]
+ public void TestSpecificationExamples()
+ {
+ String[]
+ []
+ data = {
+ new string[] { "Peter", "PTA1111111"},
+ new string[] { "ready", "RTA1111111"},
+ new string[] { "social", "SSA1111111"},
+ new string[] { "able", "APA1111111"},
+ new string[] { "Tedder", "TTA1111111"},
+ new string[] { "Karleen", "KLN1111111"},
+ new string[] { "Dyun", "TN11111111"}
+ };
+ this.CheckEncodings(data);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs
new file mode 100644
index 0000000..46b14ff
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs
@@ -0,0 +1,171 @@
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests the <see cref="ColognePhonetic"/> class.
+ /// </summary>
+ public class ColognePhoneticTest : StringEncoderAbstractTest<ColognePhonetic>
+ {
+ protected override ColognePhonetic CreateStringEncoder()
+ {
+ return new ColognePhonetic();
+ }
+
+ [Test]
+ public void TestAabjoe()
+ {
+ this.CheckEncoding("01", "Aabjoe");
+ }
+
+ [Test]
+ public void TestAaclan()
+ {
+ this.CheckEncoding("0856", "Aaclan");
+ }
+
+ /**
+ * Tests [CODEC-122]
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestAychlmajrForCodec122()
+ {
+ this.CheckEncoding("04567", "Aychlmajr");
+ }
+
+ [Test]
+ public void TestEdgeCases()
+ {
+ String[][] data = {
+ new string[] { "a", "0"},
+ new string[] { "e", "0"},
+ new string[] { "i", "0"},
+ new string[] { "o", "0"},
+ new string[] { "u", "0"},
+ new string[] { "\u00E4", "0"}, // a-umlaut
+ new string[] { "\u00F6", "0"}, // o-umlaut
+ new string[] { "\u00FC", "0"}, // u-umlaut
+ new string[] { "aa", "0"},
+ new string[] { "ha", "0"},
+ new string[] { "h", ""},
+ new string[] { "aha", "0"},
+ new string[] { "b", "1"},
+ new string[] { "p", "1"},
+ new string[] { "ph", "3"},
+ new string[] { "f", "3"},
+ new string[] { "v", "3"},
+ new string[] { "w", "3"},
+ new string[] { "g", "4"},
+ new string[] { "k", "4"},
+ new string[] { "q", "4"},
+ new string[] { "x", "48"},
+ new string[] { "ax", "048"},
+ new string[] { "cx", "48"},
+ new string[] { "l", "5"},
+ new string[] { "cl", "45"},
+ new string[] { "acl", "085"},
+ new string[] { "mn", "6"},
+ new string[] { "r", "7"}
+ };
+ this.CheckEncodings(data);
+ }
+
+ [Test]
+ public void TestExamples()
+ {
+ String[][] data = {
+ new string[] { "m\u00DCller", "657"}, // mÜller - why upper case U-umlaut?
+ new string[] { "schmidt", "862"},
+ new string[] { "schneider", "8627"},
+ new string[] { "fischer", "387"},
+ new string[] { "weber", "317"},
+ new string[] { "wagner", "3467"},
+ new string[] { "becker", "147"},
+ new string[] { "hoffmann", "0366"},
+ new string[] { "sch\u00C4fer", "837"}, // schÄfer - why upper case A-umlaut ?
+ new string[] { "Breschnew", "17863"},
+ new string[] { "Wikipedia", "3412"},
+ new string[] { "peter", "127"},
+ new string[] { "pharma", "376"},
+ new string[] { "m\u00f6nchengladbach", "664645214"}, // mönchengladbach
+ new string[] { "deutsch", "28"},
+ new string[] { "deutz", "28"},
+ new string[] { "hamburg", "06174"},
+ new string[] { "hannover", "0637"},
+ new string[] { "christstollen", "478256"},
+ new string[] { "Xanthippe", "48621"},
+ new string[] { "Zacharias", "8478"},
+ new string[] { "Holzbau", "0581"},
+ new string[] { "matsch", "68"},
+ new string[] { "matz", "68"},
+ new string[] { "Arbeitsamt", "071862"},
+ new string[] { "Eberhard", "01772"},
+ new string[] { "Eberhardt", "01772"},
+ new string[] { "heithabu", "021"}
+ };
+ this.CheckEncodings(data);
+ }
+
+ [Test]
+ public void TestHyphen()
+ {
+ String[][] data = {
+ new string[] { "bergisch-gladbach", "174845214"},
+ new string[] { "M\u00fcller-L\u00fcdenscheidt", "65752682"}
+ }; // Müller-Lüdenscheidt
+ this.CheckEncodings(data);
+ }
+
+ [Test]
+ public void TestIsEncodeEquals()
+ {
+ String[][] data = {
+ new string[] {"Meyer", "M\u00fcller"}, // Müller
+ new string[] {"Meyer", "Mayr"},
+ new string[] {"house", "house"},
+ new string[] {"House", "house"},
+ new string[] {"Haus", "house"},
+ new string[] {"ganz", "Gans"},
+ new string[] {"ganz", "G\u00e4nse"}, // Gänse
+ new string[] {"Miyagi", "Miyako"}};
+ foreach (String[] element in data)
+ {
+ this.StringEncoder.IsEncodeEqual(element[1], element[0]);
+ }
+ }
+
+ [Test]
+ public void TestVariationsMella()
+ {
+ String[] data = { "mella", "milah", "moulla", "mellah", "muehle", "mule" };
+ this.CheckEncodingVariations("65", data);
+ }
+
+ [Test]
+ public void TestVariationsMeyer()
+ {
+ String[] data = { "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major" };
+ this.CheckEncodingVariations("67", data);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs
new file mode 100644
index 0000000..84bb5d3
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs
@@ -0,0 +1,176 @@
+// commons-codec version compatibility level: 1.10
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <see cref="DaitchMokotoffSoundex"/>.
+ /// <para/>
+ /// since 1.10
+ /// </summary>
+ public class DaitchMokotoffSoundexTest : StringEncoderAbstractTest<DaitchMokotoffSoundex>
+ {
+ protected override DaitchMokotoffSoundex CreateStringEncoder()
+ {
+ return new DaitchMokotoffSoundex();
+ }
+
+ private string GetSoundex(string source)
+ {
+ return StringEncoder.GetSoundex(source);
+ }
+
+ private string Encode(string source)
+ {
+ return StringEncoder.Encode(source);
+ }
+
+ [Test]
+ public void TestAccentedCharacterFolding()
+ {
+ Assert.AreEqual("294795", GetSoundex("Straßburg"));
+ Assert.AreEqual("294795", GetSoundex("Strasburg"));
+
+ Assert.AreEqual("095600", GetSoundex("Éregon"));
+ Assert.AreEqual("095600", GetSoundex("Eregon"));
+ }
+
+ [Test]
+ public void TestAdjacentCodes()
+ {
+ // AKSSOL
+ // A-KS-S-O-L
+ // 0-54-4---8 -> wrong
+ // 0-54-----8 -> correct
+ Assert.AreEqual("054800", GetSoundex("AKSSOL"));
+
+ // GERSCHFELD
+ // G-E-RS-CH-F-E-L-D
+ // 5--4/94-5/4-7-8-3 -> wrong
+ // 5--4/94-5/--7-8-3 -> correct
+ Assert.AreEqual("547830|545783|594783|594578", GetSoundex("GERSCHFELD"));
+ }
+
+ [Test]
+ public void TestEncodeBasic()
+ {
+ // same as above, but without branching
+ Assert.AreEqual("097400", Encode("AUERBACH"));
+ Assert.AreEqual("097400", Encode("OHRBACH"));
+ Assert.AreEqual("874400", Encode("LIPSHITZ"));
+ Assert.AreEqual("874400", Encode("LIPPSZYC"));
+ Assert.AreEqual("876450", Encode("LEWINSKY"));
+ Assert.AreEqual("876450", Encode("LEVINSKI"));
+ Assert.AreEqual("486740", Encode("SZLAMAWICZ"));
+ Assert.AreEqual("486740", Encode("SHLAMOVITZ"));
+ }
+
+ [Test]
+ public void TestEncodeIgnoreApostrophes()
+ {
+ this.CheckEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
+ "OBri'en", "OBrie'n", "OBrien'" });
+ }
+
+ /**
+ * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+ *
+ * @throws EncoderException
+ */
+ [Test]
+ public void TestEncodeIgnoreHyphens()
+ {
+ this.CheckEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
+ "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
+ }
+
+ [Test]
+ public void TestEncodeIgnoreTrimmable()
+ {
+ Assert.AreEqual("746536", Encode(" \t\n\r Washington \t\n\r "));
+ Assert.AreEqual("746536", Encode("Washington"));
+ }
+
+ /**
+ * Examples from http://www.jewishgen.org/infofiles/soundex.html
+ */
+ [Test]
+ public void TestSoundexBasic()
+ {
+ Assert.AreEqual("583600", GetSoundex("GOLDEN"));
+ Assert.AreEqual("087930", GetSoundex("Alpert"));
+ Assert.AreEqual("791900", GetSoundex("Breuer"));
+ Assert.AreEqual("579000", GetSoundex("Haber"));
+ Assert.AreEqual("665600", GetSoundex("Mannheim"));
+ Assert.AreEqual("664000", GetSoundex("Mintz"));
+ Assert.AreEqual("370000", GetSoundex("Topf"));
+ Assert.AreEqual("586660", GetSoundex("Kleinmann"));
+ Assert.AreEqual("769600", GetSoundex("Ben Aron"));
+
+ Assert.AreEqual("097400|097500", GetSoundex("AUERBACH"));
+ Assert.AreEqual("097400|097500", GetSoundex("OHRBACH"));
+ Assert.AreEqual("874400", GetSoundex("LIPSHITZ"));
+ Assert.AreEqual("874400|874500", GetSoundex("LIPPSZYC"));
+ Assert.AreEqual("876450", GetSoundex("LEWINSKY"));
+ Assert.AreEqual("876450", GetSoundex("LEVINSKI"));
+ Assert.AreEqual("486740", GetSoundex("SZLAMAWICZ"));
+ Assert.AreEqual("486740", GetSoundex("SHLAMOVITZ"));
+ }
+
+ /**
+ * Examples from http://www.avotaynu.com/soundex.htm
+ */
+ [Test]
+ public void TestSoundexBasic2()
+ {
+ Assert.AreEqual("467000|567000", GetSoundex("Ceniow"));
+ Assert.AreEqual("467000", GetSoundex("Tsenyuv"));
+ Assert.AreEqual("587400|587500", GetSoundex("Holubica"));
+ Assert.AreEqual("587400", GetSoundex("Golubitsa"));
+ Assert.AreEqual("746480|794648", GetSoundex("Przemysl"));
+ Assert.AreEqual("746480", GetSoundex("Pshemeshil"));
+ Assert.AreEqual("944744|944745|944754|944755|945744|945745|945754|945755", GetSoundex("Rosochowaciec"));
+ Assert.AreEqual("945744", GetSoundex("Rosokhovatsets"));
+ }
+
+ /**
+ * Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+ */
+ [Test]
+ public void TestSoundexBasic3()
+ {
+ Assert.AreEqual("734000|739400", GetSoundex("Peters"));
+ Assert.AreEqual("734600|739460", GetSoundex("Peterson"));
+ Assert.AreEqual("645740", GetSoundex("Moskowitz"));
+ Assert.AreEqual("645740", GetSoundex("Moskovitz"));
+ Assert.AreEqual("154600|145460|454600|445460", GetSoundex("Jackson"));
+ Assert.AreEqual("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464",
+ GetSoundex("Jackson-Jackson"));
+ }
+
+ [Test]
+ public void TestSpecialRomanianCharacters()
+ {
+ Assert.AreEqual("364000|464000", GetSoundex("ţamas")); // t-cedilla
+ Assert.AreEqual("364000|464000", GetSoundex("țamas")); // t-comma
+ }
+ }
+}