You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/08/23 23:18:30 UTC
[37/50] [abbrv] lucenenet git commit: Ported Analysis.Compound
namespace + tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestCompoundWordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestCompoundWordTokenFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestCompoundWordTokenFilter.cs
index 7aa8a77..1feb390 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestCompoundWordTokenFilter.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestCompoundWordTokenFilter.cs
@@ -1,7 +1,16 @@
-\ufeffnamespace org.apache.lucene.analysis.compound
+\ufeffusing Lucene.Net.Analysis.CharFilters;
+using Lucene.Net.Analysis.Compound.Hyphenation;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Compound
{
-
- /*
+ /*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -18,395 +27,370 @@
* limitations under the License.
*/
+ public class TestCompoundWordTokenFilter : BaseTokenStreamTestCase
+ {
+
+ private static CharArraySet makeDictionary(params string[] dictionary)
+ {
+ return new CharArraySet(TEST_VERSION_CURRENT, Arrays.AsList(dictionary), true);
+ }
- using MappingCharFilter = org.apache.lucene.analysis.charfilter.MappingCharFilter;
- using NormalizeCharMap = org.apache.lucene.analysis.charfilter.NormalizeCharMap;
- using HyphenationTree = org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
- using KeywordTokenizer = org.apache.lucene.analysis.core.KeywordTokenizer;
- using WhitespaceTokenizer = org.apache.lucene.analysis.core.WhitespaceTokenizer;
- using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using Attribute = org.apache.lucene.util.Attribute;
- using AttributeImpl = org.apache.lucene.util.AttributeImpl;
- using InputSource = org.xml.sax.InputSource;
-
- public class TestCompoundWordTokenFilter : BaseTokenStreamTestCase
- {
-
- private static CharArraySet makeDictionary(params string[] dictionary)
- {
- return new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(dictionary), true);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHyphenationCompoundWordsDA() throws Exception
- public virtual void testHyphenationCompoundWordsDA()
- {
- CharArraySet dict = makeDictionary("l�se", "hest");
-
- InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
- HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
-
- HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en l�sehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
- assertTokenStreamContents(tf, new string[] {"min", "veninde", "som", "er", "lidt", "af", "en", "l�sehest", "l�se", "hest"}, new int[] {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
- }
+ [Test]
+ public virtual void TestHyphenationCompoundWordsDA()
+ {
+ CharArraySet dict = makeDictionary("l�se", "hest");
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHyphenationCompoundWordsDELongestMatch() throws Exception
- public virtual void testHyphenationCompoundWordsDELongestMatch()
- {
- CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");
-
- InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
- HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
-
- // the word basket will not be added due to the longest match option
- HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
- assertTokenStreamContents(tf, new string[] {"basketballkurv", "basketball", "ball", "kurv"}, new int[] {1, 0, 0, 0});
-
- }
-
- /// <summary>
- /// With hyphenation-only, you can get a lot of nonsense tokens.
- /// This can be controlled with the min/max subword size.
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHyphenationOnly() throws Exception
- public virtual void testHyphenationOnly()
- {
- InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
- HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
-
- HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);
-
- // min=2, max=4
- assertTokenStreamContents(tf, new string[] {"basketballkurv", "ba", "sket", "bal", "ball", "kurv"});
-
- tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);
-
- // min=4, max=6
- assertTokenStreamContents(tf, new string[] {"basketballkurv", "basket", "sket", "ball", "lkurv", "kurv"});
-
- tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);
-
- // min=4, max=10
- assertTokenStreamContents(tf, new string[] {"basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv"});
-
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testDumbCompoundWordsSE() throws Exception
- public virtual void testDumbCompoundWordsSE()
- {
- CharArraySet dict = makeDictionary("Bil", "D�rr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "�gon", "Fodral", "Bas", "Fiol", "Makare", "Ges�ll", "Sko", "Vind", "Rute", "Torkare", "Blad");
-
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("Bild�rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas�gonfodral Basfiolsfodral Basfiolsfodralmakareges�ll Skomakare Vindrutetorkare Vindrutetorkarblad abba"), MockTokenizer.WHITESPACE, false), dict);
-
- assertTokenStreamContents(tf, new string[] {"Bild�rr", "Bil", "d�rr", "Bilmotor", "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", "Glas�gonfodral", "Glas", "�gon", "fodral", "Basfiolsfodral", "Bas", "fiol", "fodral", "Basfiolsfodralmakareges�ll", "Bas", "fiol", "fodral", "makare", "ges�ll", "Skomakare", "Sko", "makare", "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad", "Vind", "rute", "blad", "abba"}, new int[] {0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156}, new int[] {7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155, 155, 160}, new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0
, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1});
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testDumbCompoundWordsSELongestMatch() throws Exception
- public virtual void testDumbCompoundWordsSELongestMatch()
- {
- CharArraySet dict = makeDictionary("Bil", "D�rr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "�gon", "Fodral", "Bas", "Fiols", "Makare", "Ges�ll", "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral");
-
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("Basfiolsfodralmakareges�ll"), MockTokenizer.WHITESPACE, false), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
-
- assertTokenStreamContents(tf, new string[] {"Basfiolsfodralmakareges�ll", "Bas", "fiolsfodral", "fodral", "makare", "ges�ll"}, new int[] {0, 0, 0, 0, 0, 0}, new int[] {26, 26, 26, 26, 26, 26}, new int[] {1, 0, 0, 0, 0, 0});
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception
- public virtual void testTokenEndingWithWordComponentOfMinimumLength()
- {
- CharArraySet dict = makeDictionary("ab", "cd", "ef");
-
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdef")
- ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
-
- assertTokenStreamContents(tf, new string[] {"abcdef", "ab", "cd", "ef"}, new int[] {0, 0, 0, 0}, new int[] {6, 6, 6, 6}, new int[] {1, 0, 0, 0});
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testWordComponentWithLessThanMinimumLength() throws Exception
- public virtual void testWordComponentWithLessThanMinimumLength()
- {
- CharArraySet dict = makeDictionary("abc", "d", "efg");
-
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")
- ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
-
- // since "d" is shorter than the minimum subword size, it should not be added to the token stream
- assertTokenStreamContents(tf, new string[] {"abcdefg", "abc", "efg"}, new int[] {0, 0, 0}, new int[] {7, 7, 7}, new int[] {1, 0, 0});
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testReset() throws Exception
- public virtual void testReset()
- {
- CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "�berwachung");
-
- Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Rindfleisch�berwachungsgesetz"));
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
-
- CharTermAttribute termAtt = tf.getAttribute(typeof(CharTermAttribute));
- tf.reset();
- assertTrue(tf.incrementToken());
- assertEquals("Rindfleisch�berwachungsgesetz", termAtt.ToString());
- assertTrue(tf.incrementToken());
- assertEquals("Rind", termAtt.ToString());
- tf.end();
- tf.close();
- wsTokenizer.Reader = new StringReader("Rindfleisch�berwachungsgesetz");
- tf.reset();
- assertTrue(tf.incrementToken());
- assertEquals("Rindfleisch�berwachungsgesetz", termAtt.ToString());
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testRetainMockAttribute() throws Exception
- public virtual void testRetainMockAttribute()
- {
- CharArraySet dict = makeDictionary("abc", "d", "efg");
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg"));
- TokenStream stream = new MockRetainAttributeFilter(tokenizer);
- stream = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
- MockRetainAttribute retAtt = stream.addAttribute(typeof(MockRetainAttribute));
- stream.reset();
- while (stream.incrementToken())
- {
- assertTrue("Custom attribute value was lost", retAtt.Retain);
- }
-
- }
-
- public interface MockRetainAttribute : Attribute
- {
- bool Retain {set;get;}
- }
-
- public sealed class MockRetainAttributeImpl : AttributeImpl, MockRetainAttribute
- {
- internal bool retain = false;
- public override void clear()
- {
- retain = false;
- }
- public bool Retain
- {
- get
- {
- return retain;
- }
- set
- {
- this.retain = value;
- }
- }
- public override void copyTo(AttributeImpl target)
- {
- MockRetainAttribute t = (MockRetainAttribute) target;
- t.Retain = retain;
- }
- }
-
- private class MockRetainAttributeFilter : TokenFilter
- {
-
- internal MockRetainAttribute retainAtt = addAttribute(typeof(MockRetainAttribute));
-
- internal MockRetainAttributeFilter(TokenStream input) : base(input)
- {
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- if (input.incrementToken())
- {
- retainAtt.Retain = true;
- return true;
- }
- else
- {
- return false;
- }
- }
- }
-
- // SOLR-2891
- // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
- // wrt original text if a previous filter increases the length of the word (in this case � -> ue)
- // so in this case we behave like WDF, and preserve any modified offsets
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testInvalidOffsets() throws Exception
- public virtual void testInvalidOffsets()
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.util.CharArraySet dict = makeDictionary("fall");
- CharArraySet dict = makeDictionary("fall");
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder builder = new org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder();
- NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
- builder.add("�", "ue");
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap normMap = builder.build();
- NormalizeCharMap normMap = builder.build();
-
- Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap);
-
- assertAnalyzesTo(analyzer, "bank�berfall", new string[] {"bankueberfall", "fall"}, new int[] {0, 0}, new int[] {12, 12});
- }
-
- private class AnalyzerAnonymousInnerClassHelper : Analyzer
- {
- private readonly TestCompoundWordTokenFilter outerInstance;
-
- private CharArraySet dict;
- private NormalizeCharMap normMap;
-
- public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap)
- {
- this.outerInstance = outerInstance;
- this.dict = dict;
- this.normMap = normMap;
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
- return new TokenStreamComponents(tokenizer, filter);
- }
-
- protected internal override Reader initReader(string fieldName, Reader reader)
- {
- return new MappingCharFilter(normMap, reader);
- }
- }
-
- /// <summary>
- /// blast some random strings through the analyzer </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testRandomStrings() throws Exception
- public virtual void testRandomStrings()
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.util.CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
- CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
- Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dict);
- checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
-
- InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.compound.hyphenation.HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
- HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
- Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator);
- checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
- }
-
- private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
- {
- private readonly TestCompoundWordTokenFilter outerInstance;
-
- private CharArraySet dict;
-
- public AnalyzerAnonymousInnerClassHelper2(TestCompoundWordTokenFilter outerInstance, CharArraySet dict)
- {
- this.outerInstance = outerInstance;
- this.dict = dict;
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
- {
- private readonly TestCompoundWordTokenFilter outerInstance;
-
- private HyphenationTree hyphenator;
-
- public AnalyzerAnonymousInnerClassHelper3(TestCompoundWordTokenFilter outerInstance, HyphenationTree hyphenator)
- {
- this.outerInstance = outerInstance;
- this.hyphenator = hyphenator;
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
- return new TokenStreamComponents(tokenizer, filter);
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testEmptyTerm() throws Exception
- public virtual void testEmptyTerm()
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.util.CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
- CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
- Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this, dict);
- checkOneTerm(a, "", "");
-
- InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.analysis.compound.hyphenation.HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
- HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
- Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator);
- checkOneTerm(b, "", "");
- }
-
- private class AnalyzerAnonymousInnerClassHelper4 : Analyzer
- {
- private readonly TestCompoundWordTokenFilter outerInstance;
-
- private CharArraySet dict;
-
- public AnalyzerAnonymousInnerClassHelper4(TestCompoundWordTokenFilter outerInstance, CharArraySet dict)
- {
- this.outerInstance = outerInstance;
- this.dict = dict;
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper5 : Analyzer
- {
- private readonly TestCompoundWordTokenFilter outerInstance;
-
- private HyphenationTree hyphenator;
-
- public AnalyzerAnonymousInnerClassHelper5(TestCompoundWordTokenFilter outerInstance, HyphenationTree hyphenator)
- {
- this.outerInstance = outerInstance;
- this.hyphenator = hyphenator;
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
- TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
- return new TokenStreamComponents(tokenizer, filter);
- }
- }
- }
+ //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
+ using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
+ {
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en l�sehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "l�sehest", "l�se", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
+ }
+ }
+
+ [Test]
+ public virtual void TestHyphenationCompoundWordsDELongestMatch()
+ {
+ CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");
+
+ //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
+ using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
+ {
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
+
+ // the word basket will not be added due to the longest match option
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
+ AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
+ }
+ }
+
+ /// <summary>
+ /// With hyphenation-only, you can get a lot of nonsense tokens.
+ /// This can be controlled with the min/max subword size.
+ /// </summary>
+ [Test]
+ public virtual void TestHyphenationOnly()
+ {
+ //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
+ using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
+ {
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);
+
+ // min=2, max=4
+ AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });
+
+ tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);
+
+ // min=4, max=6
+ AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });
+
+ tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);
+
+ // min=4, max=10
+ AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
+ }
+ }
+
+ [Test]
+ public virtual void TestDumbCompoundWordsSE()
+ {
+ CharArraySet dict = makeDictionary("Bil", "D�rr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "�gon", "Fodral", "Bas", "Fiol", "Makare", "Ges�ll", "Sko", "Vind", "Rute", "Torkare", "Blad");
+
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("Bild�rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas�gonfodral Basfiolsfodral Basfiolsfodralmakareges�ll Skomakare Vindrutetorkare Vindrutetorkarblad abba"), MockTokenizer.WHITESPACE, false), dict);
+
+ AssertTokenStreamContents(tf, new string[] { "Bild�rr", "Bil", "d�rr", "Bilmotor", "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", "Glas�gonfodral", "Glas", "�gon", "fodral", "Basfiolsfodral", "Bas", "fiol", "fodral", "Basfiolsfodralmakareges�ll", "Bas", "fiol", "fodral", "makare", "ges�ll", "Skomakare", "Sko", "makare", "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad", "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155, 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 });
+ }
+
+ [Test]
+ public virtual void TestDumbCompoundWordsSELongestMatch()
+ {
+ CharArraySet dict = makeDictionary("Bil", "D�rr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "�gon", "Fodral", "Bas", "Fiols", "Makare", "Ges�ll", "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral");
+
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("Basfiolsfodralmakareges�ll"), MockTokenizer.WHITESPACE, false), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
+
+ AssertTokenStreamContents(tf, new string[] { "Basfiolsfodralmakareges�ll", "Bas", "fiolsfodral", "fodral", "makare", "ges�ll" }, new int[] { 0, 0, 0, 0, 0, 0 }, new int[] { 26, 26, 26, 26, 26, 26 }, new int[] { 1, 0, 0, 0, 0, 0 });
+ }
+
+ [Test]
+ public virtual void TestTokenEndingWithWordComponentOfMinimumLength()
+ {
+ CharArraySet dict = makeDictionary("ab", "cd", "ef");
+
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdef")
+ ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+ AssertTokenStreamContents(tf, new string[] { "abcdef", "ab", "cd", "ef" }, new int[] { 0, 0, 0, 0 }, new int[] { 6, 6, 6, 6 }, new int[] { 1, 0, 0, 0 });
+ }
+
+ [Test]
+ public virtual void TestWordComponentWithLessThanMinimumLength()
+ {
+ CharArraySet dict = makeDictionary("abc", "d", "efg");
+
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")
+ ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+ // since "d" is shorter than the minimum subword size, it should not be added to the token stream
+ AssertTokenStreamContents(tf, new string[] { "abcdefg", "abc", "efg" }, new int[] { 0, 0, 0 }, new int[] { 7, 7, 7 }, new int[] { 1, 0, 0 });
+ }
+
+ [Test]
+ public virtual void TestReset()
+ {
+ CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "�berwachung");
+
+ Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Rindfleisch�berwachungsgesetz"));
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+ ICharTermAttribute termAtt = tf.GetAttribute<ICharTermAttribute>();
+ tf.Reset();
+ assertTrue(tf.IncrementToken());
+ assertEquals("Rindfleisch�berwachungsgesetz", termAtt.ToString());
+ assertTrue(tf.IncrementToken());
+ assertEquals("Rind", termAtt.ToString());
+ tf.End();
+ tf.Dispose();
+ wsTokenizer.Reader = new StringReader("Rindfleisch�berwachungsgesetz");
+ tf.Reset();
+ assertTrue(tf.IncrementToken());
+ assertEquals("Rindfleisch�berwachungsgesetz", termAtt.ToString());
+ }
+
+ [Test]
+ public virtual void TestRetainMockAttribute()
+ {
+ CharArraySet dict = makeDictionary("abc", "d", "efg");
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg"));
+ TokenStream stream = new MockRetainAttributeFilter(tokenizer);
+ stream = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ IMockRetainAttribute retAtt = stream.AddAttribute<IMockRetainAttribute>();
+ stream.Reset();
+ while (stream.IncrementToken())
+ {
+ assertTrue("Custom attribute value was lost", retAtt.Retain);
+ }
+
+ }
+
+ public interface IMockRetainAttribute : IAttribute
+ {
+ bool Retain { set; get; }
+ }
+
+ public sealed class MockRetainAttribute : Attribute, IMockRetainAttribute
+ {
+ internal bool retain = false;
+ public override void Clear()
+ {
+ retain = false;
+ }
+ public bool Retain
+ {
+ get
+ {
+ return retain;
+ }
+ set
+ {
+ this.retain = value;
+ }
+ }
+ public override void CopyTo(Attribute target)
+ {
+ IMockRetainAttribute t = (IMockRetainAttribute)target;
+ t.Retain = retain;
+ }
+ }
+
+ private class MockRetainAttributeFilter : TokenFilter
+ {
+
+ internal IMockRetainAttribute retainAtt;
+
+ internal MockRetainAttributeFilter(TokenStream input)
+ : base(input)
+ {
+ retainAtt = AddAttribute<IMockRetainAttribute>();
+ }
+
+ public override sealed bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ retainAtt.Retain = true;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+ // SOLR-2891
+ // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
+ // wrt original text if a previous filter increases the length of the word (in this case � -> ue)
+ // so in this case we behave like WDF, and preserve any modified offsets
+ [Test]
+ public virtual void TestInvalidOffsets()
+ {
+ CharArraySet dict = makeDictionary("fall");
+ NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ builder.Add("�", "ue");
+ NormalizeCharMap normMap = builder.Build();
+
+ Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap);
+
+ AssertAnalyzesTo(analyzer, "bank�berfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper : Analyzer
+ {
+ private readonly TestCompoundWordTokenFilter outerInstance;
+
+ private CharArraySet dict;
+ private NormalizeCharMap normMap;
+
+ public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap)
+ {
+ this.outerInstance = outerInstance;
+ this.dict = dict;
+ this.normMap = normMap;
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+
+ public override TextReader InitReader(string fieldName, TextReader reader)
+ {
+ return new MappingCharFilter(normMap, reader);
+ }
+ }
+
+ /// <summary>
+ /// blast some random strings through the analyzer </summary>
+ [Test]
+ public virtual void TestRandomStrings()
+ {
+ CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dict);
+ CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER);
+
+ //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
+ using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
+ {
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
+ Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator);
+ CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER);
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+ {
+ private readonly TestCompoundWordTokenFilter outerInstance;
+
+ private CharArraySet dict;
+
+ public AnalyzerAnonymousInnerClassHelper2(TestCompoundWordTokenFilter outerInstance, CharArraySet dict)
+ {
+ this.outerInstance = outerInstance;
+ this.dict = dict;
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+ {
+ private readonly TestCompoundWordTokenFilter outerInstance;
+
+ private HyphenationTree hyphenator;
+
+ public AnalyzerAnonymousInnerClassHelper3(TestCompoundWordTokenFilter outerInstance, HyphenationTree hyphenator)
+ {
+ this.outerInstance = outerInstance;
+ this.hyphenator = hyphenator;
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ }
+
+ [Test]
+ public virtual void TestEmptyTerm()
+ {
+ CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this, dict);
+ CheckOneTerm(a, "", "");
+
+ //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
+ using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
+ {
+
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
+ Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator);
+ CheckOneTerm(b, "", "");
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper4 : Analyzer
+ {
+ private readonly TestCompoundWordTokenFilter outerInstance;
+
+ private CharArraySet dict;
+
+ public AnalyzerAnonymousInnerClassHelper4(TestCompoundWordTokenFilter outerInstance, CharArraySet dict)
+ {
+ this.outerInstance = outerInstance;
+ this.dict = dict;
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper5 : Analyzer
+ {
+ private readonly TestCompoundWordTokenFilter outerInstance;
+
+ private HyphenationTree hyphenator;
+
+ public AnalyzerAnonymousInnerClassHelper5(TestCompoundWordTokenFilter outerInstance, HyphenationTree hyphenator)
+ {
+ this.outerInstance = outerInstance;
+ this.hyphenator = hyphenator;
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestDictionaryCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestDictionaryCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestDictionaryCompoundWordTokenFilterFactory.cs
index dd219c0..4d469b1 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestDictionaryCompoundWordTokenFilterFactory.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestDictionaryCompoundWordTokenFilterFactory.cs
@@ -1,7 +1,10 @@
-\ufeffnamespace org.apache.lucene.analysis.compound
-{
+\ufeffusing Lucene.Net.Analysis.Util;
+using System.IO;
+using NUnit.Framework;
- /*
+namespace Lucene.Net.Analysis.Compound
+{
+ /*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -18,43 +21,37 @@
* limitations under the License.
*/
+ /// <summary>
+ /// Simple tests to ensure the Dictionary compound filter factory is working.
+ /// </summary>
+ public class TestDictionaryCompoundWordTokenFilterFactory : BaseTokenStreamFactoryTestCase
+ {
+ /// <summary>
+ /// Ensure the filter actually decompounds text.
+ /// </summary>
+ [Test]
+ public virtual void TestDecompounding()
+ {
+ TextReader reader = new StringReader("I like to play softball");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("DictionaryCompoundWord", "dictionary", "compoundDictionary.txt").Create(stream);
+ AssertTokenStreamContents(stream, new string[] { "I", "like", "to", "play", "softball", "soft", "ball" });
+ }
- using BaseTokenStreamFactoryTestCase = org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-
- /// <summary>
- /// Simple tests to ensure the Dictionary compound filter factory is working.
- /// </summary>
- public class TestDictionaryCompoundWordTokenFilterFactory : BaseTokenStreamFactoryTestCase
- {
- /// <summary>
- /// Ensure the filter actually decompounds text.
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testDecompounding() throws Exception
- public virtual void testDecompounding()
- {
- Reader reader = new StringReader("I like to play softball");
- TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- stream = tokenFilterFactory("DictionaryCompoundWord", "dictionary", "compoundDictionary.txt").create(stream);
- assertTokenStreamContents(stream, new string[] {"I", "like", "to", "play", "softball", "soft", "ball"});
- }
-
- /// <summary>
- /// Test that bogus arguments result in exception </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testBogusArguments() throws Exception
- public virtual void testBogusArguments()
- {
- try
- {
- tokenFilterFactory("DictionaryCompoundWord", "dictionary", "compoundDictionary.txt", "bogusArg", "bogusValue");
- fail();
- }
- catch (System.ArgumentException expected)
- {
- assertTrue(expected.Message.contains("Unknown parameters"));
- }
- }
- }
-
+ /// <summary>
+ /// Test that bogus arguments result in exception </summary>
+ [Test]
+ public virtual void TestBogusArguments()
+ {
+ try
+ {
+ TokenFilterFactory("DictionaryCompoundWord", "dictionary", "compoundDictionary.txt", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestHyphenationCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestHyphenationCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestHyphenationCompoundWordTokenFilterFactory.cs
index f195618..79bf1a5 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestHyphenationCompoundWordTokenFilterFactory.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/TestHyphenationCompoundWordTokenFilterFactory.cs
@@ -1,7 +1,10 @@
-\ufeffnamespace org.apache.lucene.analysis.compound
-{
+\ufeffusing Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System.IO;
- /*
+namespace Lucene.Net.Analysis.Compound
+{
+ /*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -18,60 +21,53 @@
* limitations under the License.
*/
+ /// <summary>
+ /// Simple tests to ensure the Hyphenation compound filter factory is working.
+ /// </summary>
+ public class TestHyphenationCompoundWordTokenFilterFactory : BaseTokenStreamFactoryTestCase
+ {
+ /// <summary>
+ /// Ensure the factory works with hyphenation grammar+dictionary: using default options.
+ /// </summary>
+ [Test]
+ public virtual void TestHyphenationWithDictionary()
+ {
+ TextReader reader = new StringReader("min veninde som er lidt af en l�sehest");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "dictionary", "da_compoundDictionary.txt").Create(stream);
- using BaseTokenStreamFactoryTestCase = org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-
- /// <summary>
- /// Simple tests to ensure the Hyphenation compound filter factory is working.
- /// </summary>
- public class TestHyphenationCompoundWordTokenFilterFactory : BaseTokenStreamFactoryTestCase
- {
- /// <summary>
- /// Ensure the factory works with hyphenation grammar+dictionary: using default options.
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHyphenationWithDictionary() throws Exception
- public virtual void testHyphenationWithDictionary()
- {
- Reader reader = new StringReader("min veninde som er lidt af en l�sehest");
- TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- stream = tokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "dictionary", "da_compoundDictionary.txt").create(stream);
-
- assertTokenStreamContents(stream, new string[] {"min", "veninde", "som", "er", "lidt", "af", "en", "l�sehest", "l�se", "hest"}, new int[] {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
- }
-
- /// <summary>
- /// Ensure the factory works with no dictionary: using hyphenation grammar only.
- /// Also change the min/max subword sizes from the default. When using no dictionary,
- /// its generally necessary to tweak these, or you get lots of expansions.
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHyphenationOnly() throws Exception
- public virtual void testHyphenationOnly()
- {
- Reader reader = new StringReader("basketballkurv");
- TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- stream = tokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "minSubwordSize", "2", "maxSubwordSize", "4").create(stream);
+ AssertTokenStreamContents(stream, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "l�sehest", "l�se", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
+ }
- assertTokenStreamContents(stream, new string[] {"basketballkurv", "ba", "sket", "bal", "ball", "kurv"});
- }
+ /// <summary>
+ /// Ensure the factory works with no dictionary: using hyphenation grammar only.
+ /// Also change the min/max subword sizes from the default. When using no dictionary,
+ /// its generally necessary to tweak these, or you get lots of expansions.
+ /// </summary>
+ [Test]
+ public virtual void TestHyphenationOnly()
+ {
+ TextReader reader = new StringReader("basketballkurv");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "minSubwordSize", "2", "maxSubwordSize", "4").Create(stream);
- /// <summary>
- /// Test that bogus arguments result in exception </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testBogusArguments() throws Exception
- public virtual void testBogusArguments()
- {
- try
- {
- tokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "bogusArg", "bogusValue");
- fail();
- }
- catch (System.ArgumentException expected)
- {
- assertTrue(expected.Message.contains("Unknown parameters"));
- }
- }
- }
+ AssertTokenStreamContents(stream, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });
+ }
+ /// <summary>
+ /// Test that bogus arguments result in exception </summary>
+ [Test]
+ public virtual void TestBogusArguments()
+ {
+ try
+ {
+ TokenFilterFactory("HyphenationCompoundWord", "hyphenator", "da_UTF8.xml", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/compoundDictionary.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/compoundDictionary.txt b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/compoundDictionary.txt
new file mode 100644
index 0000000..f4977b5
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/compoundDictionary.txt
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of words for testing the DictionaryCompound factory
+soft
+ball
+team
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_UTF8.xml
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_UTF8.xml b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_UTF8.xml
new file mode 100644
index 0000000..2c8d203
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_UTF8.xml
@@ -0,0 +1,1208 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
+<!--
+ Copyright 1999-2004 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+This file contains the hyphenation patterns for danish.
+Adapted from dkhyph.tex, dkcommon.tex and dkspecial.tex
+originally created by Frank Jensen (fj@iesd.auc.dk).
+FOP adaptation by Carlos Villegas (cav@uniscope.co.jp)
+-->
+<hyphenation-info>
+
+<hyphen-char value="-"/>
+<hyphen-min before="2" after="2"/>
+
+<classes>
+aA
+bB
+cC
+dD
+eE
+fF
+gG
+hH
+iI
+jJ
+kK
+lL
+mM
+nN
+oO
+pP
+qQ
+rR
+sS
+tT
+uU
+vV
+wW
+xX
+yY
+zZ
+��
+��
+��
+</classes>
+<patterns>
+<!-- dkcommon -->
+.ae3
+.an3k
+.an1s
+.be5la
+.be1t
+.bi4tr
+.der3i
+.diagno5
+.her3
+.hoved3
+.ne4t5
+.om1
+.ove4
+.po1
+.til3
+.yd5r
+ab5le
+3abst
+a3c
+ade5la
+5adg
+a1e
+5afg
+5a4f1l
+af3r
+af4ri
+5afs
+a4gef
+a4gi
+ag5in
+ag5si
+3agti
+a4gy
+a3h
+ais5t
+a3j
+a5ka
+a3ke
+a5kr
+aku5
+a3la
+a1le
+a1li
+al3k
+4alkv
+a1lo
+al5si
+a3lu
+a1ly
+am4pa
+3analy
+an4k5r
+a3nu
+3anv
+a5o
+a5pe
+a3pi
+a5po
+a1ra
+ar5af
+1arb
+a1re
+5arg
+a1ri
+a3ro
+a3sa
+a3sc
+a1si
+a3sk
+a3so
+3a3sp
+a3ste
+a3sti
+a1ta1
+a1te
+a1ti
+a4t5in
+a1to
+ato5v
+a5tr
+a1tu
+a5va
+a1ve
+a5z
+1ba
+ba4ti
+4bd
+1be
+be1k
+be3ro
+be5ru
+be1s4
+be1tr
+1bi
+bi5sk
+b1j
+4b1n
+1bo
+bo4gr
+bo3ra
+bo5re
+1br4
+4bs
+bs5k
+b3so
+b1st
+b5t
+3bu
+bu4s5tr
+b5w
+1by
+by5s
+4c1c
+1ce
+ce5ro
+3ch
+4ch.
+ci4o
+ck3
+5cy
+3da
+4d3af
+d5anta
+da4s
+d1b
+d1d4
+1de
+de5d
+4de4lem
+der5eri
+de4rig
+de5sk
+d1f
+d1g
+d3h
+1di
+di1e
+di5l
+d3j
+d1k
+d1l
+d1m
+4d1n
+3do
+4dop
+d5ov
+d1p
+4drett
+5d4reve
+3drif
+3driv
+d5ros
+d5ru
+ds5an
+ds5in
+d1ski
+d4sm
+d4su
+dsu5l
+ds5vi
+d3ta
+d1te
+dt5o
+d5tr
+dt5u
+1du
+dub5
+d1v
+3dy
+e5ad
+e3af
+e5ag
+e3ak
+e1al
+ea4la
+e3an
+e5ap
+e3at
+e3bl
+ebs3
+e1ci
+ed5ar
+edde4
+eddel5
+e4do
+ed5ra
+ed3re
+ed3rin
+ed4str
+e3e
+3eff
+e3fr
+3eft
+e3gu
+e1h
+e3in
+ei5s
+e3je
+e4j5el
+e1ka
+e3ke
+e3kl
+4e1ko
+e5kr
+ek5sa
+3eksem
+3eksp
+e3ku
+e1kv
+e5ky
+e3lad
+el3ak
+el3ar
+e1las
+e3le
+e4lek
+3elem
+e1li
+5elim
+e3lo
+el5sa
+e5lu
+e3ly
+e4mad
+em4p5le
+em1s
+en5ak
+e4nan
+4enn
+e4no
+en3so
+e5nu
+e5ol
+e3op
+e1or
+e3ov
+epi3
+e1pr
+e3ra
+er3af
+e4rag
+e4rak
+e1re
+e4ref
+er5ege
+5erhv
+e1ri
+e4rib
+er1k
+ero5d
+er5ov
+er3s
+er5tr
+e3rum
+er5un
+e5ry
+e1ta
+e1te
+etek4s
+e1ti
+e3tj
+e1to
+e3tr
+e3tu
+e1ty
+e3um
+e3un
+3eur
+e1va
+e3ve
+e4v3erf
+e1vi
+e5x
+1fa
+fa4ce
+fags3
+f1b
+f1d
+1fe
+fej4
+fejl1
+f1f
+f1g
+f1h
+1fi
+f1k
+3fl
+1fo
+for1en
+fo4ri
+f1p
+f1s4
+4ft
+f3ta
+f1te
+f1ti
+f5to
+f5tvi
+1fu
+f1v
+3fy
+1ga
+g3art
+g1b
+g1d
+1ge
+4g5enden
+ger3in
+ge3s
+g3f
+g1g
+g1h
+1gi
+gi4b
+gi3st
+5gj
+g3k
+g1l
+g1m
+3go
+4g5om
+g5ov
+g3p
+1gr
+gs1a
+gsde4len
+g4se
+gsha4
+g5sla
+gs3or
+gs1p
+g5s4tide
+g4str
+gs1v
+g3ta
+g1te
+g1ti
+g5to
+g3tr
+gt4s
+g3ud
+gun5
+g3v
+1gy
+g5yd
+4ha.
+heds3
+he5s
+4het
+hi4e
+hi4n5
+hi3s
+ho5ko
+ho5ve
+4h3t
+hun4
+hund3
+hvo4
+i1a
+i3b
+i4ble
+i1c
+i3dr
+ids5k
+i1el
+i1en
+i3er
+i3et.
+if3r
+i3gu
+i3h
+i5i
+i5j
+i1ka
+i1ke
+ik1l
+i5ko
+ik3re
+ik5ri
+iks5t
+ik4tu
+i3ku
+ik3v
+i3lag
+il3eg
+il5ej
+il5el
+i3li
+i4l5id
+il3k
+i1lo
+il5u
+i3mu
+ind3t
+5inf
+ings1
+in3s
+in4sv
+inter1
+i3nu
+i3od
+i3og
+i5ok
+i3ol
+ion4
+ions1
+i5o5r
+i3ot
+i5pi
+i3pli
+i5pr
+i3re
+i3ri
+ir5t
+i3sc
+i3si
+i4sm
+is3p
+i1ster
+i3sti
+i5sua
+i1ta
+i1te
+i1ti
+i3to
+i3tr
+it5re.
+i1tu
+i3ty
+i1u
+i1va
+i1ve
+i1vi
+j3ag
+jde4rer
+jds1
+jek4to
+4j5en.
+j5k
+j3le
+j3li
+jlmeld5
+jlmel4di
+j3r
+jre5
+ju3s
+5kap
+k5au
+5kav
+k5b
+kel5s
+ke3sk
+ke5st
+ke4t5a
+k3h
+ki3e
+ki3st
+k1k
+k5lak
+k1le
+3klu
+k4ny
+5kod
+1kon
+ko3ra
+3kort
+ko3v
+1kra
+5kry
+ks3an
+k1si
+ks3k
+ks1p
+k3ste
+k5stu
+ks5v
+k1t
+k4tar
+k4terh
+kti4e
+kt5re
+kt5s
+3kur
+1kus
+3kut
+k4vo
+k4vu
+5lab
+lad3r
+5lagd
+la4g3r
+5lam
+1lat
+l1b
+ldiagnos5
+l3dr
+ld3st
+1le.
+5led
+4lele
+le4mo
+3len
+1ler
+1les
+4leu
+l1f
+lfin4
+lfind5
+l1go1
+l3h
+li4ga
+4l5ins
+4l3int
+li5o
+l3j
+l1ke
+l1ko
+l3ky
+l1l
+l5mu
+lo4du
+l3op
+4l5or
+3lov
+4l3p
+l4ps
+l3r
+4ls
+lses1
+ls5in
+l5sj
+l1ta
+l4taf
+l1te
+l4t5erf
+l3ti
+lt3o
+l3tr
+l3tu
+lu5l
+l3ve
+l3vi
+1ma
+m1b
+m3d
+1me
+4m5ej
+m3f
+m1g
+m3h
+1mi
+mi3k
+m5ing
+mi4o
+mi5sty
+m3k
+m1l
+m1m
+mmen5
+m1n
+3mo
+mo4da
+4mop
+4m5ov
+m1pe
+m3pi
+m3pl
+m1po
+m3pr
+m1r
+mse5s
+ms5in
+m5sk
+ms3p
+m3ste
+ms5v
+m3ta
+m3te
+m3ti
+m3tr
+m1ud
+1mul
+mu1li
+3my
+3na
+4nak
+1nal
+n1b
+n1c
+4nd
+n3dr
+nd5si
+nd5sk
+nd5sp
+1ne
+ne5a
+ne4da
+nemen4
+nement5e
+neo4
+n3erk
+n5erl
+ne5sl
+ne5st
+n1f
+n4go
+4n1h
+1ni
+4nim
+ni5o
+ni3st
+n1ke
+n1ko
+n3kr
+n3ku
+n5kv
+4n1l
+n1m
+n1n
+1no
+n3ord
+n5p
+n3r
+4ns
+n3si
+n1sku
+ns3po
+n1sta
+n5sti
+n1ta
+nta4le
+n1te
+n1ti
+ntiali4
+n3to
+n1tr
+nt4s5t
+nt4su
+n3tu
+n3ty
+4n1v
+3ny
+n3z
+o3a
+o4as
+ob3li
+o1c
+o4din
+od5ri
+od5s
+od5un
+o1e
+of5r
+o4gek
+o4gel
+o4g5o
+og5re
+og5sk
+o5h
+o5in
+oi6s5e
+o1j
+o3ka
+o1ke
+o3ku
+o3la
+o3le
+o1li
+o1lo
+o3lu
+o5ly
+1omr
+on3k
+ook5
+o3or
+o5ov
+o3pi
+op3l
+op3r
+op3s
+3opta
+4or.
+or1an
+3ordn
+ord5s
+o3re.
+o3reg
+o3rek
+o3rer
+o3re3s
+o3ret
+o3ri
+3orient
+or5im
+o4r5in
+or3k
+or5o
+or3sl
+or3st
+o3si
+o3so
+o3t
+o1te
+o5un
+ov4s
+3pa
+pa5gh
+p5anl
+p3d
+4pec
+3pen
+1per
+pe1ra
+pe5s
+pe3u
+p3f
+4p5h
+1pla
+p4lan
+4ple.
+4pler
+4ples
+p3m
+p3n
+5pok
+4po3re
+3pot
+4p5p4
+p4ro
+1proc
+p3sk
+p5so
+ps4p
+p3st
+p1t
+1pu
+pu5b
+p5ule
+p5v
+5py3
+qu4
+4raf
+ra5is
+4rarb
+r1b
+r4d5ar
+r3dr
+rd4s3
+4reks
+1rel
+re5la
+r5enss
+5rese
+re5spo
+4ress
+re3st
+re5s4u
+5rett
+r1f
+r1gu
+r1h
+ri1e
+ri5la
+4rimo
+r4ing
+ringse4
+ringso4r
+4rinp
+4rint
+r3ka
+r1ke
+r1ki
+rk3so
+r3ku
+r1l
+rmo4
+r5mu
+r1n
+ro1b
+ro3p
+r3or
+r3p
+r1r
+rre5s
+rro4n5
+r1sa
+r1si
+r5skr
+r4sk5v
+rs4n
+r3sp
+r5stu
+r5su
+r3sv
+r5tal
+r1te
+r4teli
+r1ti
+r3to
+r4t5or
+rt5rat
+rt3re
+r5tri
+r5tro
+rt3s
+r5ty
+r3ud
+run4da
+5rut
+r3va
+r1ve
+r3vi
+ry4s
+s3af
+1sam
+sa4ma
+s3ap
+s1ar
+1sat
+4s1b
+s1d
+sdy4
+1se
+s4ed
+5s4er
+se4se
+s1f
+4s1g4
+4s3h
+si4bl
+1sig
+s5int
+5sis
+5sit
+5siu
+s5ju
+4sk.
+1skab
+1ske
+s3kl
+sk5s4
+5sky
+s1le
+s1li
+slo3
+5slu
+s5ly
+s1m
+s4my
+4snin
+s4nit
+so5k
+5sol
+5som.
+3somm
+s5oms
+5somt
+3son
+4s1op
+sp4
+3spec
+4sper
+3s4pi
+s1pl
+3sprog.
+s5r4
+s1s4
+4st.
+5s4tam
+1stan
+st5as
+3stat
+1stav
+1ste.
+1sted
+3stel
+5stemo
+1sten
+5step
+3ster.
+3stes
+5stet
+5stj
+3sto
+st5om
+1str
+s1ud
+3sul
+s3un
+3sur
+s3ve
+3s4y
+1sy1s
+5ta.
+1tag
+tands3
+4tanv
+4tb
+tede4l
+teds5
+3teg
+5tekn
+teo1
+5term
+te5ro
+4t1f
+6t3g
+t1h
+tialis5t
+3tid
+ti4en
+ti3st
+4t3k
+4t1l
+tli4s5
+t1m
+t1n
+to5ra
+to1re
+to1ri
+tor4m
+4t3p
+t4ra
+4tres
+tro5v
+1try
+4ts
+t3si
+ts4pa
+ts5pr
+t3st
+ts5ul
+4t1t
+t5uds
+5tur
+t5ve
+1typ
+u1a
+5udl
+ud5r
+ud3s
+3udv
+u1e
+ue4t5
+uge4ri
+ugs3
+u5gu
+u3i
+u5kl
+uk4ta
+uk4tr
+u1la
+u1le
+u5ly
+u5pe
+up5l
+u5q
+u3ra
+u3re
+u4r3eg
+u1rer
+u3ro
+us5a
+u3si
+u5ska
+u5so
+us5v
+u1te
+u1ti
+u1to
+ut5r
+ut5s4
+5u5v
+va5d
+3varm
+1ved
+ve4l5e
+ve4reg
+ve3s
+5vet
+v5h
+vi4l3in
+1vis
+v5j
+v5k
+vl4
+v3le
+v5li
+vls1
+1vo
+4v5om
+v5p
+v5re
+v3st
+v5su
+v5t
+3vu
+y3a
+y5dr
+y3e
+y3ke
+y5ki
+yk3li
+y3ko
+yk4s5
+y3kv
+y5li
+y5lo
+y5mu
+yns5
+y5o
+y1pe
+y3pi
+y3re
+yr3ek
+y3ri
+y3si
+y3ti
+y5t3r
+y5ve
+zi5o
+<!-- dkspecial -->
+.s�3
+.�r5i
+.�v3r
+a3t�
+a5v�
+br�d3
+5b�
+5dr�v
+dst�4
+3d�
+3d�
+e3l�
+e3l�
+e3r�
+er5�n
+e5t�
+e5t�
+e1v�
+e3�
+e5�
+3f�
+3f�
+f�4r5en
+gi�4
+g4s�
+g5s�
+3g�
+3g�1
+3g�
+i5t�
+i3�
+3k�
+3k�
+lingeni�4
+l3v�
+5l�s
+m5t�
+1m�
+3m�
+3m�
+n3k�
+n5t�
+3n�
+4n5�b
+5n�
+o5l�
+or3�
+o5�
+5pr�
+5p�d
+p�3
+r5k�
+r5t�
+r5t�
+r3v�
+r5�l
+4r�n
+5r�r
+3r�d
+r5�r
+s4k�
+3sl�
+s4n�
+5st�
+1st�
+1s�
+4s5�n
+1s�
+s5�k
+s�4r5
+ti4�
+3tr�k.
+t4s�
+t5s�
+t3v�
+u3l�
+3v�rd
+1v�rk
+5v�
+y5v�
+�b3l
+�3c
+�3e
+�g5a
+�4gek
+�4g5r
+�gs5
+�5i
+�5kv
+�lle4
+�n1dr
+�5o
+�1re
+�r4g5r
+�3ri
+�r4ma
+�r4mo
+�r5s
+�5si
+�3so
+�3ste
+�3ve
+�de5
+�3e
+�1je
+�3ke
+�3le
+�ms5
+�n3st
+�n4t3
+�1re
+�3ri
+�rne3
+�r5o
+�1ve
+�1d
+�1e
+�5h
+�3l
+�3re
+�rs5t
+�5sk
+�3t
+</patterns>
+</hyphenation-info>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_compoundDictionary.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_compoundDictionary.txt b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_compoundDictionary.txt
new file mode 100644
index 0000000..9a14f40
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Compound/da_compoundDictionary.txt
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of words for testing the HyphenationCompound factory,
+# in conjunction with the danish hyphenation grammar.
+l�se
+hest
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
index 1b641b7..504ec5f 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
+++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
@@ -84,6 +84,9 @@
<Compile Include="Analysis\CommonGrams\CommonGramsFilterTest.cs" />
<Compile Include="Analysis\CommonGrams\TestCommonGramsFilterFactory.cs" />
<Compile Include="Analysis\CommonGrams\TestCommonGramsQueryFilterFactory.cs" />
+ <Compile Include="Analysis\Compound\TestCompoundWordTokenFilter.cs" />
+ <Compile Include="Analysis\Compound\TestDictionaryCompoundWordTokenFilterFactory.cs" />
+ <Compile Include="Analysis\Compound\TestHyphenationCompoundWordTokenFilterFactory.cs" />
<Compile Include="Analysis\Core\TestAnalyzers.cs" />
<Compile Include="Analysis\Core\TestBugInSomething.cs" />
<Compile Include="Analysis\Core\TestClassicAnalyzer.cs" />
@@ -476,7 +479,11 @@
<EmbeddedResource Include="Analysis\No\nn_light.txt" />
<EmbeddedResource Include="Analysis\No\nn_minimal.txt" />
</ItemGroup>
- <ItemGroup />
+ <ItemGroup>
+ <EmbeddedResource Include="Analysis\Compound\compoundDictionary.txt" />
+ <EmbeddedResource Include="Analysis\Compound\da_UTF8.xml" />
+ <EmbeddedResource Include="Analysis\Compound\da_compoundDictionary.txt" />
+ </ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.