You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:35 UTC
[31/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
new file mode 100644
index 0000000..58b40a1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
@@ -0,0 +1,202 @@
+using System.Diagnostics;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// Base class for decomposition token filters.
+ /// <para>
+ ///
+ /// <a name="version"></a>
+ /// You must specify the required <seealso cref="Version"/> compatibility when creating
+ /// CompoundWordTokenFilterBase:
+ /// <ul>
+ /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ /// supplementary characters in strings and char arrays provided as compound word
+ /// dictionaries.
+ /// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public abstract class CompoundWordTokenFilterBase : TokenFilter
+ {
+ /// <summary>
+ /// The default for minimal word length that gets decomposed
+ /// </summary>
+ public const int DEFAULT_MIN_WORD_SIZE = 5;
+
+ /// <summary>
+ /// The default for minimal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
+
+ /// <summary>
+ /// The default for maximal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
+
+ protected internal readonly Version matchVersion;
+ protected internal readonly CharArraySet dictionary;
+ protected internal readonly LinkedList<CompoundToken> tokens;
+ protected internal readonly int minWordSize;
+ protected internal readonly int minSubwordSize;
+ protected internal readonly int maxSubwordSize;
+ protected internal readonly bool onlyLongestMatch;
+
+ protected internal readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ protected internal readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+
+ private AttributeSource.State current;
+
+ protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
+ {
+ }
+
+ protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false)
+ {
+ }
+
+ protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input)
+ {
+ this.matchVersion = matchVersion;
+ this.tokens = new LinkedList<>();
+ if (minWordSize < 0)
+ {
+ throw new System.ArgumentException("minWordSize cannot be negative");
+ }
+ this.minWordSize = minWordSize;
+ if (minSubwordSize < 0)
+ {
+ throw new System.ArgumentException("minSubwordSize cannot be negative");
+ }
+ this.minSubwordSize = minSubwordSize;
+ if (maxSubwordSize < 0)
+ {
+ throw new System.ArgumentException("maxSubwordSize cannot be negative");
+ }
+ this.maxSubwordSize = maxSubwordSize;
+ this.onlyLongestMatch = onlyLongestMatch;
+ this.dictionary = dictionary;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (tokens.Count > 0)
+ {
+ Debug.Assert(current != null);
+ CompoundToken token = tokens.RemoveFirst();
+ restoreState(current); // keep all other attributes untouched
+ termAtt.setEmpty().append(token.txt);
+ offsetAtt.setOffset(token.startOffset, token.endOffset);
+ posIncAtt.PositionIncrement = 0;
+ return true;
+ }
+
+ current = null; // not really needed, but for safety
+ if (input.incrementToken())
+ {
+ // Only words longer than minWordSize get processed
+ if (termAtt.length() >= this.minWordSize)
+ {
+ decompose();
+ // only capture the state if we really need it for producing new tokens
+ if (tokens.Count > 0)
+ {
+ current = captureState();
+ }
+ }
+ // return original token:
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
+ /// The original token may not be placed in the list, as it is automatically passed through this filter.
+ /// </summary>
+ protected internal abstract void decompose();
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ tokens.Clear();
+ current = null;
+ }
+
+ /// <summary>
+ /// Helper class to hold decompounded token information
+ /// </summary>
+ protected internal class CompoundToken
+ {
+ private readonly CompoundWordTokenFilterBase outerInstance;
+
+ public readonly CharSequence txt;
+ public readonly int startOffset, endOffset;
+
+ /// <summary>
+ /// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
+ public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
+ {
+ this.outerInstance = outerInstance;
+ this.txt = outerInstance.termAtt.subSequence(offset, offset + length);
+
+ // offsets of the original word
+ int startOff = outerInstance.offsetAtt.startOffset();
+ int endOff = outerInstance.offsetAtt.endOffset();
+
+ if (outerInstance.matchVersion.onOrAfter(Version.LUCENE_44) || endOff - startOff != outerInstance.termAtt.length())
+ {
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ this.startOffset = startOff;
+ this.endOffset = endOff;
+ }
+ else
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newStart = startOff + offset;
+ int newStart = startOff + offset;
+ this.startOffset = newStart;
+ this.endOffset = newStart + length;
+ }
+ }
+
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
new file mode 100644
index 0000000..6b875e0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
@@ -0,0 +1,137 @@
+namespace org.apache.lucene.analysis.compound
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that decomposes compound words found in many Germanic languages.
+ /// <para>
+ /// "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+ /// "Donaudampfschiff" even when you only enter "schiff".
+ /// It uses a brute-force algorithm to achieve this.
+ /// </para>
+ /// <para>
+ /// You must specify the required <seealso cref="Version"/> compatibility when creating
+ /// CompoundWordTokenFilterBase:
+ /// <ul>
+ /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ /// supplementary characters in strings and char arrays provided as compound word
+ /// dictionaries.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public class DictionaryCompoundWordTokenFilter : CompoundWordTokenFilterBase
+ {
+
+ /// <summary>
+ /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
+ /// </summary>
+ /// <param name="matchVersion">
+ /// Lucene version to enable correct Unicode 4.0 behavior in the
+ /// dictionaries if Version > 3.0. See <a
+ /// href="CompoundWordTokenFilterBase.html#version"
+ /// >CompoundWordTokenFilterBase</a> for details. </param>
+ /// <param name="input">
+ /// the <seealso cref="TokenStream"/> to process </param>
+ /// <param name="dictionary">
+ /// the word dictionary to match against. </param>
+ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) : base(matchVersion, input, dictionary)
+ {
+ if (dictionary == null)
+ {
+ throw new System.ArgumentException("dictionary cannot be null");
+ }
+ }
+
+ /// <summary>
+ /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
+ /// </summary>
+ /// <param name="matchVersion">
+ /// Lucene version to enable correct Unicode 4.0 behavior in the
+ /// dictionaries if Version > 3.0. See <a
+ /// href="CompoundWordTokenFilterBase.html#version"
+ /// >CompoundWordTokenFilterBase</a> for details. </param>
+ /// <param name="input">
+ /// the <seealso cref="TokenStream"/> to process </param>
+ /// <param name="dictionary">
+ /// the word dictionary to match against. </param>
+ /// <param name="minWordSize">
+ /// only words longer than this get processed </param>
+ /// <param name="minSubwordSize">
+ /// only subwords longer than this get to the output stream </param>
+ /// <param name="maxSubwordSize">
+ /// only subwords shorter than this get to the output stream </param>
+ /// <param name="onlyLongestMatch">
+ /// Add only the longest matching subword to the stream </param>
+ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
+ {
+ if (dictionary == null)
+ {
+ throw new System.ArgumentException("dictionary cannot be null");
+ }
+ }
+
+ protected internal override void decompose()
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int len = termAtt.length();
+ int len = termAtt.length();
+ for (int i = 0;i <= len - this.minSubwordSize;++i)
+ {
+ CompoundToken longestMatchToken = null;
+ for (int j = this.minSubwordSize;j <= this.maxSubwordSize;++j)
+ {
+ if (i + j > len)
+ {
+ break;
+ }
+ if (dictionary.contains(termAtt.buffer(), i, j))
+ {
+ if (this.onlyLongestMatch)
+ {
+ if (longestMatchToken != null)
+ {
+ if (longestMatchToken.txt.length() < j)
+ {
+ longestMatchToken = new CompoundToken(this, i,j);
+ }
+ }
+ else
+ {
+ longestMatchToken = new CompoundToken(this, i,j);
+ }
+ }
+ else
+ {
+ tokens.AddLast(new CompoundToken(this, i,j));
+ }
+ }
+ }
+ if (this.onlyLongestMatch && longestMatchToken != null)
+ {
+ tokens.AddLast(longestMatchToken);
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
new file mode 100644
index 0000000..497d89d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
@@ -0,0 +1,81 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+ using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+ using TokenFilterFactory = TokenFilterFactory;
+
+
+ /// <summary>
+ /// Factory for <seealso cref="DictionaryCompoundWordTokenFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
+ /// minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class DictionaryCompoundWordTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+ {
+ private CharArraySet dictionary;
+ private readonly string dictFile;
+ private readonly int minWordSize;
+ private readonly int minSubwordSize;
+ private readonly int maxSubwordSize;
+ private readonly bool onlyLongestMatch;
+
+ /// <summary>
+ /// Creates a new DictionaryCompoundWordTokenFilterFactory </summary>
+ public DictionaryCompoundWordTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ assureMatchVersion();
+ dictFile = require(args, "dictionary");
+ minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+ minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+ maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+ onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+ public virtual void inform(ResourceLoader loader)
+ {
+ dictionary = base.getWordSet(loader, dictFile, false);
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ // if the dictionary is null, it means it was empty
+ return dictionary == null ? input : new DictionaryCompoundWordTokenFilter(luceneMatchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
new file mode 100644
index 0000000..0b5e99c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
@@ -0,0 +1,255 @@
+namespace org.apache.lucene.analysis.compound
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using Hyphenation = org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
+ using HyphenationTree = org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using Version = org.apache.lucene.util.Version;
+ using InputSource = org.xml.sax.InputSource;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that decomposes compound words found in many Germanic languages.
+ /// <para>
+ /// "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+ /// "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
+ /// grammar and a word dictionary to achieve this.
+ /// </para>
+ /// <para>
+ /// You must specify the required <seealso cref="Version"/> compatibility when creating
+ /// CompoundWordTokenFilterBase:
+ /// <ul>
+ /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ /// supplementary characters in strings and char arrays provided as compound word
+ /// dictionaries.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public class HyphenationCompoundWordTokenFilter : CompoundWordTokenFilterBase
+ {
+ private HyphenationTree hyphenator;
+
+ /// <summary>
+ /// Creates a new <seealso cref="HyphenationCompoundWordTokenFilter"/> instance.
+ /// </summary>
+ /// <param name="matchVersion">
+ /// Lucene version to enable correct Unicode 4.0 behavior in the
+ /// dictionaries if Version > 3.0. See <a
+ /// href="CompoundWordTokenFilterBase.html#version"
+ /// >CompoundWordTokenFilterBase</a> for details. </param>
+ /// <param name="input">
+ /// the <seealso cref="TokenStream"/> to process </param>
+ /// <param name="hyphenator">
+ /// the hyphenation pattern tree to use for hyphenation </param>
+ /// <param name="dictionary">
+ /// the word dictionary to match against. </param>
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary) : this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new <seealso cref="HyphenationCompoundWordTokenFilter"/> instance.
+ /// </summary>
+ /// <param name="matchVersion">
+ /// Lucene version to enable correct Unicode 4.0 behavior in the
+ /// dictionaries if Version > 3.0. See <a
+ /// href="CompoundWordTokenFilterBase.html#version"
+ /// >CompoundWordTokenFilterBase</a> for details. </param>
+ /// <param name="input">
+ /// the <seealso cref="TokenStream"/> to process </param>
+ /// <param name="hyphenator">
+ /// the hyphenation pattern tree to use for hyphenation </param>
+ /// <param name="dictionary">
+ /// the word dictionary to match against. </param>
+ /// <param name="minWordSize">
+ /// only words longer than this get processed </param>
+ /// <param name="minSubwordSize">
+ /// only subwords longer than this get to the output stream </param>
+ /// <param name="maxSubwordSize">
+ /// only subwords shorter than this get to the output stream </param>
+ /// <param name="onlyLongestMatch">
+ /// Add only the longest matching subword to the stream </param>
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
+ {
+
+ this.hyphenator = hyphenator;
+ }
+
+ /// <summary>
+ /// Create a HyphenationCompoundWordTokenFilter with no dictionary.
+ /// <para>
+ /// Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
+ /// HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+ /// null, minWordSize, minSubwordSize, maxSubwordSize }
+ /// </para>
+ /// </summary>
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) : this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false)
+ {
+ }
+
+ /// <summary>
+ /// Create a HyphenationCompoundWordTokenFilter with no dictionary.
+ /// <para>
+ /// Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
+ /// HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+ /// DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+ /// </para>
+ /// </summary>
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator) : this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE)
+ {
+ }
+
+ /// <summary>
+ /// Create a hyphenator tree
+ /// </summary>
+ /// <param name="hyphenationFilename"> the filename of the XML grammar to load </param>
+ /// <returns> An object representing the hyphenation patterns </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(String hyphenationFilename) throws java.io.IOException
+ public static HyphenationTree getHyphenationTree(string hyphenationFilename)
+ {
+ return getHyphenationTree(new InputSource(hyphenationFilename));
+ }
+
+ /// <summary>
+ /// Create a hyphenator tree
+ /// </summary>
+ /// <param name="hyphenationFile"> the file of the XML grammar to load </param>
+ /// <returns> An object representing the hyphenation patterns </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(java.io.File hyphenationFile) throws java.io.IOException
+ public static HyphenationTree getHyphenationTree(File hyphenationFile)
+ {
+ return getHyphenationTree(new InputSource(hyphenationFile.toURI().toASCIIString()));
+ }
+
+ /// <summary>
+ /// Create a hyphenator tree
+ /// </summary>
+ /// <param name="hyphenationSource"> the InputSource pointing to the XML grammar </param>
+ /// <returns> An object representing the hyphenation patterns </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(org.xml.sax.InputSource hyphenationSource) throws java.io.IOException
+ public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
+ {
+ HyphenationTree tree = new HyphenationTree();
+ tree.loadPatterns(hyphenationSource);
+ return tree;
+ }
+
+ protected internal override void decompose()
+ {
+ // get the hyphenation points
+ Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
+ // No hyphen points found -> exit
+ if (hyphens == null)
+ {
+ return;
+ }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] hyp = hyphens.getHyphenationPoints();
+ int[] hyp = hyphens.HyphenationPoints;
+
+ for (int i = 0; i < hyp.Length; ++i)
+ {
+ int remaining = hyp.Length - i;
+ int start = hyp[i];
+ CompoundToken longestMatchToken = null;
+ for (int j = 1; j < remaining; j++)
+ {
+ int partLength = hyp[i + j] - start;
+
+ // if the part is longer than maxSubwordSize we
+ // are done with this round
+ if (partLength > this.maxSubwordSize)
+ {
+ break;
+ }
+
+ // we only put subwords to the token stream
+ // that are longer than minPartSize
+ if (partLength < this.minSubwordSize)
+ {
+ // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
+ // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
+ continue;
+ }
+
+ // check the dictionary
+ if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength))
+ {
+ if (this.onlyLongestMatch)
+ {
+ if (longestMatchToken != null)
+ {
+ if (longestMatchToken.txt.length() < partLength)
+ {
+ longestMatchToken = new CompoundToken(this, start, partLength);
+ }
+ }
+ else
+ {
+ longestMatchToken = new CompoundToken(this, start, partLength);
+ }
+ }
+ else
+ {
+ tokens.AddLast(new CompoundToken(this, start, partLength));
+ }
+ }
+ else if (dictionary.contains(termAtt.buffer(), start, partLength - 1))
+ {
+ // check the dictionary again with a word that is one character
+ // shorter
+ // to avoid problems with genitive 's characters and other binding
+ // characters
+ if (this.onlyLongestMatch)
+ {
+ if (longestMatchToken != null)
+ {
+ if (longestMatchToken.txt.length() < partLength - 1)
+ {
+ longestMatchToken = new CompoundToken(this, start, partLength - 1);
+ }
+ }
+ else
+ {
+ longestMatchToken = new CompoundToken(this, start, partLength - 1);
+ }
+ }
+ else
+ {
+ tokens.AddLast(new CompoundToken(this, start, partLength - 1));
+ }
+ }
+ }
+ if (this.onlyLongestMatch && longestMatchToken != null)
+ {
+ tokens.AddLast(longestMatchToken);
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
new file mode 100644
index 0000000..4a51f7b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
@@ -0,0 +1,125 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using HyphenationTree = org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+ using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+ using TokenFilterFactory = TokenFilterFactory;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+
+ using InputSource = org.xml.sax.InputSource;
+
+ /// <summary>
+ /// Factory for <seealso cref="HyphenationCompoundWordTokenFilter"/>.
+ /// <para>
+ /// This factory accepts the following parameters:
+ /// <ul>
+ /// <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern.
+ /// See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
+ /// <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
+ /// <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
+ /// <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
+ /// <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
+ /// <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
+ /// <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
+ /// to the stream. defaults to false.
+ /// </ul>
+ /// </para>
+ /// <para>
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
+ /// dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ ///
+ /// </para>
+ /// </summary>
+ /// <seealso cref= HyphenationCompoundWordTokenFilter </seealso>
+ public class HyphenationCompoundWordTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+ {
+ private CharArraySet dictionary;
+ private HyphenationTree hyphenator;
+ private readonly string dictFile;
+ private readonly string hypFile;
+ private readonly string encoding;
+ private readonly int minWordSize;
+ private readonly int minSubwordSize;
+ private readonly int maxSubwordSize;
+ private readonly bool onlyLongestMatch;
+
+ /// <summary>
+ /// Creates a new HyphenationCompoundWordTokenFilterFactory </summary>
+ public HyphenationCompoundWordTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ assureMatchVersion();
+ dictFile = get(args, "dictionary");
+ encoding = get(args, "encoding");
+ hypFile = require(args, "hyphenator");
+ minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+ minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+ maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+ onlyLongestMatch = getBoolean(args, "onlyLongestMatch", false);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+ public virtual void inform(ResourceLoader loader)
+ {
+ InputStream stream = null;
+ try
+ {
+ if (dictFile != null) // the dictionary can be empty.
+ {
+ dictionary = getWordSet(loader, dictFile, false);
+ }
+ // TODO: Broken, because we cannot resolve real system id
+ // ResourceLoader should also supply method like ClassLoader to get resource URL
+ stream = loader.openResource(hypFile);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.xml.sax.InputSource is = new org.xml.sax.InputSource(stream);
+ InputSource @is = new InputSource(stream);
+ @is.Encoding = encoding; // if it's null let xml parser decide
+ @is.SystemId = hypFile;
+ hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
+ }
+ finally
+ {
+ IOUtils.closeWhileHandlingException(stream);
+ }
+ }
+
+ public override HyphenationCompoundWordTokenFilter create(TokenStream input)
+ {
+ return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
new file mode 100644
index 0000000..963ad0d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
@@ -0,0 +1,151 @@
+using System;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ /// <summary>
+ /// This class implements a simple byte vector with access to the underlying
+ /// array.
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class ByteVector
+ {
+
+ /// <summary>
+ /// Capacity increment size
+ /// </summary>
+ private const int DEFAULT_BLOCK_SIZE = 2048;
+
+ private int blockSize;
+
+ /// <summary>
+ /// The encapsulated array
+ /// </summary>
+ private sbyte[] array;
+
+ /// <summary>
+ /// Points to next free item
+ /// </summary>
+ private int n;
+
+ public ByteVector() : this(DEFAULT_BLOCK_SIZE)
+ {
+ }
+
+ public ByteVector(int capacity)
+ {
+ if (capacity_Renamed > 0)
+ {
+ blockSize = capacity_Renamed;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new sbyte[blockSize];
+ n = 0;
+ }
+
+ public ByteVector(sbyte[] a)
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = 0;
+ }
+
+ public ByteVector(sbyte[] a, int capacity)
+ {
+ if (capacity_Renamed > 0)
+ {
+ blockSize = capacity_Renamed;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = 0;
+ }
+
+ public virtual sbyte[] Array
+ {
+ get
+ {
+ return array;
+ }
+ }
+
+ /// <summary>
+ /// return number of items in array
+ /// </summary>
+ public virtual int length()
+ {
+ return n;
+ }
+
+ /// <summary>
+ /// returns current capacity of array
+ /// </summary>
+ public virtual int capacity()
+ {
+ return array.Length;
+ }
+
+ public virtual void put(int index, sbyte val)
+ {
+ array[index] = val;
+ }
+
+ public virtual sbyte get(int index)
+ {
+ return array[index];
+ }
+
+ /// <summary>
+ /// This is to implement memory allocation in the array. Like malloc().
+ /// </summary>
+ public virtual int alloc(int size)
+ {
+ int index = n;
+ int len = array.Length;
+ if (n + size >= len)
+ {
+ sbyte[] aux = new sbyte[len + blockSize];
+ Array.Copy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public virtual void trimToSize()
+ {
+ if (n < array.Length)
+ {
+ sbyte[] aux = new sbyte[n];
+ Array.Copy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
new file mode 100644
index 0000000..6868911
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
@@ -0,0 +1,163 @@
+using System;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ /// <summary>
+ /// This class implements a simple char vector with access to the underlying
+ /// array.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class CharVector : ICloneable
+ {
+
+ /// <summary>
+ /// Capacity increment size
+ /// </summary>
+ private const int DEFAULT_BLOCK_SIZE = 2048;
+
+ private int blockSize;
+
+ /// <summary>
+ /// The encapsulated array
+ /// </summary>
+ private char[] array;
+
+ /// <summary>
+ /// Points to next free item
+ /// </summary>
+ private int n;
+
+ public CharVector() : this(DEFAULT_BLOCK_SIZE)
+ {
+ }
+
+ public CharVector(int capacity)
+ {
+ if (capacity_Renamed > 0)
+ {
+ blockSize = capacity_Renamed;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new char[blockSize];
+ n = 0;
+ }
+
+ public CharVector(char[] a)
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = a.Length;
+ }
+
+ public CharVector(char[] a, int capacity)
+ {
+ if (capacity_Renamed > 0)
+ {
+ blockSize = capacity_Renamed;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = a.Length;
+ }
+
+ /// <summary>
+ /// Reset Vector but don't resize or clear elements
+ /// </summary>
+ public virtual void clear()
+ {
+ n = 0;
+ }
+
+ public override CharVector clone()
+ {
+ CharVector cv = new CharVector(array.Clone(), blockSize);
+ cv.n = this.n;
+ return cv;
+ }
+
+ public virtual char[] Array
+ {
+ get
+ {
+ return array;
+ }
+ }
+
+ /// <summary>
+ /// return number of items in array
+ /// </summary>
+ public virtual int length()
+ {
+ return n;
+ }
+
+ /// <summary>
+ /// returns current capacity of array
+ /// </summary>
+ public virtual int capacity()
+ {
+ return array.Length;
+ }
+
+ public virtual void put(int index, char val)
+ {
+ array[index] = val;
+ }
+
+ public virtual char get(int index)
+ {
+ return array[index];
+ }
+
+ public virtual int alloc(int size)
+ {
+ int index = n;
+ int len = array.Length;
+ if (n + size >= len)
+ {
+ char[] aux = new char[len + blockSize];
+ Array.Copy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public virtual void trimToSize()
+ {
+ if (n < array.Length)
+ {
+ char[] aux = new char[n];
+ Array.Copy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
new file mode 100644
index 0000000..819d756
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
@@ -0,0 +1,76 @@
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ /// <summary>
+ /// This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
+ /// pre-break text, post-break text and no-break. If no line-break is generated
+ /// at this position, the no-break text is used, otherwise, pre-break and
+ /// post-break are used. Typically, pre-break is equal to the hyphen character
+ /// and the others are empty. However, this general scheme allows support for
+ /// cases in some languages where words change spelling if they're split across
+ /// lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
+ /// from TeX.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+
+ public class Hyphen
+ {
+ public string preBreak;
+
+ public string noBreak;
+
+ public string postBreak;
+
+ internal Hyphen(string pre, string no, string post)
+ {
+ preBreak = pre;
+ noBreak = no;
+ postBreak = post;
+ }
+
+ internal Hyphen(string pre)
+ {
+ preBreak = pre;
+ noBreak = null;
+ postBreak = null;
+ }
+
+ public override string ToString()
+ {
+ if (noBreak == null && postBreak == null && preBreak != null && preBreak.Equals("-"))
+ {
+ return "-";
+ }
+ StringBuilder res = new StringBuilder("{");
+ res.Append(preBreak);
+ res.Append("}{");
+ res.Append(postBreak);
+ res.Append("}{");
+ res.Append(noBreak);
+ res.Append('}');
+ return res.ToString();
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
new file mode 100644
index 0000000..ccf7387
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ /// <summary>
+ /// This class represents a hyphenated word.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class Hyphenation
+ {
+
+ private int[] hyphenPoints;
+
+ /// <summary>
+ /// rawWord as made of alternating strings and <seealso cref="Hyphen Hyphen"/> instances
+ /// </summary>
+ internal Hyphenation(int[] points)
+ {
+ hyphenPoints = points;
+ }
+
+ /// <returns> the number of hyphenation points in the word </returns>
+ public virtual int length()
+ {
+ return hyphenPoints.Length;
+ }
+
+ /// <returns> the hyphenation points </returns>
+ public virtual int[] HyphenationPoints
+ {
+ get
+ {
+ return hyphenPoints;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
new file mode 100644
index 0000000..9bc4cc0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
@@ -0,0 +1,533 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+
+ using InputSource = org.xml.sax.InputSource;
+
+ /// <summary>
+ /// This tree structure stores the hyphenation patterns in an efficient way for
+ /// fast lookup. It provides the provides the method to hyphenate a word.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class HyphenationTree : TernaryTree, PatternConsumer
+ {
+
+ /// <summary>
+ /// value space: stores the interletter values
+ /// </summary>
+ protected internal ByteVector vspace;
+
+ /// <summary>
+ /// This map stores hyphenation exceptions
+ /// </summary>
+ protected internal Dictionary<string, List<object>> stoplist;
+
+ /// <summary>
+ /// This map stores the character classes
+ /// </summary>
+ protected internal TernaryTree classmap;
+
+ /// <summary>
+ /// Temporary map to store interletter values on pattern loading.
+ /// </summary>
+ [NonSerialized]
+ private TernaryTree ivalues;
+
+ public HyphenationTree()
+ {
+ stoplist = new Dictionary<>(23); // usually a small table
+ classmap = new TernaryTree();
+ vspace = new ByteVector();
+ vspace.alloc(1); // this reserves index 0, which we don't use
+ }
+
+ /// <summary>
+ /// Packs the values by storing them in 4 bits, two values into a byte Values
+ /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+ /// value.
+ /// </summary>
+ /// <param name="values"> a string of digits from '0' to '9' representing the
+ /// interletter values. </param>
+ /// <returns> the index into the vspace array where the packed values are stored. </returns>
+ protected internal virtual int packValues(string values)
+ {
+ int i , n = values.Length;
+ int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+ int offset = vspace.alloc(m);
+ sbyte[] va = vspace.Array;
+ for (i = 0; i < n; i++)
+ {
+ int j = i >> 1;
+ sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
+ if ((i & 1) == 1)
+ {
+ va[j + offset] = (sbyte)(va[j + offset] | v);
+ }
+ else
+ {
+ va[j + offset] = (sbyte)(v << 4); // big endian
+ }
+ }
+ va[m - 1 + offset] = 0; // terminator
+ return offset;
+ }
+
+ protected internal virtual string unpackValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace.get(k++);
+ while (v != 0)
+ {
+ char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1 + '0');
+ buf.Append(c);
+ v = vspace.get(k++);
+ }
+ return buf.ToString();
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void loadPatterns(java.io.File f) throws java.io.IOException
+ public virtual void loadPatterns(File f)
+ {
+ InputSource src = new InputSource(f.toURI().toASCIIString());
+ loadPatterns(src);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void loadPatterns(org.xml.sax.InputSource source) throws java.io.IOException
+ public virtual void loadPatterns(InputSource source)
+ {
+ PatternParser pp = new PatternParser(this);
+ ivalues = new TernaryTree();
+
+ pp.parse(source);
+
+ // patterns/values should be now in the tree
+ // let's optimize a bit
+ trimToSize();
+ vspace.trimToSize();
+ classmap.trimToSize();
+
+ // get rid of the auxiliary map
+ ivalues = null;
+ }
+
+ public virtual string findPattern(string pat)
+ {
+ int k = base.find(pat);
+ if (k >= 0)
+ {
+ return unpackValues(k);
+ }
+ return "";
+ }
+
+ /// <summary>
+ /// String compare, returns 0 if equal or t is a substring of s
+ /// </summary>
+ protected internal virtual int hstrcmp(char[] s, int si, char[] t, int ti)
+ {
+ for (; s[si] == t[ti]; si++, ti++)
+ {
+ if (s[si] == 0)
+ {
+ return 0;
+ }
+ }
+ if (t[ti] == 0)
+ {
+ return 0;
+ }
+ return s[si] - t[ti];
+ }
+
+ protected internal virtual sbyte[] getValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace.get(k++);
+ while (v != 0)
+ {
+ char c = (char)(((int)((uint)v >> 4)) - 1);
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1);
+ buf.Append(c);
+ v = vspace.get(k++);
+ }
+ sbyte[] res = new sbyte[buf.Length];
+ for (int i = 0; i < res.Length; i++)
+ {
+ res[i] = (sbyte) buf[i];
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// <para>
+ /// Search for all possible partial matches of word starting at index an update
+ /// interletter values. In other words, it does something like:
+ /// </para>
+ /// <code>
+ /// for(i=0; i<patterns.length; i++) {
+ /// if ( word.substring(index).startsWidth(patterns[i]) )
+ /// update_interletter_values(patterns[i]);
+ /// }
+ /// </code>
+ /// <para>
+ /// But it is done in an efficient way since the patterns are stored in a
+ /// ternary tree. In fact, this is the whole purpose of having the tree: doing
+ /// this search without having to test every single pattern. The number of
+ /// patterns for languages such as English range from 4000 to 10000. Thus,
+ /// doing thousands of string comparisons for each word to hyphenate would be
+ /// really slow without the tree. The tradeoff is memory, but using a ternary
+ /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
+ /// It's also faster than using a hash table
+ /// </para>
+ /// </summary>
+ /// <param name="word"> null terminated word to match </param>
+ /// <param name="index"> start index from word </param>
+ /// <param name="il"> interletter values array to update </param>
+ protected internal virtual void searchPatterns(char[] word, int index, sbyte[] il)
+ {
+ sbyte[] values;
+ int i = index;
+ char p, q;
+ char sp = word[i];
+ p = root;
+
+ while (p > 0 && p < sc.Length)
+ {
+ if (sc[p] == 0xFFFF)
+ {
+ if (hstrcmp(word, i, kv.Array, lo[p]) == 0)
+ {
+ values = getValues(eq[p]); // data pointer is in eq[]
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ }
+ return;
+ }
+ int d = sp - sc[p];
+ if (d == 0)
+ {
+ if (sp == 0)
+ {
+ break;
+ }
+ sp = word[++i];
+ p = eq[p];
+ q = p;
+
+ // look for a pattern ending at this position by searching for
+ // the null char ( splitchar == 0 )
+ while (q > 0 && q < sc.Length)
+ {
+ if (sc[q] == 0xFFFF) // stop at compressed branch
+ {
+ break;
+ }
+ if (sc[q] == 0)
+ {
+ values = getValues(eq[q]);
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ break;
+ }
+ else
+ {
+ q = lo[q];
+
+ /// <summary>
+ /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+ /// java chars are unsigned
+ /// </summary>
+ }
+ }
+ }
+ else
+ {
+ p = d < 0 ? lo[p] : hi[p];
+ }
+ }
+ }
+
+ /// <summary>
+ /// Hyphenate word and return a Hyphenation object.
+ /// </summary>
+ /// <param name="word"> the word to be hyphenated </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation hyphenate(string word, int remainCharCount, int pushCharCount)
+ {
+ char[] w = word.ToCharArray();
+ return hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
+ }
+
+ /// <summary>
+ /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+ /// may be absent, the first n is at offset, the first l is at offset +
+ /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+ /// into word. In the first part of the routine len = w.length, in the second
+ /// part of the routine len = word.length. Three indices are used: index(w),
+ /// the index in w, index(word), the index in word, letterindex(word), the
+ /// index in the letter part of word. The following relations exist: index(w) =
+ /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+ /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+ /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+ /// iIgnoreAtBeginning
+ /// </summary>
+
+ /// <summary>
+ /// Hyphenate word and return an array of hyphenation points.
+ /// </summary>
+ /// <param name="w"> char array that contains the word </param>
+ /// <param name="offset"> Offset to first character in word </param>
+ /// <param name="len"> Length of word </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
+ {
+ int i;
+ char[] word = new char[len + 3];
+
+ // normalize word
+ char[] c = new char[2];
+ int iIgnoreAtBeginning = 0;
+ int iLength = len;
+ bool bEndOfLetters = false;
+ for (i = 1; i <= len; i++)
+ {
+ c[0] = w[offset + i - 1];
+ int nc = classmap.find(c, 0);
+ if (nc < 0) // found a non-letter character ...
+ {
+ if (i == (1 + iIgnoreAtBeginning))
+ {
+ // ... before any letter character
+ iIgnoreAtBeginning++;
+ }
+ else
+ {
+ // ... after a letter character
+ bEndOfLetters = true;
+ }
+ iLength--;
+ }
+ else
+ {
+ if (!bEndOfLetters)
+ {
+ word[i - iIgnoreAtBeginning] = (char) nc;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ }
+ len = iLength;
+ if (len < (remainCharCount + pushCharCount))
+ {
+ // word is too short to be hyphenated
+ return null;
+ }
+ int[] result = new int[len + 1];
+ int k = 0;
+
+ // check exception list first
+ string sw = new string(word, 1, len);
+ if (stoplist.ContainsKey(sw))
+ {
+ // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+ // null)
+ List<object> hw = stoplist[sw];
+ int j = 0;
+ for (i = 0; i < hw.Count; i++)
+ {
+ object o = hw[i];
+ // j = index(sw) = letterindex(word)?
+ // result[k] = corresponding index(w)
+ if (o is string)
+ {
+ j += ((string) o).Length;
+ if (j >= remainCharCount && j < (len - pushCharCount))
+ {
+ result[k++] = j + iIgnoreAtBeginning;
+ }
+ }
+ }
+ }
+ else
+ {
+ // use algorithm to get hyphenation points
+ word[0] = '.'; // word start marker
+ word[len + 1] = '.'; // word end marker
+ word[len + 2] = (char)0; // null terminated
+ sbyte[] il = new sbyte[len + 3]; // initialized to zero
+ for (i = 0; i < len + 1; i++)
+ {
+ searchPatterns(word, i, il);
+ }
+
+ // hyphenation points are located where interletter value is odd
+ // i is letterindex(word),
+ // i + 1 is index(word),
+ // result[k] = corresponding index(w)
+ for (i = 0; i < len; i++)
+ {
+ if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
+ {
+ result[k++] = i + iIgnoreAtBeginning;
+ }
+ }
+ }
+
+ if (k > 0)
+ {
+ // trim result array
+ int[] res = new int[k + 2];
+ Array.Copy(result, 0, res, 1, k);
+ // We add the synthetical hyphenation points
+ // at the beginning and end of the word
+ res[0] = 0;
+ res[k + 1] = len;
+ return new Hyphenation(res);
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /// <summary>
+ /// Add a character class to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
+ /// Character classes define the valid word characters for hyphenation. If a
+ /// word contains a character not defined in any of the classes, it is not
+ /// hyphenated. It also defines a way to normalize the characters in order to
+ /// compare them with the stored patterns. Usually pattern files use only lower
+ /// case characters, in this case a class for letter 'a', for example, should
+ /// be defined as "aA", the first character being the normalization char.
+ /// </summary>
+ public virtual void addClass(string chargroup)
+ {
+ if (chargroup.Length > 0)
+ {
+ char equivChar = chargroup[0];
+ char[] key = new char[2];
+ key[1] = (char)0;
+ for (int i = 0; i < chargroup.Length; i++)
+ {
+ key[0] = chargroup[i];
+ classmap.insert(key, 0, equivChar);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Add an exception to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
+ /// hyphenation exceptions.
+ /// </summary>
+ /// <param name="word"> normalized word </param>
+ /// <param name="hyphenatedword"> a vector of alternating strings and
+ /// <seealso cref="Hyphen hyphen"/> objects. </param>
+ public virtual void addException(string word, List<object> hyphenatedword)
+ {
+ stoplist[word] = hyphenatedword;
+ }
+
+ /// <summary>
+ /// Add a pattern to the tree. Mainly, to be used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
+ /// the tree.
+ /// </summary>
+ /// <param name="pattern"> the hyphenation pattern </param>
+ /// <param name="ivalue"> interletter weight values indicating the desirability and
+ /// priority of hyphenating at a given point within the pattern. It
+ /// should contain only digit characters. (i.e. '0' to '9'). </param>
+ public virtual void addPattern(string pattern, string ivalue)
+ {
+ int k = ivalues.find(ivalue);
+ if (k <= 0)
+ {
+ k = packValues(ivalue);
+ ivalues.insert(ivalue, (char) k);
+ }
+ insert(pattern, (char) k);
+ }
+
+ public override void printStats(PrintStream @out)
+ {
+ @out.println("Value space size = " + Convert.ToString(vspace.length()));
+ base.printStats(@out);
+
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
new file mode 100644
index 0000000..5b3fc39
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
@@ -0,0 +1,57 @@
+using System.Collections.Generic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ /// <summary>
+ /// This interface is used to connect the XML pattern file parser to the
+ /// hyphenation tree.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public interface PatternConsumer
+ {
+
+ /// <summary>
+ /// Add a character class. A character class defines characters that are
+ /// considered equivalent for the purpose of hyphenation (e.g. "aA"). It
+ /// usually means to ignore case.
+ /// </summary>
+ /// <param name="chargroup"> character group </param>
+ void addClass(string chargroup);
+
+ /// <summary>
+ /// Add a hyphenation exception. An exception replaces the result obtained by
+ /// the algorithm for cases for which this fails or the user wants to provide
+ /// his own hyphenation. A hyphenatedword is a vector of alternating String's
+ /// and <seealso cref="Hyphen Hyphen"/> instances
+ /// </summary>
+ void addException(string word, List<object> hyphenatedword);
+
+ /// <summary>
+ /// Add hyphenation patterns.
+ /// </summary>
+ /// <param name="pattern"> the pattern </param>
+ /// <param name="values"> interletter values expressed as a string of digit characters. </param>
+ void addPattern(string pattern, string values);
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
new file mode 100644
index 0000000..50d3eb8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
@@ -0,0 +1,463 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+ // SAX
+ using XMLReader = org.xml.sax.XMLReader;
+ using InputSource = org.xml.sax.InputSource;
+ using SAXException = org.xml.sax.SAXException;
+ using SAXParseException = org.xml.sax.SAXParseException;
+ using DefaultHandler = org.xml.sax.helpers.DefaultHandler;
+ using Attributes = org.xml.sax.Attributes;
+
+ // Java
+
+ /// <summary>
+ /// A SAX document handler to read and parse hyphenation patterns from a XML
+ /// file.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class PatternParser : DefaultHandler
+ {
+
+ internal XMLReader parser;
+
+ internal int currElement;
+
+ internal PatternConsumer consumer;
+
+ internal StringBuilder token;
+
+ internal List<object> exception;
+
+ internal char hyphenChar;
+
+ internal string errMsg;
+
+ internal const int ELEM_CLASSES = 1;
+
+ internal const int ELEM_EXCEPTIONS = 2;
+
+ internal const int ELEM_PATTERNS = 3;
+
+ internal const int ELEM_HYPHEN = 4;
+
+ public PatternParser()
+ {
+ token = new StringBuilder();
+ parser = createParser();
+ parser.ContentHandler = this;
+ parser.ErrorHandler = this;
+ parser.EntityResolver = this;
+ hyphenChar = '-'; // default
+
+ }
+
+ public PatternParser(PatternConsumer consumer) : this()
+ {
+ this.consumer = consumer;
+ }
+
+ public virtual PatternConsumer Consumer
+ {
+ set
+ {
+ this.consumer = value;
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="filename"> the filename </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(String filename) throws java.io.IOException
+ public virtual void parse(string filename)
+ {
+ parse(new InputSource(filename));
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(java.io.File file) throws java.io.IOException
+ public virtual void parse(File file)
+ {
+ InputSource src = new InputSource(file.toURI().toASCIIString());
+ parse(src);
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(org.xml.sax.InputSource source) throws java.io.IOException
+ public virtual void parse(InputSource source)
+ {
+ try
+ {
+ parser.parse(source);
+ }
+ catch (SAXException e)
+ {
+ throw new IOException(e);
+ }
+ }
+
+ /// <summary>
+ /// Creates a SAX parser using JAXP
+ /// </summary>
+ /// <returns> the created SAX parser </returns>
+ internal static XMLReader createParser()
+ {
+ try
+ {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.NamespaceAware = true;
+ return factory.newSAXParser().XMLReader;
+ }
+ catch (Exception e)
+ {
+ throw new Exception("Couldn't create XMLReader: " + e.Message);
+ }
+ }
+
+ protected internal virtual string readToken(StringBuilder chars)
+ {
+ string word;
+ bool space = false;
+ int i;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (space)
+ {
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (token.Length > 0)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ }
+ space = false;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ break;
+ }
+ }
+ token.Append(chars.ToString().Substring(0, i));
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (space)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ token.Append(chars);
+ return null;
+ }
+
+ protected internal static string getPattern(string word)
+ {
+ StringBuilder pat = new StringBuilder();
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ if (!char.IsDigit(word[i]))
+ {
+ pat.Append(word[i]);
+ }
+ }
+ return pat.ToString();
+ }
+
+ protected internal virtual List<object> normalizeException(List<T1> ex)
+ {
+ List<object> res = new List<object>();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ string str = (string) item;
+ StringBuilder buf = new StringBuilder();
+ for (int j = 0; j < str.Length; j++)
+ {
+ char c = str[j];
+ if (c != hyphenChar)
+ {
+ buf.Append(c);
+ }
+ else
+ {
+ res.Add(buf.ToString());
+ buf.Length = 0;
+ char[] h = new char[1];
+ h[0] = hyphenChar;
+ // we use here hyphenChar which is not necessarily
+ // the one to be printed
+ res.Add(new Hyphen(new string(h), null, null));
+ }
+ }
+ if (buf.Length > 0)
+ {
+ res.Add(buf.ToString());
+ }
+ }
+ else
+ {
+ res.Add(item);
+ }
+ }
+ return res;
+ }
+
+ protected internal virtual string getExceptionWord<T1>(List<T1> ex)
+ {
+ StringBuilder res = new StringBuilder();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ res.Append((string) item);
+ }
+ else
+ {
+ if (((Hyphen) item).noBreak != null)
+ {
+ res.Append(((Hyphen) item).noBreak);
+ }
+ }
+ }
+ return res.ToString();
+ }
+
+ protected internal static string getInterletterValues(string pat)
+ {
+ StringBuilder il = new StringBuilder();
+ string word = pat + "a"; // add dummy letter to serve as sentinel
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ char c = word[i];
+ if (char.IsDigit(c))
+ {
+ il.Append(c);
+ i++;
+ }
+ else
+ {
+ il.Append('0');
+ }
+ }
+ return il.ToString();
+ }
+
+ //
+ // EntityResolver methods
+ //
+ public override InputSource resolveEntity(string publicId, string systemId)
+ {
+ // supply the internal hyphenation.dtd if possible
+ if ((systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || ("hyphenation-info".Equals(publicId)))
+ {
+ // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
+ return new InputSource(this.GetType().getResource("hyphenation.dtd").toExternalForm());
+ }
+ return null;
+ }
+
+ //
+ // ContentHandler methods
+ //
+
+ /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
+ /// java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
+ public override void startElement(string uri, string local, string raw, Attributes attrs)
+ {
+ if (local.Equals("hyphen-char"))
+ {
+ string h = attrs.getValue("value");
+ if (h != null && h.Length == 1)
+ {
+ hyphenChar = h[0];
+ }
+ }
+ else if (local.Equals("classes"))
+ {
+ currElement = ELEM_CLASSES;
+ }
+ else if (local.Equals("patterns"))
+ {
+ currElement = ELEM_PATTERNS;
+ }
+ else if (local.Equals("exceptions"))
+ {
+ currElement = ELEM_EXCEPTIONS;
+ exception = new List<>();
+ }
+ else if (local.Equals("hyphen"))
+ {
+ if (token.Length > 0)
+ {
+ exception.Add(token.ToString());
+ }
+ exception.Add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), attrs.getValue("post")));
+ currElement = ELEM_HYPHEN;
+ }
+ token.Length = 0;
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
+ /// java.lang.String, java.lang.String) </seealso>
+//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
+//ORIGINAL LINE: @Override @SuppressWarnings("unchecked") public void endElement(String uri, String local, String raw)
+ public override void endElement(string uri, string local, string raw)
+ {
+
+ if (token.Length > 0)
+ {
+ string word = token.ToString();
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.addClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = normalizeException(exception);
+ consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
+ break;
+ case ELEM_PATTERNS:
+ consumer.addPattern(getPattern(word), getInterletterValues(word));
+ break;
+ case ELEM_HYPHEN:
+ // nothing to do
+ break;
+ }
+ if (currElement != ELEM_HYPHEN)
+ {
+ token.Length = 0;
+ }
+ }
+ if (currElement == ELEM_HYPHEN)
+ {
+ currElement = ELEM_EXCEPTIONS;
+ }
+ else
+ {
+ currElement = 0;
+ }
+
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
+//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
+//ORIGINAL LINE: @SuppressWarnings("unchecked") @Override public void characters(char ch[] , int start, int length)
+ public override void characters(char[] ch, int start, int length)
+ {
+ StringBuilder chars = new StringBuilder(length);
+ chars.Append(ch, start, length);
+ string word = readToken(chars);
+ while (word != null)
+ {
+ // System.out.println("\"" + word + "\"");
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.addClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = normalizeException(exception);
+ consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
+ exception.Clear();
+ break;
+ case ELEM_PATTERNS:
+ consumer.addPattern(getPattern(word), getInterletterValues(word));
+ break;
+ }
+ word = readToken(chars);
+ }
+
+ }
+
+ /// <summary>
+ /// Returns a string of the location.
+ /// </summary>
+ private string getLocationString(SAXParseException ex)
+ {
+ StringBuilder str = new StringBuilder();
+
+ string systemId = ex.SystemId;
+ if (systemId != null)
+ {
+ int index = systemId.LastIndexOf('/');
+ if (index != -1)
+ {
+ systemId = systemId.Substring(index + 1);
+ }
+ str.Append(systemId);
+ }
+ str.Append(':');
+ str.Append(ex.LineNumber);
+ str.Append(':');
+ str.Append(ex.ColumnNumber);
+
+ return str.ToString();
+
+ } // getLocationString(SAXParseException):String
+ }
+
+}
\ No newline at end of file