You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:39:40 UTC
[19/52] [abbrv] lucenenet git commit: Ported Analysis.Compound
namespace + tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
index d3fa779..33bc310 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
@@ -1,528 +1,580 @@
-\ufeff/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System;
+\ufeffusing System;
using System.Collections.Generic;
+using System.IO;
using System.Text;
+using System.Xml;
namespace Lucene.Net.Analysis.Compound.Hyphenation
{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
/// <summary>
/// This tree structure stores the hyphenation patterns in an efficient way for
/// fast lookup. It provides the provides the method to hyphenate a word.
///
/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
/// </summary>
- public class HyphenationTree : TernaryTree, PatternConsumer
- {
-
- /// <summary>
- /// value space: stores the interletter values
- /// </summary>
- protected internal ByteVector vspace;
-
- /// <summary>
- /// This map stores hyphenation exceptions
- /// </summary>
- protected internal Dictionary<string, List<object>> stoplist;
-
- /// <summary>
- /// This map stores the character classes
- /// </summary>
- protected internal TernaryTree classmap;
-
- /// <summary>
- /// Temporary map to store interletter values on pattern loading.
- /// </summary>
- [NonSerialized]
- private TernaryTree ivalues;
-
- public HyphenationTree()
- {
- stoplist = new Dictionary<>(23); // usually a small table
- classmap = new TernaryTree();
- vspace = new ByteVector();
- vspace.alloc(1); // this reserves index 0, which we don't use
- }
-
- /// <summary>
- /// Packs the values by storing them in 4 bits, two values into a byte Values
- /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
- /// value.
- /// </summary>
- /// <param name="values"> a string of digits from '0' to '9' representing the
- /// interletter values. </param>
- /// <returns> the index into the vspace array where the packed values are stored. </returns>
- protected internal virtual int packValues(string values)
- {
- int i , n = values.Length;
- int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
- int offset = vspace.alloc(m);
- sbyte[] va = vspace.Array;
- for (i = 0; i < n; i++)
- {
- int j = i >> 1;
- sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
- if ((i & 1) == 1)
- {
- va[j + offset] = (sbyte)(va[j + offset] | v);
- }
- else
- {
- va[j + offset] = (sbyte)(v << 4); // big endian
- }
- }
- va[m - 1 + offset] = 0; // terminator
- return offset;
- }
-
- protected internal virtual string unpackValues(int k)
- {
- StringBuilder buf = new StringBuilder();
- sbyte v = vspace.get(k++);
- while (v != 0)
- {
- char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
- buf.Append(c);
- c = (char)(v & 0x0f);
- if (c == 0)
- {
- break;
- }
- c = (char)(c - 1 + '0');
- buf.Append(c);
- v = vspace.get(k++);
- }
- return buf.ToString();
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="f"> the filename </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void loadPatterns(java.io.File f) throws java.io.IOException
- public virtual void loadPatterns(File f)
- {
- InputSource src = new InputSource(f.toURI().toASCIIString());
- loadPatterns(src);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="source"> the InputSource for the file </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void loadPatterns(org.xml.sax.InputSource source) throws java.io.IOException
- public virtual void loadPatterns(InputSource source)
- {
- PatternParser pp = new PatternParser(this);
- ivalues = new TernaryTree();
-
- pp.parse(source);
-
- // patterns/values should be now in the tree
- // let's optimize a bit
- trimToSize();
- vspace.trimToSize();
- classmap.trimToSize();
-
- // get rid of the auxiliary map
- ivalues = null;
- }
-
- public virtual string findPattern(string pat)
- {
- int k = base.find(pat);
- if (k >= 0)
- {
- return unpackValues(k);
- }
- return "";
- }
-
- /// <summary>
- /// String compare, returns 0 if equal or t is a substring of s
- /// </summary>
- protected internal virtual int hstrcmp(char[] s, int si, char[] t, int ti)
- {
- for (; s[si] == t[ti]; si++, ti++)
- {
- if (s[si] == 0)
- {
- return 0;
- }
- }
- if (t[ti] == 0)
- {
- return 0;
- }
- return s[si] - t[ti];
- }
-
- protected internal virtual sbyte[] getValues(int k)
- {
- StringBuilder buf = new StringBuilder();
- sbyte v = vspace.get(k++);
- while (v != 0)
- {
- char c = (char)(((int)((uint)v >> 4)) - 1);
- buf.Append(c);
- c = (char)(v & 0x0f);
- if (c == 0)
- {
- break;
- }
- c = (char)(c - 1);
- buf.Append(c);
- v = vspace.get(k++);
- }
- sbyte[] res = new sbyte[buf.Length];
- for (int i = 0; i < res.Length; i++)
- {
- res[i] = (sbyte) buf[i];
- }
- return res;
- }
-
- /// <summary>
- /// <para>
- /// Search for all possible partial matches of word starting at index an update
- /// interletter values. In other words, it does something like:
- /// </para>
- /// <code>
- /// for(i=0; i<patterns.length; i++) {
- /// if ( word.substring(index).startsWidth(patterns[i]) )
- /// update_interletter_values(patterns[i]);
- /// }
- /// </code>
- /// <para>
- /// But it is done in an efficient way since the patterns are stored in a
- /// ternary tree. In fact, this is the whole purpose of having the tree: doing
- /// this search without having to test every single pattern. The number of
- /// patterns for languages such as English range from 4000 to 10000. Thus,
- /// doing thousands of string comparisons for each word to hyphenate would be
- /// really slow without the tree. The tradeoff is memory, but using a ternary
- /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
- /// It's also faster than using a hash table
- /// </para>
- /// </summary>
- /// <param name="word"> null terminated word to match </param>
- /// <param name="index"> start index from word </param>
- /// <param name="il"> interletter values array to update </param>
- protected internal virtual void searchPatterns(char[] word, int index, sbyte[] il)
- {
- sbyte[] values;
- int i = index;
- char p, q;
- char sp = word[i];
- p = root;
-
- while (p > 0 && p < sc.Length)
- {
- if (sc[p] == 0xFFFF)
- {
- if (hstrcmp(word, i, kv.Array, lo[p]) == 0)
- {
- values = getValues(eq[p]); // data pointer is in eq[]
- int j = index;
- for (int k = 0; k < values.Length; k++)
- {
- if (j < il.Length && values[k] > il[j])
- {
- il[j] = values[k];
- }
- j++;
- }
- }
- return;
- }
- int d = sp - sc[p];
- if (d == 0)
- {
- if (sp == 0)
- {
- break;
- }
- sp = word[++i];
- p = eq[p];
- q = p;
-
- // look for a pattern ending at this position by searching for
- // the null char ( splitchar == 0 )
- while (q > 0 && q < sc.Length)
- {
- if (sc[q] == 0xFFFF) // stop at compressed branch
- {
- break;
- }
- if (sc[q] == 0)
- {
- values = getValues(eq[q]);
- int j = index;
- for (int k = 0; k < values.Length; k++)
- {
- if (j < il.Length && values[k] > il[j])
- {
- il[j] = values[k];
- }
- j++;
- }
- break;
- }
- else
- {
- q = lo[q];
-
- /// <summary>
- /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
- /// java chars are unsigned
- /// </summary>
- }
- }
- }
- else
- {
- p = d < 0 ? lo[p] : hi[p];
- }
- }
- }
-
- /// <summary>
- /// Hyphenate word and return a Hyphenation object.
- /// </summary>
- /// <param name="word"> the word to be hyphenated </param>
- /// <param name="remainCharCount"> Minimum number of characters allowed before the
- /// hyphenation point. </param>
- /// <param name="pushCharCount"> Minimum number of characters allowed after the
- /// hyphenation point. </param>
- /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
- /// hyphenated word or null if word is not hyphenated. </returns>
- public virtual Hyphenation hyphenate(string word, int remainCharCount, int pushCharCount)
- {
- char[] w = word.ToCharArray();
- return hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
- }
-
- /// <summary>
- /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
- /// may be absent, the first n is at offset, the first l is at offset +
- /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
- /// into word. In the first part of the routine len = w.length, in the second
- /// part of the routine len = word.length. Three indices are used: index(w),
- /// the index in w, index(word), the index in word, letterindex(word), the
- /// index in the letter part of word. The following relations exist: index(w) =
- /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
- /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
- /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
- /// iIgnoreAtBeginning
- /// </summary>
-
- /// <summary>
- /// Hyphenate word and return an array of hyphenation points.
- /// </summary>
- /// <param name="w"> char array that contains the word </param>
- /// <param name="offset"> Offset to first character in word </param>
- /// <param name="len"> Length of word </param>
- /// <param name="remainCharCount"> Minimum number of characters allowed before the
- /// hyphenation point. </param>
- /// <param name="pushCharCount"> Minimum number of characters allowed after the
- /// hyphenation point. </param>
- /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
- /// hyphenated word or null if word is not hyphenated. </returns>
- public virtual Hyphenation hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
- {
- int i;
- char[] word = new char[len + 3];
-
- // normalize word
- char[] c = new char[2];
- int iIgnoreAtBeginning = 0;
- int iLength = len;
- bool bEndOfLetters = false;
- for (i = 1; i <= len; i++)
- {
- c[0] = w[offset + i - 1];
- int nc = classmap.find(c, 0);
- if (nc < 0) // found a non-letter character ...
- {
- if (i == (1 + iIgnoreAtBeginning))
- {
- // ... before any letter character
- iIgnoreAtBeginning++;
- }
- else
- {
- // ... after a letter character
- bEndOfLetters = true;
- }
- iLength--;
- }
- else
- {
- if (!bEndOfLetters)
- {
- word[i - iIgnoreAtBeginning] = (char) nc;
- }
- else
- {
- return null;
- }
- }
- }
- len = iLength;
- if (len < (remainCharCount + pushCharCount))
- {
- // word is too short to be hyphenated
- return null;
- }
- int[] result = new int[len + 1];
- int k = 0;
-
- // check exception list first
- string sw = new string(word, 1, len);
- if (stoplist.ContainsKey(sw))
- {
- // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
- // null)
- List<object> hw = stoplist[sw];
- int j = 0;
- for (i = 0; i < hw.Count; i++)
- {
- object o = hw[i];
- // j = index(sw) = letterindex(word)?
- // result[k] = corresponding index(w)
- if (o is string)
- {
- j += ((string) o).Length;
- if (j >= remainCharCount && j < (len - pushCharCount))
- {
- result[k++] = j + iIgnoreAtBeginning;
- }
- }
- }
- }
- else
- {
- // use algorithm to get hyphenation points
- word[0] = '.'; // word start marker
- word[len + 1] = '.'; // word end marker
- word[len + 2] = (char)0; // null terminated
- sbyte[] il = new sbyte[len + 3]; // initialized to zero
- for (i = 0; i < len + 1; i++)
- {
- searchPatterns(word, i, il);
- }
-
- // hyphenation points are located where interletter value is odd
- // i is letterindex(word),
- // i + 1 is index(word),
- // result[k] = corresponding index(w)
- for (i = 0; i < len; i++)
- {
- if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
- {
- result[k++] = i + iIgnoreAtBeginning;
- }
- }
- }
-
- if (k > 0)
- {
- // trim result array
- int[] res = new int[k + 2];
- Array.Copy(result, 0, res, 1, k);
- // We add the synthetical hyphenation points
- // at the beginning and end of the word
- res[0] = 0;
- res[k + 1] = len;
- return new Hyphenation(res);
- }
- else
- {
- return null;
- }
- }
-
- /// <summary>
- /// Add a character class to the tree. It is used by
- /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
- /// Character classes define the valid word characters for hyphenation. If a
- /// word contains a character not defined in any of the classes, it is not
- /// hyphenated. It also defines a way to normalize the characters in order to
- /// compare them with the stored patterns. Usually pattern files use only lower
- /// case characters, in this case a class for letter 'a', for example, should
- /// be defined as "aA", the first character being the normalization char.
- /// </summary>
- public virtual void addClass(string chargroup)
- {
- if (chargroup.Length > 0)
- {
- char equivChar = chargroup[0];
- char[] key = new char[2];
- key[1] = (char)0;
- for (int i = 0; i < chargroup.Length; i++)
- {
- key[0] = chargroup[i];
- classmap.insert(key, 0, equivChar);
- }
- }
- }
-
- /// <summary>
- /// Add an exception to the tree. It is used by
- /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
- /// hyphenation exceptions.
- /// </summary>
- /// <param name="word"> normalized word </param>
- /// <param name="hyphenatedword"> a vector of alternating strings and
- /// <seealso cref="Hyphen hyphen"/> objects. </param>
- public virtual void addException(string word, List<object> hyphenatedword)
- {
- stoplist[word] = hyphenatedword;
- }
-
- /// <summary>
- /// Add a pattern to the tree. Mainly, to be used by
- /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
- /// the tree.
- /// </summary>
- /// <param name="pattern"> the hyphenation pattern </param>
- /// <param name="ivalue"> interletter weight values indicating the desirability and
- /// priority of hyphenating at a given point within the pattern. It
- /// should contain only digit characters. (i.e. '0' to '9'). </param>
- public virtual void addPattern(string pattern, string ivalue)
- {
- int k = ivalues.find(ivalue);
- if (k <= 0)
- {
- k = packValues(ivalue);
- ivalues.insert(ivalue, (char) k);
- }
- insert(pattern, (char) k);
- }
-
- public override void printStats(PrintStream @out)
- {
- @out.println("Value space size = " + Convert.ToString(vspace.length()));
- base.printStats(@out);
-
- }
- }
-
+ public class HyphenationTree : TernaryTree, IPatternConsumer
+ {
+
+ /// <summary>
+ /// value space: stores the interletter values
+ /// </summary>
+ protected internal ByteVector vspace;
+
+ /// <summary>
+ /// This map stores hyphenation exceptions
+ /// </summary>
+ protected internal Dictionary<string, List<object>> stoplist;
+
+ /// <summary>
+ /// This map stores the character classes
+ /// </summary>
+ protected internal TernaryTree classmap;
+
+ /// <summary>
+ /// Temporary map to store interletter values on pattern loading.
+ /// </summary>
+ [NonSerialized]
+ private TernaryTree ivalues;
+
+ public HyphenationTree()
+ {
+ stoplist = new Dictionary<string, List<object>>(23); // usually a small table
+ classmap = new TernaryTree();
+ vspace = new ByteVector();
+ vspace.Alloc(1); // this reserves index 0, which we don't use
+ }
+
+ /// <summary>
+ /// Packs the values by storing them in 4 bits, two values into a byte Values
+ /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+ /// value.
+ /// </summary>
+ /// <param name="values"> a string of digits from '0' to '9' representing the
+ /// interletter values. </param>
+ /// <returns> the index into the vspace array where the packed values are stored. </returns>
+ protected internal virtual int PackValues(string values)
+ {
+ int i, n = values.Length;
+ int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+ int offset = vspace.Alloc(m);
+ sbyte[] va = vspace.Array;
+ for (i = 0; i < n; i++)
+ {
+ int j = i >> 1;
+ sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
+ if ((i & 1) == 1)
+ {
+ va[j + offset] = (sbyte)(va[j + offset] | v);
+ }
+ else
+ {
+ va[j + offset] = (sbyte)(v << 4); // big endian
+ }
+ }
+ va[m - 1 + offset] = 0; // terminator
+ return offset;
+ }
+
+ protected internal virtual string UnpackValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace[k++];
+ while (v != 0)
+ {
+ char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1 + '0');
+ buf.Append(c);
+ v = vspace[k++];
+ }
+ return buf.ToString();
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(string filename)
+ {
+ LoadPatterns(filename, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(string filename, Encoding encoding)
+ {
+ var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
+ LoadPatterns(src, encoding);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(FileInfo f)
+ {
+ LoadPatterns(f, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(FileInfo f, Encoding encoding)
+ {
+ var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
+ LoadPatterns(src, encoding);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(Stream source)
+ {
+ LoadPatterns(source, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(Stream source, Encoding encoding)
+ {
+ // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+ using (var reader = XmlReader.Create(new StreamReader(source, encoding), new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new PatternParser.DtdResolver()
+ }))
+ {
+ LoadPatterns(reader);
+ }
+ }
+
+ public virtual void LoadPatterns(XmlReader source)
+ {
+ PatternParser pp = new PatternParser(this);
+ ivalues = new TernaryTree();
+
+ pp.Parse(source);
+
+ // patterns/values should be now in the tree
+ // let's optimize a bit
+ TrimToSize();
+ vspace.TrimToSize();
+ classmap.TrimToSize();
+
+ // get rid of the auxiliary map
+ ivalues = null;
+ }
+
+ public virtual string FindPattern(string pat)
+ {
+ int k = base.Find(pat);
+ if (k >= 0)
+ {
+ return UnpackValues(k);
+ }
+ return "";
+ }
+
+ /// <summary>
+ /// String compare, returns 0 if equal or t is a substring of s
+ /// </summary>
+ protected internal virtual int HStrCmp(char[] s, int si, char[] t, int ti)
+ {
+ for (; s[si] == t[ti]; si++, ti++)
+ {
+ if (s[si] == 0)
+ {
+ return 0;
+ }
+ }
+ if (t[ti] == 0)
+ {
+ return 0;
+ }
+ return s[si] - t[ti];
+ }
+
+ protected internal virtual sbyte[] GetValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace[k++];
+ while (v != 0)
+ {
+ char c = (char)(((int)((uint)v >> 4)) - 1);
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1);
+ buf.Append(c);
+ v = vspace[k++];
+ }
+ sbyte[] res = new sbyte[buf.Length];
+ for (int i = 0; i < res.Length; i++)
+ {
+ res[i] = (sbyte)buf[i];
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// <para>
+ /// Search for all possible partial matches of word starting at index an update
+ /// interletter values. In other words, it does something like:
+ /// </para>
+ /// <code>
+ /// for(i=0; i<patterns.length; i++) {
+ /// if ( word.substring(index).startsWidth(patterns[i]) )
+ /// update_interletter_values(patterns[i]);
+ /// }
+ /// </code>
+ /// <para>
+ /// But it is done in an efficient way since the patterns are stored in a
+ /// ternary tree. In fact, this is the whole purpose of having the tree: doing
+ /// this search without having to test every single pattern. The number of
+ /// patterns for languages such as English range from 4000 to 10000. Thus,
+ /// doing thousands of string comparisons for each word to hyphenate would be
+ /// really slow without the tree. The tradeoff is memory, but using a ternary
+ /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
+ /// It's also faster than using a hash table
+ /// </para>
+ /// </summary>
+ /// <param name="word"> null terminated word to match </param>
+ /// <param name="index"> start index from word </param>
+ /// <param name="il"> interletter values array to update </param>
+ protected internal virtual void SearchPatterns(char[] word, int index, sbyte[] il)
+ {
+ sbyte[] values;
+ int i = index;
+ char p, q;
+ char sp = word[i];
+ p = root;
+
+ while (p > 0 && p < sc.Length)
+ {
+ if (sc[p] == 0xFFFF)
+ {
+ if (HStrCmp(word, i, kv.Array, lo[p]) == 0)
+ {
+ values = GetValues(eq[p]); // data pointer is in eq[]
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ }
+ return;
+ }
+ int d = sp - sc[p];
+ if (d == 0)
+ {
+ if (sp == 0)
+ {
+ break;
+ }
+ sp = word[++i];
+ p = eq[p];
+ q = p;
+
+ // look for a pattern ending at this position by searching for
+ // the null char ( splitchar == 0 )
+ while (q > 0 && q < sc.Length)
+ {
+ if (sc[q] == 0xFFFF) // stop at compressed branch
+ {
+ break;
+ }
+ if (sc[q] == 0)
+ {
+ values = GetValues(eq[q]);
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ break;
+ }
+ else
+ {
+ q = lo[q];
+
+ /// <summary>
+ /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+ /// java chars are unsigned
+ /// </summary>
+ }
+ }
+ }
+ else
+ {
+ p = d < 0 ? lo[p] : hi[p];
+ }
+ }
+ }
+
+ /// <summary>
+ /// Hyphenate word and return a Hyphenation object.
+ /// </summary>
+ /// <param name="word"> the word to be hyphenated </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
+ {
+ char[] w = word.ToCharArray();
+ return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
+ }
+
+ /// <summary>
+ /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+ /// may be absent, the first n is at offset, the first l is at offset +
+ /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+ /// into word. In the first part of the routine len = w.length, in the second
+ /// part of the routine len = word.length. Three indices are used: index(w),
+ /// the index in w, index(word), the index in word, letterindex(word), the
+ /// index in the letter part of word. The following relations exist: index(w) =
+ /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+ /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+ /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+ /// iIgnoreAtBeginning
+ /// </summary>
+
+ /// <summary>
+ /// Hyphenate word and return an array of hyphenation points.
+ /// </summary>
+ /// <param name="w"> char array that contains the word </param>
+ /// <param name="offset"> Offset to first character in word </param>
+ /// <param name="len"> Length of word </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
+ {
+ int i;
+ char[] word = new char[len + 3];
+
+ // normalize word
+ char[] c = new char[2];
+ int iIgnoreAtBeginning = 0;
+ int iLength = len;
+ bool bEndOfLetters = false;
+ for (i = 1; i <= len; i++)
+ {
+ c[0] = w[offset + i - 1];
+ int nc = classmap.Find(c, 0);
+ if (nc < 0) // found a non-letter character ...
+ {
+ if (i == (1 + iIgnoreAtBeginning))
+ {
+ // ... before any letter character
+ iIgnoreAtBeginning++;
+ }
+ else
+ {
+ // ... after a letter character
+ bEndOfLetters = true;
+ }
+ iLength--;
+ }
+ else
+ {
+ if (!bEndOfLetters)
+ {
+ word[i - iIgnoreAtBeginning] = (char)nc;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ }
+ len = iLength;
+ if (len < (remainCharCount + pushCharCount))
+ {
+ // word is too short to be hyphenated
+ return null;
+ }
+ int[] result = new int[len + 1];
+ int k = 0;
+
+ // check exception list first
+ string sw = new string(word, 1, len);
+ if (stoplist.ContainsKey(sw))
+ {
+ // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+ // null)
+ List<object> hw = stoplist[sw];
+ int j = 0;
+ for (i = 0; i < hw.Count; i++)
+ {
+ object o = hw[i];
+ // j = index(sw) = letterindex(word)?
+ // result[k] = corresponding index(w)
+ if (o is string)
+ {
+ j += ((string)o).Length;
+ if (j >= remainCharCount && j < (len - pushCharCount))
+ {
+ result[k++] = j + iIgnoreAtBeginning;
+ }
+ }
+ }
+ }
+ else
+ {
+ // use algorithm to get hyphenation points
+ word[0] = '.'; // word start marker
+ word[len + 1] = '.'; // word end marker
+ word[len + 2] = (char)0; // null terminated
+ sbyte[] il = new sbyte[len + 3]; // initialized to zero
+ for (i = 0; i < len + 1; i++)
+ {
+ SearchPatterns(word, i, il);
+ }
+
+ // hyphenation points are located where interletter value is odd
+ // i is letterindex(word),
+ // i + 1 is index(word),
+ // result[k] = corresponding index(w)
+ for (i = 0; i < len; i++)
+ {
+ if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
+ {
+ result[k++] = i + iIgnoreAtBeginning;
+ }
+ }
+ }
+
+ if (k > 0)
+ {
+ // trim result array
+ int[] res = new int[k + 2];
+ Array.Copy(result, 0, res, 1, k);
+ // We add the synthetical hyphenation points
+ // at the beginning and end of the word
+ res[0] = 0;
+ res[k + 1] = len;
+ return new Hyphenation(res);
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /// <summary>
+ /// Add a character class to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
+ /// Character classes define the valid word characters for hyphenation. If a
+ /// word contains a character not defined in any of the classes, it is not
+ /// hyphenated. It also defines a way to normalize the characters in order to
+ /// compare them with the stored patterns. Usually pattern files use only lower
+ /// case characters, in this case a class for letter 'a', for example, should
+ /// be defined as "aA", the first character being the normalization char.
+ /// </summary>
+ public virtual void AddClass(string chargroup)
+ {
+ if (chargroup.Length > 0)
+ {
+ char equivChar = chargroup[0];
+ char[] key = new char[2];
+ key[1] = (char)0;
+ for (int i = 0; i < chargroup.Length; i++)
+ {
+ key[0] = chargroup[i];
+ classmap.Insert(key, 0, equivChar);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Add an exception to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
+ /// hyphenation exceptions.
+ /// </summary>
+ /// <param name="word"> normalized word </param>
+ /// <param name="hyphenatedword"> a vector of alternating strings and
+ /// <seealso cref="Hyphen hyphen"/> objects. </param>
+ public virtual void AddException(string word, List<object> hyphenatedword)
+ {
+ stoplist[word] = hyphenatedword;
+ }
+
+ /// <summary>
+ /// Add a pattern to the tree. Mainly, to be used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
+ /// the tree.
+ /// </summary>
+ /// <param name="pattern"> the hyphenation pattern </param>
+ /// <param name="ivalue"> interletter weight values indicating the desirability and
+ /// priority of hyphenating at a given point within the pattern. It
+ /// should contain only digit characters. (i.e. '0' to '9'). </param>
+ public virtual void AddPattern(string pattern, string ivalue)
+ {
+ int k = ivalues.Find(ivalue);
+ if (k <= 0)
+ {
+ k = PackValues(ivalue);
+ ivalues.Insert(ivalue, (char)k);
+ }
+ Insert(pattern, (char)k);
+ }
+
+ // public override void printStats(PrintStream @out)
+ // {
+ //@out.println("Value space size = " + Convert.ToString(vspace.length()));
+ //base.printStats(@out);
+
+ // }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
index 762b832..069badd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
@@ -1,31 +1,31 @@
-\ufeff/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System.Collections.Generic;
+\ufeffusing System.Collections.Generic;
namespace Lucene.Net.Analysis.Compound.Hyphenation
{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
- /// <summary>
- /// This interface is used to connect the XML pattern file parser to the
- /// hyphenation tree.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public interface PatternConsumer
+ /// <summary>
+ /// This interface is used to connect the XML pattern file parser to the
+ /// hyphenation tree.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public interface IPatternConsumer
{
/// <summary>
@@ -34,7 +34,7 @@ namespace Lucene.Net.Analysis.Compound.Hyphenation
/// usually means to ignore case.
/// </summary>
/// <param name="chargroup"> character group </param>
- void addClass(string chargroup);
+ void AddClass(string chargroup);
/// <summary>
/// Add a hyphenation exception. An exception replaces the result obtained by
@@ -42,15 +42,13 @@ namespace Lucene.Net.Analysis.Compound.Hyphenation
/// his own hyphenation. A hyphenatedword is a vector of alternating String's
/// and <seealso cref="Hyphen"/> instances
/// </summary>
- void addException(string word, List<object> hyphenatedword);
+ void AddException(string word, List<object> hyphenatedword);
/// <summary>
/// Add hyphenation patterns.
/// </summary>
/// <param name="pattern"> the pattern </param>
/// <param name="values"> interletter values expressed as a string of digit characters. </param>
- void addPattern(string pattern, string values);
-
+ void AddPattern(string pattern, string values);
}
-
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
index 1d012c4..e94e8cf 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
@@ -1,457 +1,484 @@
-\ufeff/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System;
-using System.Collections;
+\ufeffusing System;
using System.Collections.Generic;
using System.IO;
+using System.Linq;
using System.Text;
+using System.Xml;
namespace Lucene.Net.Analysis.Compound.Hyphenation
{
-
- // SAX
-
- // Java
-
- /// <summary>
- /// A SAX document handler to read and parse hyphenation patterns from a XML
- /// file.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class PatternParser : DefaultHandler
- {
-
- internal XMLReader parser;
-
- internal int currElement;
-
- internal PatternConsumer consumer;
-
- internal StringBuilder token;
-
- internal List<object> exception;
-
- internal char hyphenChar;
-
- internal string errMsg;
-
- internal const int ELEM_CLASSES = 1;
-
- internal const int ELEM_EXCEPTIONS = 2;
-
- internal const int ELEM_PATTERNS = 3;
-
- internal const int ELEM_HYPHEN = 4;
-
- public PatternParser()
- {
- token = new StringBuilder();
- parser = createParser();
- parser.ContentHandler = this;
- parser.ErrorHandler = this;
- parser.EntityResolver = this;
- hyphenChar = '-'; // default
-
- }
-
- public PatternParser(PatternConsumer consumer) : this()
- {
- this.consumer = consumer;
- }
-
- public virtual PatternConsumer Consumer
- {
- set
- {
- this.consumer = value;
- }
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="filename"> the filename </param>
- /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(String filename) throws java.io.IOException
- public virtual void parse(string filename)
- {
- parse(new InputSource(filename));
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="file"> the pattern file </param>
- /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(java.io.File file) throws java.io.IOException
- public virtual void parse(File file)
- {
- InputSource src = new InputSource(file.toURI().toASCIIString());
- parse(src);
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="source"> the InputSource for the file </param>
- /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(org.xml.sax.InputSource source) throws java.io.IOException
- public virtual void parse(InputSource source)
- {
- try
- {
- parser.parse(source);
- }
- catch (SAXException e)
- {
- throw new IOException(e);
- }
- }
-
- /// <summary>
- /// Creates a SAX parser using JAXP
- /// </summary>
- /// <returns> the created SAX parser </returns>
- internal static XMLReader createParser()
- {
- try
- {
- SAXParserFactory factory = SAXParserFactory.newInstance();
- factory.NamespaceAware = true;
- return factory.newSAXParser().XMLReader;
- }
- catch (Exception e)
- {
- throw new Exception("Couldn't create XMLReader: " + e.Message);
- }
- }
-
- protected internal virtual string readToken(StringBuilder chars)
- {
- string word;
- bool space = false;
- int i;
- for (i = 0; i < chars.Length; i++)
- {
- if (char.IsWhiteSpace(chars[i]))
- {
- space = true;
- }
- else
- {
- break;
- }
- }
- if (space)
- {
- // chars.delete(0,i);
- for (int countr = i; countr < chars.Length; countr++)
- {
- chars[countr - i] = chars[countr];
- }
- chars.Length = chars.Length - i;
- if (token.Length > 0)
- {
- word = token.ToString();
- token.Length = 0;
- return word;
- }
- }
- space = false;
- for (i = 0; i < chars.Length; i++)
- {
- if (char.IsWhiteSpace(chars[i]))
- {
- space = true;
- break;
- }
- }
- token.Append(chars.ToString().Substring(0, i));
- // chars.delete(0,i);
- for (int countr = i; countr < chars.Length; countr++)
- {
- chars[countr - i] = chars[countr];
- }
- chars.Length = chars.Length - i;
- if (space)
- {
- word = token.ToString();
- token.Length = 0;
- return word;
- }
- token.Append(chars);
- return null;
- }
-
- protected internal static string getPattern(string word)
- {
- StringBuilder pat = new StringBuilder();
- int len = word.Length;
- for (int i = 0; i < len; i++)
- {
- if (!char.IsDigit(word[i]))
- {
- pat.Append(word[i]);
- }
- }
- return pat.ToString();
- }
-
- protected internal virtual List<object> normalizeException(List<T1> ex)
- {
- List<object> res = new List<object>();
- for (int i = 0; i < ex.Count; i++)
- {
- object item = ex[i];
- if (item is string)
- {
- string str = (string) item;
- StringBuilder buf = new StringBuilder();
- for (int j = 0; j < str.Length; j++)
- {
- char c = str[j];
- if (c != hyphenChar)
- {
- buf.Append(c);
- }
- else
- {
- res.Add(buf.ToString());
- buf.Length = 0;
- char[] h = new char[1];
- h[0] = hyphenChar;
- // we use here hyphenChar which is not necessarily
- // the one to be printed
- res.Add(new Hyphen(new string(h), null, null));
- }
- }
- if (buf.Length > 0)
- {
- res.Add(buf.ToString());
- }
- }
- else
- {
- res.Add(item);
- }
- }
- return res;
- }
-
- protected internal virtual string getExceptionWord<T1>(List<T1> ex)
- {
- StringBuilder res = new StringBuilder();
- for (int i = 0; i < ex.Count; i++)
- {
- object item = ex[i];
- if (item is string)
- {
- res.Append((string) item);
- }
- else
- {
- if (((Hyphen) item).noBreak != null)
- {
- res.Append(((Hyphen) item).noBreak);
- }
- }
- }
- return res.ToString();
- }
-
- protected internal static string getInterletterValues(string pat)
- {
- StringBuilder il = new StringBuilder();
- string word = pat + "a"; // add dummy letter to serve as sentinel
- int len = word.Length;
- for (int i = 0; i < len; i++)
- {
- char c = word[i];
- if (char.IsDigit(c))
- {
- il.Append(c);
- i++;
- }
- else
- {
- il.Append('0');
- }
- }
- return il.ToString();
- }
-
- //
- // EntityResolver methods
- //
- public override InputSource resolveEntity(string publicId, string systemId)
- {
- // supply the internal hyphenation.dtd if possible
- if ((systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || ("hyphenation-info".Equals(publicId)))
- {
- // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
- return new InputSource(this.GetType().getResource("hyphenation.dtd").toExternalForm());
- }
- return null;
- }
-
- //
- // ContentHandler methods
- //
-
- /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
- /// java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
- public override void startElement(string uri, string local, string raw, Attributes attrs)
- {
- if (local.Equals("hyphen-char"))
- {
- string h = attrs.getValue("value");
- if (h != null && h.Length == 1)
- {
- hyphenChar = h[0];
- }
- }
- else if (local.Equals("classes"))
- {
- currElement = ELEM_CLASSES;
- }
- else if (local.Equals("patterns"))
- {
- currElement = ELEM_PATTERNS;
- }
- else if (local.Equals("exceptions"))
- {
- currElement = ELEM_EXCEPTIONS;
- exception = new List<>();
- }
- else if (local.Equals("hyphen"))
- {
- if (token.Length > 0)
- {
- exception.Add(token.ToString());
- }
- exception.Add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), attrs.getValue("post")));
- currElement = ELEM_HYPHEN;
- }
- token.Length = 0;
- }
-
- /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
- /// java.lang.String, java.lang.String) </seealso>
-//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
-//ORIGINAL LINE: @Override @SuppressWarnings("unchecked") public void endElement(String uri, String local, String raw)
- public override void endElement(string uri, string local, string raw)
- {
-
- if (token.Length > 0)
- {
- string word = token.ToString();
- switch (currElement)
- {
- case ELEM_CLASSES:
- consumer.addClass(word);
- break;
- case ELEM_EXCEPTIONS:
- exception.Add(word);
- exception = normalizeException(exception);
- consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
- break;
- case ELEM_PATTERNS:
- consumer.addPattern(getPattern(word), getInterletterValues(word));
- break;
- case ELEM_HYPHEN:
- // nothing to do
- break;
- }
- if (currElement != ELEM_HYPHEN)
- {
- token.Length = 0;
- }
- }
- if (currElement == ELEM_HYPHEN)
- {
- currElement = ELEM_EXCEPTIONS;
- }
- else
- {
- currElement = 0;
- }
-
- }
-
- /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
-//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
-//ORIGINAL LINE: @SuppressWarnings("unchecked") @Override public void characters(char ch[] , int start, int length)
- public override void characters(char[] ch, int start, int length)
- {
- StringBuilder chars = new StringBuilder(length);
- chars.Append(ch, start, length);
- string word = readToken(chars);
- while (word != null)
- {
- // System.out.println("\"" + word + "\"");
- switch (currElement)
- {
- case ELEM_CLASSES:
- consumer.addClass(word);
- break;
- case ELEM_EXCEPTIONS:
- exception.Add(word);
- exception = normalizeException(exception);
- consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
- exception.Clear();
- break;
- case ELEM_PATTERNS:
- consumer.addPattern(getPattern(word), getInterletterValues(word));
- break;
- }
- word = readToken(chars);
- }
-
- }
-
- /// <summary>
- /// Returns a string of the location.
- /// </summary>
- private string getLocationString(SAXParseException ex)
- {
- StringBuilder str = new StringBuilder();
-
- string systemId = ex.SystemId;
- if (systemId != null)
- {
- int index = systemId.LastIndexOf('/');
- if (index != -1)
- {
- systemId = systemId.Substring(index + 1);
- }
- str.Append(systemId);
- }
- str.Append(':');
- str.Append(ex.LineNumber);
- str.Append(':');
- str.Append(ex.ColumnNumber);
-
- return str.ToString();
-
- } // getLocationString(SAXParseException):String
- }
-
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A XMLReader document handler to read and parse hyphenation patterns from a XML
+ /// file.
+ ///
+ /// LUCENENET: This class has been refactored from its Java counterpart to use XmlReader rather
+ /// than a SAX parser.
+ /// </summary>
+ public class PatternParser
+ {
+ internal int currElement;
+
+ internal IPatternConsumer consumer;
+
+ internal StringBuilder token;
+
+ internal List<object> exception;
+
+ internal char hyphenChar;
+
+ internal string errMsg;
+
+ internal const int ELEM_CLASSES = 1;
+
+ internal const int ELEM_EXCEPTIONS = 2;
+
+ internal const int ELEM_PATTERNS = 3;
+
+ internal const int ELEM_HYPHEN = 4;
+
+ public PatternParser()
+ {
+ token = new StringBuilder();
+ hyphenChar = '-'; // default
+ }
+
+ public PatternParser(IPatternConsumer consumer) : this()
+ {
+ this.consumer = consumer;
+ }
+
+ public virtual IPatternConsumer Consumer
+ {
+ set
+ {
+ this.consumer = value;
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="filename"> the filename </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+ public virtual void Parse(string filename)
+ {
+ // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+ using (var src = XmlReader.Create(filename, new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(FileInfo file)
+ {
+ Parse(file, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(FileInfo file, Encoding encoding)
+ {
+ using (var src = XmlReader.Create(new StreamReader(file.FullName, encoding), new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(Stream xmlStream)
+ {
+ using (var src = XmlReader.Create(xmlStream, new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+ public virtual void Parse(XmlReader source)
+ {
+ source.MoveToContent();
+ while (source.Read())
+ {
+ ParseNode(source);
+ }
+ }
+
+ private void ParseNode(XmlReader node)
+ {
+ string uri, name, raw;
+ switch (node.NodeType)
+ {
+ case XmlNodeType.Element:
+
+ // Element start
+ uri = node.NamespaceURI;
+ name = node.Name;
+ var attributes = GetAttributes(node);
+ raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+ this.StartElement(uri, name, raw, attributes);
+ if (node.IsEmptyElement)
+ {
+ this.EndElement(uri, name, raw);
+ }
+ break;
+
+ case XmlNodeType.Text:
+
+ this.Characters(node.Value.ToCharArray(), 0, node.Value.Length);
+ break;
+
+ case XmlNodeType.EndElement:
+ uri = node.NamespaceURI;
+ name = node.Name;
+ raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+ // Element end
+ this.EndElement(uri, name, raw);
+ break;
+ }
+ }
+
+ private IDictionary<string, string> GetAttributes(XmlReader node)
+ {
+ var result = new Dictionary<string, string>();
+ if (node.HasAttributes)
+ {
+ for (int i = 0; i < node.AttributeCount; i++)
+ {
+ node.MoveToAttribute(i);
+ result.Add(node.Name, node.Value);
+ }
+ }
+
+ return result;
+ }
+
+ protected internal virtual string ReadToken(StringBuilder chars)
+ {
+ string word;
+ bool space = false;
+ int i;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (space)
+ {
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (token.Length > 0)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ }
+ space = false;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ break;
+ }
+ }
+ token.Append(chars.ToString(0, i));
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (space)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ token.Append(chars);
+ return null;
+ }
+
+ protected internal static string GetPattern(string word)
+ {
+ StringBuilder pat = new StringBuilder();
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ if (!char.IsDigit(word[i]))
+ {
+ pat.Append(word[i]);
+ }
+ }
+ return pat.ToString();
+ }
+
+ protected internal virtual List<object> NormalizeException<T1>(List<T1> ex)
+ {
+ List<object> res = new List<object>();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ string str = (string)item;
+ StringBuilder buf = new StringBuilder();
+ for (int j = 0; j < str.Length; j++)
+ {
+ char c = str[j];
+ if (c != hyphenChar)
+ {
+ buf.Append(c);
+ }
+ else
+ {
+ res.Add(buf.ToString());
+ buf.Length = 0;
+ char[] h = new char[1];
+ h[0] = hyphenChar;
+ // we use here hyphenChar which is not necessarily
+ // the one to be printed
+ res.Add(new Hyphen(new string(h), null, null));
+ }
+ }
+ if (buf.Length > 0)
+ {
+ res.Add(buf.ToString());
+ }
+ }
+ else
+ {
+ res.Add(item);
+ }
+ }
+ return res;
+ }
+
+ protected internal virtual string GetExceptionWord<T1>(List<T1> ex)
+ {
+ StringBuilder res = new StringBuilder();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ res.Append((string)item);
+ }
+ else
+ {
+ if (((Hyphen)item).noBreak != null)
+ {
+ res.Append(((Hyphen)item).noBreak);
+ }
+ }
+ }
+ return res.ToString();
+ }
+
+ protected internal static string GetInterletterValues(string pat)
+ {
+ StringBuilder il = new StringBuilder();
+ string word = pat + "a"; // add dummy letter to serve as sentinel
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ char c = word[i];
+ if (char.IsDigit(c))
+ {
+ il.Append(c);
+ i++;
+ }
+ else
+ {
+ il.Append('0');
+ }
+ }
+ return il.ToString();
+ }
+
+ /// <summary>
+ /// LUCENENET specific helper class to force the DTD file to be read from the embedded resource
+ /// rather than from the file system.
+ /// </summary>
+ internal class DtdResolver : XmlUrlResolver
+ {
+ public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
+ {
+ string dtdFilename = "hyphenation.dtd";
+ if (dtdFilename.Equals(absoluteUri.Segments.LastOrDefault()))
+ {
+ var qualifedDtdFilename = string.Concat(GetType().Namespace, ".", dtdFilename);
+ return GetType().Assembly.GetManifestResourceStream(qualifedDtdFilename);
+ }
+
+ return base.GetEntity(absoluteUri, role, ofObjectToReturn);
+ }
+ }
+
+ //
+ // ContentHandler methods
+ //
+
+ /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
+ /// java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
+ public void StartElement(string uri, string local, string raw, IDictionary<string, string> attrs)
+ {
+ if (local.Equals("hyphen-char"))
+ {
+ string h = attrs.ContainsKey("value") ? attrs["value"] : null;
+ if (h != null && h.Length == 1)
+ {
+ hyphenChar = h[0];
+ }
+ }
+ else if (local.Equals("classes"))
+ {
+ currElement = ELEM_CLASSES;
+ }
+ else if (local.Equals("patterns"))
+ {
+ currElement = ELEM_PATTERNS;
+ }
+ else if (local.Equals("exceptions"))
+ {
+ currElement = ELEM_EXCEPTIONS;
+ exception = new List<object>();
+ }
+ else if (local.Equals("hyphen"))
+ {
+ if (token.Length > 0)
+ {
+ exception.Add(token.ToString());
+ }
+ exception.Add(new Hyphen(attrs["pre"], attrs["no"], attrs["post"]));
+ currElement = ELEM_HYPHEN;
+ }
+ token.Length = 0;
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
+ /// java.lang.String, java.lang.String) </seealso>
+ public void EndElement(string uri, string local, string raw)
+ {
+
+ if (token.Length > 0)
+ {
+ string word = token.ToString();
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.AddClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = NormalizeException(exception);
+ consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+ break;
+ case ELEM_PATTERNS:
+ consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+ break;
+ case ELEM_HYPHEN:
+ // nothing to do
+ break;
+ }
+ if (currElement != ELEM_HYPHEN)
+ {
+ token.Length = 0;
+ }
+ }
+ if (currElement == ELEM_HYPHEN)
+ {
+ currElement = ELEM_EXCEPTIONS;
+ }
+ else
+ {
+ currElement = 0;
+ }
+
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
+ public void Characters(char[] ch, int start, int length)
+ {
+ StringBuilder chars = new StringBuilder(length);
+ chars.Append(ch, start, length);
+ string word = ReadToken(chars);
+ while (!string.IsNullOrEmpty(word))
+ {
+ // System.out.println("\"" + word + "\"");
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.AddClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = NormalizeException(exception);
+ consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+ exception.Clear();
+ break;
+ case ELEM_PATTERNS:
+ consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+ break;
+ }
+ word = ReadToken(chars);
+ }
+
+ }
+ }
}
\ No newline at end of file