You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/12/14 20:00:47 UTC
[1/4] lucenenet git commit: Renamed hyphenation to Hyphenation to fix
build and run on case sensitive file systems
Repository: lucenenet
Updated Branches:
refs/heads/master e75dbf614 -> 7214c8a3c
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
deleted file mode 100644
index 88cfd01..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
+++ /dev/null
@@ -1,816 +0,0 @@
-\ufeffusing System;
-using System.Collections;
-using System.Collections.Generic;
-using System.IO;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// <h2>Ternary Search Tree.</h2>
- ///
- /// <para>
- /// A ternary search tree is a hybrid between a binary tree and a digital search
- /// tree (trie). Keys are limited to strings. A data value of type char is stored
- /// in each leaf node. It can be used as an index (or pointer) to the data.
- /// Branches that only contain one key are compressed to one node by storing a
- /// pointer to the trailer substring of the key. This class is intended to serve
- /// as base class or helper class to implement Dictionary collections or the
- /// like. Ternary trees have some nice properties as the following: the tree can
- /// be traversed in sorted order, partial matches (wildcard) can be implemented,
- /// retrieval of all keys within a given distance from the target, etc. The
- /// storage requirements are higher than a binary tree but a lot less than a
- /// trie. Performance is comparable with a hash table, sometimes it outperforms a
- /// hash function (most of the time can determine a miss faster than a hash).
- /// </para>
- ///
- /// <para>
- /// The main purpose of this java port is to serve as a base for implementing
- /// TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
- /// requires from 5000 to 15000 hyphenation patterns which will be keys in this
- /// tree. The strings patterns are usually small (from 2 to 5 characters), but
- /// each char in the tree is stored in a node. Thus memory usage is the main
- /// concern. We will sacrifice 'elegance' to keep memory requirements to the
- /// minimum. Using java's char type as pointer (yes, I know pointer it is a
- /// forbidden word in java) we can keep the size of the node to be just 8 bytes
- /// (3 pointers and the data char). This gives room for about 65000 nodes. In my
- /// tests the english patterns took 7694 nodes and the german patterns 10055
- /// nodes, so I think we are safe.
- /// </para>
- ///
- /// <para>
- /// All said, this is a map with strings as keys and char as value. Pretty
- /// limited!. It can be extended to a general map by using the string
- /// representation of an object and using the char value as an index to an array
- /// that contains the object values.
- /// </para>
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
-
- public class TernaryTree : ICloneable
- {
- /// <summary>
- /// We use 4 arrays to represent a node. I guess I should have created a proper
- /// node class, but somehow Knuth's pascal code made me forget we now have a
- /// portable language with virtual memory management and automatic garbage
- /// collection! And now is kind of late, furthermore, if it ain't broken, don't
- /// fix it.
- /// </summary>
-
- /// <summary>
- /// Pointer to low branch and to rest of the key when it is stored directly in
- /// this node, we don't have unions in java!
- /// </summary>
- protected internal char[] lo;
-
- /// <summary>
- /// Pointer to high branch.
- /// </summary>
- protected internal char[] hi;
-
- /// <summary>
- /// Pointer to equal branch and to data when this node is a string terminator.
- /// </summary>
- protected internal char[] eq;
-
- /// <summary>
- /// <P>
- /// The character stored in this node: splitchar. Two special values are
- /// reserved:
- /// </P>
- /// <ul>
- /// <li>0x0000 as string terminator</li>
- /// <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
- /// </ul>
- /// <para>
- /// This shouldn't be a problem if we give the usual semantics to strings since
- /// 0xFFFF is guaranteed not to be an Unicode character.
- /// </para>
- /// </summary>
- protected internal char[] sc;
-
- /// <summary>
- /// This vector holds the trailing of the keys when the branch is compressed.
- /// </summary>
- protected internal CharVector kv;
-
- protected internal char root;
-
- protected internal char freenode;
-
- protected internal int length; // number of items in tree
-
- protected internal const int BLOCK_SIZE = 2048; // allocation size for arrays
-
- internal TernaryTree()
- {
- Init();
- }
-
- protected internal virtual void Init()
- {
- root = (char)0;
- freenode = (char)1;
- length = 0;
- lo = new char[BLOCK_SIZE];
- hi = new char[BLOCK_SIZE];
- eq = new char[BLOCK_SIZE];
- sc = new char[BLOCK_SIZE];
- kv = new CharVector();
- }
-
- /// <summary>
- /// Branches are initially compressed, needing one node per key plus the size
- /// of the string key. They are decompressed as needed when another key with
- /// same prefix is inserted. This saves a lot of space, specially for long
- /// keys.
- /// </summary>
- public virtual void Insert(string key, char val)
- {
- // make sure we have enough room in the arrays
- int len = key.Length + 1; // maximum number of nodes that may be generated
- if (freenode + len > eq.Length)
- {
- RedimNodeArrays(eq.Length + BLOCK_SIZE);
- }
- char[] strkey = new char[len--];
- key.CopyTo(0, strkey, 0, len - 0);
- strkey[len] = (char)0;
- root = Insert(root, strkey, 0, val);
- }
-
- public virtual void Insert(char[] key, int start, char val)
- {
- int len = StrLen(key) + 1;
- if (freenode + len > eq.Length)
- {
- RedimNodeArrays(eq.Length + BLOCK_SIZE);
- }
- root = Insert(root, key, start, val);
- }
-
- /// <summary>
- /// The actual insertion function, recursive version.
- /// </summary>
- private char Insert(char p, char[] key, int start, char val)
- {
- int len = StrLen(key, start);
- if (p == 0)
- {
- // this means there is no branch, this node will start a new branch.
- // Instead of doing that, we store the key somewhere else and create
- // only one node with a pointer to the key
- p = freenode++;
- eq[p] = val; // holds data
- length++;
- hi[p] = (char)0;
- if (len > 0)
- {
- sc[p] = (char)0xFFFF; // indicates branch is compressed
- lo[p] = (char)kv.Alloc(len + 1); // use 'lo' to hold pointer to key
- StrCpy(kv.Array, lo[p], key, start);
- }
- else
- {
- sc[p] = (char)0;
- lo[p] = (char)0;
- }
- return p;
- }
-
- if (sc[p] == 0xFFFF)
- {
- // branch is compressed: need to decompress
- // this will generate garbage in the external key array
- // but we can do some garbage collection later
- char pp = freenode++;
- lo[pp] = lo[p]; // previous pointer to key
- eq[pp] = eq[p]; // previous pointer to data
- lo[p] = (char)0;
- if (len > 0)
- {
- sc[p] = kv[lo[pp]];
- eq[p] = pp;
- lo[pp]++;
- if (kv[lo[pp]] == 0)
- {
- // key completly decompressed leaving garbage in key array
- lo[pp] = (char)0;
- sc[pp] = (char)0;
- hi[pp] = (char)0;
- }
- else
- {
- // we only got first char of key, rest is still there
- sc[pp] = (char)0xFFFF;
- }
- }
- else
- {
- // In this case we can save a node by swapping the new node
- // with the compressed node
- sc[pp] = (char)0xFFFF;
- hi[p] = pp;
- sc[p] = (char)0;
- eq[p] = val;
- length++;
- return p;
- }
- }
- char s = key[start];
- if (s < sc[p])
- {
- lo[p] = Insert(lo[p], key, start, val);
- }
- else if (s == sc[p])
- {
- if (s != 0)
- {
- eq[p] = Insert(eq[p], key, start + 1, val);
- }
- else
- {
- // key already in tree, overwrite data
- eq[p] = val;
- }
- }
- else
- {
- hi[p] = Insert(hi[p], key, start, val);
- }
- return p;
- }
-
- /// <summary>
- /// Compares 2 null terminated char arrays
- /// </summary>
- public static int StrCmp(char[] a, int startA, char[] b, int startB)
- {
- for (; a[startA] == b[startB]; startA++, startB++)
- {
- if (a[startA] == 0)
- {
- return 0;
- }
- }
- return a[startA] - b[startB];
- }
-
- /// <summary>
- /// Compares a string with null terminated char array
- /// </summary>
- public static int StrCmp(string str, char[] a, int start)
- {
- int i, d, len = str.Length;
- for (i = 0; i < len; i++)
- {
- d = (int)str[i] - a[start + i];
- if (d != 0)
- {
- return d;
- }
- if (a[start + i] == 0)
- {
- return d;
- }
- }
- if (a[start + i] != 0)
- {
- return -a[start + i];
- }
- return 0;
-
- }
-
- public static void StrCpy(char[] dst, int di, char[] src, int si)
- {
- while (src[si] != 0)
- {
- dst[di++] = src[si++];
- }
- dst[di] = (char)0;
- }
-
- public static int StrLen(char[] a, int start)
- {
- int len = 0;
- for (int i = start; i < a.Length && a[i] != 0; i++)
- {
- len++;
- }
- return len;
- }
-
- public static int StrLen(char[] a)
- {
- return StrLen(a, 0);
- }
-
- public virtual int Find(string key)
- {
- int len = key.Length;
- char[] strkey = new char[len + 1];
- key.CopyTo(0, strkey, 0, len - 0);
- strkey[len] = (char)0;
-
- return Find(strkey, 0);
- }
-
- public virtual int Find(char[] key, int start)
- {
- int d;
- char p = root;
- int i = start;
- char c;
-
- while (p != 0)
- {
- if (sc[p] == 0xFFFF)
- {
- if (StrCmp(key, i, kv.Array, lo[p]) == 0)
- {
- return eq[p];
- }
- else
- {
- return -1;
- }
- }
- c = key[i];
- d = c - sc[p];
- if (d == 0)
- {
- if (c == 0)
- {
- return eq[p];
- }
- i++;
- p = eq[p];
- }
- else if (d < 0)
- {
- p = lo[p];
- }
- else
- {
- p = hi[p];
- }
- }
- return -1;
- }
-
- public virtual bool Knows(string key)
- {
- return (Find(key) >= 0);
- }
-
- // redimension the arrays
- private void RedimNodeArrays(int newsize)
- {
- int len = newsize < lo.Length ? newsize : lo.Length;
- char[] na = new char[newsize];
- Array.Copy(lo, 0, na, 0, len);
- lo = na;
- na = new char[newsize];
- Array.Copy(hi, 0, na, 0, len);
- hi = na;
- na = new char[newsize];
- Array.Copy(eq, 0, na, 0, len);
- eq = na;
- na = new char[newsize];
- Array.Copy(sc, 0, na, 0, len);
- sc = na;
- }
-
- public virtual int Length
- {
- get { return length; }
- }
-
- public object Clone()
- {
- TernaryTree t = new TernaryTree();
- t.lo = (char[])this.lo.Clone();
- t.hi = (char[])this.hi.Clone();
- t.eq = (char[])this.eq.Clone();
- t.sc = (char[])this.sc.Clone();
- t.kv = (CharVector)this.kv.Clone();
- t.root = this.root;
- t.freenode = this.freenode;
- t.length = this.length;
-
- return t;
- }
-
- /// <summary>
- /// Recursively insert the median first and then the median of the lower and
- /// upper halves, and so on in order to get a balanced tree. The array of keys
- /// is assumed to be sorted in ascending order.
- /// </summary>
- protected internal virtual void InsertBalanced(string[] k, char[] v, int offset, int n)
- {
- int m;
- if (n < 1)
- {
- return;
- }
- m = n >> 1;
-
- Insert(k[m + offset], v[m + offset]);
- InsertBalanced(k, v, offset, m);
-
- InsertBalanced(k, v, offset + m + 1, n - m - 1);
- }
-
- /// <summary>
- /// Balance the tree for best search performance
- /// </summary>
- public virtual void Balance()
- {
- // System.out.print("Before root splitchar = ");
- // System.out.println(sc[root]);
-
- int i = 0, n = length;
- string[] k = new string[n];
- char[] v = new char[n];
- Iterator iter = new Iterator(this);
- while (iter.MoveNext())
- {
- v[i] = iter.Value;
- k[i++] = iter.Current;
- }
- Init();
- InsertBalanced(k, v, 0, n);
-
- // With uniform letter distribution sc[root] should be around 'm'
- // System.out.print("After root splitchar = ");
- // System.out.println(sc[root]);
- }
-
- /// <summary>
- /// Each node stores a character (splitchar) which is part of some key(s). In a
- /// compressed branch (one that only contain a single string key) the trailer
- /// of the key which is not already in nodes is stored externally in the kv
- /// array. As items are inserted, key substrings decrease. Some substrings may
- /// completely disappear when the whole branch is totally decompressed. The
- /// tree is traversed to find the key substrings actually used. In addition,
- /// duplicate substrings are removed using a map (implemented with a
- /// TernaryTree!).
- ///
- /// </summary>
- public virtual void TrimToSize()
- {
- // first balance the tree for best performance
- Balance();
-
- // redimension the node arrays
- RedimNodeArrays(freenode);
-
- // ok, compact kv array
- CharVector kx = new CharVector();
- kx.Alloc(1);
- TernaryTree map = new TernaryTree();
- Compact(kx, map, root);
- kv = kx;
- kv.TrimToSize();
- }
-
- private void Compact(CharVector kx, TernaryTree map, char p)
- {
- int k;
- if (p == 0)
- {
- return;
- }
- if (sc[p] == 0xFFFF)
- {
- k = map.Find(kv.Array, lo[p]);
- if (k < 0)
- {
- k = kx.Alloc(StrLen(kv.Array, lo[p]) + 1);
- StrCpy(kx.Array, k, kv.Array, lo[p]);
- map.Insert(kx.Array, k, (char)k);
- }
- lo[p] = (char)k;
- }
- else
- {
- Compact(kx, map, lo[p]);
- if (sc[p] != 0)
- {
- Compact(kx, map, eq[p]);
- }
- Compact(kx, map, hi[p]);
- }
- }
-
- public virtual IEnumerator<string> Keys()
- {
- return new Iterator(this);
- }
-
- /// <summary>
- /// Enumerator for TernaryTree
- ///
- /// LUCENENET NOTE: This differs a bit from its Java counterpart to adhere to
- /// .NET IEnumerator semantics. In Java, when the <see cref="Iterator"/> is
- /// instantiated, it is already positioned at the first element. However,
- /// to act like a .NET IEnumerator, the initial state is undefined and considered
- /// to be before the first element until <see cref="MoveNext"/> is called, and
- /// if a move took place it will return <c>true</c>;
- /// </summary>
- public class Iterator : IEnumerator<string>
- {
- private readonly TernaryTree outerInstance;
-
-
- /// <summary>
- /// current node index
- /// </summary>
- private int cur;
-
- /// <summary>
- /// current key
- /// </summary>
- private string curkey;
-
- internal class Item : ICloneable
- {
- internal char parent;
- internal char child;
-
- public Item()
- {
- parent = (char)0;
- child = (char)0;
- }
-
- public Item(char p, char c)
- {
- parent = p;
- child = c;
- }
-
- public object Clone()
- {
- return new Item(parent, child);
- }
-
- }
-
- /// <summary>
- /// Node stack
- /// </summary>
- internal Stack<Item> ns;
-
- /// <summary>
- /// key stack implemented with a StringBuilder
- /// </summary>
- internal StringBuilder ks;
-
- private bool isInitialized = false;
-
- public Iterator(TernaryTree outerInstance)
- {
- this.outerInstance = outerInstance;
- cur = -1;
- ns = new Stack<Item>();
- ks = new StringBuilder();
- isInitialized = false;
- }
-
- public virtual void Rewind()
- {
- ns.Clear();
- ks.Length = 0;
- cur = outerInstance.root;
- Run();
- }
-
- public virtual char Value
- {
- get
- {
- if (cur >= 0)
- {
- return outerInstance.eq[cur];
- }
- return (char)0;
- }
- }
-
- /// <summary>
- /// traverse upwards
- /// </summary>
- internal virtual int Up()
- {
- Item i = new Item();
- int res = 0;
-
- if (ns.Count == 0)
- {
- return -1;
- }
-
- if (cur != 0 && outerInstance.sc[cur] == 0)
- {
- return outerInstance.lo[cur];
- }
-
- bool climb = true;
-
- while (climb)
- {
- i = ns.Pop();
- i.child++;
- switch ((int)i.child)
- {
- case 1:
- if (outerInstance.sc[i.parent] != 0)
- {
- res = outerInstance.eq[i.parent];
- ns.Push((Item)i.Clone());
- ks.Append(outerInstance.sc[i.parent]);
- }
- else
- {
- i.child++;
- ns.Push((Item)i.Clone());
- res = outerInstance.hi[i.parent];
- }
- climb = false;
- break;
-
- case 2:
- res = outerInstance.hi[i.parent];
- ns.Push((Item)i.Clone());
- if (ks.Length > 0)
- {
- ks.Length = ks.Length - 1; // pop
- }
- climb = false;
- break;
-
- default:
- if (ns.Count == 0)
- {
- return -1;
- }
- climb = true;
- break;
- }
- }
- return res;
- }
-
- /// <summary>
- /// traverse the tree to find next key
- /// </summary>
- internal virtual int Run()
- {
- if (cur == -1)
- {
- return -1;
- }
-
- bool leaf = false;
- while (true)
- {
- // first go down on low branch until leaf or compressed branch
- while (cur != 0)
- {
- if (outerInstance.sc[cur] == 0xFFFF)
- {
- leaf = true;
- break;
- }
- ns.Push(new Item((char)cur, '\u0000'));
- if (outerInstance.sc[cur] == 0)
- {
- leaf = true;
- break;
- }
- cur = outerInstance.lo[cur];
- }
- if (leaf)
- {
- break;
- }
- // nothing found, go up one node and try again
- cur = Up();
- if (cur == -1)
- {
- return -1;
- }
- }
- // The current node should be a data node and
- // the key should be in the key stack (at least partially)
- StringBuilder buf = new StringBuilder(ks.ToString());
- if (outerInstance.sc[cur] == 0xFFFF)
- {
- int p = outerInstance.lo[cur];
- while (outerInstance.kv[p] != 0)
- {
- buf.Append(outerInstance.kv[p++]);
- }
- }
- curkey = buf.ToString();
- return 0;
- }
-
- #region Added for better .NET support
- public string Current
- {
- get
- {
- return curkey;
- }
- }
-
- object IEnumerator.Current
- {
- get
- {
- return Current;
- }
- }
-
- public void Dispose()
- {
- // nothing to do
- }
-
- public bool MoveNext()
- {
- if (!isInitialized)
- {
- Rewind();
- isInitialized = true;
- return cur != -1;
- }
- if (cur == -1)
- {
- return false;
- }
- cur = Up();
- Run();
- return cur != -1;
- }
-
- public void Reset()
- {
- throw new NotSupportedException();
- }
-
- #endregion
- }
-
- public virtual void PrintStats(TextWriter @out)
- {
- @out.WriteLine("Number of keys = " + Convert.ToString(length));
- @out.WriteLine("Node count = " + Convert.ToString(freenode));
- // System.out.println("Array length = " + Integer.toString(eq.length));
- @out.WriteLine("Key Array length = " + Convert.ToString(kv.Length()));
-
- /*
- * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
- * System.out.print(kv.get(i)); else System.out.println("");
- * System.out.println("Keys:"); for(Enumeration enum = keys();
- * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
- */
- }
- /*
- public static void main(String[] args) {
- TernaryTree tt = new TernaryTree();
- tt.insert("Carlos", 'C');
- tt.insert("Car", 'r');
- tt.insert("palos", 'l');
- tt.insert("pa", 'p');
- tt.trimToSize();
- System.out.println((char) tt.find("Car"));
- System.out.println((char) tt.find("Carlos"));
- System.out.println((char) tt.find("alto"));
- tt.printStats(System.out);
- }
- */
-
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/hyphenation.dtd
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/hyphenation.dtd b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/hyphenation.dtd
deleted file mode 100644
index 083c2bd..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/hyphenation.dtd
+++ /dev/null
@@ -1,68 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
- Copyright 1999-2004 The Apache Software Foundation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
-
-<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
- classes, exceptions?, patterns)>
-
-<!-- Hyphen character to be used in the exception list as shortcut for
- <hyphen pre-break="-"/>. Defaults to '-'
--->
-<!ELEMENT hyphen-char EMPTY>
-<!ATTLIST hyphen-char value CDATA #REQUIRED>
-
-<!-- Default minimun length in characters of hyphenated word fragments
- before and after the line break. For some languages this is not
- only for aesthetic purposes, wrong hyphens may be generated if this
- is not accounted for.
--->
-<!ELEMENT hyphen-min EMPTY>
-<!ATTLIST hyphen-min before CDATA #REQUIRED>
-<!ATTLIST hyphen-min after CDATA #REQUIRED>
-
-<!-- Character equivalent classes: space separated list of character groups, all
- characters in a group are to be treated equivalent as far as
- the hyphenation algorithm is concerned. The first character in a group
- is the group's equivalent character. Patterns should only contain
- first characters. It also defines word characters, i.e. a word that
- contains characters not present in any of the classes is not hyphenated.
--->
-<!ELEMENT classes (#PCDATA)>
-
-<!-- Hyphenation exceptions: space separated list of hyphenated words.
- A hyphen is indicated by the hyphen tag, but you can use the
- hyphen-char defined previously as shortcut. This is in cases
- when the algorithm procedure finds wrong hyphens or you want
- to provide your own hyphenation for some words.
--->
-<!ELEMENT exceptions (#PCDATA|hyphen)* >
-
-<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
- characters as described before, between any two word characters a digit
- in the range 0 to 9 may be specified. The absence of a digit is equivalent
- to zero. The '.' character is reserved to indicate begining or ending
- of words. -->
-<!ELEMENT patterns (#PCDATA)>
-
-<!-- A "full hyphen" equivalent to TeX's \discretionary
- with pre-break, post-break and no-break attributes.
- To be used in the exceptions list, the hyphen character is not
- automatically added -->
-<!ELEMENT hyphen EMPTY>
-<!ATTLIST hyphen pre CDATA #IMPLIED>
-<!ATTLIST hyphen no CDATA #IMPLIED>
-<!ATTLIST hyphen post CDATA #IMPLIED>
[2/4] lucenenet git commit: Renamed hyphenation to Hyphenation to fix
build and run on case sensitive file systems
Posted by sy...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/hyphenation.dtd
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/hyphenation.dtd b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/hyphenation.dtd
new file mode 100644
index 0000000..083c2bd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/hyphenation.dtd
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 1999-2004 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
+
+<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
+ classes, exceptions?, patterns)>
+
+<!-- Hyphen character to be used in the exception list as shortcut for
+ <hyphen pre-break="-"/>. Defaults to '-'
+-->
+<!ELEMENT hyphen-char EMPTY>
+<!ATTLIST hyphen-char value CDATA #REQUIRED>
+
+<!-- Default minimun length in characters of hyphenated word fragments
+ before and after the line break. For some languages this is not
+ only for aesthetic purposes, wrong hyphens may be generated if this
+ is not accounted for.
+-->
+<!ELEMENT hyphen-min EMPTY>
+<!ATTLIST hyphen-min before CDATA #REQUIRED>
+<!ATTLIST hyphen-min after CDATA #REQUIRED>
+
+<!-- Character equivalent classes: space separated list of character groups, all
+ characters in a group are to be treated equivalent as far as
+ the hyphenation algorithm is concerned. The first character in a group
+ is the group's equivalent character. Patterns should only contain
+ first characters. It also defines word characters, i.e. a word that
+ contains characters not present in any of the classes is not hyphenated.
+-->
+<!ELEMENT classes (#PCDATA)>
+
+<!-- Hyphenation exceptions: space separated list of hyphenated words.
+ A hyphen is indicated by the hyphen tag, but you can use the
+ hyphen-char defined previously as shortcut. This is in cases
+ when the algorithm procedure finds wrong hyphens or you want
+ to provide your own hyphenation for some words.
+-->
+<!ELEMENT exceptions (#PCDATA|hyphen)* >
+
+<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
+ characters as described before, between any two word characters a digit
+ in the range 0 to 9 may be specified. The absence of a digit is equivalent
+ to zero. The '.' character is reserved to indicate begining or ending
+ of words. -->
+<!ELEMENT patterns (#PCDATA)>
+
+<!-- A "full hyphen" equivalent to TeX's \discretionary
+ with pre-break, post-break and no-break attributes.
+ To be used in the exceptions list, the hyphen character is not
+ automatically added -->
+<!ELEMENT hyphen EMPTY>
+<!ATTLIST hyphen pre CDATA #IMPLIED>
+<!ATTLIST hyphen no CDATA #IMPLIED>
+<!ATTLIST hyphen post CDATA #IMPLIED>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
deleted file mode 100644
index 6442d11..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
+++ /dev/null
@@ -1,156 +0,0 @@
-\ufeffnamespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This class implements a simple byte vector with access to the underlying
- /// array.
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class ByteVector
- {
-
- /// <summary>
- /// Capacity increment size
- /// </summary>
- private const int DEFAULT_BLOCK_SIZE = 2048;
-
- private int blockSize;
-
- /// <summary>
- /// The encapsulated array
- /// </summary>
- private sbyte[] array;
-
- /// <summary>
- /// Points to next free item
- /// </summary>
- private int n;
-
- public ByteVector() : this(DEFAULT_BLOCK_SIZE)
- {
- }
-
- public ByteVector(int capacity)
- {
- if (capacity > 0)
- {
- blockSize = capacity;
- }
- else
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- }
- array = new sbyte[blockSize];
- n = 0;
- }
-
- public ByteVector(sbyte[] a)
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- array = a;
- n = 0;
- }
-
- public ByteVector(sbyte[] a, int capacity)
- {
- if (capacity > 0)
- {
- blockSize = capacity;
- }
- else
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- }
- array = a;
- n = 0;
- }
-
- public virtual sbyte[] Array
- {
- get
- {
- return array;
- }
- }
-
- /// <summary>
- /// LUCENENET indexer for .NET
- /// </summary>
- /// <param name="index"></param>
- /// <returns></returns>
- public virtual sbyte this[int index]
- {
- get { return array[index]; }
- set { array[index] = value; }
- }
-
- /// <summary>
- /// return number of items in array
- /// </summary>
- public virtual int Length
- {
- get { return n; }
- }
-
- /// <summary>
- /// returns current capacity of array
- /// </summary>
- public virtual int Capacity
- {
- get { return array.Length; }
- }
-
- //public virtual void Put(int index, sbyte val)
- //{
- // array[index] = val;
- //}
-
- //public virtual sbyte Get(int index)
- //{
- // return array[index];
- //}
-
- /// <summary>
- /// This is to implement memory allocation in the array. Like malloc().
- /// </summary>
- public virtual int Alloc(int size)
- {
- int index = n;
- int len = array.Length;
- if (n + size >= len)
- {
- sbyte[] aux = new sbyte[len + blockSize];
- System.Array.Copy(array, 0, aux, 0, len);
- array = aux;
- }
- n += size;
- return index;
- }
-
- public virtual void TrimToSize()
- {
- if (n < array.Length)
- {
- sbyte[] aux = new sbyte[n];
- System.Array.Copy(array, 0, aux, 0, n);
- array = aux;
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
deleted file mode 100644
index 26fcea5..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
+++ /dev/null
@@ -1,171 +0,0 @@
-\ufeffusing System;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This class implements a simple char vector with access to the underlying
- /// array.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class CharVector : ICloneable
- {
-
- /// <summary>
- /// Capacity increment size
- /// </summary>
- private const int DEFAULT_BLOCK_SIZE = 2048;
-
- private int blockSize;
-
- /// <summary>
- /// The encapsulated array
- /// </summary>
- private char[] array;
-
- /// <summary>
- /// Points to next free item
- /// </summary>
- private int n;
-
- public CharVector() : this(DEFAULT_BLOCK_SIZE)
- {
- }
-
- public CharVector(int capacity)
- {
- if (capacity > 0)
- {
- blockSize = capacity;
- }
- else
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- }
- array = new char[blockSize];
- n = 0;
- }
-
- public CharVector(char[] a)
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- array = a;
- n = a.Length;
- }
-
- public CharVector(char[] a, int capacity)
- {
- if (capacity > 0)
- {
- blockSize = capacity;
- }
- else
- {
- blockSize = DEFAULT_BLOCK_SIZE;
- }
- array = a;
- n = a.Length;
- }
-
- /// <summary>
- /// Reset Vector but don't resize or clear elements
- /// </summary>
- public virtual void Clear()
- {
- n = 0;
- }
-
- public virtual object Clone()
- {
- CharVector cv = new CharVector(array, blockSize);
- cv.n = this.n;
- return cv;
- }
-
- public virtual char[] Array
- {
- get
- {
- return array;
- }
- }
-
- /// <summary>
- /// LUCENENET indexer for .NET
- /// </summary>
- /// <param name="index"></param>
- /// <returns></returns>
- public virtual char this[int index]
- {
- get { return array[index]; }
- set { array[index] = value; }
- }
-
- /// <summary>
- /// return number of items in array
- /// </summary>
- public virtual int Length()
- {
- return n;
- }
-
- /// <summary>
- /// returns current capacity of array
- /// </summary>
- public virtual int Capacity
- {
- get { return array.Length; }
- }
-
- //public virtual void Put(int index, char val)
- //{
- // array[index] = val;
- //}
-
- //public virtual char get(int index)
- //{
- // return array[index];
- //}
-
- public virtual int Alloc(int size)
- {
- int index = n;
- int len = array.Length;
- if (n + size >= len)
- {
- char[] aux = new char[len + blockSize];
- System.Array.Copy(array, 0, aux, 0, len);
- array = aux;
- }
- n += size;
- return index;
- }
-
- public virtual void TrimToSize()
- {
- if (n < array.Length)
- {
- char[] aux = new char[n];
- System.Array.Copy(array, 0, aux, 0, n);
- array = aux;
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
deleted file mode 100644
index 91009b1..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
+++ /dev/null
@@ -1,72 +0,0 @@
-\ufeffusing System.Text;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
- /// pre-break text, post-break text and no-break. If no line-break is generated
- /// at this position, the no-break text is used, otherwise, pre-break and
- /// post-break are used. Typically, pre-break is equal to the hyphen character
- /// and the others are empty. However, this general scheme allows support for
- /// cases in some languages where words change spelling if they're split across
- /// lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
- /// from TeX.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class Hyphen
- {
- public string preBreak;
-
- public string noBreak;
-
- public string postBreak;
-
- internal Hyphen(string pre, string no, string post)
- {
- preBreak = pre;
- noBreak = no;
- postBreak = post;
- }
-
- internal Hyphen(string pre)
- {
- preBreak = pre;
- noBreak = null;
- postBreak = null;
- }
-
- public override string ToString()
- {
- if (noBreak == null && postBreak == null && preBreak != null && preBreak.Equals("-"))
- {
- return "-";
- }
- StringBuilder res = new StringBuilder("{");
- res.Append(preBreak);
- res.Append("}{");
- res.Append(postBreak);
- res.Append("}{");
- res.Append(noBreak);
- res.Append('}');
- return res.ToString();
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
deleted file mode 100644
index fdbac29..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
+++ /dev/null
@@ -1,53 +0,0 @@
-\ufeffnamespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This class represents a hyphenated word.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class Hyphenation
- {
-
- private readonly int[] hyphenPoints;
-
- /// <summary>
- /// rawWord as made of alternating strings and <seealso cref="Hyphen"/> instances
- /// </summary>
- internal Hyphenation(int[] points)
- {
- hyphenPoints = points;
- }
-
- /// <returns> the number of hyphenation points in the word </returns>
- public virtual int Length
- {
- get { return hyphenPoints.Length; }
- }
-
- /// <returns> the hyphenation points </returns>
- public virtual int[] HyphenationPoints
- {
- get
- {
- return hyphenPoints;
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
deleted file mode 100644
index 287f6f3..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
+++ /dev/null
@@ -1,581 +0,0 @@
-\ufeffusing Lucene.Net.Support;
-using System;
-using System.Collections.Generic;
-using System.IO;
-using System.Text;
-using System.Xml;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This tree structure stores the hyphenation patterns in an efficient way for
- /// fast lookup. It provides the provides the method to hyphenate a word.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public class HyphenationTree : TernaryTree, IPatternConsumer
- {
-
- /// <summary>
- /// value space: stores the interletter values
- /// </summary>
- protected internal ByteVector vspace;
-
- /// <summary>
- /// This map stores hyphenation exceptions
- /// </summary>
- protected internal IDictionary<string, IList<object>> stoplist;
-
- /// <summary>
- /// This map stores the character classes
- /// </summary>
- protected internal TernaryTree classmap;
-
- /// <summary>
- /// Temporary map to store interletter values on pattern loading.
- /// </summary>
- [NonSerialized]
- private TernaryTree ivalues;
-
- public HyphenationTree()
- {
- stoplist = new HashMap<string, IList<object>>(23); // usually a small table
- classmap = new TernaryTree();
- vspace = new ByteVector();
- vspace.Alloc(1); // this reserves index 0, which we don't use
- }
-
- /// <summary>
- /// Packs the values by storing them in 4 bits, two values into a byte Values
- /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
- /// value.
- /// </summary>
- /// <param name="values"> a string of digits from '0' to '9' representing the
- /// interletter values. </param>
- /// <returns> the index into the vspace array where the packed values are stored. </returns>
- protected internal virtual int PackValues(string values)
- {
- int i, n = values.Length;
- int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
- int offset = vspace.Alloc(m);
- sbyte[] va = vspace.Array;
- for (i = 0; i < n; i++)
- {
- int j = i >> 1;
- sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
- if ((i & 1) == 1)
- {
- va[j + offset] = (sbyte)(va[j + offset] | v);
- }
- else
- {
- va[j + offset] = (sbyte)(v << 4); // big endian
- }
- }
- va[m - 1 + offset] = 0; // terminator
- return offset;
- }
-
- protected internal virtual string UnpackValues(int k)
- {
- StringBuilder buf = new StringBuilder();
- sbyte v = vspace[k++];
- while (v != 0)
- {
- char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
- buf.Append(c);
- c = (char)(v & 0x0f);
- if (c == 0)
- {
- break;
- }
- c = (char)(c - 1 + '0');
- buf.Append(c);
- v = vspace[k++];
- }
- return buf.ToString();
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="f"> the filename </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(string filename)
- {
- LoadPatterns(filename, Encoding.UTF8);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="f"> the filename </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(string filename, Encoding encoding)
- {
- var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
- LoadPatterns(src, encoding);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="f"> the filename </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(FileInfo f)
- {
- LoadPatterns(f, Encoding.UTF8);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="f"> the filename </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(FileInfo f, Encoding encoding)
- {
- var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
- LoadPatterns(src, encoding);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="source"> the InputSource for the file </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(Stream source)
- {
- LoadPatterns(source, Encoding.UTF8);
- }
-
- /// <summary>
- /// Read hyphenation patterns from an XML file.
- /// </summary>
- /// <param name="source"> the InputSource for the file </param>
- /// <exception cref="IOException"> In case the parsing fails </exception>
- public virtual void LoadPatterns(Stream source, Encoding encoding)
- {
- // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
- using (var reader = XmlReader.Create(new StreamReader(source, encoding), new XmlReaderSettings
- {
- DtdProcessing = DtdProcessing.Parse,
- XmlResolver = new PatternParser.DtdResolver()
- }))
- {
- LoadPatterns(reader);
- }
- }
-
- public virtual void LoadPatterns(XmlReader source)
- {
- PatternParser pp = new PatternParser(this);
- ivalues = new TernaryTree();
-
- pp.Parse(source);
-
- // patterns/values should be now in the tree
- // let's optimize a bit
- TrimToSize();
- vspace.TrimToSize();
- classmap.TrimToSize();
-
- // get rid of the auxiliary map
- ivalues = null;
- }
-
- public virtual string FindPattern(string pat)
- {
- int k = base.Find(pat);
- if (k >= 0)
- {
- return UnpackValues(k);
- }
- return "";
- }
-
- /// <summary>
- /// String compare, returns 0 if equal or t is a substring of s
- /// </summary>
- protected internal virtual int HStrCmp(char[] s, int si, char[] t, int ti)
- {
- for (; s[si] == t[ti]; si++, ti++)
- {
- if (s[si] == 0)
- {
- return 0;
- }
- }
- if (t[ti] == 0)
- {
- return 0;
- }
- return s[si] - t[ti];
- }
-
- protected internal virtual sbyte[] GetValues(int k)
- {
- StringBuilder buf = new StringBuilder();
- sbyte v = vspace[k++];
- while (v != 0)
- {
- char c = (char)((((int)((uint)v >> 4))) - 1);
- buf.Append(c);
- c = (char)(v & 0x0f);
- if (c == 0)
- {
- break;
- }
- c = (char)(c - 1);
- buf.Append(c);
- v = vspace[k++];
- }
- sbyte[] res = new sbyte[buf.Length];
- for (int i = 0; i < res.Length; i++)
- {
- res[i] = (sbyte)buf[i];
- }
- return res;
- }
-
- /// <summary>
- /// <para>
- /// Search for all possible partial matches of word starting at index an update
- /// interletter values. In other words, it does something like:
- /// </para>
- /// <code>
- /// for(i=0; i<patterns.length; i++) {
- /// if ( word.substring(index).startsWidth(patterns[i]) )
- /// update_interletter_values(patterns[i]);
- /// }
- /// </code>
- /// <para>
- /// But it is done in an efficient way since the patterns are stored in a
- /// ternary tree. In fact, this is the whole purpose of having the tree: doing
- /// this search without having to test every single pattern. The number of
- /// patterns for languages such as English range from 4000 to 10000. Thus,
- /// doing thousands of string comparisons for each word to hyphenate would be
- /// really slow without the tree. The tradeoff is memory, but using a ternary
- /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
- /// It's also faster than using a hash table
- /// </para>
- /// </summary>
- /// <param name="word"> null terminated word to match </param>
- /// <param name="index"> start index from word </param>
- /// <param name="il"> interletter values array to update </param>
- protected internal virtual void SearchPatterns(char[] word, int index, sbyte[] il)
- {
- sbyte[] values;
- int i = index;
- char p, q;
- char sp = word[i];
- p = root;
-
- while (p > 0 && p < sc.Length)
- {
- if (sc[p] == 0xFFFF)
- {
- if (HStrCmp(word, i, kv.Array, lo[p]) == 0)
- {
- values = GetValues(eq[p]); // data pointer is in eq[]
- int j = index;
- for (int k = 0; k < values.Length; k++)
- {
- if (j < il.Length && values[k] > il[j])
- {
- il[j] = values[k];
- }
- j++;
- }
- }
- return;
- }
- int d = sp - sc[p];
- if (d == 0)
- {
- if (sp == 0)
- {
- break;
- }
- sp = word[++i];
- p = eq[p];
- q = p;
-
- // look for a pattern ending at this position by searching for
- // the null char ( splitchar == 0 )
- while (q > 0 && q < sc.Length)
- {
- if (sc[q] == 0xFFFF) // stop at compressed branch
- {
- break;
- }
- if (sc[q] == 0)
- {
- values = GetValues(eq[q]);
- int j = index;
- for (int k = 0; k < values.Length; k++)
- {
- if (j < il.Length && values[k] > il[j])
- {
- il[j] = values[k];
- }
- j++;
- }
- break;
- }
- else
- {
- q = lo[q];
-
- /// <summary>
- /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
- /// java chars are unsigned
- /// </summary>
- }
- }
- }
- else
- {
- p = d < 0 ? lo[p] : hi[p];
- }
- }
- }
-
- /// <summary>
- /// Hyphenate word and return a Hyphenation object.
- /// </summary>
- /// <param name="word"> the word to be hyphenated </param>
- /// <param name="remainCharCount"> Minimum number of characters allowed before the
- /// hyphenation point. </param>
- /// <param name="pushCharCount"> Minimum number of characters allowed after the
- /// hyphenation point. </param>
- /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
- /// hyphenated word or null if word is not hyphenated. </returns>
- public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
- {
- char[] w = word.ToCharArray();
- return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
- }
-
- /// <summary>
- /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
- /// may be absent, the first n is at offset, the first l is at offset +
- /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
- /// into word. In the first part of the routine len = w.length, in the second
- /// part of the routine len = word.length. Three indices are used: index(w),
- /// the index in w, index(word), the index in word, letterindex(word), the
- /// index in the letter part of word. The following relations exist: index(w) =
- /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
- /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
- /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
- /// iIgnoreAtBeginning
- /// </summary>
-
- /// <summary>
- /// Hyphenate word and return an array of hyphenation points.
- /// </summary>
- /// <param name="w"> char array that contains the word </param>
- /// <param name="offset"> Offset to first character in word </param>
- /// <param name="len"> Length of word </param>
- /// <param name="remainCharCount"> Minimum number of characters allowed before the
- /// hyphenation point. </param>
- /// <param name="pushCharCount"> Minimum number of characters allowed after the
- /// hyphenation point. </param>
- /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
- /// hyphenated word or null if word is not hyphenated. </returns>
- public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
- {
- int i;
- char[] word = new char[len + 3];
-
- // normalize word
- char[] c = new char[2];
- int iIgnoreAtBeginning = 0;
- int iLength = len;
- bool bEndOfLetters = false;
- for (i = 1; i <= len; i++)
- {
- c[0] = w[offset + i - 1];
- int nc = classmap.Find(c, 0);
- if (nc < 0) // found a non-letter character ...
- {
- if (i == (1 + iIgnoreAtBeginning))
- {
- // ... before any letter character
- iIgnoreAtBeginning++;
- }
- else
- {
- // ... after a letter character
- bEndOfLetters = true;
- }
- iLength--;
- }
- else
- {
- if (!bEndOfLetters)
- {
- word[i - iIgnoreAtBeginning] = (char)nc;
- }
- else
- {
- return null;
- }
- }
- }
- len = iLength;
- if (len < (remainCharCount + pushCharCount))
- {
- // word is too short to be hyphenated
- return null;
- }
- int[] result = new int[len + 1];
- int k = 0;
-
- // check exception list first
- string sw = new string(word, 1, len);
- if (stoplist.ContainsKey(sw))
- {
- // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
- // null)
- IList<object> hw = stoplist[sw];
- int j = 0;
- for (i = 0; i < hw.Count; i++)
- {
- object o = hw[i];
- // j = index(sw) = letterindex(word)?
- // result[k] = corresponding index(w)
- if (o is string)
- {
- j += ((string)o).Length;
- if (j >= remainCharCount && j < (len - pushCharCount))
- {
- result[k++] = j + iIgnoreAtBeginning;
- }
- }
- }
- }
- else
- {
- // use algorithm to get hyphenation points
- word[0] = '.'; // word start marker
- word[len + 1] = '.'; // word end marker
- word[len + 2] = (char)0; // null terminated
- sbyte[] il = new sbyte[len + 3]; // initialized to zero
- for (i = 0; i < len + 1; i++)
- {
- SearchPatterns(word, i, il);
- }
-
- // hyphenation points are located where interletter value is odd
- // i is letterindex(word),
- // i + 1 is index(word),
- // result[k] = corresponding index(w)
- for (i = 0; i < len; i++)
- {
- if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
- {
- result[k++] = i + iIgnoreAtBeginning;
- }
- }
- }
-
- if (k > 0)
- {
- // trim result array
- int[] res = new int[k + 2];
- Array.Copy(result, 0, res, 1, k);
- // We add the synthetical hyphenation points
- // at the beginning and end of the word
- res[0] = 0;
- res[k + 1] = len;
- return new Hyphenation(res);
- }
- else
- {
- return null;
- }
- }
-
- /// <summary>
- /// Add a character class to the tree. It is used by
- /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
- /// Character classes define the valid word characters for hyphenation. If a
- /// word contains a character not defined in any of the classes, it is not
- /// hyphenated. It also defines a way to normalize the characters in order to
- /// compare them with the stored patterns. Usually pattern files use only lower
- /// case characters, in this case a class for letter 'a', for example, should
- /// be defined as "aA", the first character being the normalization char.
- /// </summary>
- public virtual void AddClass(string chargroup)
- {
- if (chargroup.Length > 0)
- {
- char equivChar = chargroup[0];
- char[] key = new char[2];
- key[1] = (char)0;
- for (int i = 0; i < chargroup.Length; i++)
- {
- key[0] = chargroup[i];
- classmap.Insert(key, 0, equivChar);
- }
- }
- }
-
- /// <summary>
- /// Add an exception to the tree. It is used by
- /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
- /// hyphenation exceptions.
- /// </summary>
- /// <param name="word"> normalized word </param>
- /// <param name="hyphenatedword"> a vector of alternating strings and
- /// <seealso cref="Hyphen hyphen"/> objects. </param>
- public virtual void AddException(string word, List<object> hyphenatedword)
- {
- stoplist[word] = hyphenatedword;
- }
-
- /// <summary>
- /// Add a pattern to the tree. Mainly, to be used by
- /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
- /// the tree.
- /// </summary>
- /// <param name="pattern"> the hyphenation pattern </param>
- /// <param name="ivalue"> interletter weight values indicating the desirability and
- /// priority of hyphenating at a given point within the pattern. It
- /// should contain only digit characters. (i.e. '0' to '9'). </param>
- public virtual void AddPattern(string pattern, string ivalue)
- {
- int k = ivalues.Find(ivalue);
- if (k <= 0)
- {
- k = PackValues(ivalue);
- ivalues.Insert(ivalue, (char)k);
- }
- Insert(pattern, (char)k);
- }
-
- // public override void printStats(PrintStream @out)
- // {
- //@out.println("Value space size = " + Convert.ToString(vspace.length()));
- //base.printStats(@out);
-
- // }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
deleted file mode 100644
index 069badd..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
+++ /dev/null
@@ -1,54 +0,0 @@
-\ufeffusing System.Collections.Generic;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// This interface is used to connect the XML pattern file parser to the
- /// hyphenation tree.
- ///
- /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
- /// </summary>
- public interface IPatternConsumer
- {
-
- /// <summary>
- /// Add a character class. A character class defines characters that are
- /// considered equivalent for the purpose of hyphenation (e.g. "aA"). It
- /// usually means to ignore case.
- /// </summary>
- /// <param name="chargroup"> character group </param>
- void AddClass(string chargroup);
-
- /// <summary>
- /// Add a hyphenation exception. An exception replaces the result obtained by
- /// the algorithm for cases for which this fails or the user wants to provide
- /// his own hyphenation. A hyphenatedword is a vector of alternating String's
- /// and <seealso cref="Hyphen"/> instances
- /// </summary>
- void AddException(string word, List<object> hyphenatedword);
-
- /// <summary>
- /// Add hyphenation patterns.
- /// </summary>
- /// <param name="pattern"> the pattern </param>
- /// <param name="values"> interletter values expressed as a string of digit characters. </param>
- void AddPattern(string pattern, string values);
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
deleted file mode 100644
index 8c00d19..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
+++ /dev/null
@@ -1,483 +0,0 @@
-\ufeffusing System;
-using System.Collections.Generic;
-using System.IO;
-using System.Linq;
-using System.Text;
-using System.Xml;
-
-namespace Lucene.Net.Analysis.Compound.Hyphenation
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// A XMLReader document handler to read and parse hyphenation patterns from a XML
- /// file.
- ///
- /// LUCENENET: This class has been refactored from its Java counterpart to use XmlReader rather
- /// than a SAX parser.
- /// </summary>
- public class PatternParser
- {
- internal int currElement;
-
- internal IPatternConsumer consumer;
-
- internal StringBuilder token;
-
- internal List<object> exception;
-
- internal char hyphenChar;
-
- internal string errMsg;
-
- internal const int ELEM_CLASSES = 1;
-
- internal const int ELEM_EXCEPTIONS = 2;
-
- internal const int ELEM_PATTERNS = 3;
-
- internal const int ELEM_HYPHEN = 4;
-
- public PatternParser()
- {
- token = new StringBuilder();
- hyphenChar = '-'; // default
- }
-
- public PatternParser(IPatternConsumer consumer) : this()
- {
- this.consumer = consumer;
- }
-
- public virtual IPatternConsumer Consumer
- {
- set
- {
- this.consumer = value;
- }
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="filename"> the filename </param>
- /// <exception cref="IOException"> In case of an exception while parsing </exception>
- public virtual void Parse(string filename)
- {
- // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
- using (var src = XmlReader.Create(filename, new XmlReaderSettings
- {
- DtdProcessing = DtdProcessing.Parse,
- XmlResolver = new DtdResolver()
- }))
- {
- Parse(src);
- }
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="file"> the pattern file </param>
- public virtual void Parse(FileInfo file)
- {
- Parse(file, Encoding.UTF8);
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="file"> the pattern file </param>
- public virtual void Parse(FileInfo file, Encoding encoding)
- {
- using (var src = XmlReader.Create(new StreamReader(file.FullName, encoding), new XmlReaderSettings
- {
- DtdProcessing = DtdProcessing.Parse,
- XmlResolver = new DtdResolver()
- }))
- {
-
- Parse(src);
- }
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="file"> the pattern file </param>
- public virtual void Parse(Stream xmlStream)
- {
- using (var src = XmlReader.Create(xmlStream, new XmlReaderSettings
- {
- DtdProcessing = DtdProcessing.Parse,
- XmlResolver = new DtdResolver()
- }))
- {
- Parse(src);
- }
- }
-
- /// <summary>
- /// Parses a hyphenation pattern file.
- /// </summary>
- /// <param name="source"> the InputSource for the file </param>
- /// <exception cref="IOException"> In case of an exception while parsing </exception>
- public virtual void Parse(XmlReader source)
- {
- source.MoveToContent();
- while (source.Read())
- {
- ParseNode(source);
- }
- }
-
- private void ParseNode(XmlReader node)
- {
- string uri, name, raw;
- switch (node.NodeType)
- {
- case XmlNodeType.Element:
-
- // Element start
- uri = node.NamespaceURI;
- name = node.Name;
- bool isEmptyElement = node.IsEmptyElement;
- var attributes = GetAttributes(node);
- raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
-
- this.StartElement(uri, name, raw, attributes);
- if (isEmptyElement)
- {
- this.EndElement(uri, name, raw);
- }
- break;
-
- case XmlNodeType.Text:
-
- this.Characters(node.Value.ToCharArray(), 0, node.Value.Length);
- break;
-
- case XmlNodeType.EndElement:
- uri = node.NamespaceURI;
- name = node.Name;
- raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
-
- // Element end
- this.EndElement(uri, name, raw);
- break;
- }
- }
-
- private IDictionary<string, string> GetAttributes(XmlReader node)
- {
- var result = new Dictionary<string, string>();
- if (node.HasAttributes)
- {
- for (int i = 0; i < node.AttributeCount; i++)
- {
- node.MoveToAttribute(i);
- result.Add(node.Name, node.Value);
- }
- }
-
- return result;
- }
-
- protected internal virtual string ReadToken(StringBuilder chars)
- {
- string word;
- bool space = false;
- int i;
- for (i = 0; i < chars.Length; i++)
- {
- if (char.IsWhiteSpace(chars[i]))
- {
- space = true;
- }
- else
- {
- break;
- }
- }
- if (space)
- {
- // chars.delete(0,i);
- for (int countr = i; countr < chars.Length; countr++)
- {
- chars[countr - i] = chars[countr];
- }
- chars.Length = chars.Length - i;
- if (token.Length > 0)
- {
- word = token.ToString();
- token.Length = 0;
- return word;
- }
- }
- space = false;
- for (i = 0; i < chars.Length; i++)
- {
- if (char.IsWhiteSpace(chars[i]))
- {
- space = true;
- break;
- }
- }
- token.Append(chars.ToString(0, i - 0));
- // chars.delete(0,i);
- for (int countr = i; countr < chars.Length; countr++)
- {
- chars[countr - i] = chars[countr];
- }
- chars.Length = chars.Length - i;
- if (space)
- {
- word = token.ToString();
- token.Length = 0;
- return word;
- }
- token.Append(chars.ToString());
- return null;
- }
-
- protected internal static string GetPattern(string word)
- {
- StringBuilder pat = new StringBuilder();
- int len = word.Length;
- for (int i = 0; i < len; i++)
- {
- if (!char.IsDigit(word[i]))
- {
- pat.Append(word[i]);
- }
- }
- return pat.ToString();
- }
-
- protected internal virtual List<object> NormalizeException<T1>(List<T1> ex)
- {
- List<object> res = new List<object>();
- for (int i = 0; i < ex.Count; i++)
- {
- object item = ex[i];
- if (item is string)
- {
- string str = (string)item;
- StringBuilder buf = new StringBuilder();
- for (int j = 0; j < str.Length; j++)
- {
- char c = str[j];
- if (c != hyphenChar)
- {
- buf.Append(c);
- }
- else
- {
- res.Add(buf.ToString());
- buf.Length = 0;
- char[] h = new char[1];
- h[0] = hyphenChar;
- // we use here hyphenChar which is not necessarily
- // the one to be printed
- res.Add(new Hyphen(new string(h), null, null));
- }
- }
- if (buf.Length > 0)
- {
- res.Add(buf.ToString());
- }
- }
- else
- {
- res.Add(item);
- }
- }
- return res;
- }
-
- protected internal virtual string GetExceptionWord<T1>(List<T1> ex)
- {
- StringBuilder res = new StringBuilder();
- for (int i = 0; i < ex.Count; i++)
- {
- object item = ex[i];
- if (item is string)
- {
- res.Append((string)item);
- }
- else
- {
- if (((Hyphen)item).noBreak != null)
- {
- res.Append(((Hyphen)item).noBreak);
- }
- }
- }
- return res.ToString();
- }
-
- protected internal static string GetInterletterValues(string pat)
- {
- StringBuilder il = new StringBuilder();
- string word = pat + "a"; // add dummy letter to serve as sentinel
- int len = word.Length;
- for (int i = 0; i < len; i++)
- {
- char c = word[i];
- if (char.IsDigit(c))
- {
- il.Append(c);
- i++;
- }
- else
- {
- il.Append('0');
- }
- }
- return il.ToString();
- }
-
- /// <summary>
- /// LUCENENET specific helper class to force the DTD file to be read from the embedded resource
- /// rather than from the file system.
- /// </summary>
- internal class DtdResolver : XmlUrlResolver
- {
- public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
- {
- string dtdFilename = "hyphenation.dtd";
- if (dtdFilename.Equals(absoluteUri.Segments.LastOrDefault(), StringComparison.OrdinalIgnoreCase))
- {
- var qualifedDtdFilename = string.Concat(GetType().Namespace, ".", dtdFilename);
- return GetType().Assembly.GetManifestResourceStream(qualifedDtdFilename);
- }
-
- return base.GetEntity(absoluteUri, role, ofObjectToReturn);
- }
- }
-
- //
- // ContentHandler methods
- //
-
- /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
- /// java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
- public void StartElement(string uri, string local, string raw, IDictionary<string, string> attrs)
- {
- if (local.Equals("hyphen-char"))
- {
- string h = attrs.ContainsKey("value") ? attrs["value"] : null;
- if (h != null && h.Length == 1)
- {
- hyphenChar = h[0];
- }
- }
- else if (local.Equals("classes"))
- {
- currElement = ELEM_CLASSES;
- }
- else if (local.Equals("patterns"))
- {
- currElement = ELEM_PATTERNS;
- }
- else if (local.Equals("exceptions"))
- {
- currElement = ELEM_EXCEPTIONS;
- exception = new List<object>();
- }
- else if (local.Equals("hyphen"))
- {
- if (token.Length > 0)
- {
- exception.Add(token.ToString());
- }
- exception.Add(new Hyphen(attrs["pre"], attrs["no"], attrs["post"]));
- currElement = ELEM_HYPHEN;
- }
- token.Length = 0;
- }
-
- /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
- /// java.lang.String, java.lang.String) </seealso>
- public void EndElement(string uri, string local, string raw)
- {
- if (token.Length > 0)
- {
- string word = token.ToString();
- switch (currElement)
- {
- case ELEM_CLASSES:
- consumer.AddClass(word);
- break;
- case ELEM_EXCEPTIONS:
- exception.Add(word);
- exception = NormalizeException(exception);
- consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
- break;
- case ELEM_PATTERNS:
- consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
- break;
- case ELEM_HYPHEN:
- // nothing to do
- break;
- }
- if (currElement != ELEM_HYPHEN)
- {
- token.Length = 0;
- }
- }
- if (currElement == ELEM_HYPHEN)
- {
- currElement = ELEM_EXCEPTIONS;
- }
- else
- {
- currElement = 0;
- }
- }
-
- /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
- public void Characters(char[] ch, int start, int length)
- {
- StringBuilder chars = new StringBuilder(length);
- chars.Append(ch, start, length);
- string word = ReadToken(chars);
- while (word != null)
- {
- // System.out.println("\"" + word + "\"");
- switch (currElement)
- {
- case ELEM_CLASSES:
- consumer.AddClass(word);
- break;
- case ELEM_EXCEPTIONS:
- exception.Add(word);
- exception = NormalizeException(exception);
- consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
- exception.Clear();
- break;
- case ELEM_PATTERNS:
- consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
- break;
- }
- word = ReadToken(chars);
- }
-
- }
- }
-}
\ No newline at end of file
[4/4] lucenenet git commit: Merge remote-tracking branch
'devmk/master'
Posted by sy...@apache.org.
Merge remote-tracking branch 'devmk/master'
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7214c8a3
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7214c8a3
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7214c8a3
Branch: refs/heads/master
Commit: 7214c8a3c991889f0cb024b0b8045304b1b3d6df
Parents: e75dbf6 7ecb752
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Wed Dec 14 22:00:34 2016 +0200
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Wed Dec 14 22:00:34 2016 +0200
----------------------------------------------------------------------
.../Analysis/Compound/Hyphenation/ByteVector.cs | 156 ++++
.../Analysis/Compound/Hyphenation/CharVector.cs | 171 ++++
.../Analysis/Compound/Hyphenation/Hyphen.cs | 72 ++
.../Compound/Hyphenation/Hyphenation.cs | 53 ++
.../Compound/Hyphenation/HyphenationTree.cs | 581 +++++++++++++
.../Compound/Hyphenation/PatternConsumer.cs | 54 ++
.../Compound/Hyphenation/PatternParser.cs | 483 +++++++++++
.../Compound/Hyphenation/TernaryTree.cs | 816 +++++++++++++++++++
.../Compound/Hyphenation/hyphenation.dtd | 68 ++
.../Analysis/Compound/hyphenation/ByteVector.cs | 156 ----
.../Analysis/Compound/hyphenation/CharVector.cs | 171 ----
.../Analysis/Compound/hyphenation/Hyphen.cs | 72 --
.../Compound/hyphenation/Hyphenation.cs | 53 --
.../Compound/hyphenation/HyphenationTree.cs | 581 -------------
.../Compound/hyphenation/PatternConsumer.cs | 54 --
.../Compound/hyphenation/PatternParser.cs | 483 -----------
.../Compound/hyphenation/TernaryTree.cs | 816 -------------------
.../Compound/hyphenation/hyphenation.dtd | 68 --
18 files changed, 2454 insertions(+), 2454 deletions(-)
----------------------------------------------------------------------
[3/4] lucenenet git commit: Renamed hyphenation to Hyphenation to fix
build and run on case sensitive file systems
Posted by sy...@apache.org.
Renamed hyphenation to Hyphenation to fix build and run on case sensitive file systems
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7ecb7529
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7ecb7529
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7ecb7529
Branch: refs/heads/master
Commit: 7ecb7529066c1d95dce83b175dac3322fc4068ab
Parents: 96d38ef
Author: Yaroslav <sl...@gmail.com>
Authored: Mon Nov 28 21:23:25 2016 +0200
Committer: Yaroslav <sl...@gmail.com>
Committed: Mon Nov 28 21:23:25 2016 +0200
----------------------------------------------------------------------
.../Analysis/Compound/Hyphenation/ByteVector.cs | 156 ++++
.../Analysis/Compound/Hyphenation/CharVector.cs | 171 ++++
.../Analysis/Compound/Hyphenation/Hyphen.cs | 72 ++
.../Compound/Hyphenation/Hyphenation.cs | 53 ++
.../Compound/Hyphenation/HyphenationTree.cs | 581 +++++++++++++
.../Compound/Hyphenation/PatternConsumer.cs | 54 ++
.../Compound/Hyphenation/PatternParser.cs | 483 +++++++++++
.../Compound/Hyphenation/TernaryTree.cs | 816 +++++++++++++++++++
.../Compound/Hyphenation/hyphenation.dtd | 68 ++
.../Analysis/Compound/hyphenation/ByteVector.cs | 156 ----
.../Analysis/Compound/hyphenation/CharVector.cs | 171 ----
.../Analysis/Compound/hyphenation/Hyphen.cs | 72 --
.../Compound/hyphenation/Hyphenation.cs | 53 --
.../Compound/hyphenation/HyphenationTree.cs | 581 -------------
.../Compound/hyphenation/PatternConsumer.cs | 54 --
.../Compound/hyphenation/PatternParser.cs | 483 -----------
.../Compound/hyphenation/TernaryTree.cs | 816 -------------------
.../Compound/hyphenation/hyphenation.dtd | 68 --
18 files changed, 2454 insertions(+), 2454 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs
new file mode 100644
index 0000000..6442d11
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs
@@ -0,0 +1,156 @@
+\ufeffnamespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This class implements a simple byte vector with access to the underlying
+ /// array.
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class ByteVector
+ {
+
+ /// <summary>
+ /// Capacity increment size
+ /// </summary>
+ private const int DEFAULT_BLOCK_SIZE = 2048;
+
+ private int blockSize;
+
+ /// <summary>
+ /// The encapsulated array
+ /// </summary>
+ private sbyte[] array;
+
+ /// <summary>
+ /// Points to next free item
+ /// </summary>
+ private int n;
+
+ public ByteVector() : this(DEFAULT_BLOCK_SIZE)
+ {
+ }
+
+ public ByteVector(int capacity)
+ {
+ if (capacity > 0)
+ {
+ blockSize = capacity;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new sbyte[blockSize];
+ n = 0;
+ }
+
+ public ByteVector(sbyte[] a)
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = 0;
+ }
+
+ public ByteVector(sbyte[] a, int capacity)
+ {
+ if (capacity > 0)
+ {
+ blockSize = capacity;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = 0;
+ }
+
+ public virtual sbyte[] Array
+ {
+ get
+ {
+ return array;
+ }
+ }
+
+ /// <summary>
+ /// LUCENENET indexer for .NET
+ /// </summary>
+ /// <param name="index"></param>
+ /// <returns></returns>
+ public virtual sbyte this[int index]
+ {
+ get { return array[index]; }
+ set { array[index] = value; }
+ }
+
+ /// <summary>
+ /// return number of items in array
+ /// </summary>
+ public virtual int Length
+ {
+ get { return n; }
+ }
+
+ /// <summary>
+ /// returns current capacity of array
+ /// </summary>
+ public virtual int Capacity
+ {
+ get { return array.Length; }
+ }
+
+ //public virtual void Put(int index, sbyte val)
+ //{
+ // array[index] = val;
+ //}
+
+ //public virtual sbyte Get(int index)
+ //{
+ // return array[index];
+ //}
+
+ /// <summary>
+ /// This is to implement memory allocation in the array. Like malloc().
+ /// </summary>
+ public virtual int Alloc(int size)
+ {
+ int index = n;
+ int len = array.Length;
+ if (n + size >= len)
+ {
+ sbyte[] aux = new sbyte[len + blockSize];
+ System.Array.Copy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public virtual void TrimToSize()
+ {
+ if (n < array.Length)
+ {
+ sbyte[] aux = new sbyte[n];
+ System.Array.Copy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs
new file mode 100644
index 0000000..26fcea5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs
@@ -0,0 +1,171 @@
+\ufeffusing System;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This class implements a simple char vector with access to the underlying
+ /// array.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class CharVector : ICloneable
+ {
+
+ /// <summary>
+ /// Capacity increment size
+ /// </summary>
+ private const int DEFAULT_BLOCK_SIZE = 2048;
+
+ private int blockSize;
+
+ /// <summary>
+ /// The encapsulated array
+ /// </summary>
+ private char[] array;
+
+ /// <summary>
+ /// Points to next free item
+ /// </summary>
+ private int n;
+
+ public CharVector() : this(DEFAULT_BLOCK_SIZE)
+ {
+ }
+
+ public CharVector(int capacity)
+ {
+ if (capacity > 0)
+ {
+ blockSize = capacity;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new char[blockSize];
+ n = 0;
+ }
+
+ public CharVector(char[] a)
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = a.Length;
+ }
+
+ public CharVector(char[] a, int capacity)
+ {
+ if (capacity > 0)
+ {
+ blockSize = capacity;
+ }
+ else
+ {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = a.Length;
+ }
+
+ /// <summary>
+ /// Reset Vector but don't resize or clear elements
+ /// </summary>
+ public virtual void Clear()
+ {
+ n = 0;
+ }
+
+ public virtual object Clone()
+ {
+ CharVector cv = new CharVector(array, blockSize);
+ cv.n = this.n;
+ return cv;
+ }
+
+ public virtual char[] Array
+ {
+ get
+ {
+ return array;
+ }
+ }
+
+ /// <summary>
+ /// LUCENENET indexer for .NET
+ /// </summary>
+ /// <param name="index"></param>
+ /// <returns></returns>
+ public virtual char this[int index]
+ {
+ get { return array[index]; }
+ set { array[index] = value; }
+ }
+
+ /// <summary>
+ /// return number of items in array
+ /// </summary>
+ public virtual int Length()
+ {
+ return n;
+ }
+
+ /// <summary>
+ /// returns current capacity of array
+ /// </summary>
+ public virtual int Capacity
+ {
+ get { return array.Length; }
+ }
+
+ //public virtual void Put(int index, char val)
+ //{
+ // array[index] = val;
+ //}
+
+ //public virtual char get(int index)
+ //{
+ // return array[index];
+ //}
+
+ public virtual int Alloc(int size)
+ {
+ int index = n;
+ int len = array.Length;
+ if (n + size >= len)
+ {
+ char[] aux = new char[len + blockSize];
+ System.Array.Copy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public virtual void TrimToSize()
+ {
+ if (n < array.Length)
+ {
+ char[] aux = new char[n];
+ System.Array.Copy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs
new file mode 100644
index 0000000..91009b1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs
@@ -0,0 +1,72 @@
+\ufeffusing System.Text;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
+ /// pre-break text, post-break text and no-break. If no line-break is generated
+ /// at this position, the no-break text is used, otherwise, pre-break and
+ /// post-break are used. Typically, pre-break is equal to the hyphen character
+ /// and the others are empty. However, this general scheme allows support for
+ /// cases in some languages where words change spelling if they're split across
+ /// lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
+ /// from TeX.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class Hyphen
+ {
+ public string preBreak;
+
+ public string noBreak;
+
+ public string postBreak;
+
+ internal Hyphen(string pre, string no, string post)
+ {
+ preBreak = pre;
+ noBreak = no;
+ postBreak = post;
+ }
+
+ internal Hyphen(string pre)
+ {
+ preBreak = pre;
+ noBreak = null;
+ postBreak = null;
+ }
+
+ public override string ToString()
+ {
+ if (noBreak == null && postBreak == null && preBreak != null && preBreak.Equals("-"))
+ {
+ return "-";
+ }
+ StringBuilder res = new StringBuilder("{");
+ res.Append(preBreak);
+ res.Append("}{");
+ res.Append(postBreak);
+ res.Append("}{");
+ res.Append(noBreak);
+ res.Append('}');
+ return res.ToString();
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs
new file mode 100644
index 0000000..fdbac29
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs
@@ -0,0 +1,53 @@
+\ufeffnamespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This class represents a hyphenated word.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class Hyphenation
+ {
+
+ private readonly int[] hyphenPoints;
+
+ /// <summary>
+ /// rawWord as made of alternating strings and <seealso cref="Hyphen"/> instances
+ /// </summary>
+ internal Hyphenation(int[] points)
+ {
+ hyphenPoints = points;
+ }
+
+ /// <returns> the number of hyphenation points in the word </returns>
+ public virtual int Length
+ {
+ get { return hyphenPoints.Length; }
+ }
+
+ /// <returns> the hyphenation points </returns>
+ public virtual int[] HyphenationPoints
+ {
+ get
+ {
+ return hyphenPoints;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs
new file mode 100644
index 0000000..287f6f3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs
@@ -0,0 +1,581 @@
+\ufeffusing Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using System.Xml;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This tree structure stores the hyphenation patterns in an efficient way for
+ /// fast lookup. It provides the provides the method to hyphenate a word.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public class HyphenationTree : TernaryTree, IPatternConsumer
+ {
+
+ /// <summary>
+ /// value space: stores the interletter values
+ /// </summary>
+ protected internal ByteVector vspace;
+
+ /// <summary>
+ /// This map stores hyphenation exceptions
+ /// </summary>
+ protected internal IDictionary<string, IList<object>> stoplist;
+
+ /// <summary>
+ /// This map stores the character classes
+ /// </summary>
+ protected internal TernaryTree classmap;
+
+ /// <summary>
+ /// Temporary map to store interletter values on pattern loading.
+ /// </summary>
+ [NonSerialized]
+ private TernaryTree ivalues;
+
+ public HyphenationTree()
+ {
+ stoplist = new HashMap<string, IList<object>>(23); // usually a small table
+ classmap = new TernaryTree();
+ vspace = new ByteVector();
+ vspace.Alloc(1); // this reserves index 0, which we don't use
+ }
+
+ /// <summary>
+ /// Packs the values by storing them in 4 bits, two values into a byte Values
+ /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+ /// value.
+ /// </summary>
+ /// <param name="values"> a string of digits from '0' to '9' representing the
+ /// interletter values. </param>
+ /// <returns> the index into the vspace array where the packed values are stored. </returns>
+ protected internal virtual int PackValues(string values)
+ {
+ int i, n = values.Length;
+ int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+ int offset = vspace.Alloc(m);
+ sbyte[] va = vspace.Array;
+ for (i = 0; i < n; i++)
+ {
+ int j = i >> 1;
+ sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
+ if ((i & 1) == 1)
+ {
+ va[j + offset] = (sbyte)(va[j + offset] | v);
+ }
+ else
+ {
+ va[j + offset] = (sbyte)(v << 4); // big endian
+ }
+ }
+ va[m - 1 + offset] = 0; // terminator
+ return offset;
+ }
+
+ protected internal virtual string UnpackValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace[k++];
+ while (v != 0)
+ {
+ char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1 + '0');
+ buf.Append(c);
+ v = vspace[k++];
+ }
+ return buf.ToString();
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(string filename)
+ {
+ LoadPatterns(filename, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(string filename, Encoding encoding)
+ {
+ var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
+ LoadPatterns(src, encoding);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(FileInfo f)
+ {
+ LoadPatterns(f, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="f"> the filename </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(FileInfo f, Encoding encoding)
+ {
+ var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
+ LoadPatterns(src, encoding);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(Stream source)
+ {
+ LoadPatterns(source, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Read hyphenation patterns from an XML file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case the parsing fails </exception>
+ public virtual void LoadPatterns(Stream source, Encoding encoding)
+ {
+ // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+ using (var reader = XmlReader.Create(new StreamReader(source, encoding), new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new PatternParser.DtdResolver()
+ }))
+ {
+ LoadPatterns(reader);
+ }
+ }
+
+ public virtual void LoadPatterns(XmlReader source)
+ {
+ PatternParser pp = new PatternParser(this);
+ ivalues = new TernaryTree();
+
+ pp.Parse(source);
+
+ // patterns/values should be now in the tree
+ // let's optimize a bit
+ TrimToSize();
+ vspace.TrimToSize();
+ classmap.TrimToSize();
+
+ // get rid of the auxiliary map
+ ivalues = null;
+ }
+
+ public virtual string FindPattern(string pat)
+ {
+ int k = base.Find(pat);
+ if (k >= 0)
+ {
+ return UnpackValues(k);
+ }
+ return "";
+ }
+
+ /// <summary>
+ /// String compare, returns 0 if equal or t is a substring of s
+ /// </summary>
+ protected internal virtual int HStrCmp(char[] s, int si, char[] t, int ti)
+ {
+ for (; s[si] == t[ti]; si++, ti++)
+ {
+ if (s[si] == 0)
+ {
+ return 0;
+ }
+ }
+ if (t[ti] == 0)
+ {
+ return 0;
+ }
+ return s[si] - t[ti];
+ }
+
+ protected internal virtual sbyte[] GetValues(int k)
+ {
+ StringBuilder buf = new StringBuilder();
+ sbyte v = vspace[k++];
+ while (v != 0)
+ {
+ char c = (char)((((int)((uint)v >> 4))) - 1);
+ buf.Append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0)
+ {
+ break;
+ }
+ c = (char)(c - 1);
+ buf.Append(c);
+ v = vspace[k++];
+ }
+ sbyte[] res = new sbyte[buf.Length];
+ for (int i = 0; i < res.Length; i++)
+ {
+ res[i] = (sbyte)buf[i];
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// <para>
+ /// Search for all possible partial matches of word starting at index an update
+ /// interletter values. In other words, it does something like:
+ /// </para>
+ /// <code>
+ /// for(i=0; i<patterns.length; i++) {
+ /// if ( word.substring(index).startsWidth(patterns[i]) )
+ /// update_interletter_values(patterns[i]);
+ /// }
+ /// </code>
+ /// <para>
+ /// But it is done in an efficient way since the patterns are stored in a
+ /// ternary tree. In fact, this is the whole purpose of having the tree: doing
+ /// this search without having to test every single pattern. The number of
+ /// patterns for languages such as English range from 4000 to 10000. Thus,
+ /// doing thousands of string comparisons for each word to hyphenate would be
+ /// really slow without the tree. The tradeoff is memory, but using a ternary
+ /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
+ /// It's also faster than using a hash table
+ /// </para>
+ /// </summary>
+ /// <param name="word"> null terminated word to match </param>
+ /// <param name="index"> start index from word </param>
+ /// <param name="il"> interletter values array to update </param>
+ protected internal virtual void SearchPatterns(char[] word, int index, sbyte[] il)
+ {
+ sbyte[] values;
+ int i = index;
+ char p, q;
+ char sp = word[i];
+ p = root;
+
+ while (p > 0 && p < sc.Length)
+ {
+ if (sc[p] == 0xFFFF)
+ {
+ if (HStrCmp(word, i, kv.Array, lo[p]) == 0)
+ {
+ values = GetValues(eq[p]); // data pointer is in eq[]
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ }
+ return;
+ }
+ int d = sp - sc[p];
+ if (d == 0)
+ {
+ if (sp == 0)
+ {
+ break;
+ }
+ sp = word[++i];
+ p = eq[p];
+ q = p;
+
+ // look for a pattern ending at this position by searching for
+ // the null char ( splitchar == 0 )
+ while (q > 0 && q < sc.Length)
+ {
+ if (sc[q] == 0xFFFF) // stop at compressed branch
+ {
+ break;
+ }
+ if (sc[q] == 0)
+ {
+ values = GetValues(eq[q]);
+ int j = index;
+ for (int k = 0; k < values.Length; k++)
+ {
+ if (j < il.Length && values[k] > il[j])
+ {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ break;
+ }
+ else
+ {
+ q = lo[q];
+
+ /// <summary>
+ /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+ /// java chars are unsigned
+ /// </summary>
+ }
+ }
+ }
+ else
+ {
+ p = d < 0 ? lo[p] : hi[p];
+ }
+ }
+ }
+
+ /// <summary>
+ /// Hyphenate word and return a Hyphenation object.
+ /// </summary>
+ /// <param name="word"> the word to be hyphenated </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
+ {
+ char[] w = word.ToCharArray();
+ return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
+ }
+
+ /// <summary>
+ /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+ /// may be absent, the first n is at offset, the first l is at offset +
+ /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+ /// into word. In the first part of the routine len = w.length, in the second
+ /// part of the routine len = word.length. Three indices are used: index(w),
+ /// the index in w, index(word), the index in word, letterindex(word), the
+ /// index in the letter part of word. The following relations exist: index(w) =
+ /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+ /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+ /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+ /// iIgnoreAtBeginning
+ /// </summary>
+
+ /// <summary>
+ /// Hyphenate word and return an array of hyphenation points.
+ /// </summary>
+ /// <param name="w"> char array that contains the word </param>
+ /// <param name="offset"> Offset to first character in word </param>
+ /// <param name="len"> Length of word </param>
+ /// <param name="remainCharCount"> Minimum number of characters allowed before the
+ /// hyphenation point. </param>
+ /// <param name="pushCharCount"> Minimum number of characters allowed after the
+ /// hyphenation point. </param>
+ /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+ /// hyphenated word or null if word is not hyphenated. </returns>
+ public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
+ {
+ int i;
+ char[] word = new char[len + 3];
+
+ // normalize word
+ char[] c = new char[2];
+ int iIgnoreAtBeginning = 0;
+ int iLength = len;
+ bool bEndOfLetters = false;
+ for (i = 1; i <= len; i++)
+ {
+ c[0] = w[offset + i - 1];
+ int nc = classmap.Find(c, 0);
+ if (nc < 0) // found a non-letter character ...
+ {
+ if (i == (1 + iIgnoreAtBeginning))
+ {
+ // ... before any letter character
+ iIgnoreAtBeginning++;
+ }
+ else
+ {
+ // ... after a letter character
+ bEndOfLetters = true;
+ }
+ iLength--;
+ }
+ else
+ {
+ if (!bEndOfLetters)
+ {
+ word[i - iIgnoreAtBeginning] = (char)nc;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ }
+ len = iLength;
+ if (len < (remainCharCount + pushCharCount))
+ {
+ // word is too short to be hyphenated
+ return null;
+ }
+ int[] result = new int[len + 1];
+ int k = 0;
+
+ // check exception list first
+ string sw = new string(word, 1, len);
+ if (stoplist.ContainsKey(sw))
+ {
+ // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+ // null)
+ IList<object> hw = stoplist[sw];
+ int j = 0;
+ for (i = 0; i < hw.Count; i++)
+ {
+ object o = hw[i];
+ // j = index(sw) = letterindex(word)?
+ // result[k] = corresponding index(w)
+ if (o is string)
+ {
+ j += ((string)o).Length;
+ if (j >= remainCharCount && j < (len - pushCharCount))
+ {
+ result[k++] = j + iIgnoreAtBeginning;
+ }
+ }
+ }
+ }
+ else
+ {
+ // use algorithm to get hyphenation points
+ word[0] = '.'; // word start marker
+ word[len + 1] = '.'; // word end marker
+ word[len + 2] = (char)0; // null terminated
+ sbyte[] il = new sbyte[len + 3]; // initialized to zero
+ for (i = 0; i < len + 1; i++)
+ {
+ SearchPatterns(word, i, il);
+ }
+
+ // hyphenation points are located where interletter value is odd
+ // i is letterindex(word),
+ // i + 1 is index(word),
+ // result[k] = corresponding index(w)
+ for (i = 0; i < len; i++)
+ {
+ if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
+ {
+ result[k++] = i + iIgnoreAtBeginning;
+ }
+ }
+ }
+
+ if (k > 0)
+ {
+ // trim result array
+ int[] res = new int[k + 2];
+ Array.Copy(result, 0, res, 1, k);
+ // We add the synthetical hyphenation points
+ // at the beginning and end of the word
+ res[0] = 0;
+ res[k + 1] = len;
+ return new Hyphenation(res);
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /// <summary>
+ /// Add a character class to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
+ /// Character classes define the valid word characters for hyphenation. If a
+ /// word contains a character not defined in any of the classes, it is not
+ /// hyphenated. It also defines a way to normalize the characters in order to
+ /// compare them with the stored patterns. Usually pattern files use only lower
+ /// case characters, in this case a class for letter 'a', for example, should
+ /// be defined as "aA", the first character being the normalization char.
+ /// </summary>
+ public virtual void AddClass(string chargroup)
+ {
+ if (chargroup.Length > 0)
+ {
+ char equivChar = chargroup[0];
+ char[] key = new char[2];
+ key[1] = (char)0;
+ for (int i = 0; i < chargroup.Length; i++)
+ {
+ key[0] = chargroup[i];
+ classmap.Insert(key, 0, equivChar);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Add an exception to the tree. It is used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
+ /// hyphenation exceptions.
+ /// </summary>
+ /// <param name="word"> normalized word </param>
+ /// <param name="hyphenatedword"> a vector of alternating strings and
+ /// <seealso cref="Hyphen hyphen"/> objects. </param>
+ public virtual void AddException(string word, List<object> hyphenatedword)
+ {
+ stoplist[word] = hyphenatedword;
+ }
+
+ /// <summary>
+ /// Add a pattern to the tree. Mainly, to be used by
+ /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
+ /// the tree.
+ /// </summary>
+ /// <param name="pattern"> the hyphenation pattern </param>
+ /// <param name="ivalue"> interletter weight values indicating the desirability and
+ /// priority of hyphenating at a given point within the pattern. It
+ /// should contain only digit characters. (i.e. '0' to '9'). </param>
+ public virtual void AddPattern(string pattern, string ivalue)
+ {
+ int k = ivalues.Find(ivalue);
+ if (k <= 0)
+ {
+ k = PackValues(ivalue);
+ ivalues.Insert(ivalue, (char)k);
+ }
+ Insert(pattern, (char)k);
+ }
+
+ // public override void printStats(PrintStream @out)
+ // {
+ //@out.println("Value space size = " + Convert.ToString(vspace.length()));
+ //base.printStats(@out);
+
+ // }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs
new file mode 100644
index 0000000..069badd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs
@@ -0,0 +1,54 @@
+\ufeffusing System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This interface is used to connect the XML pattern file parser to the
+ /// hyphenation tree.
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+ public interface IPatternConsumer
+ {
+
+ /// <summary>
+ /// Add a character class. A character class defines characters that are
+ /// considered equivalent for the purpose of hyphenation (e.g. "aA"). It
+ /// usually means to ignore case.
+ /// </summary>
+ /// <param name="chargroup"> character group </param>
+ void AddClass(string chargroup);
+
+ /// <summary>
+ /// Add a hyphenation exception. An exception replaces the result obtained by
+ /// the algorithm for cases for which this fails or the user wants to provide
+ /// his own hyphenation. A hyphenatedword is a vector of alternating String's
+ /// and <seealso cref="Hyphen"/> instances
+ /// </summary>
+ void AddException(string word, List<object> hyphenatedword);
+
+ /// <summary>
+ /// Add hyphenation patterns.
+ /// </summary>
+ /// <param name="pattern"> the pattern </param>
+ /// <param name="values"> interletter values expressed as a string of digit characters. </param>
+ void AddPattern(string pattern, string values);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs
new file mode 100644
index 0000000..8c00d19
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs
@@ -0,0 +1,483 @@
+\ufeffusing System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Xml;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A XMLReader document handler to read and parse hyphenation patterns from a XML
+ /// file.
+ ///
+ /// LUCENENET: This class has been refactored from its Java counterpart to use XmlReader rather
+ /// than a SAX parser.
+ /// </summary>
+ public class PatternParser
+ {
+ internal int currElement;
+
+ internal IPatternConsumer consumer;
+
+ internal StringBuilder token;
+
+ internal List<object> exception;
+
+ internal char hyphenChar;
+
+ internal string errMsg;
+
+ internal const int ELEM_CLASSES = 1;
+
+ internal const int ELEM_EXCEPTIONS = 2;
+
+ internal const int ELEM_PATTERNS = 3;
+
+ internal const int ELEM_HYPHEN = 4;
+
+ public PatternParser()
+ {
+ token = new StringBuilder();
+ hyphenChar = '-'; // default
+ }
+
+ public PatternParser(IPatternConsumer consumer) : this()
+ {
+ this.consumer = consumer;
+ }
+
+ public virtual IPatternConsumer Consumer
+ {
+ set
+ {
+ this.consumer = value;
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="filename"> the filename </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+ public virtual void Parse(string filename)
+ {
+ // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+ using (var src = XmlReader.Create(filename, new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(FileInfo file)
+ {
+ Parse(file, Encoding.UTF8);
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(FileInfo file, Encoding encoding)
+ {
+ using (var src = XmlReader.Create(new StreamReader(file.FullName, encoding), new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="file"> the pattern file </param>
+ public virtual void Parse(Stream xmlStream)
+ {
+ using (var src = XmlReader.Create(xmlStream, new XmlReaderSettings
+ {
+ DtdProcessing = DtdProcessing.Parse,
+ XmlResolver = new DtdResolver()
+ }))
+ {
+ Parse(src);
+ }
+ }
+
+ /// <summary>
+ /// Parses a hyphenation pattern file.
+ /// </summary>
+ /// <param name="source"> the InputSource for the file </param>
+ /// <exception cref="IOException"> In case of an exception while parsing </exception>
+ public virtual void Parse(XmlReader source)
+ {
+ source.MoveToContent();
+ while (source.Read())
+ {
+ ParseNode(source);
+ }
+ }
+
+ private void ParseNode(XmlReader node)
+ {
+ string uri, name, raw;
+ switch (node.NodeType)
+ {
+ case XmlNodeType.Element:
+
+ // Element start
+ uri = node.NamespaceURI;
+ name = node.Name;
+ bool isEmptyElement = node.IsEmptyElement;
+ var attributes = GetAttributes(node);
+ raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+ this.StartElement(uri, name, raw, attributes);
+ if (isEmptyElement)
+ {
+ this.EndElement(uri, name, raw);
+ }
+ break;
+
+ case XmlNodeType.Text:
+
+ this.Characters(node.Value.ToCharArray(), 0, node.Value.Length);
+ break;
+
+ case XmlNodeType.EndElement:
+ uri = node.NamespaceURI;
+ name = node.Name;
+ raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+ // Element end
+ this.EndElement(uri, name, raw);
+ break;
+ }
+ }
+
+ private IDictionary<string, string> GetAttributes(XmlReader node)
+ {
+ var result = new Dictionary<string, string>();
+ if (node.HasAttributes)
+ {
+ for (int i = 0; i < node.AttributeCount; i++)
+ {
+ node.MoveToAttribute(i);
+ result.Add(node.Name, node.Value);
+ }
+ }
+
+ return result;
+ }
+
+ protected internal virtual string ReadToken(StringBuilder chars)
+ {
+ string word;
+ bool space = false;
+ int i;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (space)
+ {
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (token.Length > 0)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ }
+ space = false;
+ for (i = 0; i < chars.Length; i++)
+ {
+ if (char.IsWhiteSpace(chars[i]))
+ {
+ space = true;
+ break;
+ }
+ }
+ token.Append(chars.ToString(0, i - 0));
+ // chars.delete(0,i);
+ for (int countr = i; countr < chars.Length; countr++)
+ {
+ chars[countr - i] = chars[countr];
+ }
+ chars.Length = chars.Length - i;
+ if (space)
+ {
+ word = token.ToString();
+ token.Length = 0;
+ return word;
+ }
+ token.Append(chars.ToString());
+ return null;
+ }
+
+ protected internal static string GetPattern(string word)
+ {
+ StringBuilder pat = new StringBuilder();
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ if (!char.IsDigit(word[i]))
+ {
+ pat.Append(word[i]);
+ }
+ }
+ return pat.ToString();
+ }
+
+ protected internal virtual List<object> NormalizeException<T1>(List<T1> ex)
+ {
+ List<object> res = new List<object>();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ string str = (string)item;
+ StringBuilder buf = new StringBuilder();
+ for (int j = 0; j < str.Length; j++)
+ {
+ char c = str[j];
+ if (c != hyphenChar)
+ {
+ buf.Append(c);
+ }
+ else
+ {
+ res.Add(buf.ToString());
+ buf.Length = 0;
+ char[] h = new char[1];
+ h[0] = hyphenChar;
+ // we use here hyphenChar which is not necessarily
+ // the one to be printed
+ res.Add(new Hyphen(new string(h), null, null));
+ }
+ }
+ if (buf.Length > 0)
+ {
+ res.Add(buf.ToString());
+ }
+ }
+ else
+ {
+ res.Add(item);
+ }
+ }
+ return res;
+ }
+
+ protected internal virtual string GetExceptionWord<T1>(List<T1> ex)
+ {
+ StringBuilder res = new StringBuilder();
+ for (int i = 0; i < ex.Count; i++)
+ {
+ object item = ex[i];
+ if (item is string)
+ {
+ res.Append((string)item);
+ }
+ else
+ {
+ if (((Hyphen)item).noBreak != null)
+ {
+ res.Append(((Hyphen)item).noBreak);
+ }
+ }
+ }
+ return res.ToString();
+ }
+
+ protected internal static string GetInterletterValues(string pat)
+ {
+ StringBuilder il = new StringBuilder();
+ string word = pat + "a"; // add dummy letter to serve as sentinel
+ int len = word.Length;
+ for (int i = 0; i < len; i++)
+ {
+ char c = word[i];
+ if (char.IsDigit(c))
+ {
+ il.Append(c);
+ i++;
+ }
+ else
+ {
+ il.Append('0');
+ }
+ }
+ return il.ToString();
+ }
+
+ /// <summary>
+ /// LUCENENET specific helper class to force the DTD file to be read from the embedded resource
+ /// rather than from the file system.
+ /// </summary>
+ internal class DtdResolver : XmlUrlResolver
+ {
+ public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
+ {
+ string dtdFilename = "hyphenation.dtd";
+ if (dtdFilename.Equals(absoluteUri.Segments.LastOrDefault(), StringComparison.OrdinalIgnoreCase))
+ {
+ var qualifedDtdFilename = string.Concat(GetType().Namespace, ".", dtdFilename);
+ return GetType().Assembly.GetManifestResourceStream(qualifedDtdFilename);
+ }
+
+ return base.GetEntity(absoluteUri, role, ofObjectToReturn);
+ }
+ }
+
+ //
+ // ContentHandler methods
+ //
+
+ /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
+ /// java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
+ public void StartElement(string uri, string local, string raw, IDictionary<string, string> attrs)
+ {
+ if (local.Equals("hyphen-char"))
+ {
+ string h = attrs.ContainsKey("value") ? attrs["value"] : null;
+ if (h != null && h.Length == 1)
+ {
+ hyphenChar = h[0];
+ }
+ }
+ else if (local.Equals("classes"))
+ {
+ currElement = ELEM_CLASSES;
+ }
+ else if (local.Equals("patterns"))
+ {
+ currElement = ELEM_PATTERNS;
+ }
+ else if (local.Equals("exceptions"))
+ {
+ currElement = ELEM_EXCEPTIONS;
+ exception = new List<object>();
+ }
+ else if (local.Equals("hyphen"))
+ {
+ if (token.Length > 0)
+ {
+ exception.Add(token.ToString());
+ }
+ exception.Add(new Hyphen(attrs["pre"], attrs["no"], attrs["post"]));
+ currElement = ELEM_HYPHEN;
+ }
+ token.Length = 0;
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
+ /// java.lang.String, java.lang.String) </seealso>
+ public void EndElement(string uri, string local, string raw)
+ {
+ if (token.Length > 0)
+ {
+ string word = token.ToString();
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.AddClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = NormalizeException(exception);
+ consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+ break;
+ case ELEM_PATTERNS:
+ consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+ break;
+ case ELEM_HYPHEN:
+ // nothing to do
+ break;
+ }
+ if (currElement != ELEM_HYPHEN)
+ {
+ token.Length = 0;
+ }
+ }
+ if (currElement == ELEM_HYPHEN)
+ {
+ currElement = ELEM_EXCEPTIONS;
+ }
+ else
+ {
+ currElement = 0;
+ }
+ }
+
+ /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
+ public void Characters(char[] ch, int start, int length)
+ {
+ StringBuilder chars = new StringBuilder(length);
+ chars.Append(ch, start, length);
+ string word = ReadToken(chars);
+ while (word != null)
+ {
+ // System.out.println("\"" + word + "\"");
+ switch (currElement)
+ {
+ case ELEM_CLASSES:
+ consumer.AddClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.Add(word);
+ exception = NormalizeException(exception);
+ consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+ exception.Clear();
+ break;
+ case ELEM_PATTERNS:
+ consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+ break;
+ }
+ word = ReadToken(chars);
+ }
+
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs
new file mode 100644
index 0000000..88cfd01
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs
@@ -0,0 +1,816 @@
+\ufeffusing System;
+using System.Collections;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Compound.Hyphenation
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// <h2>Ternary Search Tree.</h2>
+ ///
+ /// <para>
+ /// A ternary search tree is a hybrid between a binary tree and a digital search
+ /// tree (trie). Keys are limited to strings. A data value of type char is stored
+ /// in each leaf node. It can be used as an index (or pointer) to the data.
+ /// Branches that only contain one key are compressed to one node by storing a
+ /// pointer to the trailer substring of the key. This class is intended to serve
+ /// as base class or helper class to implement Dictionary collections or the
+ /// like. Ternary trees have some nice properties as the following: the tree can
+ /// be traversed in sorted order, partial matches (wildcard) can be implemented,
+ /// retrieval of all keys within a given distance from the target, etc. The
+ /// storage requirements are higher than a binary tree but a lot less than a
+ /// trie. Performance is comparable with a hash table, sometimes it outperforms a
+ /// hash function (most of the time can determine a miss faster than a hash).
+ /// </para>
+ ///
+ /// <para>
+ /// The main purpose of this java port is to serve as a base for implementing
+ /// TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
+ /// requires from 5000 to 15000 hyphenation patterns which will be keys in this
+ /// tree. The strings patterns are usually small (from 2 to 5 characters), but
+ /// each char in the tree is stored in a node. Thus memory usage is the main
+ /// concern. We will sacrifice 'elegance' to keep memory requirements to the
+ /// minimum. Using java's char type as pointer (yes, I know pointer it is a
+ /// forbidden word in java) we can keep the size of the node to be just 8 bytes
+ /// (3 pointers and the data char). This gives room for about 65000 nodes. In my
+ /// tests the english patterns took 7694 nodes and the german patterns 10055
+ /// nodes, so I think we are safe.
+ /// </para>
+ ///
+ /// <para>
+ /// All said, this is a map with strings as keys and char as value. Pretty
+ /// limited!. It can be extended to a general map by using the string
+ /// representation of an object and using the char value as an index to an array
+ /// that contains the object values.
+ /// </para>
+ ///
+ /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ /// </summary>
+
+ public class TernaryTree : ICloneable
+ {
+ /// <summary>
+ /// We use 4 arrays to represent a node. I guess I should have created a proper
+ /// node class, but somehow Knuth's pascal code made me forget we now have a
+ /// portable language with virtual memory management and automatic garbage
+ /// collection! And now is kind of late, furthermore, if it ain't broken, don't
+ /// fix it.
+ /// </summary>
+
+ /// <summary>
+ /// Pointer to low branch and to rest of the key when it is stored directly in
+ /// this node, we don't have unions in java!
+ /// </summary>
+ protected internal char[] lo;
+
+ /// <summary>
+ /// Pointer to high branch.
+ /// </summary>
+ protected internal char[] hi;
+
+ /// <summary>
+ /// Pointer to equal branch and to data when this node is a string terminator.
+ /// </summary>
+ protected internal char[] eq;
+
+ /// <summary>
+ /// <P>
+ /// The character stored in this node: splitchar. Two special values are
+ /// reserved:
+ /// </P>
+ /// <ul>
+ /// <li>0x0000 as string terminator</li>
+ /// <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
+ /// </ul>
+ /// <para>
+ /// This shouldn't be a problem if we give the usual semantics to strings since
+ /// 0xFFFF is guaranteed not to be an Unicode character.
+ /// </para>
+ /// </summary>
+ protected internal char[] sc;
+
+ /// <summary>
+ /// This vector holds the trailing of the keys when the branch is compressed.
+ /// </summary>
+ protected internal CharVector kv;
+
+ protected internal char root;
+
+ protected internal char freenode;
+
+ protected internal int length; // number of items in tree
+
+ protected internal const int BLOCK_SIZE = 2048; // allocation size for arrays
+
+ internal TernaryTree()
+ {
+ Init();
+ }
+
+ protected internal virtual void Init()
+ {
+ root = (char)0;
+ freenode = (char)1;
+ length = 0;
+ lo = new char[BLOCK_SIZE];
+ hi = new char[BLOCK_SIZE];
+ eq = new char[BLOCK_SIZE];
+ sc = new char[BLOCK_SIZE];
+ kv = new CharVector();
+ }
+
+ /// <summary>
+ /// Branches are initially compressed, needing one node per key plus the size
+ /// of the string key. They are decompressed as needed when another key with
+ /// same prefix is inserted. This saves a lot of space, specially for long
+ /// keys.
+ /// </summary>
+ public virtual void Insert(string key, char val)
+ {
+ // make sure we have enough room in the arrays
+ int len = key.Length + 1; // maximum number of nodes that may be generated
+ if (freenode + len > eq.Length)
+ {
+ RedimNodeArrays(eq.Length + BLOCK_SIZE);
+ }
+ char[] strkey = new char[len--];
+ key.CopyTo(0, strkey, 0, len - 0);
+ strkey[len] = (char)0;
+ root = Insert(root, strkey, 0, val);
+ }
+
+ public virtual void Insert(char[] key, int start, char val)
+ {
+ int len = StrLen(key) + 1;
+ if (freenode + len > eq.Length)
+ {
+ RedimNodeArrays(eq.Length + BLOCK_SIZE);
+ }
+ root = Insert(root, key, start, val);
+ }
+
+ /// <summary>
+ /// The actual insertion function, recursive version.
+ /// </summary>
+ private char Insert(char p, char[] key, int start, char val)
+ {
+ int len = StrLen(key, start);
+ if (p == 0)
+ {
+ // this means there is no branch, this node will start a new branch.
+ // Instead of doing that, we store the key somewhere else and create
+ // only one node with a pointer to the key
+ p = freenode++;
+ eq[p] = val; // holds data
+ length++;
+ hi[p] = (char)0;
+ if (len > 0)
+ {
+ sc[p] = (char)0xFFFF; // indicates branch is compressed
+ lo[p] = (char)kv.Alloc(len + 1); // use 'lo' to hold pointer to key
+ StrCpy(kv.Array, lo[p], key, start);
+ }
+ else
+ {
+ sc[p] = (char)0;
+ lo[p] = (char)0;
+ }
+ return p;
+ }
+
+ if (sc[p] == 0xFFFF)
+ {
+ // branch is compressed: need to decompress
+ // this will generate garbage in the external key array
+ // but we can do some garbage collection later
+ char pp = freenode++;
+ lo[pp] = lo[p]; // previous pointer to key
+ eq[pp] = eq[p]; // previous pointer to data
+ lo[p] = (char)0;
+ if (len > 0)
+ {
+ sc[p] = kv[lo[pp]];
+ eq[p] = pp;
+ lo[pp]++;
+ if (kv[lo[pp]] == 0)
+ {
+ // key completly decompressed leaving garbage in key array
+ lo[pp] = (char)0;
+ sc[pp] = (char)0;
+ hi[pp] = (char)0;
+ }
+ else
+ {
+ // we only got first char of key, rest is still there
+ sc[pp] = (char)0xFFFF;
+ }
+ }
+ else
+ {
+ // In this case we can save a node by swapping the new node
+ // with the compressed node
+ sc[pp] = (char)0xFFFF;
+ hi[p] = pp;
+ sc[p] = (char)0;
+ eq[p] = val;
+ length++;
+ return p;
+ }
+ }
+ char s = key[start];
+ if (s < sc[p])
+ {
+ lo[p] = Insert(lo[p], key, start, val);
+ }
+ else if (s == sc[p])
+ {
+ if (s != 0)
+ {
+ eq[p] = Insert(eq[p], key, start + 1, val);
+ }
+ else
+ {
+ // key already in tree, overwrite data
+ eq[p] = val;
+ }
+ }
+ else
+ {
+ hi[p] = Insert(hi[p], key, start, val);
+ }
+ return p;
+ }
+
+ /// <summary>
+ /// Compares 2 null terminated char arrays
+ /// </summary>
+ public static int StrCmp(char[] a, int startA, char[] b, int startB)
+ {
+ for (; a[startA] == b[startB]; startA++, startB++)
+ {
+ if (a[startA] == 0)
+ {
+ return 0;
+ }
+ }
+ return a[startA] - b[startB];
+ }
+
+ /// <summary>
+ /// Compares a string with null terminated char array
+ /// </summary>
+ public static int StrCmp(string str, char[] a, int start)
+ {
+ int i, d, len = str.Length;
+ for (i = 0; i < len; i++)
+ {
+ d = (int)str[i] - a[start + i];
+ if (d != 0)
+ {
+ return d;
+ }
+ if (a[start + i] == 0)
+ {
+ return d;
+ }
+ }
+ if (a[start + i] != 0)
+ {
+ return -a[start + i];
+ }
+ return 0;
+
+ }
+
+ public static void StrCpy(char[] dst, int di, char[] src, int si)
+ {
+ while (src[si] != 0)
+ {
+ dst[di++] = src[si++];
+ }
+ dst[di] = (char)0;
+ }
+
+ public static int StrLen(char[] a, int start)
+ {
+ int len = 0;
+ for (int i = start; i < a.Length && a[i] != 0; i++)
+ {
+ len++;
+ }
+ return len;
+ }
+
+ public static int StrLen(char[] a)
+ {
+ return StrLen(a, 0);
+ }
+
+ public virtual int Find(string key)
+ {
+ int len = key.Length;
+ char[] strkey = new char[len + 1];
+ key.CopyTo(0, strkey, 0, len - 0);
+ strkey[len] = (char)0;
+
+ return Find(strkey, 0);
+ }
+
+ public virtual int Find(char[] key, int start)
+ {
+ int d;
+ char p = root;
+ int i = start;
+ char c;
+
+ while (p != 0)
+ {
+ if (sc[p] == 0xFFFF)
+ {
+ if (StrCmp(key, i, kv.Array, lo[p]) == 0)
+ {
+ return eq[p];
+ }
+ else
+ {
+ return -1;
+ }
+ }
+ c = key[i];
+ d = c - sc[p];
+ if (d == 0)
+ {
+ if (c == 0)
+ {
+ return eq[p];
+ }
+ i++;
+ p = eq[p];
+ }
+ else if (d < 0)
+ {
+ p = lo[p];
+ }
+ else
+ {
+ p = hi[p];
+ }
+ }
+ return -1;
+ }
+
+ public virtual bool Knows(string key)
+ {
+ return (Find(key) >= 0);
+ }
+
+ // redimension the arrays
+ private void RedimNodeArrays(int newsize)
+ {
+ int len = newsize < lo.Length ? newsize : lo.Length;
+ char[] na = new char[newsize];
+ Array.Copy(lo, 0, na, 0, len);
+ lo = na;
+ na = new char[newsize];
+ Array.Copy(hi, 0, na, 0, len);
+ hi = na;
+ na = new char[newsize];
+ Array.Copy(eq, 0, na, 0, len);
+ eq = na;
+ na = new char[newsize];
+ Array.Copy(sc, 0, na, 0, len);
+ sc = na;
+ }
+
+ public virtual int Length
+ {
+ get { return length; }
+ }
+
+ public object Clone()
+ {
+ TernaryTree t = new TernaryTree();
+ t.lo = (char[])this.lo.Clone();
+ t.hi = (char[])this.hi.Clone();
+ t.eq = (char[])this.eq.Clone();
+ t.sc = (char[])this.sc.Clone();
+ t.kv = (CharVector)this.kv.Clone();
+ t.root = this.root;
+ t.freenode = this.freenode;
+ t.length = this.length;
+
+ return t;
+ }
+
+ /// <summary>
+ /// Recursively insert the median first and then the median of the lower and
+ /// upper halves, and so on in order to get a balanced tree. The array of keys
+ /// is assumed to be sorted in ascending order.
+ /// </summary>
+ protected internal virtual void InsertBalanced(string[] k, char[] v, int offset, int n)
+ {
+ int m;
+ if (n < 1)
+ {
+ return;
+ }
+ m = n >> 1;
+
+ Insert(k[m + offset], v[m + offset]);
+ InsertBalanced(k, v, offset, m);
+
+ InsertBalanced(k, v, offset + m + 1, n - m - 1);
+ }
+
+ /// <summary>
+ /// Balance the tree for best search performance
+ /// </summary>
+ public virtual void Balance()
+ {
+ // System.out.print("Before root splitchar = ");
+ // System.out.println(sc[root]);
+
+ int i = 0, n = length;
+ string[] k = new string[n];
+ char[] v = new char[n];
+ Iterator iter = new Iterator(this);
+ while (iter.MoveNext())
+ {
+ v[i] = iter.Value;
+ k[i++] = iter.Current;
+ }
+ Init();
+ InsertBalanced(k, v, 0, n);
+
+ // With uniform letter distribution sc[root] should be around 'm'
+ // System.out.print("After root splitchar = ");
+ // System.out.println(sc[root]);
+ }
+
+ /// <summary>
+ /// Each node stores a character (splitchar) which is part of some key(s). In a
+ /// compressed branch (one that only contain a single string key) the trailer
+ /// of the key which is not already in nodes is stored externally in the kv
+ /// array. As items are inserted, key substrings decrease. Some substrings may
+ /// completely disappear when the whole branch is totally decompressed. The
+ /// tree is traversed to find the key substrings actually used. In addition,
+ /// duplicate substrings are removed using a map (implemented with a
+ /// TernaryTree!).
+ ///
+ /// </summary>
+ public virtual void TrimToSize()
+ {
+ // first balance the tree for best performance
+ Balance();
+
+ // redimension the node arrays
+ RedimNodeArrays(freenode);
+
+ // ok, compact kv array
+ CharVector kx = new CharVector();
+ kx.Alloc(1);
+ TernaryTree map = new TernaryTree();
+ Compact(kx, map, root);
+ kv = kx;
+ kv.TrimToSize();
+ }
+
+ private void Compact(CharVector kx, TernaryTree map, char p)
+ {
+ int k;
+ if (p == 0)
+ {
+ return;
+ }
+ if (sc[p] == 0xFFFF)
+ {
+ k = map.Find(kv.Array, lo[p]);
+ if (k < 0)
+ {
+ k = kx.Alloc(StrLen(kv.Array, lo[p]) + 1);
+ StrCpy(kx.Array, k, kv.Array, lo[p]);
+ map.Insert(kx.Array, k, (char)k);
+ }
+ lo[p] = (char)k;
+ }
+ else
+ {
+ Compact(kx, map, lo[p]);
+ if (sc[p] != 0)
+ {
+ Compact(kx, map, eq[p]);
+ }
+ Compact(kx, map, hi[p]);
+ }
+ }
+
+ public virtual IEnumerator<string> Keys()
+ {
+ return new Iterator(this);
+ }
+
+ /// <summary>
+ /// Enumerator for TernaryTree
+ ///
+ /// LUCENENET NOTE: This differs a bit from its Java counterpart to adhere to
+ /// .NET IEnumerator semantics. In Java, when the <see cref="Iterator"/> is
+ /// instantiated, it is already positioned at the first element. However,
+ /// to act like a .NET IEnumerator, the initial state is undefined and considered
+ /// to be before the first element until <see cref="MoveNext"/> is called, and
+ /// if a move took place it will return <c>true</c>;
+ /// </summary>
+ public class Iterator : IEnumerator<string>
+ {
+ private readonly TernaryTree outerInstance;
+
+
+ /// <summary>
+ /// current node index
+ /// </summary>
+ private int cur;
+
+ /// <summary>
+ /// current key
+ /// </summary>
+ private string curkey;
+
+ internal class Item : ICloneable
+ {
+ internal char parent;
+ internal char child;
+
+ public Item()
+ {
+ parent = (char)0;
+ child = (char)0;
+ }
+
+ public Item(char p, char c)
+ {
+ parent = p;
+ child = c;
+ }
+
+ public object Clone()
+ {
+ return new Item(parent, child);
+ }
+
+ }
+
+ /// <summary>
+ /// Node stack
+ /// </summary>
+ internal Stack<Item> ns;
+
+ /// <summary>
+ /// key stack implemented with a StringBuilder
+ /// </summary>
+ internal StringBuilder ks;
+
+ private bool isInitialized = false;
+
+ public Iterator(TernaryTree outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ cur = -1;
+ ns = new Stack<Item>();
+ ks = new StringBuilder();
+ isInitialized = false;
+ }
+
+ public virtual void Rewind()
+ {
+ ns.Clear();
+ ks.Length = 0;
+ cur = outerInstance.root;
+ Run();
+ }
+
+ public virtual char Value
+ {
+ get
+ {
+ if (cur >= 0)
+ {
+ return outerInstance.eq[cur];
+ }
+ return (char)0;
+ }
+ }
+
+ /// <summary>
+ /// traverse upwards
+ /// </summary>
+ internal virtual int Up()
+ {
+ Item i = new Item();
+ int res = 0;
+
+ if (ns.Count == 0)
+ {
+ return -1;
+ }
+
+ if (cur != 0 && outerInstance.sc[cur] == 0)
+ {
+ return outerInstance.lo[cur];
+ }
+
+ bool climb = true;
+
+ while (climb)
+ {
+ i = ns.Pop();
+ i.child++;
+ switch ((int)i.child)
+ {
+ case 1:
+ if (outerInstance.sc[i.parent] != 0)
+ {
+ res = outerInstance.eq[i.parent];
+ ns.Push((Item)i.Clone());
+ ks.Append(outerInstance.sc[i.parent]);
+ }
+ else
+ {
+ i.child++;
+ ns.Push((Item)i.Clone());
+ res = outerInstance.hi[i.parent];
+ }
+ climb = false;
+ break;
+
+ case 2:
+ res = outerInstance.hi[i.parent];
+ ns.Push((Item)i.Clone());
+ if (ks.Length > 0)
+ {
+ ks.Length = ks.Length - 1; // pop
+ }
+ climb = false;
+ break;
+
+ default:
+ if (ns.Count == 0)
+ {
+ return -1;
+ }
+ climb = true;
+ break;
+ }
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// traverse the tree to find next key
+ /// </summary>
+ internal virtual int Run()
+ {
+ if (cur == -1)
+ {
+ return -1;
+ }
+
+ bool leaf = false;
+ while (true)
+ {
+ // first go down on low branch until leaf or compressed branch
+ while (cur != 0)
+ {
+ if (outerInstance.sc[cur] == 0xFFFF)
+ {
+ leaf = true;
+ break;
+ }
+ ns.Push(new Item((char)cur, '\u0000'));
+ if (outerInstance.sc[cur] == 0)
+ {
+ leaf = true;
+ break;
+ }
+ cur = outerInstance.lo[cur];
+ }
+ if (leaf)
+ {
+ break;
+ }
+ // nothing found, go up one node and try again
+ cur = Up();
+ if (cur == -1)
+ {
+ return -1;
+ }
+ }
+ // The current node should be a data node and
+ // the key should be in the key stack (at least partially)
+ StringBuilder buf = new StringBuilder(ks.ToString());
+ if (outerInstance.sc[cur] == 0xFFFF)
+ {
+ int p = outerInstance.lo[cur];
+ while (outerInstance.kv[p] != 0)
+ {
+ buf.Append(outerInstance.kv[p++]);
+ }
+ }
+ curkey = buf.ToString();
+ return 0;
+ }
+
+ #region Added for better .NET support
+ public string Current
+ {
+ get
+ {
+ return curkey;
+ }
+ }
+
+ object IEnumerator.Current
+ {
+ get
+ {
+ return Current;
+ }
+ }
+
+ public void Dispose()
+ {
+ // nothing to do
+ }
+
+ public bool MoveNext()
+ {
+ if (!isInitialized)
+ {
+ Rewind();
+ isInitialized = true;
+ return cur != -1;
+ }
+ if (cur == -1)
+ {
+ return false;
+ }
+ cur = Up();
+ Run();
+ return cur != -1;
+ }
+
+ public void Reset()
+ {
+ throw new NotSupportedException();
+ }
+
+ #endregion
+ }
+
+ public virtual void PrintStats(TextWriter @out)
+ {
+ @out.WriteLine("Number of keys = " + Convert.ToString(length));
+ @out.WriteLine("Node count = " + Convert.ToString(freenode));
+ // System.out.println("Array length = " + Integer.toString(eq.length));
+ @out.WriteLine("Key Array length = " + Convert.ToString(kv.Length()));
+
+ /*
+ * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
+ * System.out.print(kv.get(i)); else System.out.println("");
+ * System.out.println("Keys:"); for(Enumeration enum = keys();
+ * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
+ */
+ }
+ /*
+ public static void main(String[] args) {
+ TernaryTree tt = new TernaryTree();
+ tt.insert("Carlos", 'C');
+ tt.insert("Car", 'r');
+ tt.insert("palos", 'l');
+ tt.insert("pa", 'p');
+ tt.trimToSize();
+ System.out.println((char) tt.find("Car"));
+ System.out.println((char) tt.find("Carlos"));
+ System.out.println((char) tt.find("alto"));
+ tt.printStats(System.out);
+ }
+ */
+
+ }
+}
\ No newline at end of file