You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2016/10/23 13:02:35 UTC
[49/50] [abbrv] lucenenet git commit: Ported Analysis.Stempel + tests
(closes #190)
Ported Analysis.Stempel + tests (closes #190)
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/29525086
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/29525086
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/29525086
Branch: refs/heads/master
Commit: 2952508699645b571a2b960afaedc725252e168c
Parents: 4dbc359
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Sun Oct 2 21:37:26 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Sun Oct 23 19:44:24 2016 +0700
----------------------------------------------------------------------
Lucene.Net.sln | 28 ++
.../Egothor.Stemmer/Cell.cs | 105 +++++
.../Egothor.Stemmer/Compile.cs | 222 +++++++++
.../Egothor.Stemmer/Diff.cs | 332 +++++++++++++
.../Egothor.Stemmer/DiffIt.cs | 144 ++++++
.../Egothor.Stemmer/Gener.cs | 139 ++++++
.../Egothor.Stemmer/Lift.cs | 165 +++++++
.../Egothor.Stemmer/MultiTrie.cs | 213 +++++++++
.../Egothor.Stemmer/MultiTrie2.cs | 421 +++++++++++++++++
.../Egothor.Stemmer/Optimizer.cs | 227 +++++++++
.../Egothor.Stemmer/Optimizer2.cs | 92 ++++
.../Egothor.Stemmer/Reduce.cs | 143 ++++++
.../Egothor.Stemmer/Row.cs | 342 ++++++++++++++
.../Egothor.Stemmer/Trie.cs | 472 +++++++++++++++++++
.../Lucene.Net.Analysis.Stempel.csproj | 87 ++++
.../Pl/PolishAnalyzer.cs | 164 +++++++
.../Pl/stemmer_20000.tbl | Bin 0 -> 2225192 bytes
.../Pl/stopwords.txt | 186 ++++++++
.../Properties/AssemblyInfo.cs | 39 ++
.../RectangularArrays.cs | 52 ++
.../Stempel/StempelFilter.cs | 91 ++++
.../Stempel/StempelPolishStemFilterFactory.cs | 48 ++
.../Stempel/StempelStemmer.cs | 105 +++++
src/Lucene.Net.Core/Lucene.Net.csproj | 4 +
src/Lucene.Net.Core/Support/DataInputStream.cs | 323 +++++++++++++
src/Lucene.Net.Core/Support/DataOutputStream.cs | 256 ++++++++++
src/Lucene.Net.Core/Support/IDataInput.cs | 24 +
src/Lucene.Net.Core/Support/IDataOutput.cs | 23 +
.../Egothor.Stemmer/TestCompile.cs | 211 +++++++++
.../Egothor.Stemmer/TestStemmer.cs | 191 ++++++++
.../Egothor.Stemmer/testRules.txt | 4 +
.../Lucene.Net.Tests.Analysis.Stempel.csproj | 89 ++++
.../Pl/TestPolishAnalyzer.cs | 102 ++++
.../Properties/AssemblyInfo.cs | 36 ++
.../TestStempelPolishStemFilterFactory.cs | 56 +++
.../packages.config | 4 +
36 files changed, 5140 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/Lucene.Net.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.sln b/Lucene.Net.sln
index 0322498..c87a7be 100644
--- a/Lucene.Net.sln
+++ b/Lucene.Net.sln
@@ -64,6 +64,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Memory", "src\Lu
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Memory", "src\Lucene.Net.Tests.Memory\Lucene.Net.Tests.Memory.csproj", "{7F9378BF-C88D-46FF-9AE8-5E7D8C0225D3}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.Stempel", "src\Lucene.Net.Analysis.Stempel\Lucene.Net.Analysis.Stempel.csproj", "{A76DAD88-E3A5-40F9-9114-FACD77BD8265}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.Stempel", "src\Lucene.Net.Tests.Analysis.Stempel\Lucene.Net.Tests.Analysis.Stempel.csproj", "{940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -355,6 +359,30 @@ Global
{7F9378BF-C88D-46FF-9AE8-5E7D8C0225D3}.Release|Mixed Platforms.Build.0 = Release|Any CPU
{7F9378BF-C88D-46FF-9AE8-5E7D8C0225D3}.Release|x86.ActiveCfg = Release|Any CPU
{7F9378BF-C88D-46FF-9AE8-5E7D8C0225D3}.Release|x86.Build.0 = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Debug|x86.Build.0 = Debug|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|Any CPU.Build.0 = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|x86.ActiveCfg = Release|Any CPU
+ {A76DAD88-E3A5-40F9-9114-FACD77BD8265}.Release|x86.Build.0 = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Debug|x86.Build.0 = Debug|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|Any CPU.Build.0 = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|x86.ActiveCfg = Release|Any CPU
+ {940A6AB1-F00A-40E2-BC1A-2898EFA8C48F}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Cell.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Cell.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Cell.cs
new file mode 100644
index 0000000..b1fa11c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Cell.cs
@@ -0,0 +1,105 @@
+\ufeff/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// A <see cref="Cell"/> is a portion of a <see cref="Trie"/>.
+ /// </summary>
+ public class Cell
+ {
+ /// <summary>
+ /// next row id in this way
+ /// </summary>
+ internal int @ref = -1;
+ /// <summary>
+ /// command of the cell
+ /// </summary>
+ internal int cmd = -1;
+ /// <summary>
+ /// how many cmd-s was in subtrie before Pack()
+ /// </summary>
+ internal int cnt = 0;
+ /// <summary>
+ /// how many chars would be discarded from input key in this way
+ /// </summary>
+ internal int skip = 0;
+
+ /// <summary>
+ /// Constructor for the <see cref="Cell"/> object.
+ /// </summary>
+ internal Cell() { }
+
+ /// <summary>
+ /// Construct a <see cref="Cell"/> using the properties of the given <see cref="Cell"/>.
+ /// </summary>
+ /// <param name="a">the <see cref="Cell"/> whose properties will be used</param>
+ internal Cell(Cell a)
+ {
+ @ref = a.@ref;
+ cmd = a.cmd;
+ cnt = a.cnt;
+ skip = a.skip;
+ }
+
+ /// <summary>
+ /// Return a string containing this <see cref="Cell"/>'s attributes.
+ /// </summary>
+ /// <returns>a string representation of this <see cref="Cell"/></returns>
+ public override string ToString()
+ {
+ return "ref(" + @ref +")cmd(" + cmd + ")cnt(" + cnt + ")skp(" + skip + ")";
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
new file mode 100644
index 0000000..20cb46a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
@@ -0,0 +1,222 @@
+\ufeffusing Lucene.Net.Support;
+using System;
+using System.IO;
+using System.Text;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The Compile class is used to compile a stemmer table.
+ /// </summary>
+ public class Compile
+ {
+ static bool backward;
+ static bool multi;
+ static Trie trie;
+
+ /// <summary>
+ /// no instantiation
+ /// </summary>
+ private Compile() { }
+
+ /**
+ * Entry point to the Compile application.
+ * <p>
+ * This program takes any number of arguments: the first is the name of the
+ * desired stemming algorithm to use (a list is available in the package
+ * description) , all of the rest should be the path or paths to a file or
+ * files containing a stemmer table to compile.
+ *
+ * @param args the command line arguments
+ */
+ public static void Main(string[] args)
+ {
+ if (args.Length < 1)
+ {
+ return;
+ }
+
+ args[0].ToUpperInvariant();
+
+ backward = args[0][0] == '-';
+ int qq = (backward) ? 1 : 0;
+ bool storeorig = false;
+
+ if (args[0][qq] == '0')
+ {
+ storeorig = true;
+ qq++;
+ }
+
+ multi = args[0][qq] == 'M';
+ if (multi)
+ {
+ qq++;
+ }
+
+ // LUCENENET TODO: Is this any different than Encoding.UTF8?
+ //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
+
+ char[] optimizer = new char[args[0].Length - qq];
+ for (int i = 0; i < optimizer.Length; i++)
+ {
+ optimizer[i] = args[0][qq + i];
+ }
+
+ for (int i = 1; i < args.Length; i++)
+ {
+ TextReader @in;
+ // System.out.println("[" + args[i] + "]");
+ Diff diff = new Diff();
+ //int stems = 0; // not used
+ int words = 0;
+
+
+ AllocTrie();
+
+ Console.WriteLine(args[i]);
+ using (@in = new StreamReader(
+ new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8))
+ {
+ for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+ {
+ try
+ {
+ line = line.ToLowerInvariant();
+ StringTokenizer st = new StringTokenizer(line);
+ string stem = st.NextToken();
+ if (storeorig)
+ {
+ trie.Add(stem, "-a");
+ words++;
+ }
+ while (st.HasMoreTokens())
+ {
+ string token = st.NextToken();
+ if (token.Equals(stem) == false)
+ {
+ trie.Add(token, diff.Exec(token, stem));
+ words++;
+ }
+ }
+ }
+ catch (InvalidOperationException /*x*/)
+ {
+ // no base token (stem) on a line
+ }
+ }
+ }
+
+ Optimizer o = new Optimizer();
+ Optimizer2 o2 = new Optimizer2();
+ Lift l = new Lift(true);
+ Lift e = new Lift(false);
+ Gener g = new Gener();
+
+ for (int j = 0; j < optimizer.Length; j++)
+ {
+ string prefix;
+ switch (optimizer[j])
+ {
+ case 'G':
+ trie = trie.Reduce(g);
+ prefix = "G: ";
+ break;
+ case 'L':
+ trie = trie.Reduce(l);
+ prefix = "L: ";
+ break;
+ case 'E':
+ trie = trie.Reduce(e);
+ prefix = "E: ";
+ break;
+ case '2':
+ trie = trie.Reduce(o2);
+ prefix = "2: ";
+ break;
+ case '1':
+ trie = trie.Reduce(o);
+ prefix = "1: ";
+ break;
+ default:
+ continue;
+ }
+ trie.PrintInfo(System.Console.Out, prefix + " ");
+ }
+
+ using (DataOutputStream os = new DataOutputStream(
+ new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
+ {
+ os.WriteUTF(args[0]);
+ trie.Store(os);
+ }
+ }
+ }
+
+ internal static void AllocTrie()
+ {
+ if (multi)
+ {
+ trie = new MultiTrie2(!backward);
+ }
+ else
+ {
+ trie = new Trie(!backward);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Diff.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Diff.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Diff.cs
new file mode 100644
index 0000000..e5e372e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Diff.cs
@@ -0,0 +1,332 @@
+\ufeffusing System;
+using System.Text;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The Diff object generates a patch string.
+ /// <para>
+ /// A patch string is actually a command to a stemmer telling it how to reduce a
+ /// word to its root. For example, to reduce the word teacher to its root teach
+ /// the patch string Db would be generated. This command tells the stemmer to
+ /// delete the last 2 characters from the word teacher to reach the stem (the
+ /// patch commands are applied starting from the last character in order to save
+ /// </para>
+ /// </summary>
+ public class Diff
+ {
+ int sizex = 0;
+ int sizey = 0;
+ int[][] net;
+ int[][] way;
+
+ int INSERT;
+ int DELETE;
+ int REPLACE;
+ int NOOP;
+
+ /// <summary>
+ /// Constructor for the Diff object.
+ /// </summary>
+ public Diff()
+ : this(1, 1, 1, 0)
+ {
+ }
+
+ /// <summary>
+ /// Constructor for the Diff object
+ /// </summary>
+ /// <param name="ins">Description of the Parameter</param>
+ /// <param name="del">Description of the Parameter</param>
+ /// <param name="rep">Description of the Parameter</param>
+ /// <param name="noop">Description of the Parameter</param>
+ public Diff(int ins, int del, int rep, int noop)
+ {
+ INSERT = ins;
+ DELETE = del;
+ REPLACE = rep;
+ NOOP = noop;
+ }
+
+ /// <summary>
+ /// Apply the given patch string <paramref name="diff"/> to the given string
+ /// <paramref name="dest"/>
+ /// </summary>
+ /// <param name="dest">Destination string</param>
+ /// <param name="diff">Patch string</param>
+ public static void Apply(StringBuilder dest, string diff)
+ {
+ try
+ {
+
+ if (diff == null)
+ {
+ return;
+ }
+
+ int pos = dest.Length - 1;
+ if (pos < 0)
+ {
+ return;
+ }
+ // orig == ""
+ for (int i = 0; i < diff.Length / 2; i++)
+ {
+ char cmd = diff[2 * i];
+ char param = diff[2 * i + 1];
+ int par_num = (param - 'a' + 1);
+ switch (cmd)
+ {
+ case '-':
+ pos = pos - par_num + 1;
+ break;
+ case 'R':
+ dest[pos] = param;
+ break;
+ case 'D':
+ int o = pos;
+ pos -= par_num - 1;
+ /*
+ * delete par_num chars from index pos
+ */
+ // String s = orig.toString();
+ // s = s.substring( 0, pos ) + s.substring( o + 1 );
+ // orig = new StringBuffer( s );
+ dest.Remove(pos, (o + 1) - pos);
+ break;
+ case 'I':
+ dest.Insert(pos += 1, param);
+ break;
+ }
+ pos--;
+ }
+ }
+ catch (IndexOutOfRangeException /*x*/)
+ {
+ // x.printStackTrace();
+ }
+ catch (ArgumentOutOfRangeException /*x*/)
+ {
+ // x.printStackTrace();
+ }
+ }
+
+ /// <summary>
+ /// Construct a patch string that transforms a to b.
+ /// </summary>
+ /// <param name="a">1st string</param>
+ /// <param name="b">2nd string</param>
+ /// <returns></returns>
+ public string Exec(string a, string b)
+ {
+ if (a == null || b == null)
+ {
+ return null;
+ }
+
+ int x;
+ int y;
+ int maxx;
+ int maxy;
+ int[] go = new int[4];
+ const int X = 1;
+ const int Y = 2;
+ const int R = 3;
+ const int D = 0;
+
+ /*
+ * setup memory if needed => processing speed up
+ */
+ maxx = a.Length + 1;
+ maxy = b.Length + 1;
+ if ((maxx >= sizex) || (maxy >= sizey))
+ {
+ sizex = maxx + 8;
+ sizey = maxy + 8;
+ net = RectangularArrays.ReturnRectangularIntArray(sizex, sizey);
+ way = RectangularArrays.ReturnRectangularIntArray(sizex, sizey);
+ }
+
+ /*
+ * clear the network
+ */
+ for (x = 0; x < maxx; x++)
+ {
+ for (y = 0; y < maxy; y++)
+ {
+ net[x][y] = 0;
+ }
+ }
+
+ /*
+ * set known persistent values
+ */
+ for (x = 1; x < maxx; x++)
+ {
+ net[x][0] = x;
+ way[x][0] = X;
+ }
+ for (y = 1; y < maxy; y++)
+ {
+ net[0][y] = y;
+ way[0][y] = Y;
+ }
+
+ for (x = 1; x < maxx; x++)
+ {
+ for (y = 1; y < maxy; y++)
+ {
+ go[X] = net[x - 1][y] + DELETE;
+ // way on x costs 1 unit
+ go[Y] = net[x][y - 1] + INSERT;
+ // way on y costs 1 unit
+ go[R] = net[x - 1][y - 1] + REPLACE;
+ go[D] = net[x - 1][y - 1]
+ + ((a[x - 1] == b[y - 1]) ? NOOP : 100);
+ // diagonal costs 0, when no change
+ ushort min = (ushort)D;
+ if (go[min] >= go[X])
+ {
+ min = (ushort)X;
+ }
+ if (go[min] > go[Y])
+ {
+ min = (ushort)Y;
+ }
+ if (go[min] > go[R])
+ {
+ min = (ushort)R;
+ }
+ way[x][y] = min;
+ net[x][y] = (ushort)go[min];
+ }
+ }
+
+ // read the patch string
+ StringBuilder result = new StringBuilder();
+ char @base = (char)('a' - 1);
+ char deletes = @base;
+ char equals = @base;
+ for (x = maxx - 1, y = maxy - 1; x + y != 0;)
+ {
+ switch (way[x][y])
+ {
+ case X:
+ if (equals != @base)
+ {
+ result.Append("-" + (equals));
+ equals = @base;
+ }
+ deletes++;
+ x--;
+ break;
+ // delete
+ case Y:
+ if (deletes != @base)
+ {
+ result.Append("D" + (deletes));
+ deletes = @base;
+ }
+ if (equals != @base)
+ {
+ result.Append("-" + (equals));
+ equals = @base;
+ }
+ result.Append('I');
+ result.Append(b[--y]);
+ break;
+ // insert
+ case R:
+ if (deletes != @base)
+ {
+ result.Append("D" + (deletes));
+ deletes = @base;
+ }
+ if (equals != @base)
+ {
+ result.Append("-" + (equals));
+ equals = @base;
+ }
+ result.Append('R');
+ result.Append(b[--y]);
+ x--;
+ break;
+ // replace
+ case D:
+ if (deletes != @base)
+ {
+ result.Append("D" + (deletes));
+ deletes = @base;
+ }
+ equals++;
+ x--;
+ y--;
+ break;
+ // no change
+ }
+ }
+ if (deletes != @base)
+ {
+ result.Append("D" + (deletes));
+ deletes = @base;
+ }
+
+ return result.ToString();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
new file mode 100644
index 0000000..01621bf
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
@@ -0,0 +1,144 @@
+\ufeffusing Lucene.Net.Support;
+using System;
+using System.IO;
+using System.Text;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The DiffIt class is a means generate patch commands from an already prepared
+ /// stemmer table.
+ /// </summary>
+ public class DiffIt
+ {
+ /// <summary>
+ /// no instantiation
+ /// </summary>
+ private DiffIt() { }
+
+ internal static int Get(int i, string s)
+ {
+ int result;
+ if (!int.TryParse(s.Substring(i, 1), out result))
+ {
+ return 1;
+ }
+
+ return result;
+ //try
+ //{
+ // return int.parseInt(s.substring(i, i + 1));
+ //}
+ //catch (Exception /*x*/)
+ //{
+ // return 1;
+ //}
+ }
+
+ /// <summary>
+ /// Entry point to the DiffIt application.
+ /// <para>
+ /// This application takes one argument, the path to a file containing a
+ /// stemmer table. The program reads the file and generates the patch commands
+ /// for the stems.
+ /// </para>
+ /// </summary>
+ /// <param name="args">the path to a file containing a stemmer table</param>
+ public static void Main(string[] args)
+ {
+
+
+ int ins = Get(0, args[0]);
+ int del = Get(1, args[0]);
+ int rep = Get(2, args[0]);
+ int nop = Get(3, args[0]);
+
+ for (int i = 1; i < args.Length; i++)
+ {
+ TextReader @in;
+ // System.out.println("[" + args[i] + "]");
+ Diff diff = new Diff(ins, del, rep, nop);
+ // LUCENENET TODO: Is using Encoding.UTF8 good enough?
+ //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
+ @in = new StreamReader(new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8);
+ for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+ {
+ try
+ {
+ line = line.ToLowerInvariant();
+ StringTokenizer st = new StringTokenizer(line);
+ string stem = st.NextToken();
+ Console.WriteLine(stem + " -a");
+ while (st.HasMoreTokens())
+ {
+ String token = st.NextToken();
+ if (token.Equals(stem) == false)
+ {
+ Console.WriteLine(stem + " " + diff.Exec(token, stem));
+ }
+ }
+ }
+ catch (InvalidOperationException /*x*/)
+ {
+ // no base token (stem) on a line
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Gener.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Gener.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Gener.cs
new file mode 100644
index 0000000..bacfc68
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Gener.cs
@@ -0,0 +1,139 @@
+\ufeffusing Lucene.Net.Support;
+using System.Collections.Generic;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The Gener object helps in the discarding of nodes which break the reduction
+ /// effort and defend the structure against large reductions.
+ /// </summary>
+ public class Gener : Reduce
+ {
+ /// <summary>
+ /// Constructor for the Gener object.
+ /// </summary>
+ public Gener() { }
+
+ /// <summary>
+ /// Return a Trie with infrequent values occurring in the given Trie removed.
+ /// </summary>
+ /// <param name="orig">the Trie to optimize</param>
+ /// <returns>a new optimized Trie</returns>
+ public override Trie Optimize(Trie orig)
+ {
+ IList<string> cmds = orig.cmds;
+ IList<Row> rows = new List<Row>();
+ IList<Row> orows = orig.rows;
+ int[] remap = new int[orows.Count];
+
+ Arrays.Fill(remap, 1);
+ for (int j = orows.Count - 1; j >= 0; j--)
+ {
+ if (Eat(orows[j], remap))
+ {
+ remap[j] = 0;
+ }
+ }
+
+ Arrays.Fill(remap, -1);
+ rows = RemoveGaps(orig.root, orows, new List<Row>(), remap);
+
+ return new Trie(orig.forward, remap[orig.root], cmds, rows);
+ }
+
+ /// <summary>
+ /// Test whether the given Row of Cells in a Trie should be included in an
+ /// optimized Trie.
+ /// </summary>
+ /// <param name="in">the Row to test</param>
+ /// <param name="remap">Description of the Parameter</param>
+ /// <returns><c>true</c> if the Row should remain; otherwise, <c>false</c></returns>
+ public bool Eat(Row @in, int[] remap)
+ {
+ int sum = 0;
+ for (IEnumerator<Cell> i = @in.cells.Values.GetEnumerator(); i.MoveNext();)
+ {
+ Cell c = i.Current;
+ sum += c.cnt;
+ if (c.@ref >= 0)
+ {
+ if (remap[c.@ref] == 0)
+ {
+ c.@ref = -1;
+ }
+ }
+ }
+ int frame = sum / 10;
+ bool live = false;
+ for (IEnumerator<Cell> i = @in.cells.Values.GetEnumerator(); i.MoveNext();)
+ {
+ Cell c = i.Current;
+ if (c.cnt < frame && c.cmd >= 0)
+ {
+ c.cnt = 0;
+ c.cmd = -1;
+ }
+ if (c.cmd >= 0 || c.@ref >= 0)
+ {
+ live |= true;
+ }
+ }
+ return !live;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Lift.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Lift.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Lift.cs
new file mode 100644
index 0000000..43a4602
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Lift.cs
@@ -0,0 +1,165 @@
+\ufeffusing Lucene.Net.Support;
+using System.Collections.Generic;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The Lift class is a data structure that is a variation of a Patricia trie.
+ /// <para>
+ /// Lift's <i>raison d'etre</i> is to implement reduction of the trie via the
+ /// Lift-Up method., which makes the data structure less liable to overstemming.
+ /// </para>
+ /// </summary>
+ public class Lift : Reduce
+ {
+ bool changeSkip;
+
+ /// <summary>
+ /// Constructor for the Lift object.
+ /// </summary>
+ /// <param name="changeSkip">
+ /// when set to <c>true</c>, comparison of two Cells takes
+ /// a skip command into account
+ /// </param>
+ public Lift(bool changeSkip)
+ {
+ this.changeSkip = changeSkip;
+ }
+
+ /// <summary>
+ /// Optimize (eliminate rows with no content) the given Trie and return the
+ /// reduced Trie.
+ /// </summary>
+ /// <param name="orig">the Trie to optimized</param>
+ /// <returns>the reduced Trie</returns>
+ public override Trie Optimize(Trie orig)
+ {
+ IList<string> cmds = orig.cmds;
+ IList<Row> rows = new List<Row>();
+ IList<Row> orows = orig.rows;
+ int[] remap = new int[orows.Count];
+
+ for (int j = orows.Count - 1; j >= 0; j--)
+ {
+ LiftUp(orows[j], orows);
+ }
+
+ Arrays.Fill(remap, -1);
+ rows = RemoveGaps(orig.root, orows, new List<Row>(), remap);
+
+ return new Trie(orig.forward, remap[orig.root], cmds, rows);
+ }
+
+ /// <summary>
+ /// Reduce the trie using Lift-Up reduction.
+ /// <para>
+ /// The Lift-Up reduction propagates all leaf-values (patch commands), where
+ /// possible, to higher levels which are closer to the root of the trie.
+ /// </para>
+ /// </summary>
+ /// <param name="in">the Row to consider when optimizing</param>
+ /// <param name="nodes">contains the patch commands</param>
+ public void LiftUp(Row @in, IList<Row> nodes)
+ {
+ IEnumerator<Cell> i = @in.cells.Values.GetEnumerator();
+ for (; i.MoveNext();)
+ {
+ Cell c = i.Current;
+ if (c.@ref >= 0)
+ {
+ Row to = nodes[c.@ref];
+ int sum = to.UniformCmd(changeSkip);
+ if (sum >= 0)
+ {
+ if (sum == c.cmd)
+ {
+ if (changeSkip)
+ {
+ if (c.skip != to.uniformSkip + 1)
+ {
+ continue;
+ }
+ c.skip = to.uniformSkip + 1;
+ }
+ else
+ {
+ c.skip = 0;
+ }
+ c.cnt += to.uniformCnt;
+ c.@ref = -1;
+ }
+ else if (c.cmd < 0)
+ {
+ c.cnt = to.uniformCnt;
+ c.cmd = sum;
+ c.@ref = -1;
+ if (changeSkip)
+ {
+ c.skip = to.uniformSkip + 1;
+ }
+ else
+ {
+ c.skip = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie.cs
new file mode 100644
index 0000000..7bdad8f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie.cs
@@ -0,0 +1,213 @@
+\ufeffusing Lucene.Net.Support;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The <see cref="MultiTrie"/> is a <see cref="Trie"/> of <see cref="Trie"/>s. It stores words and their associated patch
+ /// commands. The <see cref="MultiTrie"/> handles patch commands individually (each command by
+ /// itself).
+ /// </summary>
+ public class MultiTrie : Trie
+ {
+ internal static char EOM = '*';
+ internal static string EOM_NODE = "" + EOM;
+
+ protected List<Trie> tries = new List<Trie>();
+
+ int BY = 1;
+
+ /// <summary>
+ /// Constructor for the <see cref="MultiTrie"/> object.
+ /// </summary>
+ /// <param name="is">the input stream</param>
+ /// <exception cref="IOException">if an I/O error occurs</exception>
+ public MultiTrie(IDataInput @is)
+ : base(false)
+ {
+ forward = @is.ReadBoolean();
+ BY = @is.ReadInt();
+ for (int i = @is.ReadInt(); i > 0; i--)
+ {
+ tries.Add(new Trie(@is));
+ }
+ }
+
+ /// <summary>
+ /// Constructor for the <see cref="MultiTrie"/> object
+ /// </summary>
+ /// <param name="forward">set to <c>true</c> if the elements should be read left to right</param>
+ public MultiTrie(bool forward)
+ : base(forward)
+ {
+ }
+
+ /// <summary>
+ /// Return the element that is stored in a cell associated with the given key.
+ /// </summary>
+ /// <param name="key">the key to the cell holding the desired element</param>
+ /// <returns>the element</returns>
+ public override string GetFully(string key)
+ {
+ StringBuilder result = new StringBuilder(tries.Count * 2);
+ for (int i = 0; i < tries.Count; i++)
+ {
+ string r = tries[i].GetFully(key);
+ if (r == null || (r.Length == 1 && r[0] == EOM))
+ {
+ return result.ToString();
+ }
+ result.Append(r);
+ }
+ return result.ToString();
+ }
+
+ /// <summary>
+ /// Return the element that is stored as last on a path belonging to the given
+ /// key.
+ /// </summary>
+ /// <param name="key">the key associated with the desired element</param>
+ /// <returns>the element that is stored as last on a path</returns>
+ public override string GetLastOnPath(string key)
+ {
+ StringBuilder result = new StringBuilder(tries.Count * 2);
+ for (int i = 0; i < tries.Count; i++)
+ {
+ string r = tries[i].GetLastOnPath(key);
+ if (r == null || (r.Length == 1 && r[0] == EOM))
+ {
+ return result.ToString();
+ }
+ result.Append(r);
+ }
+ return result.ToString();
+ }
+
+ /// <summary>
+ /// Write this data structure to the given output stream.
+ /// </summary>
+ /// <param name="os">the output stream</param>
+ /// <exception cref="IOException">if an I/O error occurs</exception>
+ public override void Store(IDataOutput os)
+ {
+ os.WriteBoolean(forward);
+ os.WriteInt(BY);
+ os.WriteInt(tries.Count);
+ foreach (Trie trie in tries)
+ trie.Store(os);
+ }
+
+ /// <summary>
+ /// Add an element to this structure consisting of the given key and patch
+ /// command.
+ /// <para>
+ /// This method will return without executing if the <paramref name="cmd"/>
+ /// parameter's length is 0.
+ /// </para>
+ /// </summary>
+ /// <param name="key">the key</param>
+ /// <param name="cmd">the patch command</param>
+ public override void Add(string key, string cmd)
+ {
+ if (cmd.Length == 0)
+ {
+ return;
+ }
+ int levels = cmd.Length / BY;
+ while (levels >= tries.Count)
+ {
+ tries.Add(new Trie(forward));
+ }
+ for (int i = 0; i < levels; i++)
+ {
+ tries[i].Add(key, cmd.Substring(BY * i, BY));
+ }
+ tries[levels].Add(key, EOM_NODE);
+ }
+
+ /// <summary>
+ /// Remove empty rows from the given <see cref="Trie"/> and return the newly reduced <see cref="Trie"/>.
+ /// </summary>
+ /// <param name="by">the <see cref="Trie"/> to reduce</param>
+ /// <returns>the newly reduced Trie</returns>
+ public override Trie Reduce(Reduce by)
+ {
+ List<Trie> h = new List<Trie>();
+ foreach (Trie trie in tries)
+ h.Add(trie.Reduce(by));
+
+ MultiTrie m = new MultiTrie(forward);
+ m.tries = h;
+ return m;
+ }
+
+ /// <summary>
+ /// Print the given prefix and the position(s) in the Trie where it appears.
+ /// </summary>
+ /// <param name="out"></param>
+ /// <param name="prefix">the desired prefix</param>
+ public override void PrintInfo(TextWriter @out, string prefix)
+ {
+ int c = 0;
+ foreach (Trie trie in tries)
+ trie.PrintInfo(@out, prefix + "[" + (++c) + "] ");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie2.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie2.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie2.cs
new file mode 100644
index 0000000..9db6b92
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/MultiTrie2.cs
@@ -0,0 +1,421 @@
+\ufeffusing Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The <see cref="MultiTrie"/> is a <see cref="Trie"/> of <see cref="Trie"/>s.
+ /// <para>
+ /// It stores words and their associated patch commands. The <see cref="MultiTrie"/> handles
+ /// patch commands broken into their constituent parts, as a <see cref="MultiTrie"/> does, but
+ /// the commands are delimited by the skip command.
+ /// </para>
+ /// </summary>
+ public class MultiTrie2 : MultiTrie
+ {
+ /// <summary>
+ /// Constructor for the <see cref="MultiTrie"/> object.
+ /// </summary>
+ /// <param name="is">the input stream</param>
+ /// <exception cref="IOException">if an I/O error occurs</exception>
+ public MultiTrie2(IDataInput @is)
+ : base(@is)
+ {
+ }
+
+ /// <summary>
+ /// Constructor for the <see cref="MultiTrie2"/> object
+ /// </summary>
+ /// <param name="forward">set to <c>true</c> if the elements should be read left to right</param>
+ public MultiTrie2(bool forward)
+ : base(forward)
+ {
+ }
+
+ /// <summary>
+ /// Return the element that is stored in a cell associated with the given key.
+ /// </summary>
+ /// <param name="key">the key to the cell holding the desired element</param>
+ /// <returns>the element</returns>
+ public override string GetFully(string key)
+ {
+ StringBuilder result = new StringBuilder(tries.Count * 2);
+ try
+ {
+ string lastkey = key;
+ string[] p = new string[tries.Count];
+ char lastch = ' ';
+ for (int i = 0; i < tries.Count; i++)
+ {
+ string r = tries[i].GetFully(lastkey);
+ if (r == null || (r.Length == 1 && r[0] == EOM))
+ {
+ return result.ToString();
+ }
+ if (CannotFollow(lastch, r[0]))
+ {
+ return result.ToString();
+ }
+ else
+ {
+ lastch = r[r.Length - 2];
+ }
+ // key=key.substring(lengthPP(r));
+ p[i] = r;
+ if (p[i][0] == '-')
+ {
+ if (i > 0)
+ {
+ if (!TrySkip(key, LengthPP(p[i - 1]), out key))
+ {
+ break;
+ }
+ }
+ if (!TrySkip(key, LengthPP(p[i - 1]), out key))
+ {
+ break;
+ }
+ }
+ // key = skip(key, lengthPP(r));
+ result.Append(r);
+ if (key.Length != 0)
+ {
+ lastkey = key;
+ }
+ }
+ }
+ catch (ArgumentOutOfRangeException /*x*/) { }
+ return result.ToString();
+ }
+
+ /// <summary>
+ /// Return the element that is stored as last on a path belonging to the given
+ /// key.
+ /// </summary>
+ /// <param name="key">the key associated with the desired element</param>
+ /// <returns>the element that is stored as last on a path</returns>
+ public override string GetLastOnPath(string key)
+ {
+ StringBuilder result = new StringBuilder(tries.Count * 2);
+ try
+ {
+ string lastkey = key;
+ string[] p = new string[tries.Count];
+ char lastch = ' ';
+ for (int i = 0; i < tries.Count; i++)
+ {
+ string r = tries[i].GetLastOnPath(lastkey);
+ if (r == null || (r.Length == 1 && r[0] == EOM))
+ {
+ return result.ToString();
+ }
+ // System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
+ if (CannotFollow(lastch, r[0]))
+ {
+ return result.ToString();
+ }
+ else
+ {
+ lastch = r[r.Length - 2];
+ }
+ // key=key.substring(lengthPP(r));
+ p[i] = r;
+ if (p[i][0] == '-')
+ {
+ if (i > 0)
+ {
+ if (!TrySkip(key, LengthPP(p[i - 1]), out key))
+ {
+ break;
+ }
+ }
+ if (!TrySkip(key, LengthPP(p[i]), out key))
+ {
+ break;
+ }
+ }
+ // key = skip(key, lengthPP(r));
+ result.Append(r);
+ if (key.Length != 0)
+ {
+ lastkey = key;
+ }
+ }
+ }
+ catch (ArgumentOutOfRangeException /*x*/) { }
+ return result.ToString();
+ }
+
+ /// <summary>
+ /// Write this data structure to the given output stream.
+ /// </summary>
+ /// <param name="os">the output stream</param>
+ /// <exception cref="IOException">if an I/O error occurs</exception>
+ public override void Store(IDataOutput os)
+ {
+ base.Store(os);
+ }
+
+ /// <summary>
+ /// Add an element to this structure consisting of the given key and patch
+ /// command.
+ /// <para>
+ /// This method will return without executing if the <paramref name="cmd"/>
+ /// parameter's length is 0.
+ /// </para>
+ /// </summary>
+ /// <param name="key">the key</param>
+ /// <param name="cmd">the patch command</param>
+ public override void Add(string key, string cmd)
+ {
+ if (cmd.Length == 0)
+ {
+ return;
+ }
+ // System.err.println( cmd );
+ string[] p = Decompose(cmd);
+ int levels = p.Length;
+ // System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
+ while (levels >= tries.Count)
+ {
+ tries.Add(new Trie(forward));
+ }
+ string lastkey = key;
+ for (int i = 0; i < levels; i++)
+ {
+ if (key.Length > 0)
+ {
+ tries[i].Add(key, p[i]);
+ lastkey = key;
+ }
+ else
+ {
+ tries[i].Add(lastkey, p[i]);
+ }
+ // System.err.println("-"+key+" "+p[i]+"|"+key.length());
+ /*
+ * key=key.substring(lengthPP(p[i]));
+ */
+ if (p[i].Length > 0 && p[i][0] == '-')
+ {
+ if (i > 0)
+ {
+ if (!TrySkip(key, LengthPP(p[i - 1]), out key))
+ {
+ // LUCENENET: Should never happen, but since we don't
+ // have a catch block here who knows what might happen if
+ // we don't do this.
+ throw new ArgumentOutOfRangeException();
+ }
+ }
+ if (!TrySkip(key, LengthPP(p[i]), out key))
+ {
+ // LUCENENET: Should never happen, but since we don't
+ // have a catch block here who knows what might happen if
+ // we don't do this.
+ throw new ArgumentOutOfRangeException();
+ }
+ }
+ // System.err.println("--->"+key);
+ }
+ if (key.Length > 0)
+ {
+ tries[levels].Add(key, EOM_NODE);
+ }
+ else
+ {
+ tries[levels].Add(lastkey, EOM_NODE);
+ }
+ }
+
+ /// <summary>
+ /// Break the given patch command into its constituent pieces. The pieces are
+ /// delimited by NOOP commands.
+ /// </summary>
+ /// <param name="cmd">the patch command</param>
+ /// <returns>an array containing the pieces of the command</returns>
+ public virtual string[] Decompose(string cmd)
+ {
+ int parts = 0;
+
+ for (int i = 0; 0 <= i && i < cmd.Length;)
+ {
+ int next = DashEven(cmd, i);
+ if (i == next)
+ {
+ parts++;
+ i = next + 2;
+ }
+ else
+ {
+ parts++;
+ i = next;
+ }
+ }
+
+ string[] part = new string[parts];
+ int x = 0;
+
+ for (int i = 0; 0 <= i && i < cmd.Length;)
+ {
+ int next = DashEven(cmd, i);
+ if (i == next)
+ {
+ part[x++] = cmd.Substring(i, 2);
+ i = next + 2;
+ }
+ else
+ {
+ part[x++] = (next < 0) ? cmd.Substring(i, cmd.Length - i) : cmd.Substring(i, next - i);
+ i = next;
+ }
+ }
+ return part;
+ }
+
+ /// <summary>
+ /// Remove empty rows from the given Trie and return the newly reduced Trie.
+ /// </summary>
+ /// <param name="by">the <see cref="Trie"/> to reduce</param>
+ /// <returns>the newly reduced Trie</returns>
+ public override Trie Reduce(Reduce by)
+ {
+ List<Trie> h = new List<Trie>();
+ foreach (Trie trie in tries)
+ h.Add(trie.Reduce(by));
+
+ MultiTrie2 m = new MultiTrie2(forward);
+ m.tries = h;
+ return m;
+ }
+
+ private bool CannotFollow(char after, char goes)
+ {
+ switch (after)
+ {
+ case '-':
+ case 'D':
+ return after == goes;
+ }
+ return false;
+ }
+
+ private bool TrySkip(string @in, int count, out string result)
+ {
+ // LUCENENET: Rather than relying on this to throw an exception by passing a negative
+ // length to Substring like they did in Java, we check that the value
+ // is negative and return false to the caller so it can safely break out
+ // of the loop.
+ int skipLength = @in.Length - count;
+ if (skipLength < 0)
+ {
+ result = string.Empty;
+ return false;
+ }
+ if (forward)
+ {
+ result = @in.Substring(count, skipLength);
+ }
+ else
+ {
+ result = @in.Substring(0, (skipLength) - 0);
+ }
+ return true;
+ }
+
+ private int DashEven(string @in, int from)
+ {
+ while (from < @in.Length)
+ {
+ if (@in[from] == '-')
+ {
+ return from;
+ }
+ else
+ {
+ from += 2;
+ }
+ }
+ return -1;
+ }
+
+
+ private int LengthPP(string cmd)
+ {
+ int len = 0;
+ for (int i = 0; i < cmd.Length; i++)
+ {
+ switch (cmd[i++])
+ {
+ case '-':
+ case 'D':
+ len += cmd[i] - 'a' + 1;
+ break;
+ case 'R':
+ len++; /* intentional fallthrough */
+ goto case 'I';
+ case 'I':
+ break;
+ }
+ }
+ return len;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer.cs
new file mode 100644
index 0000000..e299452
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer.cs
@@ -0,0 +1,227 @@
+\ufeffusing Lucene.Net.Support;
+using System.Collections.Generic;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+namespace Egothor.Stemmer
+{
+ /// <summary>
+ /// The <see cref="Optimizer"/> class is a <see cref="Trie"/> that will be reduced (have empty rows removed).
+ /// <para>
+ /// The reduction will be made by joining two rows where the first is a subset of
+ /// the second.
+ /// </para>
+ /// </summary>
+ public class Optimizer : Reduce
+ {
+ /// <summary>
+ /// Constructor for the <see cref="Optimizer"/> object.
+ /// </summary>
+ public Optimizer() { }
+
+ /// <summary>
+ /// Optimize (remove empty rows) from the given Trie and return the resulting
+ /// Trie.
+ /// </summary>
+ /// <param name="orig">the <see cref="Trie"/> to consolidate</param>
+ /// <returns>the newly consolidated Trie</returns>
+ public override Trie Optimize(Trie orig)
+ {
+ IList<string> cmds = orig.cmds;
+ IList<Row> rows = new List<Row>();
+ IList<Row> orows = orig.rows;
+ int[] remap = new int[orows.Count];
+
+ for (int j = orows.Count - 1; j >= 0; j--)
+ {
+ Row now = new Remap(orows[j], remap);
+ bool merged = false;
+
+ for (int i = 0; i < rows.Count; i++)
+ {
+ Row q = Merge(now, rows[i]);
+ if (q != null)
+ {
+ rows[i] = q;
+ merged = true;
+ remap[j] = i;
+ break;
+ }
+ }
+
+ if (merged == false)
+ {
+ remap[j] = rows.Count;
+ rows.Add(now);
+ }
+ }
+
+ int root = remap[orig.root];
+ Arrays.Fill(remap, -1);
+ rows = RemoveGaps(root, rows, new List<Row>(), remap);
+
+ return new Trie(orig.forward, remap[root], cmds, rows);
+ }
+
+ /// <summary>
+ /// Merge the given rows and return the resulting <see cref="Row"/>.
+ /// </summary>
+ /// <param name="master">the master <see cref="Row"/></param>
+ /// <param name="existing">the existing <see cref="Row"/></param>
+ /// <returns>the resulting <see cref="Row"/>, or <c>null</c> if the operation cannot be realized</returns>
+ public Row Merge(Row master, Row existing)
+ {
+ var i = master.cells.Keys.GetEnumerator();
+ Row n = new Row();
+ for (; i.MoveNext();)
+ {
+ char ch = i.Current;
+ // XXX also must handle Cnt and Skip !!
+ Cell a = master.cells.ContainsKey(ch) ? master.cells[ch] : null;
+ Cell b = existing.cells.ContainsKey(ch) ? existing.cells[ch] : null;
+
+ Cell s = (b == null) ? new Cell(a) : Merge(a, b);
+ if (s == null)
+ {
+ return null;
+ }
+ n.cells[ch] = s;
+ }
+ i = existing.cells.Keys.GetEnumerator();
+ for (; i.MoveNext();)
+ {
+ char ch = i.Current;
+ if (master.At(ch) != null)
+ {
+ continue;
+ }
+ n.cells[ch] = existing.At(ch);
+ }
+ return n;
+ }
+
+ /// <summary>
+ /// Merge the given <see cref="Cell"/>s and return the resulting <see cref="Cell"/>.
+ /// </summary>
+ /// <param name="m">the master <see cref="Cell"/></param>
+ /// <param name="e">the existing <see cref="Cell"/></param>
+ /// <returns>the resulting <see cref="Cell"/>, or <c>null</c> if the operation cannot be realized</returns>
+ public virtual Cell Merge(Cell m, Cell e)
+ {
+ Cell n = new Cell();
+
+ if (m.skip != e.skip)
+ {
+ return null;
+ }
+
+ if (m.cmd >= 0)
+ {
+ if (e.cmd >= 0)
+ {
+ if (m.cmd == e.cmd)
+ {
+ n.cmd = m.cmd;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ else
+ {
+ n.cmd = m.cmd;
+ }
+ }
+ else
+ {
+ n.cmd = e.cmd;
+ }
+ if (m.@ref >= 0)
+ {
+ if (e.@ref >= 0)
+ {
+ if (m.@ref == e.@ref)
+ {
+ if (m.skip == e.skip)
+ {
+ n.@ref = m.@ref;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ else
+ {
+ return null;
+ }
+ }
+ else
+ {
+ n.@ref = m.@ref;
+ }
+ }
+ else
+ {
+ n.@ref = e.@ref;
+ }
+ n.cnt = m.cnt + e.cnt;
+ n.skip = m.skip;
+ return n;
+ }
+ }
+}