You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/06 13:07:09 UTC
[12/14] lucenenet git commit: Lucene.Net.Analysis.Stempel: Modified
Egothor.Stemmer Compile and DiffIt programs to accept file encoding on the
command line and cleaned up implementation
Lucene.Net.Analysis.Stempel: Modified Egothor.Stemmer Compile and DiffIt programs to accept file encoding on the command line and cleaned up implementation
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/933d8351
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/933d8351
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/933d8351
Branch: refs/heads/master
Commit: 933d8351154e2a40f5ad226d8a96172c1401d1cd
Parents: 775df65
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Jul 6 18:49:47 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Jul 6 18:49:47 2017 +0700
----------------------------------------------------------------------
.../Egothor.Stemmer/Compile.cs | 33 ++++++--
.../Egothor.Stemmer/DiffIt.cs | 86 +++++++++++---------
2 files changed, 72 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
index a6d8315..9bbfa71 100644
--- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
@@ -1,6 +1,7 @@
using Lucene.Net.Support;
using Lucene.Net.Support.IO;
using System;
+using System.Collections.Generic;
using System.IO;
using System.Text;
@@ -90,8 +91,9 @@ namespace Egothor.Stemmer
return;
}
- args[0].ToUpperInvariant();
+ args[0] = args[0].ToUpperInvariant();
+ // Reads the first char of the first arg
backward = args[0][0] == '-';
int qq = (backward) ? 1 : 0;
bool storeorig = false;
@@ -109,6 +111,7 @@ namespace Egothor.Stemmer
}
string charset = null;
+ var stemmerTables = new List<string>();
try
{
charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
@@ -124,15 +127,28 @@ namespace Egothor.Stemmer
}
}
+ // LUCENENET specific
+ // command line argument overrides environment variable or default, if supplied
+ for (int i = 1; i < args.Length; i++)
+ {
+ if ("-e".Equals(args[i]) || "--encoding".Equals(args[i]))
+ {
+ charset = args[i];
+ }
+ else
+ {
+ stemmerTables.Add(args[i]);
+ }
+ }
+
char[] optimizer = new char[args[0].Length - qq];
for (int i = 0; i < optimizer.Length; i++)
{
optimizer[i] = args[0][qq + i];
}
- for (int i = 1; i < args.Length; i++)
+ foreach (var stemmerTable in stemmerTables)
{
- TextReader @in;
// System.out.println("[" + args[i] + "]");
Diff diff = new Diff();
//int stems = 0; // not used
@@ -141,11 +157,12 @@ namespace Egothor.Stemmer
AllocTrie();
- Console.WriteLine(args[i]);
- using (@in = new StreamReader(
- new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
+ Console.WriteLine(stemmerTable);
+ using (TextReader input = new StreamReader(
+ new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
{
- for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+ string line;
+ while ((line = input.ReadLine()) != null)
{
try
{
@@ -212,7 +229,7 @@ namespace Egothor.Stemmer
}
using (DataOutputStream os = new DataOutputStream(
- new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
+ new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
{
os.WriteUTF(args[0]);
trie.Store(os);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
index 5a1c9bc..4d29472 100644
--- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
@@ -1,5 +1,6 @@
using Lucene.Net.Support;
using System;
+using System.Collections.Generic;
using System.IO;
using System.Text;
@@ -80,14 +81,6 @@ namespace Egothor.Stemmer
}
return result;
- //try
- //{
- // return int.parseInt(s.substring(i, i + 1));
- //}
- //catch (Exception /*x*/)
- //{
- // return 1;
- //}
}
/// <summary>
@@ -101,56 +94,71 @@ namespace Egothor.Stemmer
/// <param name="args">the path to a file containing a stemmer table</param>
public static void Main(string[] args)
{
-
-
int ins = Get(0, args[0]);
int del = Get(1, args[0]);
int rep = Get(2, args[0]);
int nop = Get(3, args[0]);
- for (int i = 1; i < args.Length; i++)
+ string charset = null;
+ var stemmerTables = new List<string>();
+ try
{
- TextReader @in;
- // System.out.println("[" + args[i] + "]");
- Diff diff = new Diff(ins, del, rep, nop);
-
- string charset = null;
- try
+ charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
+ }
+ catch
+ {
+ }
+ finally
+ {
+ if (string.IsNullOrEmpty(charset))
{
- charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
+ charset = "UTF-8";
}
- catch
+ }
+
+ // LUCENENET specific
+ // command line argument overrides environment variable or default, if supplied
+ for (int i = 1; i < args.Length; i++)
+ {
+ if ("-e".Equals(args[i]) || "--encoding".Equals(args[i]))
{
+ charset = args[i];
}
- finally
+ else
{
- if (string.IsNullOrEmpty(charset))
- {
- charset = "UTF-8";
- }
+ stemmerTables.Add(args[i]);
}
+ }
+
+ foreach (var stemmerTable in stemmerTables)
+ {
+ // System.out.println("[" + args[i] + "]");
+ Diff diff = new Diff(ins, del, rep, nop);
- @in = new StreamReader(new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset));
- for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+ using (TextReader input = new StreamReader(new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
{
- try
+ string line;
+ while ((line = input.ReadLine()) != null)
{
- line = line.ToLowerInvariant();
- StringTokenizer st = new StringTokenizer(line);
- string stem = st.NextToken();
- Console.WriteLine(stem + " -a");
- while (st.HasMoreTokens())
+ try
{
- String token = st.NextToken();
- if (token.Equals(stem) == false)
+ line = line.ToLowerInvariant();
+ StringTokenizer st = new StringTokenizer(line);
+ string stem = st.NextToken();
+ Console.WriteLine(stem + " -a");
+ while (st.HasMoreTokens())
{
- Console.WriteLine(stem + " " + diff.Exec(token, stem));
+ string token = st.NextToken();
+ if (token.Equals(stem) == false)
+ {
+ Console.WriteLine(stem + " " + diff.Exec(token, stem));
+ }
}
}
- }
- catch (InvalidOperationException /*x*/)
- {
- // no base token (stem) on a line
+ catch (InvalidOperationException /*x*/)
+ {
+ // no base token (stem) on a line
+ }
}
}
}