You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/06 13:07:09 UTC

[12/14] lucenenet git commit: Lucene.Net.Analysis.Stempel: Modified Egothor.Stemmer Compile and DiffIt programs to accept file encoding on the command line and cleaned up implementation

Lucene.Net.Analysis.Stempel: Modified Egothor.Stemmer Compile and DiffIt programs to accept file encoding on the command line and cleaned up implementation


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/933d8351
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/933d8351
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/933d8351

Branch: refs/heads/master
Commit: 933d8351154e2a40f5ad226d8a96172c1401d1cd
Parents: 775df65
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Jul 6 18:49:47 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Jul 6 18:49:47 2017 +0700

----------------------------------------------------------------------
 .../Egothor.Stemmer/Compile.cs                  | 33 ++++++--
 .../Egothor.Stemmer/DiffIt.cs                   | 86 +++++++++++---------
 2 files changed, 72 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
index a6d8315..9bbfa71 100644
--- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs
@@ -1,6 +1,7 @@
 using Lucene.Net.Support;
 using Lucene.Net.Support.IO;
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 
@@ -90,8 +91,9 @@ namespace Egothor.Stemmer
                 return;
             }
 
-            args[0].ToUpperInvariant();
+            args[0] = args[0].ToUpperInvariant();
 
+            // Reads the first char of the first arg
             backward = args[0][0] == '-';
             int qq = (backward) ? 1 : 0;
             bool storeorig = false;
@@ -109,6 +111,7 @@ namespace Egothor.Stemmer
             }
 
             string charset = null;
+            var stemmerTables = new List<string>();
             try
             {
                 charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
@@ -124,15 +127,28 @@ namespace Egothor.Stemmer
                 }
             }
 
+            // LUCENENET specific
+            // command line argument overrides environment variable or default, if supplied
+            for (int i = 1; i < args.Length; i++)
+            {
+                if ("-e".Equals(args[i]) || "--encoding".Equals(args[i]))
+                {
+                    charset = args[i];
+                }
+                else
+                {
+                    stemmerTables.Add(args[i]);
+                }
+            }
+
             char[] optimizer = new char[args[0].Length - qq];
             for (int i = 0; i < optimizer.Length; i++)
             {
                 optimizer[i] = args[0][qq + i];
             }
 
-            for (int i = 1; i < args.Length; i++)
+            foreach (var stemmerTable in stemmerTables)
             {
-                TextReader @in;
                 // System.out.println("[" + args[i] + "]");
                 Diff diff = new Diff();
                 //int stems = 0; // not used
@@ -141,11 +157,12 @@ namespace Egothor.Stemmer
 
                 AllocTrie();
 
-                Console.WriteLine(args[i]);
-                using (@in = new StreamReader(
-                    new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
+                Console.WriteLine(stemmerTable);
+                using (TextReader input = new StreamReader(
+                    new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
                 {
-                    for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+                    string line;
+                    while ((line = input.ReadLine()) != null)
                     {
                         try
                         {
@@ -212,7 +229,7 @@ namespace Egothor.Stemmer
                 }
 
                 using (DataOutputStream os = new DataOutputStream(
-                    new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
+                    new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
                 {
                     os.WriteUTF(args[0]);
                     trie.Store(os);

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
index 5a1c9bc..4d29472 100644
--- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
+++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs
@@ -1,5 +1,6 @@
 using Lucene.Net.Support;
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 
@@ -80,14 +81,6 @@ namespace Egothor.Stemmer
             }
 
             return result;
-            //try
-            //{
-            //    return int.parseInt(s.substring(i, i + 1));
-            //}
-            //catch (Exception /*x*/)
-            //{
-            //    return 1;
-            //}
         }
 
         /// <summary>
@@ -101,56 +94,71 @@ namespace Egothor.Stemmer
         /// <param name="args">the path to a file containing a stemmer table</param>
         public static void Main(string[] args)
         {
-
-
             int ins = Get(0, args[0]);
             int del = Get(1, args[0]);
             int rep = Get(2, args[0]);
             int nop = Get(3, args[0]);
 
-            for (int i = 1; i < args.Length; i++)
+            string charset = null;
+            var stemmerTables = new List<string>();
+            try
             {
-                TextReader @in;
-                // System.out.println("[" + args[i] + "]");
-                Diff diff = new Diff(ins, del, rep, nop);
-
-                string charset = null;
-                try
+                charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
+            }
+            catch
+            {
+            }
+            finally
+            {
+                if (string.IsNullOrEmpty(charset))
                 {
-                    charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset");
+                    charset = "UTF-8";
                 }
-                catch
+            }
+
+            // LUCENENET specific
+            // command line argument overrides environment variable or default, if supplied
+            for (int i = 1; i < args.Length; i++)
+            {
+                if ("-e".Equals(args[i]) || "--encoding".Equals(args[i]))
                 {
+                    charset = args[i];
                 }
-                finally
+                else
                 {
-                    if (string.IsNullOrEmpty(charset))
-                    {
-                        charset = "UTF-8";
-                    }
+                    stemmerTables.Add(args[i]);
                 }
+            }
+
+            foreach (var stemmerTable in stemmerTables)
+            {
+                // System.out.println("[" + args[i] + "]");
+                Diff diff = new Diff(ins, del, rep, nop);
 
-                @in = new StreamReader(new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset));
-                for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
+                using (TextReader input = new StreamReader(new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
                 {
-                    try
+                    string line;
+                    while ((line = input.ReadLine()) != null)
                     {
-                        line = line.ToLowerInvariant();
-                        StringTokenizer st = new StringTokenizer(line);
-                        string stem = st.NextToken();
-                        Console.WriteLine(stem + " -a");
-                        while (st.HasMoreTokens())
+                        try
                         {
-                            String token = st.NextToken();
-                            if (token.Equals(stem) == false)
+                            line = line.ToLowerInvariant();
+                            StringTokenizer st = new StringTokenizer(line);
+                            string stem = st.NextToken();
+                            Console.WriteLine(stem + " -a");
+                            while (st.HasMoreTokens())
                             {
-                                Console.WriteLine(stem + " " + diff.Exec(token, stem));
+                                string token = st.NextToken();
+                                if (token.Equals(stem) == false)
+                                {
+                                    Console.WriteLine(stem + " " + diff.Exec(token, stem));
+                                }
                             }
                         }
-                    }
-                    catch (InvalidOperationException /*x*/)
-                    {
-                        // no base token (stem) on a line
+                        catch (InvalidOperationException /*x*/)
+                        {
+                            // no base token (stem) on a line
+                        }
                     }
                 }
             }