You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2013/04/03 19:40:13 UTC
[30/51] [partial] Mass convert mixed tabs to spaces
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/SuggestWord.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/SuggestWord.cs b/src/contrib/SpellChecker/Spell/SuggestWord.cs
index c8bec15..54840b2 100644
--- a/src/contrib/SpellChecker/Spell/SuggestWord.cs
+++ b/src/contrib/SpellChecker/Spell/SuggestWord.cs
@@ -19,7 +19,7 @@ using System;
namespace SpellChecker.Net.Search.Spell
{
-
+
/// <summary> SuggestWord Class, used in suggestSimilar method in SpellChecker class.
///
/// </summary>
@@ -29,13 +29,13 @@ namespace SpellChecker.Net.Search.Spell
{
/// <summary> the score of the word</summary>
public float score;
-
+
/// <summary> The freq of the word</summary>
public int freq;
-
+
/// <summary> the suggested word</summary>
public System.String termString;
-
+
public int CompareTo(SuggestWord a)
{
//first criteria: the edit distance
@@ -47,18 +47,18 @@ namespace SpellChecker.Net.Search.Spell
{
return - 1;
}
-
+
//second criteria (if first criteria is equal): the popularity
if (freq > a.freq)
{
return 1;
}
-
+
if (freq < a.freq)
{
return - 1;
}
-
+
return 0;
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs b/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
index de4dc09..7ae17ec 100644
--- a/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
+++ b/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
@@ -22,12 +22,12 @@ namespace SpellChecker.Net.Search.Spell
sealed class SuggestWordQueue : PriorityQueue
{
-
+
internal SuggestWordQueue(int size)
{
Initialize(size);
}
-
+
override public bool LessThan(SuggestWord a, SuggestWord b)
{
var val = a.CompareTo(b);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/TRStringDistance.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/TRStringDistance.cs b/src/contrib/SpellChecker/Spell/TRStringDistance.cs
index f797f59..79b2314 100644
--- a/src/contrib/SpellChecker/Spell/TRStringDistance.cs
+++ b/src/contrib/SpellChecker/Spell/TRStringDistance.cs
@@ -18,16 +18,16 @@
namespace SpellChecker.Net.Search.Spell
{
-
+
/// <summary> Edit distance class</summary>
public class TRStringDistance
{
-
+
internal char[] sa;
internal int n;
internal int[][][] cache = new int[30][][];
-
-
+
+
/// <summary> Optimized to run a bit faster than the static getDistance().
/// In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
/// </summary>
@@ -36,8 +36,8 @@ namespace SpellChecker.Net.Search.Spell
sa = target.ToCharArray();
n = sa.Length;
}
-
-
+
+
//***************************
// Compute Levenshtein distance
//***************************
@@ -56,7 +56,7 @@ namespace SpellChecker.Net.Search.Spell
{
return n;
}
-
+
if (m >= cache.Length)
{
d = Form(n, m);
@@ -68,31 +68,31 @@ namespace SpellChecker.Net.Search.Spell
else
{
d = cache[m] = Form(n, m);
-
+
// Step 3
}
for (int i = 1; i <= n; i++)
{
char s_i = sa[i - 1];
-
+
// Step 4
-
+
for (int j = 1; j <= m; j++)
{
char t_j = ta[j - 1];
-
+
// Step 5
int cost = s_i == t_j ? 0 : 1;
d[i][j] = Min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
-
+
// Step 7
return d[n][m];
}
-
-
+
+
/// <summary> </summary>
private static int[][] Form(int n, int m)
{
@@ -102,7 +102,7 @@ namespace SpellChecker.Net.Search.Spell
d[i] = new int[m + 1];
}
// Step 2
-
+
for (int i = 0; i <= n; i++)
{
d[i][0] = i;
@@ -113,8 +113,8 @@ namespace SpellChecker.Net.Search.Spell
}
return d;
}
-
-
+
+
//**************************
// Get minimum of three values
//**************************
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/SynExpand/SynExpand.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/SynExpand/SynExpand.cs b/src/contrib/WordNet/SynExpand/SynExpand.cs
index a830f6f..79498c0 100644
--- a/src/contrib/WordNet/SynExpand/SynExpand.cs
+++ b/src/contrib/WordNet/SynExpand/SynExpand.cs
@@ -87,48 +87,48 @@ namespace WorldNet.Net
Analyzer a,
String field,
float boost)
- {
- already = new List<String>(); // avoid dups
- var top = new List<String>(); // needs to be separately listed..
- if (field == null)
- field = "contents";
-
+ {
+ already = new List<String>(); // avoid dups
+ var top = new List<String>(); // needs to be separately listed..
+ if (field == null)
+ field = "contents";
+
if (a == null)
- a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
-
- // [1] Parse query into separate words so that when we expand we can avoid dups
- var ts = a.TokenStream(field, new StringReader(query));
+ a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
+
+ // [1] Parse query into separate words so that when we expand we can avoid dups
+ var ts = a.TokenStream(field, new StringReader(query));
var termAtt = ts.AddAttribute<TermAttribute>();
-
+
while (ts.IncrementToken())
- {
- var word = termAtt.Term;
-
+ {
+ var word = termAtt.Term;
+
if (!already.Contains(word))
- {
- already.Add(word);
- top.Add(word);
- }
- }
-
- tmp = new BooleanQuery();
-
- // [2] form query
- System.Collections.IEnumerator it = top.GetEnumerator();
- while (it.MoveNext())
- {
- // [2a] add to level words in
- var word = (String) it.Current;
- var tq = new TermQuery(new Term(field, word));
- tmp.Add(tq, Occur.SHOULD);
-
- var c = new CollectorImpl(field, boost);
+ {
+ already.Add(word);
+ top.Add(word);
+ }
+ }
+
+ tmp = new BooleanQuery();
+
+ // [2] form query
+ System.Collections.IEnumerator it = top.GetEnumerator();
+ while (it.MoveNext())
+ {
+ // [2a] add to level words in
+ var word = (String) it.Current;
+ var tq = new TermQuery(new Term(field, word));
+ tmp.Add(tq, Occur.SHOULD);
+
+ var c = new CollectorImpl(field, boost);
syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
- }
-
- return tmp;
- }
-
+ }
+
+ return tmp;
+ }
+
/// <summary>
/// From project WordNet.Net.Syns2Index
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/SynLookup/SynLookup.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/SynLookup/SynLookup.cs b/src/contrib/WordNet/SynLookup/SynLookup.cs
index 62c436d..024dcc9 100644
--- a/src/contrib/WordNet/SynLookup/SynLookup.cs
+++ b/src/contrib/WordNet/SynLookup/SynLookup.cs
@@ -27,100 +27,100 @@ using Lucene.Net.Store;
namespace WorldNet.Net
{
- /// <summary> Test program to look up synonyms.</summary>
- public class SynLookup
- {
- static List<String> already;
- private static BooleanQuery tmp;
-
- [STAThread]
- public static void Main(System.String[] args)
- {
- if (args.Length != 2)
- {
- System.Console.Out.WriteLine(typeof(SynLookup) + " <index path> <word>");
- return;
- }
-
- using (var directory = FSDirectory.Open(new DirectoryInfo(args[0])))
- {
- using (var searcher = new IndexSearcher(directory, true))
- {
-
- String word = args[1];
- Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
- var countingCollector = new CountingCollector();
- searcher.Search(query, countingCollector);
-
- if (countingCollector.numHits == 0)
- {
- Console.Out.WriteLine("No synonyms found for " + word);
- }
- else
- {
- Console.Out.WriteLine("Synonyms found for \"" + word + "\":");
- }
-
- var hits = searcher.Search(query, countingCollector.numHits).ScoreDocs;
-
- foreach (var v in
- hits.Select(t => searcher.Doc(t.Doc)).Select(doc => doc.GetValues(Syns2Index.F_SYN)).SelectMany(values => values))
- {
- Console.Out.WriteLine(v);
- }
-
- }
- }
- }
-
- /// <summary>
- /// Perform synonym expansion on a query.
- /// </summary>
- /// <param name="query">query</param>
- /// <param name="syns">syns</param>
- /// <param name="a">a</param>
- /// <param name="field">field</param>
- /// <param name="boost">boost</param>
- public static Query Expand(String query,
- Searcher syns,
- Analyzer a,
- String field,
- float boost)
- {
- already = new List<String>(); // avoid dups
- var top = new List<String>(); // needs to be separately listed..
-
- var ts = a.TokenStream(field, new StringReader(query));
- var termAtt = ts.AddAttribute<TermAttribute>();
-
- while (ts.IncrementToken())
- {
- var word = termAtt.Term;
-
- if (!already.Contains(word))
- {
- already.Add(word);
- top.Add(word);
- }
- }
-
- tmp = new BooleanQuery();
-
- // [2] form query
- System.Collections.IEnumerator it = top.GetEnumerator();
- while (it.MoveNext())
- {
- // [2a] add to level words in
- var word = (String)it.Current;
- var tq = new TermQuery(new Term(field, word));
- tmp.Add(tq, Occur.SHOULD);
-
- var c = new CollectorImpl(field, boost);
- syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
- }
-
- return tmp;
- }
+ /// <summary> Test program to look up synonyms.</summary>
+ public class SynLookup
+ {
+ static List<String> already;
+ private static BooleanQuery tmp;
+
+ [STAThread]
+ public static void Main(System.String[] args)
+ {
+ if (args.Length != 2)
+ {
+ System.Console.Out.WriteLine(typeof(SynLookup) + " <index path> <word>");
+ return;
+ }
+
+ using (var directory = FSDirectory.Open(new DirectoryInfo(args[0])))
+ {
+ using (var searcher = new IndexSearcher(directory, true))
+ {
+
+ String word = args[1];
+ Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
+ var countingCollector = new CountingCollector();
+ searcher.Search(query, countingCollector);
+
+ if (countingCollector.numHits == 0)
+ {
+ Console.Out.WriteLine("No synonyms found for " + word);
+ }
+ else
+ {
+ Console.Out.WriteLine("Synonyms found for \"" + word + "\":");
+ }
+
+ var hits = searcher.Search(query, countingCollector.numHits).ScoreDocs;
+
+ foreach (var v in
+ hits.Select(t => searcher.Doc(t.Doc)).Select(doc => doc.GetValues(Syns2Index.F_SYN)).SelectMany(values => values))
+ {
+ Console.Out.WriteLine(v);
+ }
+
+ }
+ }
+ }
+
+ /// <summary>
+ /// Perform synonym expansion on a query.
+ /// </summary>
+ /// <param name="query">query</param>
+ /// <param name="syns">syns</param>
+ /// <param name="a">a</param>
+ /// <param name="field">field</param>
+ /// <param name="boost">boost</param>
+ public static Query Expand(String query,
+ Searcher syns,
+ Analyzer a,
+ String field,
+ float boost)
+ {
+ already = new List<String>(); // avoid dups
+ var top = new List<String>(); // needs to be separately listed..
+
+ var ts = a.TokenStream(field, new StringReader(query));
+ var termAtt = ts.AddAttribute<TermAttribute>();
+
+ while (ts.IncrementToken())
+ {
+ var word = termAtt.Term;
+
+ if (!already.Contains(word))
+ {
+ already.Add(word);
+ top.Add(word);
+ }
+ }
+
+ tmp = new BooleanQuery();
+
+ // [2] form query
+ System.Collections.IEnumerator it = top.GetEnumerator();
+ while (it.MoveNext())
+ {
+ // [2a] add to level words in
+ var word = (String)it.Current;
+ var tq = new TermQuery(new Term(field, word));
+ tmp.Add(tq, Occur.SHOULD);
+
+ var c = new CollectorImpl(field, boost);
+ syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
+ }
+
+ return tmp;
+ }
internal sealed class CountingCollector : Collector
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/Syns2Index/Syns2Index.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/Syns2Index/Syns2Index.cs b/src/contrib/WordNet/Syns2Index/Syns2Index.cs
index ac5bea6..da96a8a 100644
--- a/src/contrib/WordNet/Syns2Index/Syns2Index.cs
+++ b/src/contrib/WordNet/Syns2Index/Syns2Index.cs
@@ -29,264 +29,264 @@ using IndexWriter = Lucene.Net.Index.IndexWriter;
namespace WorldNet.Net
{
-
- /// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
- /// into a Lucene index suitable for looking up synonyms and performing query expansion (<see cref="SynExpand.Expand"/>).
- ///
- /// This has been tested with WordNet 2.0.
- ///
- /// The index has fields named "word" (<see cref="F_WORD"/>)
- /// and "syn" (<see cref="F_SYN"/>).
- /// <p>
- /// The source word (such as 'big') can be looked up in the
- /// "word" field, and if present there will be fields named "syn"
- /// for every synonym. What's tricky here is that there could be <b>multiple</b>
- /// fields with the same name, in the general case for words that have multiple synonyms.
- /// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/>
- /// </p>
- /// <p>
- /// While the WordNet file distinguishes groups of synonyms with
- /// related meanings we don't do that here.
- /// </p>
- /// This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
- /// </summary>
- ///
- /// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso>
- /// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso>
- /// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso>
- public class Syns2Index
- {
- /// <summary> </summary>
- private static readonly System.IO.StreamWriter o;
-
- /// <summary> </summary>
- private static readonly System.IO.StreamWriter err;
-
- /// <summary> </summary>
- public const System.String F_SYN = "syn";
-
- /// <summary> </summary>
- public const System.String F_WORD = "word";
-
- /// <summary> </summary>
- private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
-
- /// <summary>
- /// Takes arg of prolog file name and index directory.
- /// </summary>
- [STAThread]
- public static void Main(System.String[] args)
- {
- // get command line arguments
- String prologFilename = null; // name of file "wn_s.pl"
- String indexDir = null;
- if (args.Length == 2)
- {
- prologFilename = args[0];
- indexDir = args[1];
- }
- else
- {
- Usage();
- Environment.Exit(1);
- }
-
- // ensure that the prolog file is readable
- if (!(new FileInfo(prologFilename)).Exists)
- {
- err.WriteLine("Error: cannot read Prolog file: " + prologFilename);
- Environment.Exit(1);
- }
- // exit if the target index directory already exists
- if (Directory.Exists((new FileInfo(indexDir)).FullName))
- {
- err.WriteLine("Error: index directory already exists: " + indexDir);
- err.WriteLine("Please specify a name of a non-existent directory");
- Environment.Exit(1);
- }
-
- o.WriteLine("Opening Prolog file " + prologFilename);
- var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read);
- var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream, new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding);
- String line;
-
- // maps a word to all the "groups" it's in
- System.Collections.IDictionary word2Nums = new System.Collections.SortedList();
- // maps a group to all the words in it
- System.Collections.IDictionary num2Words = new System.Collections.SortedList();
- // number of rejected words
- var ndecent = 0;
-
- // status output
- var mod = 1;
- var row = 1;
- // parse prolog file
- o.WriteLine("[1/2] Parsing " + prologFilename);
- while ((line = br.ReadLine()) != null)
- {
- // occasional progress
- if ((++row) % mod == 0) // periodically print out line we read in
- {
- mod *= 2;
- o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count + " ndecent=" + ndecent);
- }
-
- // syntax check
- if (!line.StartsWith("s("))
- {
- err.WriteLine("OUCH: " + line);
- Environment.Exit(1);
- }
-
- // parse line
- line = line.Substring(2);
- var comma = line.IndexOf(',');
- var num = line.Substring(0, comma);
- var q1 = line.IndexOf('\'');
- line = line.Substring(q1 + 1);
- var q2 = line.IndexOf('\'');
- var word = line.Substring(0, q2).ToLower().Replace("''", "'");
-
- // make sure is a normal word
- if (!IsDecent(word))
- {
- ndecent++;
- continue; // don't store words w/ spaces
- }
-
- // 1/2: word2Nums map
- // append to entry or add new one
- var lis = (System.Collections.IList) word2Nums[word];
- if (lis == null)
- {
- lis = new List<String> {num};
- word2Nums[word] = lis;
- }
- else
- lis.Add(num);
-
- // 2/2: num2Words map
- lis = (System.Collections.IList) num2Words[num];
- if (lis == null)
- {
- lis = new List<String> { word };
- num2Words[num] = lis;
- }
- else
- lis.Add(word);
- }
-
- // close the streams
- fis.Close();
- br.Close();
-
- // create the index
- o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.Count + " and " + num2Words.Count);
- Index(indexDir, word2Nums, num2Words);
- }
-
- /// <summary>
- /// Checks to see if a word contains only alphabetic characters by
- /// checking it one character at a time.
- /// </summary>
- /// <param name="s">string to check </param>
- /// <returns> <c>true</c> if the string is decent</returns>
- private static bool IsDecent(String s)
- {
- var len = s.Length;
- for (var i = 0; i < len; i++)
- {
- if (!Char.IsLetter(s[i]))
- {
- return false;
- }
- }
- return true;
- }
-
- /// <summary>
- /// Forms a Lucene index based on the 2 maps.
- /// </summary>
- /// <param name="indexDir">the direcotry where the index should be created</param>
- /// <param name="word2Nums">word2Nums</param>
- /// <param name="num2Words">num2Words</param>
- private static void Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words)
- {
- var row = 0;
- var mod = 1;
-
- using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir)))
- {
- var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED);
- writer.UseCompoundFile = true; // why?
+
+ /// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
+ /// into a Lucene index suitable for looking up synonyms and performing query expansion (<see cref="SynExpand.Expand"/>).
+ ///
+ /// This has been tested with WordNet 2.0.
+ ///
+ /// The index has fields named "word" (<see cref="F_WORD"/>)
+ /// and "syn" (<see cref="F_SYN"/>).
+ /// <p>
+ /// The source word (such as 'big') can be looked up in the
+ /// "word" field, and if present there will be fields named "syn"
+ /// for every synonym. What's tricky here is that there could be <b>multiple</b>
+ /// fields with the same name, in the general case for words that have multiple synonyms.
+ /// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/>
+ /// </p>
+ /// <p>
+ /// While the WordNet file distinguishes groups of synonyms with
+ /// related meanings we don't do that here.
+ /// </p>
+ /// This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
+ /// </summary>
+ ///
+ /// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso>
+ /// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso>
+ /// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso>
+ public class Syns2Index
+ {
+ /// <summary> </summary>
+ private static readonly System.IO.StreamWriter o;
+
+ /// <summary> </summary>
+ private static readonly System.IO.StreamWriter err;
+
+ /// <summary> </summary>
+ public const System.String F_SYN = "syn";
+
+ /// <summary> </summary>
+ public const System.String F_WORD = "word";
+
+ /// <summary> </summary>
+ private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
+
+ /// <summary>
+ /// Takes arg of prolog file name and index directory.
+ /// </summary>
+ [STAThread]
+ public static void Main(System.String[] args)
+ {
+ // get command line arguments
+ String prologFilename = null; // name of file "wn_s.pl"
+ String indexDir = null;
+ if (args.Length == 2)
+ {
+ prologFilename = args[0];
+ indexDir = args[1];
+ }
+ else
+ {
+ Usage();
+ Environment.Exit(1);
+ }
+
+ // ensure that the prolog file is readable
+ if (!(new FileInfo(prologFilename)).Exists)
+ {
+ err.WriteLine("Error: cannot read Prolog file: " + prologFilename);
+ Environment.Exit(1);
+ }
+ // exit if the target index directory already exists
+ if (Directory.Exists((new FileInfo(indexDir)).FullName))
+ {
+ err.WriteLine("Error: index directory already exists: " + indexDir);
+ err.WriteLine("Please specify a name of a non-existent directory");
+ Environment.Exit(1);
+ }
+
+ o.WriteLine("Opening Prolog file " + prologFilename);
+ var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read);
+ var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream, new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding);
+ String line;
+
+ // maps a word to all the "groups" it's in
+ System.Collections.IDictionary word2Nums = new System.Collections.SortedList();
+ // maps a group to all the words in it
+ System.Collections.IDictionary num2Words = new System.Collections.SortedList();
+ // number of rejected words
+ var ndecent = 0;
+
+ // status output
+ var mod = 1;
+ var row = 1;
+ // parse prolog file
+ o.WriteLine("[1/2] Parsing " + prologFilename);
+ while ((line = br.ReadLine()) != null)
+ {
+ // occasional progress
+ if ((++row) % mod == 0) // periodically print out line we read in
+ {
+ mod *= 2;
+ o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count + " ndecent=" + ndecent);
+ }
+
+ // syntax check
+ if (!line.StartsWith("s("))
+ {
+ err.WriteLine("OUCH: " + line);
+ Environment.Exit(1);
+ }
+
+ // parse line
+ line = line.Substring(2);
+ var comma = line.IndexOf(',');
+ var num = line.Substring(0, comma);
+ var q1 = line.IndexOf('\'');
+ line = line.Substring(q1 + 1);
+ var q2 = line.IndexOf('\'');
+ var word = line.Substring(0, q2).ToLower().Replace("''", "'");
+
+ // make sure is a normal word
+ if (!IsDecent(word))
+ {
+ ndecent++;
+ continue; // don't store words w/ spaces
+ }
+
+ // 1/2: word2Nums map
+ // append to entry or add new one
+ var lis = (System.Collections.IList) word2Nums[word];
+ if (lis == null)
+ {
+ lis = new List<String> {num};
+ word2Nums[word] = lis;
+ }
+ else
+ lis.Add(num);
+
+ // 2/2: num2Words map
+ lis = (System.Collections.IList) num2Words[num];
+ if (lis == null)
+ {
+ lis = new List<String> { word };
+ num2Words[num] = lis;
+ }
+ else
+ lis.Add(word);
+ }
+
+ // close the streams
+ fis.Close();
+ br.Close();
+
+ // create the index
+ o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.Count + " and " + num2Words.Count);
+ Index(indexDir, word2Nums, num2Words);
+ }
+
+ /// <summary>
+ /// Checks to see if a word contains only alphabetic characters by
+ /// checking it one character at a time.
+ /// </summary>
+ /// <param name="s">string to check </param>
+ /// <returns> <c>true</c> if the string is decent</returns>
+ private static bool IsDecent(String s)
+ {
+ var len = s.Length;
+ for (var i = 0; i < len; i++)
+ {
+ if (!Char.IsLetter(s[i]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// Forms a Lucene index based on the 2 maps.
+ /// </summary>
+ /// <param name="indexDir">the direcotry where the index should be created</param>
+ /// <param name="word2Nums">word2Nums</param>
+ /// <param name="num2Words">num2Words</param>
+ private static void Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words)
+ {
+ var row = 0;
+ var mod = 1;
+
+ using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir)))
+ {
+ var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED);
+ writer.UseCompoundFile = true; // why?
- var i1 = word2Nums.Keys.GetEnumerator();
- while (i1.MoveNext())
- {
- var g = (String)i1.Current;
- var doc = new Document();
+ var i1 = word2Nums.Keys.GetEnumerator();
+ while (i1.MoveNext())
+ {
+ var g = (String)i1.Current;
+ var doc = new Document();
- var n = Index(word2Nums, num2Words, g, doc);
- if (n > 0)
- {
- doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
- if ((++row % mod) == 0)
- {
- o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc);
- mod *= 2;
- }
- writer.AddDocument(doc);
- }
- }
- o.WriteLine("Optimizing..");
- writer.Optimize();
- writer.Close();
- }
-
- }
+ var n = Index(word2Nums, num2Words, g, doc);
+ if (n > 0)
+ {
+ doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
+ if ((++row % mod) == 0)
+ {
+ o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc);
+ mod *= 2;
+ }
+ writer.AddDocument(doc);
+ }
+ }
+ o.WriteLine("Optimizing..");
+ writer.Optimize();
+ writer.Close();
+ }
+
+ }
- /// <summary>
- /// Given the 2 maps fills a document for 1 word.
- /// </summary>
- private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words, System.String g, Document doc)
- {
- var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s
- var i2 = keys.GetEnumerator();
-
- var already = new System.Collections.SortedList(); // keep them sorted
-
- // pass 1: fill up 'already' with all words
- while (i2.MoveNext()) // for each key#
- {
- foreach (var item in
- ((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item => already.Contains(item) == false))
- {
- already.Add(item, item);
- }
- }
+ /// <summary>
+ /// Given the 2 maps fills a document for 1 word.
+ /// </summary>
+ private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words, System.String g, Document doc)
+ {
+ var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s
+ var i2 = keys.GetEnumerator();
+
+ var already = new System.Collections.SortedList(); // keep them sorted
+
+ // pass 1: fill up 'already' with all words
+ while (i2.MoveNext()) // for each key#
+ {
+ foreach (var item in
+ ((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item => already.Contains(item) == false))
+ {
+ already.Add(item, item);
+ }
+ }
- var num = 0;
- already.Remove(g); // of course a word is it's own syn
- var it = already.GetEnumerator();
- while (it.MoveNext())
- {
- var cur = (String) it.Key;
- // don't store things like 'pit bull' -> 'american pit bull'
- if (!IsDecent(cur))
- {
- continue;
- }
- num++;
- doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO));
- }
- return num;
- }
-
- /// <summary> </summary>
- private static void Usage()
- {
- o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n");
- }
+ var num = 0;
+ already.Remove(g); // of course a word is it's own syn
+ var it = already.GetEnumerator();
+ while (it.MoveNext())
+ {
+ var cur = (String) it.Key;
+ // don't store things like 'pit bull' -> 'american pit bull'
+ if (!IsDecent(cur))
+ {
+ continue;
+ }
+ num++;
+ doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO));
+ }
+ return num;
+ }
+
+ /// <summary> </summary>
+ private static void Usage()
+ {
+ o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n");
+ }
- }
+ }
}
\ No newline at end of file