You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by si...@apache.org on 2012/08/27 14:04:23 UTC
svn commit: r1377648 - in /incubator/lucene.net/trunk/src/demo:
DeleteFiles/DeleteFiles.cs Demo.Common/FileDocument.cs
Demo.Common/HTMLDocument.cs IndexFiles/IndexFiles.cs IndexHtml/IndexHtml.cs
SearchFiles/SearchFiles.cs
Author: sisve
Date: Mon Aug 27 12:04:23 2012
New Revision: 1377648
URL: http://svn.apache.org/viewvc?rev=1377648&view=rev
Log:
Changes to demo-files (IDisposable, documentation, etc.)
Modified:
incubator/lucene.net/trunk/src/demo/DeleteFiles/DeleteFiles.cs
incubator/lucene.net/trunk/src/demo/Demo.Common/FileDocument.cs
incubator/lucene.net/trunk/src/demo/Demo.Common/HTMLDocument.cs
incubator/lucene.net/trunk/src/demo/IndexFiles/IndexFiles.cs
incubator/lucene.net/trunk/src/demo/IndexHtml/IndexHtml.cs
incubator/lucene.net/trunk/src/demo/SearchFiles/SearchFiles.cs
Modified: incubator/lucene.net/trunk/src/demo/DeleteFiles/DeleteFiles.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/DeleteFiles/DeleteFiles.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/DeleteFiles/DeleteFiles.cs (original)
+++ incubator/lucene.net/trunk/src/demo/DeleteFiles/DeleteFiles.cs Mon Aug 27 12:04:23 2012
@@ -16,57 +16,51 @@
*/
using System;
-
-using IndexReader = Lucene.Net.Index.IndexReader;
-using Term = Lucene.Net.Index.Term;
-using Directory = Lucene.Net.Store.Directory;
+using Lucene.Net.Index;
using FSDirectory = Lucene.Net.Store.FSDirectory;
namespace Lucene.Net.Demo
{
-
-
/// <summary>Deletes documents from an index that do not contain a term. </summary>
- public class DeleteFiles
+ public static class DeleteFiles
{
- private DeleteFiles()
- {
- } // singleton
-
/// <summary>Deletes documents from an index that do not contain a term. </summary>
[STAThread]
- public static void Main(System.String[] args)
+ public static void Main(System.String[] args)
{
- System.String usage = typeof(DeleteFiles) + " <unique_term>";
+ var usage = typeof(DeleteFiles) + " <unique_term>";
if (args.Length == 0)
{
- System.Console.Error.WriteLine("Usage: " + usage);
- System.Environment.Exit(1);
+ Console.Error.WriteLine("Usage: " + usage);
+ Environment.Exit(1);
}
+
try
{
- Directory directory = FSDirectory.Open("index");
- IndexReader reader = IndexReader.Open(directory, false); // we don't want read-only because we are about to delete
-
- Term term = new Term("path", args[0]);
- int deleted = reader.DeleteDocuments(term);
-
- System.Console.Out.WriteLine("deleted " + deleted + " documents containing " + term);
-
- // one can also delete documents by their internal id:
- /*
- for (int i = 0; i < reader.maxDoc(); i++) {
- System.out.println("Deleting document with id " + i);
- reader.delete(i);
- }*/
-
- reader.Close();
- directory.Close();
+ // We don't want a read-only reader because we are about to delete.
+ using (var directory = FSDirectory.Open("index"))
+ using (var reader = IndexReader.Open(directory, false))
+ {
+ var term = new Term("path", args[0]);
+ var deleted = reader.DeleteDocuments(term);
+
+ Console.Out.WriteLine("deleted " + deleted + " documents containing " + term);
+
+ // one can also delete documents by their internal id:
+ /*
+ for (int i = 0; i < reader.MaxDoc; i++) {
+ Console.Out.WriteLine("Deleting document with id " + i);
+ reader.DeleteDocument(i);
+ }
+ */
+
+ reader.Commit();
+ }
}
- catch (System.Exception e)
+ catch (Exception e)
{
- System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
+ Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
}
}
}
Modified: incubator/lucene.net/trunk/src/demo/Demo.Common/FileDocument.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/Demo.Common/FileDocument.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/Demo.Common/FileDocument.cs (original)
+++ incubator/lucene.net/trunk/src/demo/Demo.Common/FileDocument.cs Mon Aug 27 12:04:23 2012
@@ -16,17 +16,15 @@
*/
using System;
-
-using DateTools = Lucene.Net.Documents.DateTools;
-using Document = Lucene.Net.Documents.Document;
-using Field = Lucene.Net.Documents.Field;
+using System.IO;
+using Lucene.Net.Documents;
namespace Lucene.Net.Demo
{
/// <summary>A utility for making Lucene Documents from a File. </summary>
- public class FileDocument
+ public static class FileDocument
{
/// <summary>Makes a document for a File.
/// <p>
@@ -40,7 +38,7 @@ namespace Lucene.Net.Demo
/// <li><code>contents</code>--containing the full contents of the file, as a
/// Reader field;
/// </summary>
- public static Document Document(System.IO.DirectoryInfo f)
+ public static Document Document(FileInfo f)
{
// make a new, empty document
@@ -59,14 +57,10 @@ namespace Lucene.Net.Demo
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// If that's not the case searching for special characters will fail.
- doc.Add(new Field("contents", new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default)));
+ doc.Add(new Field("contents", new StreamReader(f.FullName, System.Text.Encoding.Default)));
// return the document
return doc;
}
-
- private FileDocument()
- {
- }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/demo/Demo.Common/HTMLDocument.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/Demo.Common/HTMLDocument.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/Demo.Common/HTMLDocument.cs (original)
+++ incubator/lucene.net/trunk/src/demo/Demo.Common/HTMLDocument.cs Mon Aug 27 12:04:23 2012
@@ -16,20 +16,21 @@
*/
using System;
+using System.IO;
+using Lucene.Net.Documents;
using HTMLParser = Lucene.Net.Demo.Html.HTMLParser;
-using Lucene.Net.Documents;
namespace Lucene.Net.Demo
{
/// <summary>A utility for making Lucene Documents for HTML documents. </summary>
- public class HTMLDocument
+ public static class HTMLDocument
{
- internal static char dirSep = System.IO.Path.DirectorySeparatorChar.ToString()[0];
+ internal static char dirSep = Path.DirectorySeparatorChar;
- public static System.String Uid(System.IO.DirectoryInfo f)
+ public static String Uid(FileInfo f)
{
// Append path and date into a string in such a way that lexicographic
// sorting gives the same results as a walk of the file hierarchy. Thus
@@ -38,13 +39,13 @@ namespace Lucene.Net.Demo
return f.FullName.Replace(dirSep, '\u0000') + "\u0000" + DateTools.TimeToString(f.LastWriteTime.Millisecond, DateTools.Resolution.SECOND);
}
- public static System.String Uid2url(System.String uid)
+ public static String Uid2url(String uid)
{
- System.String url = uid.Replace('\u0000', '/'); // replace nulls with slashes
+ var url = uid.Replace('\u0000', '/'); // replace nulls with slashes
return url.Substring(0, (url.LastIndexOf('/')) - (0)); // remove date from end
}
- public static Document Document(System.IO.DirectoryInfo f)
+ public static Document Document(FileInfo f)
{
// make a new, empty document
Document doc = new Document();
@@ -62,27 +63,25 @@ namespace Lucene.Net.Demo
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
doc.Add(new Field("uid", Uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
-
- System.IO.FileStream fis = new System.IO.FileStream(f.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
- HTMLParser parser = new HTMLParser(fis);
-
- // Add the tag-stripped contents as a Reader-valued Text field so it will
- // get tokenized and indexed.
- doc.Add(new Field("contents", parser.GetReader()));
-
- // Add the summary as a field that is stored and returned with
- // hit documents for display.
- doc.Add(new Field("summary", parser.GetSummary(), Field.Store.YES, Field.Index.NO));
-
- // Add the title as a field that it can be searched and that is stored.
- doc.Add(new Field("title", parser.GetTitle(), Field.Store.YES, Field.Index.ANALYZED));
-
- // return the document
- return doc;
- }
-
- private HTMLDocument()
- {
- }
+
+ using (var fileStream = f.OpenRead())
+ {
+ var parser = new HTMLParser(fileStream);
+
+ // Add the tag-stripped contents as a Reader-valued Text field so it will
+ // get tokenized and indexed.
+ doc.Add(new Field("contents", parser.GetReader()));
+
+ // Add the summary as a field that is stored and returned with
+ // hit documents for display.
+ doc.Add(new Field("summary", parser.GetSummary(), Field.Store.YES, Field.Index.NO));
+
+ // Add the title as a field that it can be searched and that is stored.
+ doc.Add(new Field("title", parser.GetTitle(), Field.Store.YES, Field.Index.ANALYZED));
+
+ // return the document
+ return doc;
+ }
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/demo/IndexFiles/IndexFiles.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/IndexFiles/IndexFiles.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/IndexFiles/IndexFiles.cs (original)
+++ incubator/lucene.net/trunk/src/demo/IndexFiles/IndexFiles.cs Mon Aug 27 12:04:23 2012
@@ -16,9 +16,10 @@
*/
using System;
+using System.IO;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Index;
-using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
-using IndexWriter = Lucene.Net.Index.IndexWriter;
using FSDirectory = Lucene.Net.Store.FSDirectory;
using Version = Lucene.Net.Util.Version;
@@ -26,100 +27,85 @@ namespace Lucene.Net.Demo
{
/// <summary>Index all text files under a directory. </summary>
- public class IndexFiles
+ public static class IndexFiles
{
-
- private IndexFiles()
- {
- }
-
- internal static readonly System.IO.DirectoryInfo INDEX_DIR = new System.IO.DirectoryInfo("index");
+ internal static readonly DirectoryInfo INDEX_DIR = new DirectoryInfo("index");
/// <summary>Index all text files under a directory. </summary>
[STAThread]
- public static void Main(System.String[] args)
+ public static void Main(String[] args)
{
- System.String usage = typeof(IndexFiles) + " <root_directory>";
+ var usage = typeof(IndexFiles) + " <root_directory>";
if (args.Length == 0)
{
- System.Console.Error.WriteLine("Usage: " + usage);
- System.Environment.Exit(1);
+ Console.Error.WriteLine("Usage: " + usage);
+ Environment.Exit(1);
}
-
- bool tmpBool;
- if (System.IO.File.Exists(INDEX_DIR.FullName))
- tmpBool = true;
- else
- tmpBool = System.IO.Directory.Exists(INDEX_DIR.FullName);
- if (tmpBool)
- {
- System.Console.Out.WriteLine("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
- System.Environment.Exit(1);
- }
-
- var docDir = new System.IO.DirectoryInfo(args[0]);
- bool tmpBool2;
- if (System.IO.File.Exists(docDir.FullName))
- tmpBool2 = true;
- else
- tmpBool2 = System.IO.Directory.Exists(docDir.FullName);
- if (!tmpBool2) // || !docDir.canRead()) // {{Aroush}} what is canRead() in C#?
+
+ if (File.Exists(INDEX_DIR.FullName) || Directory.Exists(INDEX_DIR.FullName))
{
- System.Console.Out.WriteLine("Document directory '" + docDir.FullName + "' does not exist or is not readable, please check the path");
- System.Environment.Exit(1);
+ Console.Out.WriteLine("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
+ Environment.Exit(1);
+ }
+
+ var docDir = new DirectoryInfo(args[0]);
+ var docDirExists = File.Exists(docDir.FullName) || Directory.Exists(docDir.FullName);
+ if (!docDirExists) // || !docDir.canRead()) // {{Aroush}} what is canRead() in C#?
+ {
+ Console.Out.WriteLine("Document directory '" + docDir.FullName + "' does not exist or is not readable, please check the path");
+ Environment.Exit(1);
}
- System.DateTime start = System.DateTime.Now;
+ var start = DateTime.Now;
try
{
- IndexWriter writer = new IndexWriter(FSDirectory.Open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);
- System.Console.Out.WriteLine("Indexing to directory '" + INDEX_DIR + "'...");
- IndexDocs(writer, docDir);
- System.Console.Out.WriteLine("Optimizing...");
- writer.Optimize();
- writer.Close();
-
- System.DateTime end = System.DateTime.Now;
- System.Console.Out.WriteLine(end.Millisecond - start.Millisecond + " total milliseconds");
+ using (var writer = new IndexWriter(FSDirectory.Open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED))
+ {
+ Console.Out.WriteLine("Indexing to directory '" + INDEX_DIR + "'...");
+ IndexDirectory(writer, docDir);
+ Console.Out.WriteLine("Optimizing...");
+ writer.Optimize();
+ writer.Commit();
+ }
+ var end = DateTime.Now;
+ Console.Out.WriteLine(end.Millisecond - start.Millisecond + " total milliseconds");
}
- catch (System.IO.IOException e)
+ catch (IOException e)
{
- System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
+ Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
}
}
- internal static void IndexDocs(IndexWriter writer, System.IO.DirectoryInfo file)
+ internal static void IndexDirectory(IndexWriter writer, DirectoryInfo directory)
+ {
+ foreach(var subDirectory in directory.GetDirectories())
+ IndexDirectory(writer, subDirectory);
+
+ foreach (var file in directory.GetFiles())
+ IndexDocs(writer, file);
+ }
+
+ internal static void IndexDocs(IndexWriter writer, FileInfo file)
{
- // do not try to index files that cannot be read
- // if (file.canRead()) // {{Aroush}} what is canRead() in C#?
+ Console.Out.WriteLine("adding " + file);
+
+ try
+ {
+ writer.AddDocument(FileDocument.Document(file));
+ }
+ catch (FileNotFoundException)
{
- if (System.IO.Directory.Exists(file.FullName))
- {
- System.String[] files = System.IO.Directory.GetFileSystemEntries(file.FullName);
- // an IO error could occur
- if (files != null)
- {
- for (int i = 0; i < files.Length; i++)
- {
- IndexDocs(writer, new System.IO.DirectoryInfo(files[i]));
- }
- }
- }
- else
- {
- System.Console.Out.WriteLine("adding " + file);
- try
- {
- writer.AddDocument(FileDocument.Document(file));
- }
- // at least on windows, some temporary files raise this exception with an "access denied" message
- // checking if the file can be read doesn't help
- catch (System.IO.FileNotFoundException fnfe)
- {
- ;
- }
- }
+ // At least on Windows, some temporary files raise this exception with an
+ // "access denied" message checking if the file can be read doesn't help.
}
+ catch (UnauthorizedAccessException)
+ {
+ // Handle any access-denied errors that occur while reading the file.
+ }
+ catch (IOException)
+ {
+ // Generic handler for any io-related exceptions that occur.
+ }
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/demo/IndexHtml/IndexHtml.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/IndexHtml/IndexHtml.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/IndexHtml/IndexHtml.cs (original)
+++ incubator/lucene.net/trunk/src/demo/IndexHtml/IndexHtml.cs Mon Aug 27 12:04:23 2012
@@ -16,13 +16,11 @@
*/
using System;
+using System.Diagnostics;
+using System.IO;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Index;
-using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
-using Document = Lucene.Net.Documents.Document;
-using IndexReader = Lucene.Net.Index.IndexReader;
-using IndexWriter = Lucene.Net.Index.IndexWriter;
-using Term = Lucene.Net.Index.Term;
-using TermEnum = Lucene.Net.Index.TermEnum;
using FSDirectory = Lucene.Net.Store.FSDirectory;
using Version = Lucene.Net.Util.Version;
@@ -30,32 +28,24 @@ namespace Lucene.Net.Demo
{
/// <summary>Indexer for HTML files. </summary>
- public class IndexHTML
+ public static class IndexHTML
{
- private IndexHTML()
- {
- }
-
- private static bool deleting = false; // true during deletion pass
- private static IndexReader reader; // existing index
- private static IndexWriter writer; // new index being built
- private static TermEnum uidIter; // document id iterator
-
+
/// <summary>Indexer for HTML files.</summary>
[STAThread]
- public static void Main(System.String[] argv)
+ public static void Main(System.String[] argv)
{
try
{
- var index = new System.IO.DirectoryInfo("index");
+ var index = new DirectoryInfo("index");
bool create = false;
- System.IO.DirectoryInfo root = null;
+ DirectoryInfo root = null;
- System.String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
+ var usage = "IndexHTML [-create] [-index <index>] <root_directory>";
if (argv.Length == 0)
{
- System.Console.Error.WriteLine("Usage: " + usage);
+ Console.Error.WriteLine("Usage: " + usage);
return ;
}
@@ -64,7 +54,7 @@ namespace Lucene.Net.Demo
if (argv[i].Equals("-index"))
{
// parse -index option
- index = new System.IO.DirectoryInfo(argv[++i]);
+ index = new DirectoryInfo(argv[++i]);
}
else if (argv[i].Equals("-create"))
{
@@ -73,43 +63,49 @@ namespace Lucene.Net.Demo
}
else if (i != argv.Length - 1)
{
- System.Console.Error.WriteLine("Usage: " + usage);
+ Console.Error.WriteLine("Usage: " + usage);
return ;
}
else
- root = new System.IO.DirectoryInfo(argv[i]);
+ root = new DirectoryInfo(argv[i]);
}
if (root == null)
{
- System.Console.Error.WriteLine("Specify directory to index");
- System.Console.Error.WriteLine("Usage: " + usage);
+ Console.Error.WriteLine("Specify directory to index");
+ Console.Error.WriteLine("Usage: " + usage);
return ;
}
- System.DateTime start = System.DateTime.Now;
-
- if (!create)
- {
- // delete stale docs
- deleting = true;
- IndexDocs(root, index, create);
- }
- writer = new IndexWriter(FSDirectory.Open(index), new StandardAnalyzer(Version.LUCENE_CURRENT), create, new IndexWriter.MaxFieldLength(1000000));
- IndexDocs(root, index, create); // add new docs
-
- System.Console.Out.WriteLine("Optimizing index...");
- writer.Optimize();
- writer.Close();
-
- System.DateTime end = System.DateTime.Now;
+ var start = DateTime.Now;
+
+ using (var writer = new IndexWriter(FSDirectory.Open(index), new StandardAnalyzer(Version.LUCENE_30), create, new IndexWriter.MaxFieldLength(1000000)))
+ {
+ if (!create)
+ {
+ // We're not creating a new index, iterate our index and remove
+ // any stale documents.
+ IndexDocs(writer, root, index, Operation.RemoveStale);
+ }
+
+ var operation = create
+ ? Operation.CompleteReindex
+ : Operation.IncrementalReindex;
+ IndexDocs(writer, root, index, operation); // add new docs
+
+ Console.Out.WriteLine("Optimizing index...");
+ writer.Optimize();
+ writer.Commit();
+ }
+
+ var end = DateTime.Now;
- System.Console.Out.Write(end.Millisecond - start.Millisecond);
- System.Console.Out.WriteLine(" total milliseconds");
+ Console.Out.Write(end.Millisecond - start.Millisecond);
+ Console.Out.WriteLine(" total milliseconds");
}
- catch (System.Exception e)
+ catch (Exception e)
{
- System.Console.Error.WriteLine(e.StackTrace);
+ Console.Error.WriteLine(e.StackTrace);
}
}
@@ -119,86 +115,127 @@ namespace Lucene.Net.Demo
/* documents, to be indexed.
*/
- private static void IndexDocs(System.IO.DirectoryInfo file, System.IO.DirectoryInfo index, bool create)
+ private static void IndexDocs(IndexWriter writer, DirectoryInfo file, DirectoryInfo index, Operation operation)
{
- if (!create)
- {
- // incrementally update
-
- reader = IndexReader.Open(FSDirectory.Open(index), false); // open existing index
- uidIter = reader.Terms(new Term("uid", "")); // init uid iterator
-
- IndexDocs(file);
-
- if (deleting)
- {
- // delete rest of stale docs
- while (uidIter.Term != null && (System.Object) uidIter.Term.Field == (System.Object) "uid")
- {
- System.Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
- reader.DeleteDocuments(uidIter.Term);
- uidIter.Next();
- }
- deleting = false;
- }
-
- uidIter.Close(); // close uid iterator
- reader.Close(); // close existing index
- }
- // don't have exisiting
- else
- IndexDocs(file);
+ if (operation == Operation.CompleteReindex)
+ {
+ // Perform a full reindexing.
+ IndexDirectory(writer, null, file, operation);
+ }
+ else
+ {
+ // Perform an incremental reindexing.
+
+ using (var reader = IndexReader.Open(FSDirectory.Open(index), true)) // open existing index
+ using (var uidIter = reader.Terms(new Term("uid", ""))) // init uid iterator
+ {
+ IndexDirectory(writer, uidIter, file, operation);
+
+ if (operation == Operation.RemoveStale) {
+ // Delete remaining, presumed stale, documents. This works since
+ // the above call to IndexDirectory should have positioned the uidIter
+ // after any uids matching existing documents. Any remaining uid
+ // is remains from documents that has been deleted since they was
+ // indexed.
+ while (uidIter.Term != null && uidIter.Term.Field == "uid") {
+ Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
+ writer.DeleteDocuments(uidIter.Term);
+ uidIter.Next();
+ }
+ }
+ }
+ }
}
- private static void IndexDocs(System.IO.DirectoryInfo file)
+ private static void IndexDirectory(IndexWriter writer, TermEnum uidIter, DirectoryInfo dir, Operation operation) {
+ var entries = Directory.GetFileSystemEntries(dir.FullName);
+
+ // Sort the entries. This is important, the uidIter TermEnum is
+ // iterated in a forward-only fashion, requiring all files to be
+ // passed in ascending order.
+ Array.Sort(entries);
+
+ foreach (var entry in entries) {
+ var path = Path.Combine(dir.FullName, entry);
+ if (Directory.Exists(path)) {
+ IndexDirectory(writer, uidIter, new DirectoryInfo(path), operation);
+ } else if (File.Exists(path)) {
+ IndexFile(writer, uidIter, new FileInfo(path), operation);
+ }
+ }
+ }
+
+ private static void IndexFile(IndexWriter writer, TermEnum uidIter, FileInfo file, Operation operation)
{
- if (System.IO.Directory.Exists(file.FullName))
+ if (file.FullName.EndsWith(".html") || file.FullName.EndsWith(".htm") || file.FullName.EndsWith(".txt"))
{
- // if a directory
- System.String[] files = System.IO.Directory.GetFileSystemEntries(file.FullName); // list its files
- System.Array.Sort(files); // sort the files
- for (int i = 0; i < files.Length; i++)
- // recursively index them
- IndexDocs(new System.IO.DirectoryInfo(System.IO.Path.Combine(file.FullName, files[i])));
- }
- else if (file.FullName.EndsWith(".html") || file.FullName.EndsWith(".htm") || file.FullName.EndsWith(".txt"))
- {
- // index .txt files
+ // We've found a file we should index.
- if (uidIter != null)
+ if (operation == Operation.IncrementalReindex ||
+ operation == Operation.RemoveStale)
{
- System.String uid = HTMLDocument.Uid(file); // construct uid for doc
+ // We should only get here with an open uidIter.
+ Debug.Assert(uidIter != null, "Expected uidIter != null for operation " + operation);
+
+ var uid = HTMLDocument.Uid(file); // construct uid for doc
- while (uidIter.Term != null && (System.Object) uidIter.Term.Field == (System.Object) "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) < 0)
+ while (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) < 0)
{
- if (deleting)
+ if (operation == Operation.RemoveStale)
{
- // delete stale docs
- System.Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
- reader.DeleteDocuments(uidIter.Term);
+ Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
+ writer.DeleteDocuments(uidIter.Term);
}
uidIter.Next();
}
- if (uidIter.Term != null && (System.Object) uidIter.Term.Field == (System.Object) "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) == 0)
+
+ // The uidIter TermEnum should now be pointing at either
+ // 1) a null term, meaning there are no more uids to check.
+ // 2) a term matching the current file.
+ // 3) a term not matching us.
+ if (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) == 0)
{
- uidIter.Next(); // keep matching docs
+ // uidIter points to the current document, we should move one
+ // step ahead to keep state consistant, and carry on.
+ uidIter.Next();
}
- else if (!deleting)
+ else if (operation == Operation.IncrementalReindex)
{
- // add new docs
- Document doc = HTMLDocument.Document(file);
- System.Console.Out.WriteLine("adding " + doc.Get("path"));
+ // uidIter does not point to the current document, and we're
+ // currently indexing documents.
+ var doc = HTMLDocument.Document(file);
+ Console.Out.WriteLine("adding " + doc.Get("path"));
writer.AddDocument(doc);
}
}
else
{
- // creating a new index
- Document doc = HTMLDocument.Document(file);
- System.Console.Out.WriteLine("adding " + doc.Get("path"));
- writer.AddDocument(doc); // add docs unconditionally
+ // We're doing a complete reindexing. We aren't using uidIter,
+ // but for completeness we assert that it's null (as expected).
+ Debug.Assert(uidIter == null, "Expected uidIter == null for operation == " + operation);
+
+ var doc = HTMLDocument.Document(file);
+ Console.Out.WriteLine("adding " + doc.Get("path"));
+ writer.AddDocument(doc);
}
}
}
+
+ private enum Operation {
+ /// <summary>
+ /// Indicates an incremental indexing.
+ /// </summary>
+ IncrementalReindex,
+
+ /// <summary>
+ /// Indicates that stale entries in the index should be removed.
+ /// </summary>
+ RemoveStale,
+
+ /// <summary>
+ /// Indicates an complete reindexing.
+ /// </summary>
+ CompleteReindex
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/demo/SearchFiles/SearchFiles.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/demo/SearchFiles/SearchFiles.cs?rev=1377648&r1=1377647&r2=1377648&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/demo/SearchFiles/SearchFiles.cs (original)
+++ incubator/lucene.net/trunk/src/demo/SearchFiles/SearchFiles.cs Mon Aug 27 12:04:23 2012
@@ -16,38 +16,32 @@
*/
using System;
-
-using Analyzer = Lucene.Net.Analysis.Analyzer;
-using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
-using Document = Lucene.Net.Documents.Document;
-using FilterIndexReader = Lucene.Net.Index.FilterIndexReader;
-using IndexReader = Lucene.Net.Index.IndexReader;
-using QueryParser = Lucene.Net.QueryParsers.QueryParser;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Documents;
+using Lucene.Net.QueryParsers;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
using FSDirectory = Lucene.Net.Store.FSDirectory;
using Version = Lucene.Net.Util.Version;
-using Collector = Lucene.Net.Search.Collector;
-using IndexSearcher = Lucene.Net.Search.IndexSearcher;
-using Query = Lucene.Net.Search.Query;
-using ScoreDoc = Lucene.Net.Search.ScoreDoc;
-using Scorer = Lucene.Net.Search.Scorer;
-using Searcher = Lucene.Net.Search.Searcher;
-using TopScoreDocCollector = Lucene.Net.Search.TopScoreDocCollector;
namespace Lucene.Net.Demo
{
/// <summary>Simple command-line based search demo. </summary>
- public class SearchFiles
+ public static class SearchFiles
{
- private class AnonymousClassCollector:Collector
+ private class AnonymousClassCollector : Collector
{
private Scorer scorer;
private int docBase;
// simply print docId and score of every matching document
- public override void Collect(int doc)
+ public override void Collect(int doc)
{
- System.Console.Out.WriteLine("doc=" + doc + docBase + " score=" + scorer.Score());
+ Console.Out.WriteLine("doc=" + doc + docBase + " score=" + scorer.Score());
}
public override bool AcceptsDocsOutOfOrder
@@ -55,7 +49,7 @@ namespace Lucene.Net.Demo
get { return true; }
}
- public override void SetNextReader(IndexReader reader, int docBase)
+ public override void SetNextReader(IndexReader reader, int docBase)
{
this.docBase = docBase;
}
@@ -66,49 +60,46 @@ namespace Lucene.Net.Demo
}
}
- /// <summary>Use the norms from one field for all fields. Norms are read into memory,
+ /// <summary>
+ /// Use the norms from one field for all fields. Norms are read into memory,
/// using a byte of memory per document per searched field. This can cause
/// search of large collections with a large number of fields to run out of
/// memory. If all of the fields contain only a single token, then the norms
/// are all identical, then single norm vector may be shared.
/// </summary>
- private class OneNormsReader:FilterIndexReader
+ private class OneNormsReader : FilterIndexReader
{
- private System.String field;
+ private readonly String field;
- public OneNormsReader(IndexReader in_Renamed, System.String field):base(in_Renamed)
+ public OneNormsReader(IndexReader in_Renamed, String field):base(in_Renamed)
{
this.field = field;
}
- public override byte[] Norms(System.String field)
+ public override byte[] Norms(String field)
{
return in_Renamed.Norms(this.field);
}
}
-
- private SearchFiles()
- {
- }
-
+
/// <summary>Simple command-line based search demo. </summary>
[STAThread]
- public static void Main(System.String[] args)
+ public static void Main(String[] args)
{
- System.String usage = "Usage:\t" + typeof(SearchFiles) + "[-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]";
+ String usage = "Usage:\t" + typeof(SearchFiles) + "[-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]";
usage += "\n\tSpecify 'false' for hitsPerPage to use streaming instead of paging search.";
if (args.Length > 0 && ("-h".Equals(args[0]) || "-help".Equals(args[0])))
{
- System.Console.Out.WriteLine(usage);
- System.Environment.Exit(0);
+ Console.Out.WriteLine(usage);
+ Environment.Exit(0);
}
- System.String index = "index";
- System.String field = "contents";
- System.String queries = null;
+ String index = "index";
+ String field = "contents";
+ String queries = null;
int repeat = 0;
bool raw = false;
- System.String normsField = null;
+ String normsField = null;
bool paging = true;
int hitsPerPage = 10;
@@ -131,7 +122,7 @@ namespace Lucene.Net.Demo
}
else if ("-repeat".Equals(args[i]))
{
- repeat = System.Int32.Parse(args[i + 1]);
+ repeat = Int32.Parse(args[i + 1]);
i++;
}
else if ("-raw".Equals(args[i]))
@@ -151,7 +142,7 @@ namespace Lucene.Net.Demo
}
else
{
- hitsPerPage = System.Int32.Parse(args[i + 1]);
+ hitsPerPage = Int32.Parse(args[i + 1]);
if (hitsPerPage == 0)
{
paging = false;
@@ -160,69 +151,82 @@ namespace Lucene.Net.Demo
i++;
}
}
+
+ IndexReader indexReader = null;
+ try
+ {
+ // only searching, so read-only=true
+ indexReader = IndexReader.Open(FSDirectory.Open(new System.IO.DirectoryInfo(index)), true); // only searching, so read-only=true
+
+ if (normsField != null)
+ indexReader = new OneNormsReader(indexReader, normsField);
- IndexReader reader = IndexReader.Open(FSDirectory.Open(new System.IO.DirectoryInfo(index)), true); // only searching, so read-only=true
-
- if (normsField != null)
- reader = new OneNormsReader(reader, normsField);
-
- Searcher searcher = new IndexSearcher(reader);
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
+ Searcher searcher = new IndexSearcher(indexReader);
+ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
- System.IO.StreamReader in_Renamed = null;
- if (queries != null)
- {
- in_Renamed = new System.IO.StreamReader(new System.IO.StreamReader(queries, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(queries, System.Text.Encoding.Default).CurrentEncoding);
- }
- else
- {
- in_Renamed = new System.IO.StreamReader(new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).BaseStream, new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).CurrentEncoding);
- }
- QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, analyzer);
- while (true)
- {
- if (queries == null)
- // prompt the user
- System.Console.Out.WriteLine("Enter query: ");
-
- System.String line = in_Renamed.ReadLine();
-
- if (line == null || line.Length == - 1)
- break;
-
- line = line.Trim();
- if (line.Length == 0)
- break;
-
- Query query = parser.Parse(line);
- System.Console.Out.WriteLine("Searching for: " + query.ToString(field));
-
-
- if (repeat > 0)
- {
- // repeat & time as benchmark
- System.DateTime start = System.DateTime.Now;
- for (int i = 0; i < repeat; i++)
- {
- searcher.Search(query, null, 100);
- }
- System.DateTime end = System.DateTime.Now;
- System.Console.Out.WriteLine("Time: " + (end.Millisecond - start.Millisecond) + "ms");
- }
-
- if (paging)
- {
- DoPagingSearch(in_Renamed, searcher, query, hitsPerPage, raw, queries == null);
- }
- else
- {
- DoStreamingSearch(searcher, query);
- }
- }
- reader.Close();
+ StreamReader queryReader;
+ if (queries != null)
+ {
+ queryReader = new StreamReader(new StreamReader(queries, Encoding.Default).BaseStream, new StreamReader(queries, Encoding.Default).CurrentEncoding);
+ }
+ else
+ {
+ queryReader = new StreamReader(new StreamReader(Console.OpenStandardInput(), Encoding.UTF8).BaseStream, new StreamReader(Console.OpenStandardInput(), Encoding.UTF8).CurrentEncoding);
+ }
+
+ var parser = new QueryParser(Version.LUCENE_30, field, analyzer);
+ while (true)
+ {
+ if (queries == null)
+ // prompt the user
+ Console.Out.WriteLine("Enter query: ");
+
+ String line = queryReader.ReadLine();
+
+ if (line == null || line.Length == - 1)
+ break;
+
+ line = line.Trim();
+ if (line.Length == 0)
+ break;
+
+ Query query = parser.Parse(line);
+ Console.Out.WriteLine("Searching for: " + query.ToString(field));
+
+ if (repeat > 0)
+ {
+ // repeat & time as benchmark
+ DateTime start = DateTime.Now;
+ for (int i = 0; i < repeat; i++)
+ {
+ searcher.Search(query, null, 100);
+ }
+ DateTime end = DateTime.Now;
+ Console.Out.WriteLine("Time: " + (end.Millisecond - start.Millisecond) + "ms");
+ }
+
+ if (paging)
+ {
+ DoPagingSearch(queryReader, searcher, query, hitsPerPage, raw, queries == null);
+ }
+ else
+ {
+ DoStreamingSearch(searcher, query);
+ }
+ }
+ queryReader.Close();
+ }
+ finally
+ {
+ if (indexReader != null)
+ {
+ indexReader.Dispose();
+ }
+ }
}
- /// <summary> This method uses a custom HitCollector implementation which simply prints out
+ /// <summary>
+ /// This method uses a custom HitCollector implementation which simply prints out
/// the docId and score of every matching document.
///
/// This simulates the streaming search use case, where all hits are supposed to
@@ -231,7 +235,6 @@ namespace Lucene.Net.Demo
public static void DoStreamingSearch(Searcher searcher, Query query)
{
Collector streamingHitCollector = new AnonymousClassCollector();
-
searcher.Search(query, streamingHitCollector);
}
@@ -244,28 +247,28 @@ namespace Lucene.Net.Demo
/// is executed another time and all hits are collected.
///
/// </summary>
- public static void DoPagingSearch(System.IO.StreamReader in_Renamed, Searcher searcher, Query query, int hitsPerPage, bool raw, bool interactive)
+ public static void DoPagingSearch(StreamReader input, Searcher searcher, Query query, int hitsPerPage, bool raw, bool interactive)
{
// Collect enough docs to show 5 pages
- TopScoreDocCollector collector = TopScoreDocCollector.Create(5 * hitsPerPage, false);
+ var collector = TopScoreDocCollector.Create(5 * hitsPerPage, false);
searcher.Search(query, collector);
- ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
+ var hits = collector.TopDocs().ScoreDocs;
int numTotalHits = collector.TotalHits;
- System.Console.Out.WriteLine(numTotalHits + " total matching documents");
+ Console.Out.WriteLine(numTotalHits + " total matching documents");
int start = 0;
- int end = System.Math.Min(numTotalHits, hitsPerPage);
+ int end = Math.Min(numTotalHits, hitsPerPage);
while (true)
{
if (end > hits.Length)
{
- System.Console.Out.WriteLine("Only results 1 - " + hits.Length + " of " + numTotalHits + " total matching documents collected.");
- System.Console.Out.WriteLine("Collect more (y/n) ?");
- System.String line = in_Renamed.ReadLine();
- if (line.Length == 0 || line[0] == 'n')
+ Console.Out.WriteLine("Only results 1 - " + hits.Length + " of " + numTotalHits + " total matching documents collected.");
+ Console.Out.WriteLine("Collect more (y/n) ?");
+ String line = input.ReadLine();
+ if (String.IsNullOrEmpty(line) || line[0] == 'n')
{
break;
}
@@ -275,31 +278,31 @@ namespace Lucene.Net.Demo
hits = collector.TopDocs().ScoreDocs;
}
- end = System.Math.Min(hits.Length, start + hitsPerPage);
+ end = Math.Min(hits.Length, start + hitsPerPage);
for (int i = start; i < end; i++)
{
if (raw)
{
// output raw format
- System.Console.Out.WriteLine("doc=" + hits[i].Doc + " score=" + hits[i].Score);
+ Console.Out.WriteLine("doc=" + hits[i].Doc + " score=" + hits[i].Score);
continue;
}
Document doc = searcher.Doc(hits[i].Doc);
- System.String path = doc.Get("path");
+ String path = doc.Get("path");
if (path != null)
{
- System.Console.Out.WriteLine((i + 1) + ". " + path);
- System.String title = doc.Get("title");
+ Console.Out.WriteLine((i + 1) + ". " + path);
+ String title = doc.Get("title");
if (title != null)
{
- System.Console.Out.WriteLine(" Title: " + doc.Get("title"));
+ Console.Out.WriteLine(" Title: " + doc.Get("title"));
}
}
else
{
- System.Console.Out.WriteLine((i + 1) + ". " + "No path for this document");
+ Console.Out.WriteLine((i + 1) + ". " + "No path for this document");
}
}
@@ -313,26 +316,26 @@ namespace Lucene.Net.Demo
bool quit = false;
while (true)
{
- System.Console.Out.Write("Press ");
+ Console.Out.Write("Press ");
if (start - hitsPerPage >= 0)
{
- System.Console.Out.Write("(p)revious page, ");
+ Console.Out.Write("(p)revious page, ");
}
if (start + hitsPerPage < numTotalHits)
{
- System.Console.Out.Write("(n)ext page, ");
+ Console.Out.Write("(n)ext page, ");
}
- System.Console.Out.WriteLine("(q)uit or enter number to jump to a page.");
+ Console.Out.WriteLine("(q)uit or enter number to jump to a page.");
- System.String line = in_Renamed.ReadLine();
- if (line.Length == 0 || line[0] == 'q')
+ String line = input.ReadLine();
+ if (String.IsNullOrEmpty(line) || line[0] == 'q')
{
quit = true;
break;
}
if (line[0] == 'p')
{
- start = System.Math.Max(0, start - hitsPerPage);
+ start = Math.Max(0, start - hitsPerPage);
break;
}
else if (line[0] == 'n')
@@ -345,7 +348,7 @@ namespace Lucene.Net.Demo
}
else
{
- int page = System.Int32.Parse(line);
+ int page = Int32.Parse(line);
if ((page - 1) * hitsPerPage < numTotalHits)
{
start = (page - 1) * hitsPerPage;
@@ -353,13 +356,13 @@ namespace Lucene.Net.Demo
}
else
{
- System.Console.Out.WriteLine("No such page");
+ Console.Out.WriteLine("No such page");
}
}
}
if (quit)
break;
- end = System.Math.Min(numTotalHits, start + hitsPerPage);
+ end = Math.Min(numTotalHits, start + hitsPerPage);
}
}
}