You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by eh...@apache.org on 2004/01/19 15:58:33 UTC
cvs commit: jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant ConfigurableDocumentHandler.java DocumentHandler.java DocumentHandlerException.java FileExtensionDocumentHandler.java HtmlDocument.java IndexTask.java TextDocument.java
ehatcher 2004/01/19 06:58:33
Modified: contributions/ant build.xml
contributions/ant/src/main/org/apache/lucene/ant
DocumentHandler.java DocumentHandlerException.java
FileExtensionDocumentHandler.java HtmlDocument.java
IndexTask.java TextDocument.java
Added: contributions/ant/src/main/org/apache/lucene/ant
ConfigurableDocumentHandler.java
Log:
upgrade index task to some ancient enhancements i had locally
Revision Changes Path
1.4 +23 -1 jakarta-lucene-sandbox/contributions/ant/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/build.xml,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- build.xml 5 Jan 2004 15:54:53 -0000 1.3
+++ build.xml 19 Jan 2004 14:58:33 -0000 1.4
@@ -1,5 +1,4 @@
<?xml version="1.0"?>
-
<project name="lucene-ant" default="default">
<description>
@@ -18,5 +17,28 @@
<property name="src.dir" location="src/main"/>
+ <!-- alias classpath for cleaner example in index target -->
+ <path id="index.classpath">
+ <path refid="test.classpath"/>
+ </path>
+
<import file="../common.xml"/>
+
+ <property name="index.dir" location="${test.output.dir}/index"/>
+ <property name="files.dir" location="${test.src.dir}"/>
+
+ <target name="index" depends="compile">
+ <taskdef name="index"
+ classname="org.apache.lucene.ant.IndexTask"
+ classpathref="index.classpath"
+ />
+<!-- <typedef file="src/main/org/apache/lucene/ant/antlib.xml"
+ uri="lucene:/org/apache/lucene/ant"
+ classpathref="index.classpath"/> -->
+
+ <index index="${index.dir}">
+ <fileset dir="${files.dir}"/>
+ </index>
+ </target>
+
</project>
1.2 +4 -2 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandler.java
Index: DocumentHandler.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandler.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DocumentHandler.java 11 Jul 2002 01:12:30 -0000 1.1
+++ DocumentHandler.java 19 Jan 2004 14:58:33 -0000 1.2
@@ -1,8 +1,9 @@
package org.apache.lucene.ant;
-import java.io.File;
import org.apache.lucene.document.Document;
+import java.io.File;
+
/**
* Allows a class to act as a Lucene document handler
*
@@ -10,6 +11,7 @@
*@created October 27, 2001
*/
public interface DocumentHandler {
+
/**
* Gets the document attribute of the DocumentHandler object
*
@@ -18,6 +20,6 @@
*@throws DocumentHandlerException
*/
public Document getDocument(File file)
- throws DocumentHandlerException;
+ throws DocumentHandlerException;
}
1.2 +19 -9 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandlerException.java
Index: DocumentHandlerException.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandlerException.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DocumentHandlerException.java 11 Jul 2002 01:12:30 -0000 1.1
+++ DocumentHandlerException.java 19 Jan 2004 14:58:33 -0000 1.2
@@ -5,32 +5,42 @@
/**
*/
-public class DocumentHandlerException extends Exception
-{
+public class DocumentHandlerException extends Exception {
private Throwable cause;
-
+
+ /**
+ * Default constructor.
+ */
public DocumentHandlerException() {
super();
}
-
+
+ /**
+ * Constructs with message.
+ */
public DocumentHandlerException(String message) {
super(message);
}
-
+
+ /**
+ * Constructs with chained exception.
+ */
public DocumentHandlerException(Throwable cause) {
super(cause.toString());
this.cause = cause;
}
-
+
+ /**
+ * Retrieves nested exception.
+ */
public Throwable getException() {
return cause;
}
- // Override stack trace methods to show original cause:
public void printStackTrace() {
printStackTrace(System.err);
}
-
+
public void printStackTrace(PrintStream ps) {
synchronized (ps) {
super.printStackTrace(ps);
@@ -40,7 +50,7 @@
}
}
}
-
+
public void printStackTrace(PrintWriter pw) {
synchronized (pw) {
super.printStackTrace(pw);
1.2 +9 -9 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/FileExtensionDocumentHandler.java
Index: FileExtensionDocumentHandler.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/FileExtensionDocumentHandler.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- FileExtensionDocumentHandler.java 11 Jul 2002 01:12:30 -0000 1.1
+++ FileExtensionDocumentHandler.java 19 Jan 2004 14:58:33 -0000 1.2
@@ -1,19 +1,20 @@
package org.apache.lucene.ant;
-import java.io.File;
import org.apache.lucene.document.Document;
+import java.io.File;
+
/**
- * Decides which class used to create the Lucene Document
- * object based on its file extension.
+ * A DocumentHandler implementation to delegate responsibility to
+ * based on a files extension. Currently only .html and .txt
+ * files are handled, other extensions ignored.
*
*@author Erik Hatcher
*@created October 28, 2001
- *@todo Add dynamic file extension/classname mappings for
- * extensibility
+ *@todo Implement dynamic document type lookup
*/
public class FileExtensionDocumentHandler
- implements DocumentHandler {
+ implements DocumentHandler {
/**
* Gets the document attribute of the
* FileExtensionDocumentHandler object
@@ -25,7 +26,7 @@
* Exception
*/
public Document getDocument(File file)
- throws DocumentHandlerException {
+ throws DocumentHandlerException {
Document doc = null;
String name = file.getName();
@@ -38,8 +39,7 @@
if (name.endsWith(".html")) {
doc = HtmlDocument.Document(file);
}
- }
- catch (java.io.IOException e) {
+ } catch (java.io.IOException e) {
throw new DocumentHandlerException(e);
}
1.2 +13 -18 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/HtmlDocument.java
Index: HtmlDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/HtmlDocument.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- HtmlDocument.java 11 Jul 2002 01:12:30 -0000 1.1
+++ HtmlDocument.java 19 Jan 2004 14:58:33 -0000 1.2
@@ -1,5 +1,12 @@
package org.apache.lucene.ant;
+import org.apache.lucene.document.Field;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+import org.w3c.tidy.Tidy;
+
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -8,18 +15,6 @@
import java.io.InputStream;
import java.io.StringWriter;
-// Imports commented out since there is a name clash and fully
-// qualified class names will be used in the code. Imports are
-// left for ease of maintenance.
-import org.apache.lucene.document.Field;
-//import org.apache.lucene.document.Document;
-//import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.Text;
-import org.w3c.tidy.Tidy;
-
/**
* The <code>HtmlDocument</code> class creates a Lucene {@link
* org.apache.lucene.document.Document} from an HTML document. <P>
@@ -51,8 +46,8 @@
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
- org.w3c.dom.Document root =
- tidy.parseDOM(new FileInputStream(file), null);
+ org.w3c.dom.Document root =
+ tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
@@ -84,7 +79,7 @@
*@exception IOException
*/
public static org.apache.lucene.document.Document
- getDocument(InputStream is) throws IOException {
+ getDocument(InputStream is) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
@@ -109,7 +104,7 @@
*@exception IOException
*/
public static org.apache.lucene.document.Document
- Document(File file) throws IOException {
+ Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
@@ -119,7 +114,7 @@
String contents = null;
BufferedReader br =
- new BufferedReader(new FileReader(file));
+ new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
@@ -153,7 +148,7 @@
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
- new HtmlDocument(new FileInputStream(new File(args[0])));
+ new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
}
1.3 +331 -231 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/IndexTask.java
Index: IndexTask.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/IndexTask.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- IndexTask.java 5 Jan 2004 15:45:55 -0000 1.2
+++ IndexTask.java 19 Jan 2004 14:58:33 -0000 1.3
@@ -1,12 +1,11 @@
package org.apache.lucene.ant;
-import java.io.File;
-import java.io.IOException;
-import java.util.Date;
-import java.util.Vector;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -16,271 +15,372 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
-
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DirectoryScanner;
+import org.apache.tools.ant.DynamicConfigurator;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.Task;
import org.apache.tools.ant.types.FileSet;
+import org.apache.tools.ant.types.EnumeratedAttribute;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Properties;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.ArrayList;
/**
- * Builds a Lucene index from a fileset.
+ * Ant task to index files with Lucene
*
- * @author Erik Hatcher
+ *@author Erik Hatcher
*/
public class IndexTask extends Task {
- /**
- * file list
- */
- private Vector filesets = new Vector();
+ /**
+ * file list
+ */
+ private ArrayList filesets = new ArrayList();
+
+ /**
+ * overwrite index?
+ */
+ private boolean overwrite = false;
+
+ /**
+ * index path
+ */
+ private File indexDir;
+
+ /**
+ * document handler classname
+ */
+ private String handlerClassName =
+ FileExtensionDocumentHandler.class.getName();
+
+ /**
+ * document handler instance
+ */
+ private DocumentHandler handler;
+
+
+ /**
+ *
+ */
+ private String analyzerClassName =
+ StandardAnalyzer.class.getName();
+
+ /**
+ * analyzer instance
+ */
+ private Analyzer analyzer;
+
+ /**
+ * Lucene merge factor
+ */
+ private int mergeFactor = 20;
+
+ private HandlerConfig handlerConfig;
+
+
+ /**
+ * Creates new instance
+ */
+ public IndexTask() {
+ }
+
+
+ /**
+ * Specifies the directory where the index will be stored
+ */
+ public void setIndex(File indexDir) {
+ this.indexDir = indexDir;
+ }
+
+
+ /**
+ * Sets the mergeFactor attribute of the IndexTask object
+ *
+ *@param mergeFactor The new mergeFactor value
+ */
+ public void setMergeFactor(int mergeFactor) {
+ this.mergeFactor = mergeFactor;
+ }
+
+
+ /**
+ * Sets the overwrite attribute of the IndexTask object
+ *
+ *@param overwrite The new overwrite value
+ */
+ public void setOverwrite(boolean overwrite) {
+ this.overwrite = overwrite;
+ }
+
+
+ /**
+ * Sets the documentHandler attribute of the IndexTask object
+ *
+ *@param classname The new documentHandler value
+ */
+ public void setDocumentHandler(String classname) {
+ handlerClassName = classname;
+ }
+
+ /**
+ * Sets the analyzer based on the builtin Lucene analyzer types.
+ *
+ * @todo Enforce analyzer and analyzerClassName to be mutually exclusive
+ */
+ public void setAnalyzer(AnalyzerType type) {
+ analyzerClassName = type.getClassname();
+ }
+
+ public void setAnalyzerClassName(String classname) {
+ analyzerClassName = classname;
+ }
+
+ /**
+ * Adds a set of files (nested fileset attribute).
+ *
+ *@param set FileSet to be added
+ */
+ public void addFileset(FileSet set) {
+ filesets.add(set);
+ }
+
+ /**
+ * Sets custom properties for a configurable document handler.
+ */
+ public void addConfig(HandlerConfig config) throws BuildException {
+ if (handlerConfig != null) {
+ throw new BuildException("Only one config element allowed");
+ }
- /**
- * overwrite index?
- */
- private boolean overwrite = false;
+ handlerConfig = config;
+ }
- /**
- * index path
- */
- private File indexPath;
- /**
- * document handler classname
- */
- private String handlerClassName =
- "org.apache.lucene.ant.FileExtensionDocumentHandler";
-
- /**
- * document handler instance
- */
- private DocumentHandler handler;
-
- /**
- * Lucene merge factor
- */
- private int mergeFactor = 20;
+ /**
+ * Begins the indexing
+ *
+ *@exception BuildException If an error occurs indexing the
+ * fileset
+ */
+ public void execute() throws BuildException {
+
+ // construct handler and analyzer dynamically
+ try {
+ Class clazz = Class.forName(handlerClassName);
+ handler = (DocumentHandler) clazz.newInstance();
+
+ clazz = Class.forName(analyzerClassName);
+ analyzer = (Analyzer) clazz.newInstance();
+ } catch (ClassNotFoundException cnfe) {
+ throw new BuildException(cnfe);
+ } catch (InstantiationException ie) {
+ throw new BuildException(ie);
+ } catch (IllegalAccessException iae) {
+ throw new BuildException(iae);
+ }
+ log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE);
+ log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);
- /**
- * Specifies the directory where the index will be stored
- *
- * @param indexPath The new index value
- */
- public void setIndex(File indexPath) {
- this.indexPath = indexPath;
+ if (handler instanceof ConfigurableDocumentHandler) {
+ ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties());
}
- /**
- * Sets the mergeFactor attribute of the IndexTask object
- *
- *@param mergeFactor The new mergeFactor value
- */
- public void setMergeFactor(int mergeFactor) {
- this.mergeFactor = mergeFactor;
+ try {
+ indexDocs();
+ } catch (IOException e) {
+ throw new BuildException(e);
}
+ }
- /**
- * If true, index will be overwritten.
- *
- * @param overwrite The new overwrite value
- */
- public void setOverwrite(boolean overwrite) {
- this.overwrite = overwrite;
+ /**
+ * Index the fileset.
+ *
+ *@exception IOException if Lucene I/O exception
+ *@todo refactor!!!!!
+ */
+ private void indexDocs() throws IOException {
+ Date start = new Date();
+
+ boolean create = overwrite;
+ // If the index directory doesn't exist,
+ // create it and force create mode
+ if (indexDir.mkdirs() && !overwrite) {
+ create = true;
}
-
- /**
- * Classname of document handler.
- *
- * @param classname The new documentHandler value
- */
- public void setDocumentHandler(String classname) {
- handlerClassName = classname;
+ Searcher searcher = null;
+ boolean checkLastModified = false;
+ if (!create) {
+ try {
+ searcher = new IndexSearcher(indexDir.getAbsolutePath());
+ checkLastModified = true;
+ } catch (IOException ioe) {
+ log("IOException: " + ioe.getMessage());
+ // Empty - ignore, which indicates to index all
+ // documents
+ }
}
+ log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE);
- /**
- * Adds a set of files.
- *
- * @param set FileSet to be added
- */
- public void addFileset(FileSet set) {
- filesets.addElement(set);
- }
+ IndexWriter writer =
+ new IndexWriter(indexDir, analyzer, create);
+ int totalFiles = 0;
+ int totalIndexed = 0;
+ int totalIgnored = 0;
+ try {
+ writer.mergeFactor = mergeFactor;
+
+ for (int i = 0; i < filesets.size(); i++) {
+ FileSet fs = (FileSet) filesets.get(i);
+ if (fs != null) {
+ DirectoryScanner ds =
+ fs.getDirectoryScanner(getProject());
+ String[] dsfiles = ds.getIncludedFiles();
+ File baseDir = ds.getBasedir();
+
+ for (int j = 0; j < dsfiles.length; j++) {
+ File file = new File(baseDir, dsfiles[j]);
+ totalFiles++;
+
+ if (!file.exists() || !file.canRead()) {
+ throw new BuildException("File \"" +
+ file.getAbsolutePath()
+ + "\" does not exist or is not readable.");
+ }
+ boolean indexIt = true;
- /**
- * Begins the indexing
- *
- * @exception BuildException If an error occurs indexing the
- * fileset
- * @todo add classpath handling so handler does not
- * have to be in system classpath
- */
- public void execute() throws BuildException {
- try {
- Class clazz = Class.forName(handlerClassName);
- handler = (DocumentHandler) clazz.newInstance();
- }
- catch (ClassNotFoundException cnfe) {
- throw new BuildException(cnfe);
- }
- catch (InstantiationException ie) {
- throw new BuildException(ie);
- }
- catch (IllegalAccessException iae) {
- throw new BuildException(iae);
- }
+ if (checkLastModified) {
+ Hits hits = null;
+ Term pathTerm =
+ new Term("path", file.getPath());
+ TermQuery query =
+ new TermQuery(pathTerm);
+ hits = searcher.search(query);
+
+ // if document is found, compare the
+ // indexed last modified time with the
+ // current file
+ // - don't index if up to date
+ if (hits.length() > 0) {
+ Document doc = hits.doc(0);
+ String indexModified =
+ doc.get("modified").trim();
+ if (indexModified != null) {
+ if (DateField.stringToTime(indexModified)
+ == file.lastModified()) {
+ indexIt = false;
+ }
+ }
+ }
+ }
- try {
- indexDocs();
- }
- catch (IOException e) {
- throw new BuildException(e);
+ if (indexIt) {
+ try {
+ log("Indexing " + file.getPath(),
+ Project.MSG_VERBOSE);
+ Document doc =
+ handler.getDocument(file);
+
+ if (doc == null) {
+ totalIgnored++;
+ } else {
+ // Add the path of the file as a field named "path". Use a Text field, so
+ // that the index stores the path, and so that the path is searchable
+ doc.add(Field.Keyword("path", file.getPath()));
+
+ // Add the last modified date of the file a field named "modified". Use a
+ // Keyword field, so that it's searchable, but so that no attempt is made
+ // to tokenize the field into words.
+ doc.add(Field.Keyword("modified",
+ DateField.timeToString(file.lastModified())));
+
+ writer.addDocument(doc);
+ totalIndexed++;
+ }
+ } catch (DocumentHandlerException e) {
+ throw new BuildException(e);
+ }
+ }
+ }
+ // for j
}
+ // if (fs != null)
+ }
+ // for i
+
+ writer.optimize();
+ }
+ //try
+ finally {
+ // always make sure everything gets closed,
+ // no matter how we exit.
+ writer.close();
+ if (searcher != null) {
+ searcher.close();
+ }
}
+ Date end = new Date();
- /**
- * index the fileset
- *
- * @exception IOException Description of Exception
- * @todo refactor - definitely lots of room for improvement here
- */
- private void indexDocs() throws IOException {
- Date start = new Date();
+ log(totalIndexed + " out of " + totalFiles + " indexed (" +
+ totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
+ " milliseconds");
+ }
- boolean create = overwrite;
- // If the index directory doesn't exist,
- // create it and force create mode
- if (indexPath.mkdirs() && !overwrite) {
- create = true;
- }
+ public static class HandlerConfig implements DynamicConfigurator {
+ Properties props = new Properties();
- Searcher searcher = null;
- Analyzer analyzer = new StopAnalyzer();
- boolean checkLastModified = false;
- if (!create) {
- try {
- searcher = new IndexSearcher(indexPath.getAbsolutePath());
- checkLastModified = true;
- }
- catch (IOException ioe) {
- log("IOException: " + ioe.getMessage());
- // Empty - ignore, which indicates to index all
- // documents
- }
- }
+ public void setDynamicAttribute(String attributeName, String value) throws BuildException {
+ props.setProperty(attributeName, value);
+ }
- log("checkLastModified = " + checkLastModified);
+ public Object createDynamicElement(String elementName) throws BuildException {
+ throw new BuildException("Sub elements not supported");
+ }
- IndexWriter writer =
- new IndexWriter(indexPath, analyzer, create);
- int totalFiles = 0;
- int totalIndexed = 0;
- int totalIgnored = 0;
- try {
- writer.mergeFactor = mergeFactor;
-
- for (int i = 0; i < filesets.size(); i++) {
- FileSet fs = (FileSet) filesets.elementAt(i);
- if (fs != null) {
- DirectoryScanner ds =
- fs.getDirectoryScanner(getProject());
- String[] dsfiles = ds.getIncludedFiles();
- File baseDir = ds.getBasedir();
-
- for (int j = 0; j < dsfiles.length; j++) {
- File file = new File(baseDir, dsfiles[j]);
- totalFiles++;
-
- if (!file.exists() || !file.canRead()) {
- throw new BuildException("File \"" +
- file.getAbsolutePath()
- + "\" does not exist or is not readable.");
- }
-
- boolean indexIt = true;
-
- if (checkLastModified) {
- Hits hits = null;
- Term pathTerm =
- new Term("path", file.getPath());
- TermQuery query =
- new TermQuery(pathTerm);
- hits = searcher.search(query);
-
- // if document is found, compare the
- // indexed last modified time with the
- // current file
- // - don't index if up to date
- if (hits.length() > 0) {
- Document doc = hits.doc(0);
- String indexModified =
- doc.get("modified");
- if (indexModified != null) {
- if (DateField.stringToTime(indexModified)
- == file.lastModified()) {
- indexIt = false;
- }
- }
- }
- }
-
- if (indexIt) {
- try {
- log("Indexing " + file.getPath(),
- Project.MSG_VERBOSE);
- Document doc =
- handler.getDocument(file);
-
- if (doc == null) {
- totalIgnored++;
- }
- else {
- // Add the path of the file as a field named "path". Use a Text field, so
- // that the index stores the path, and so that the path is searchable
- doc.add(Field.Keyword("path", file.getPath()));
-
- // Add the last modified date of the file a field named "modified". Use a
- // Keyword field, so that it's searchable, but so that no attempt is made
- // to tokenize the field into words.
- doc.add(Field.Keyword("modified",
- DateField.timeToString(file.lastModified())));
-
- writer.addDocument(doc);
- totalIndexed++;
- }
- }
- catch (DocumentHandlerException e) {
- throw new BuildException(e);
- }
- }
- }
- // for j
- }
- // if (fs != null)
- }
- // for i
+ public Properties getProperties() {
+ return props;
+ }
+ }
- writer.optimize();
- }
- //try
- finally {
- // always make sure everything gets closed,
- // no matter how we exit.
- writer.close();
- if (searcher != null) {
- searcher.close();
- }
- }
+ /**
+ * @todo - the RusionAnalyzer requires a constructor argument
+ * so its being removed from here until a mechanism
+ * is developed to pass ctor info somehow
+ */
+ public static class AnalyzerType extends EnumeratedAttribute {
+ private static Map analyzerLookup = new HashMap();
+
+ static {
+ analyzerLookup.put("simple", SimpleAnalyzer.class.getName());
+ analyzerLookup.put("standard", StandardAnalyzer.class.getName());
+ analyzerLookup.put("stop", StopAnalyzer.class.getName());
+ analyzerLookup.put("whitespace", WhitespaceAnalyzer.class.getName());
+ analyzerLookup.put("german", GermanAnalyzer.class.getName());
+// analyzerLookup.put("russian", RussianAnalyzer.class.getName());
+ }
- Date end = new Date();
+ /**
+ * @see EnumeratedAttribute#getValues
+ */
+ public String[] getValues() {
+ Set keys = analyzerLookup.keySet();
+ return (String[]) keys.toArray(new String[0]);
+ }
- log(totalIndexed + " out of " + totalFiles + " indexed (" +
- totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
- " milliseconds");
+ public String getClassname() {
+ return (String) analyzerLookup.get(getValue());
}
+ }
}
1.2 +6 -5 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/TextDocument.java
Index: TextDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/TextDocument.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TextDocument.java 11 Jul 2002 01:12:30 -0000 1.1
+++ TextDocument.java 19 Jan 2004 14:58:33 -0000 1.2
@@ -1,14 +1,14 @@
package org.apache.lucene.ant;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringWriter;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
/**
* A utility for making Lucene Documents from a File.
*
@@ -62,9 +62,10 @@
// make a new, empty document
Document doc = new Document();
+ doc.add(Field.Text("title", f.getName()));
doc.add(Field.Text("contents", textDoc.getContents()));
- doc.add(Field.UnIndexed("rawcontents",
- textDoc.getContents()));
+ doc.add(Field.UnIndexed("rawcontents",
+ textDoc.getContents()));
// return the document
return doc;
1.1 jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/ConfigurableDocumentHandler.java
Index: ConfigurableDocumentHandler.java
===================================================================
package org.apache.lucene.ant;
import java.util.Properties;
public interface ConfigurableDocumentHandler extends DocumentHandler {
void configure(Properties props);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org