You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by so...@apache.org on 2007/09/11 23:39:44 UTC
svn commit: r574702 [6/6] - in /lenya/branches/revolution/1.3.x: ./ lib/
src/java/org/apache/lenya/ac/ src/java/org/apache/lenya/ac/file/
src/java/org/apache/lenya/ac/impl/ src/java/org/apache/lenya/cms/ac/
src/java/org/apache/lenya/cms/ac/cocoon/ src/...
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/SearchFiles.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/SearchFiles.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/SearchFiles.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/SearchFiles.java Tue Sep 11 14:39:37 2007
@@ -14,15 +14,12 @@
* limitations under the License.
*
*/
-
/* $Id$ */
-
package org.apache.lenya.lucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@@ -36,98 +33,78 @@
* Command Line Interface
*/
class SearchFiles {
-
/**
* main method
- *
- * @param args Directory of the index
+ *
+ * @param args
+ * Directory of the index
*/
public static void main(String[] args) {
if (args.length == 0) {
System.err.println("Usage: org.apache.lenya.lucene.SearchFiles \"directory_where_index_is_located\" <word>");
return;
}
-
File index_directory = new File(args[0]);
-
if (!index_directory.exists()) {
- System.err.println("Exception: No such directory: " +
- index_directory.getAbsolutePath());
-
+ System.err.println("Exception: No such directory: " + index_directory.getAbsolutePath());
return;
}
-
-
try {
if (args.length > 1) {
Hits hits = new SearchFiles().search(args[1], index_directory);
return;
}
-
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-
while (true) {
System.out.print("Search: ");
-
String line = in.readLine();
-
if (line.length() == -1) {
break;
}
-
- Hits hits = new SearchFiles().search(line, index_directory);
-
- System.out.print("\nAnother Search (y/n) ? ");
- line = in.readLine();
-
- if ((line.length() == 0) || (line.charAt(0) == 'n')) {
- break;
- }
+ Hits hits = new SearchFiles().search(line, index_directory);
+ System.out.print("\nAnother Search (y/n) ? ");
+ line = in.readLine();
+ if ((line.length() == 0) || (line.charAt(0) == 'n')) {
+ break;
+ }
}
-
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
-
/**
- *
+ *
*/
public Hits search(String line, File index_directory) throws Exception {
Searcher searcher = new IndexSearcher(index_directory.getAbsolutePath());
Analyzer analyzer = new StandardAnalyzer();
-
- Query query = QueryParser.parse(line, "contents", analyzer);
+ // UPGRADE Lucene 1.3 -> 2.2
+ // Query query = QueryParser.parse(line, "contents", analyzer);
+ QueryParser qp = new QueryParser("contents", analyzer);
+ Query query = qp.parse(line);
System.out.println("Searching for: " + query.toString("contents"));
-
- Hits hits = searcher.search(query);
- System.out.println("Total matching documents: " + hits.length());
-
- final int HITS_PER_PAGE = 10;
-
- for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
- int end = Math.min(hits.length(), start + HITS_PER_PAGE);
-
- for (int i = start; i < end; i++) {
- Document doc = hits.doc(i);
- String path = doc.get("path");
-
- if (path != null) {
- System.out.println(i + ". " + path);
- } else {
- String url = doc.get("url");
-
- if (url != null) {
- System.out.println(i + ". " + url);
- System.out.println(" - " + doc.get("title"));
- } else {
- System.out.println(i + ". " + "No path nor URL for this document");
- }
- }
+ Hits hits = searcher.search(query);
+ System.out.println("Total matching documents: " + hits.length());
+ final int HITS_PER_PAGE = 10;
+ for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
+ int end = Math.min(hits.length(), start + HITS_PER_PAGE);
+ for (int i = start; i < end; i++) {
+ Document doc = hits.doc(i);
+ String path = doc.get("path");
+ if (path != null) {
+ System.out.println(i + ". " + path);
+ } else {
+ String url = doc.get("url");
+ if (url != null) {
+ System.out.println(i + ". " + url);
+ System.out.println(" - " + doc.get("title"));
+ } else {
+ System.out.println(i + ". " + "No path nor URL for this document");
}
-
}
- searcher.close();
+ }
+ }
+ searcher.close();
return hits;
}
}
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/html/HtmlDocument.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/html/HtmlDocument.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/html/HtmlDocument.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/html/HtmlDocument.java Tue Sep 11 14:39:37 2007
@@ -14,14 +14,11 @@
* limitations under the License.
*
*/
-
/* $Id$ */
-
package org.apache.lenya.lucene.html;
-
// Imports commented out since there is a name clash and fully
-// qualified class names will be used in the code. Imports are
+// qualified class names will be used in the code. Imports are
// left for ease of maintenance.
import java.io.BufferedReader;
import java.io.File;
@@ -30,7 +27,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
-
import org.apache.lucene.document.Field;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
@@ -39,228 +35,194 @@
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
-
/**
- * The <code>HtmlDocument</code> class creates a Lucene {@link org.apache.lucene.document.Document}
- * from an HTML document.
- *
+ * The <code>HtmlDocument</code> class creates a Lucene
+ * {@link org.apache.lucene.document.Document} from an HTML document.
+ *
* <P>
- * It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link
- * java.io.InputStream}.
+ * It does this by using JTidy package. It can take input input from
+ * {@link java.io.File} or {@link java.io.InputStream}.
* </p>
*/
public class HtmlDocument {
private Element rawDoc;
private String luceneTagName = null;
private String luceneClassValue = null;
-
/**
* Constructs an <code>HtmlDocument</code> from a {@link java.io.File}.
- *
- * @param file the <code>File</code> containing the HTML to parse
- * @exception IOException if an I/O exception occurs
+ *
+ * @param file
+ * the <code>File</code> containing the HTML to parse
+ * @exception IOException
+ * if an I/O exception occurs
*/
public HtmlDocument(File file) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
-
org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
-
/**
- * Constructs an <code>HtmlDocument</code> from an {@link java.io.InputStream}.
- *
- * @param is the <code>InputStream</code> containing the HTML
- * @exception IOException if I/O exception occurs
+ * Constructs an <code>HtmlDocument</code> from an
+ * {@link java.io.InputStream}.
+ *
+ * @param is
+ * the <code>InputStream</code> containing the HTML
+ * @exception IOException
+ * if I/O exception occurs
*/
public HtmlDocument(InputStream is) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
-
org.w3c.dom.Document root = tidy.parseDOM(is, null);
rawDoc = root.getDocumentElement();
}
-
/**
- * Creates a Lucene <code>Document</code> from an {@link java.io.InputStream}.
- *
+ * Creates a Lucene <code>Document</code> from an
+ * {@link java.io.InputStream}.
+ *
* @param is
* @return org.apache.lucene.document.Document
* @exception IOException
*/
- public static org.apache.lucene.document.Document getDocument(InputStream is)
- throws IOException {
+ public static org.apache.lucene.document.Document getDocument(InputStream is) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
-
- luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
- luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
-
+ // luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
+ luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ // luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
+ luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
return luceneDoc;
}
-
/**
* Creates a Lucene <code>Document</code> from a {@link java.io.File}.
- *
+ *
* @param file
* @return org.apache.lucene.document.Document
* @exception IOException
*/
- public static org.apache.lucene.document.Document Document(File file)
- throws IOException {
+ public static org.apache.lucene.document.Document Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
-
- luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
- luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
-
+ // luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
+ luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ // luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
+ luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
String contents = null;
BufferedReader br = new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
-
while (line != null) {
sw.write(line);
line = br.readLine();
}
-
br.close();
contents = sw.toString();
sw.close();
-
- luceneDoc.add(Field.UnIndexed("rawcontents", contents));
-
+ // luceneDoc.add(Field.UnIndexed("rawcontents", contents));
+ luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
return luceneDoc;
}
-
/**
* Gets the title attribute of the <code>HtmlDocument</code> object.
- *
+ *
* @return the title value
*/
public String getTitle() {
if (rawDoc == null) {
return null;
}
-
String title = "";
-
NodeList nl = rawDoc.getElementsByTagName("title");
-
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
-
if (text != null) {
title = text.getData();
}
}
-
return title;
}
-
/**
* Gets the body text attribute of the <code>HtmlDocument</code> object.
- *
+ *
* @return the body text value
*/
public String getBody() {
if (rawDoc == null) {
return null;
}
-
- // NOTE: JTidy will insert a meta tag: <meta name="generator" content="HTML Tidy, see www.w3.org" />
- // This means that getLength is always greater than 0
+ // NOTE: JTidy will insert a meta tag: <meta name="generator"
+ // content="HTML Tidy, see www.w3.org" />
+ // This means that getLength is always greater than 0
NodeList metaNL = rawDoc.getElementsByTagName("meta");
-
for (int i = 0; i < metaNL.getLength(); i++) {
Element metaElement = (Element) metaNL.item(i);
Attr nameAttr = metaElement.getAttributeNode("name");
Attr valueAttr = metaElement.getAttributeNode("value");
-
if ((nameAttr != null) && (valueAttr != null)) {
if (nameAttr.getValue().equals("lucene-tag-name")) {
luceneTagName = valueAttr.getValue();
}
-
if (nameAttr.getValue().equals("lucene-class-value")) {
luceneClassValue = valueAttr.getValue();
}
}
}
-
boolean indexByLucene = true;
-
if ((luceneTagName != null) && (luceneClassValue != null)) {
indexByLucene = false;
}
-
System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene);
-
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
-
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0), indexByLucene);
}
-
return body;
}
-
/**
* Gets the bodyText attribute of the <code>HtmlDocument</code> object.
- *
- * @param node a DOM Node
- * @param indexByLucene DOCUMENT ME!
+ *
+ * @param node
+ * a DOM Node
+ * @param indexByLucene
+ * DOCUMENT ME!
* @return The bodyText value
*/
private String getBodyText(Node node, boolean indexByLucene) {
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
-
for (int i = 0; i < nl.getLength(); i++) {
boolean index = indexByLucene;
Node child = nl.item(i);
-
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
-
if ((luceneTagName != null) && (luceneClassValue != null)) {
if (child.getNodeName().equals(luceneTagName)) {
Attr attribute = ((Element) child).getAttributeNode("class");
-
if (attribute != null) {
if (attribute.getValue().equals(luceneClassValue)) {
- System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName +
- " class=\"" + luceneClassValue + "\"> found!");
+ System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName + " class=\"" + luceneClassValue + "\"> found!");
index = true;
}
-
}
}
}
-
buffer.append(getBodyText(child, index));
-
if (index) {
buffer.append(" ");
}
-
break;
-
case Node.TEXT_NODE:
-
if (indexByLucene) {
buffer.append(((Text) child).getData());
}
-
break;
}
}
-
return buffer.toString();
}
}
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractDocumentCreator.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractDocumentCreator.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractDocumentCreator.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractDocumentCreator.java Tue Sep 11 14:39:37 2007
@@ -14,80 +14,86 @@
* limitations under the License.
*
*/
-
/* $Id$ */
-
package org.apache.lenya.lucene.index;
import java.io.File;
-
-import org.apache.lucene.document.DateField;
+import org.apache.log4j.Category;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.log4j.Category;
-
public class AbstractDocumentCreator implements DocumentCreator {
Category log = Category.getInstance(AbstractDocumentCreator.class);
-
/** Creates a new instance of AbstractDocumentCreator */
public AbstractDocumentCreator() {
}
-
/**
* DOCUMENT ME!
- *
- * @param file DOCUMENT ME!
- * @param htdocsDumpDir DOCUMENT ME!
- *
+ *
+ * @param file
+ * DOCUMENT ME!
+ * @param htdocsDumpDir
+ * DOCUMENT ME!
+ *
* @return DOCUMENT ME!
- *
- * @throws Exception DOCUMENT ME!
+ *
+ * @throws Exception
+ * DOCUMENT ME!
*/
- public Document getDocument(File file, File htdocsDumpDir)
- throws Exception {
+ public Document getDocument(File file, File htdocsDumpDir) throws Exception {
// make a new, empty document
Document doc = new Document();
-
- // Add the url as a field named "url". Use an UnIndexed field, so
+ // Add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
- String requestURI = file.getPath().replace(File.separatorChar, '/').substring(htdocsDumpDir.getPath()
- .length());
+ String requestURI = file.getPath().replace(File.separatorChar, '/').substring(htdocsDumpDir.getPath().length());
if (requestURI.substring(requestURI.length() - 8).equals(".pdf.txt")) {
- requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove .txt extension from PDF text file
+ requestURI = requestURI.substring(0, requestURI.length() - 4); // Remove
+ // .txt
+ // extension
+ // from
+ // PDF
+ // text
+ // file
}
-
- doc.add(Field.UnIndexed("url", requestURI));
-
+ // doc.add(Field.UnIndexed("url", requestURI));
+ doc.add(new Field("url", requestURI, Field.Store.YES, Field.Index.NO));
// Add the mime-type as a field named "mime-type"
if (requestURI.substring(requestURI.length() - 5).equals(".html")) {
- doc.add(Field.UnIndexed("mime-type", "text/html"));
+ // doc.add(Field.UnIndexed("mime-type", "text/html"));
+ doc.add(new Field("mime-type", "text/html", Field.Store.YES, Field.Index.NO));
} else if (requestURI.substring(requestURI.length() - 4).equals(".txt")) {
- doc.add(Field.UnIndexed("mime-type", "text/plain"));
+ // doc.add(Field.UnIndexed("mime-type", "text/plain"));
+ doc.add(new Field("mime-type", "text/plain", Field.Store.YES, Field.Index.NO));
} else if (requestURI.substring(requestURI.length() - 4).equals(".pdf")) {
- doc.add(Field.UnIndexed("mime-type", "application/pdf"));
+ // doc.add(Field.UnIndexed("mime-type", "application/pdf"));
+ doc.add(new Field("mime-type", "application/pdf", Field.Store.YES, Field.Index.NO));
} else {
// Don't add any mime-type field
- //doc.add(Field.UnIndexed("mime-type", "null"));
+ // doc.add(Field.UnIndexed("mime-type", "null"));
}
-
- // Add the last modified date of the file a field named "modified". Use a
- // Keyword field, so that it's searchable, but so that no attempt is made
+ // Add the last modified date of the file a field named "modified". Use
+ // a
+ // Keyword field, so that it's searchable, but so that no attempt is
+ // made
// to tokenize the field into words.
- doc.add(Field.Keyword("modified", DateField.timeToString(file.lastModified())));
-
+ // doc.add(Field.Keyword("modified",
+ // DateField.timeToString(file.lastModified())));
+ doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.YES));
// Add the id as a field, so that index can be incrementally maintained.
- String id = IndexIterator.createID(file, htdocsDumpDir);
+ String id = IndexIterator.createID(file, htdocsDumpDir);
log.debug(id);
- doc.add(Field.Keyword("id", id));
-
- // Add the uid as a field, so that index can be incrementally maintained.
+ // doc.add(Field.Keyword("id", id));
+ doc.add(new Field("id", id, Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.YES));
+ // Add the uid as a field, so that index can be incrementally
+ // maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
- String uid = IndexIterator.createUID(file, htdocsDumpDir);
+ String uid = IndexIterator.createUID(file, htdocsDumpDir);
log.debug(uid);
- doc.add(new Field("uid", uid, false, true, false));
-
+ doc.add(new Field("uid", uid,
+ // false, true, false));
+ Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
return doc;
}
}
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java Tue Sep 11 14:39:37 2007
@@ -14,16 +14,13 @@
* limitations under the License.
*
*/
-
/* $Id$ */
-
package org.apache.lenya.lucene.index;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.Arrays;
-
import org.apache.log4j.Category;
import org.apache.lenya.lucene.IndexConfiguration;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -31,34 +28,29 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
-
import org.w3c.dom.Element;
/**
- * Abstract base class for indexers.
- * The factory method {@link #getDocumentCreator(String[])} is used to create a
- * DocumentCreator from the command-line arguments.
+ * Abstract base class for indexers. The factory method
+ * {@link #getDocumentCreator(String[])} is used to create a DocumentCreator
+ * from the command-line arguments.
*/
public abstract class AbstractIndexer implements Indexer {
- private static Category log = Category.getInstance(AbstractIndexer.class);
-
+ private static Category log = Category.getInstance(AbstractIndexer.class);
private DocumentCreator documentCreator;
private Element indexer;
private String configFileName;
-
/**
* Creates a new instance of AbstractIndexer
*/
public AbstractIndexer() {
}
-
/**
* Returns the DocumentCreator of this indexer.
*/
protected DocumentCreator getDocumentCreator() {
return documentCreator;
}
-
/**
* Initializes this indexer with command-line parameters.
*/
@@ -67,100 +59,97 @@
this.indexer = indexer;
this.configFileName = configFileName;
}
-
/**
* DOCUMENT ME!
- *
- * @param element DOCUMENT ME!
- *
+ *
+ * @param element
+ * DOCUMENT ME!
+ *
* @return DOCUMENT ME!
- *
- * @throws Exception DOCUMENT ME!
+ *
+ * @throws Exception
+ * DOCUMENT ME!
*/
public abstract DocumentCreator createDocumentCreator(Element indexer, String configFileName) throws Exception;
-
/**
- * Updates the index incrementally.
- * Walk directory hierarchy in uid order, while keeping uid iterator from
- * existing index in sync. Mismatches indicate one of:
+ * Updates the index incrementally. Walk directory hierarchy in uid order,
+ * while keeping uid iterator from existing index in sync. Mismatches
+ * indicate one of:
* <ol>
- * <li>old documents to be deleted</li>
- * <li>unchanged documents, to be left alone, or</li>
- * <li>new documents, to be indexed.</li>
+ * <li>old documents to be deleted</li>
+ * <li>unchanged documents, to be left alone, or</li>
+ * <li>new documents, to be indexed.</li>
* </ol>
*/
public void updateIndex(File dumpDirectory, File index) throws Exception {
deleteStaleDocuments(dumpDirectory, index);
doIndex(dumpDirectory, index, false);
}
-
/**
* Updates the index re one document
- *
+ *
* <ol>
- * <li>old documents to be deleted</li>
- * <li>unchanged documents, to be left alone, or</li>
- * <li>new documents, to be indexed.</li>
+ * <li>old documents to be deleted</li>
+ * <li>unchanged documents, to be left alone, or</li>
+ * <li>new documents, to be indexed.</li>
* </ol>
*/
public void indexDocument(File file) throws Exception {
IndexConfiguration config = new IndexConfiguration(configFileName);
log.debug("File: " + file);
-
File dumpDir = new File(config.resolvePath(config.getHTDocsDumpDir()));
log.debug("Dump dir: " + dumpDir);
-
File indexDir = new File(config.resolvePath(config.getIndexDir()));
log.debug("Index dir: " + indexDir);
-
-
- String id = IndexIterator.createID(file, dumpDir);
-
- boolean createNewIndex = false;
+ String id = IndexIterator.createID(file, dumpDir);
+ boolean createNewIndex = false;
if (!IndexReader.indexExists(indexDir)) {
log.warn("Index does not exist yet: " + indexDir);
createNewIndex = true;
} else {
- // Delete from index
+ // Delete from index
IndexReader reader = IndexReader.open(indexDir.getAbsolutePath());
- Term term = new Term("id", id);
+ Term term = new Term("id", id);
log.debug(term.toString());
- int numberOfDeletedDocuments = reader.delete(term);
+ int numberOfDeletedDocuments = reader.deleteDocuments(term);
if (numberOfDeletedDocuments == 1) {
log.info("Document has been deleted: " + term);
} else {
log.warn("No such document found in this index: " + term);
}
- //log.debug("Number of deleted documents: " + numberOfDeletedDocuments);
- //log.debug("Current number of documents in this index: " + reader.numDocs());
+ // log.debug("Number of deleted documents: " +
+ // numberOfDeletedDocuments);
+ // log.debug("Current number of documents in this index: " +
+ // reader.numDocs());
reader.close();
}
-
- // Append to index
+ // Append to index
Document doc = getDocumentCreator().getDocument(new File(dumpDir, id), dumpDir);
IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), createNewIndex);
- writer.maxFieldLength = 1000000;
+ writer.setMaxFieldLength(1000000);
writer.addDocument(doc);
- //log.debug("Document has been added: " + doc);
+ // log.debug("Document has been added: " + doc);
log.info("Document has been added: " + id);
writer.optimize();
writer.close();
}
-
/**
* Creates a new index.
*/
- public void createIndex(File dumpDirectory, File index)
- throws Exception {
+ public void createIndex(File dumpDirectory, File index) throws Exception {
doIndex(dumpDirectory, index, true);
}
-
/**
* Index files
- *
- * @param dumpDirectory Directory where the files to be indexed are located
- * @param index Directory where the index shall be located
- * @param create <strong>true</strong> means the index will be created from scratch, <strong>false</strong> means it will be indexed incrementally
+ *
+ * @param dumpDirectory
+ * Directory where the files to be indexed are located
+ * @param index
+ * Directory where the index shall be located
+ * @param create
+ * <strong>true</strong> means the index will be created from
+ * scratch, <strong>false</strong> means it will be indexed
+ * incrementally
*/
public void doIndex(File dumpDirectory, File index, boolean create) {
if (!index.isDirectory()) {
@@ -169,56 +158,46 @@
}
try {
IndexWriter writer = new IndexWriter(index.getAbsolutePath(), new StandardAnalyzer(), create);
- writer.maxFieldLength = 1000000;
-
+ writer.setMaxFieldLength(1000000);
IndexInformation info = new IndexInformation(index.getAbsolutePath(), dumpDirectory, getFilter(indexer, configFileName), create);
-
IndexHandler handler;
-
if (create) {
handler = new CreateIndexHandler(dumpDirectory, info, writer);
} else {
handler = new UpdateIndexHandler(dumpDirectory, info, writer);
}
-
IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName));
iterator.addHandler(handler);
iterator.iterate(dumpDirectory);
-
writer.optimize();
writer.close();
} catch (IOException e) {
log.error(e);
}
}
-
/**
* Delete the stale documents.
*/
- protected void deleteStaleDocuments(File dumpDirectory, File index)
- throws Exception {
+ protected void deleteStaleDocuments(File dumpDirectory, File index) throws Exception {
log.debug("Deleting stale documents");
-
IndexIterator iterator = new IndexIterator(index.getAbsolutePath(), getFilter(indexer, configFileName));
iterator.addHandler(new DeleteHandler());
iterator.iterate(dumpDirectory);
log.debug("Deleting stale documents finished");
}
-
/**
- * Returns the filter used to receive the indexable files. Might be overwritten by inherited class.
+ * Returns the filter used to receive the indexable files. Might be
+ * overwritten by inherited class.
*/
public FileFilter getFilter(Element indexer, String configFileName) {
String[] indexableExtensions = { "html", "htm", "txt" };
return new AbstractIndexer.DefaultIndexFilter(indexableExtensions);
}
-
/**
* FileFilter used to obtain the files to index.
*/
public class DefaultIndexFilter implements FileFilter {
protected String[] indexableExtensions;
-
/**
* Default indexable extensions: html, htm, txt
*/
@@ -226,24 +205,24 @@
String[] iE = { "html", "htm", "txt" };
indexableExtensions = iE;
}
-
/**
- *
+ *
*/
public DefaultIndexFilter(String[] indexableExtensions) {
this.indexableExtensions = indexableExtensions;
}
-
- /** Tests whether or not the specified abstract pathname should be
+ /**
+ * Tests whether or not the specified abstract pathname should be
* included in a pathname list.
- *
- * @param pathname The abstract pathname to be tested
- * @return <code>true</code> if and only if <code>pathname</code> should be included
- *
+ *
+ * @param pathname
+ * The abstract pathname to be tested
+ * @return <code>true</code> if and only if <code>pathname</code>
+ * should be included
+ *
*/
public boolean accept(File file) {
boolean accept;
-
if (file.isDirectory()) {
accept = true;
} else {
@@ -251,105 +230,95 @@
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
accept = Arrays.asList(indexableExtensions).contains(extension);
}
-
return accept;
}
}
-
/**
- * Deletes all stale documents up to the document representing the next file.
- * The following documents are deleted:
+ * Deletes all stale documents up to the document representing the next
+ * file. The following documents are deleted:
* <ul>
- * <li>representing files that where removed</li>
- * <li>representing the same file but are older than the current file</li>
+ * <li>representing files that where removed</li>
+ * <li>representing the same file but are older than the current file</li>
* </ul>
*/
public class DeleteHandler extends AbstractIndexIteratorHandler {
- /** Handles a stale document.
- *
+ /**
+ * Handles a stale document.
+ *
*/
public void handleStaleDocument(IndexReader reader, Term term) {
- log.debug("deleting " +
- IndexIterator.uid2url(term.text()));
-
+ log.debug("deleting " + IndexIterator.uid2url(term.text()));
try {
- int deletedDocuments = reader.delete(term);
- log.debug("deleted " + deletedDocuments +
- " documents.");
+ int deletedDocuments = reader.deleteDocuments(term);
+ log.debug("deleted " + deletedDocuments + " documents.");
} catch (IOException e) {
log.error(e);
}
}
}
-
/**
* DOCUMENT ME!
*/
public class IndexHandler extends AbstractIndexIteratorHandler {
/**
* Creates a new IndexHandler object.
- *
- * @param dumpDirectory DOCUMENT ME!
- * @param info DOCUMENT ME!
- * @param writer DOCUMENT ME!
+ *
+ * @param dumpDirectory
+ * DOCUMENT ME!
+ * @param info
+ * DOCUMENT ME!
+ * @param writer
+ * DOCUMENT ME!
*/
public IndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) {
this.info = info;
this.dumpDirectory = dumpDirectory;
this.writer = writer;
}
-
private IndexInformation info;
-
protected IndexInformation getInformation() {
return info;
}
-
private File dumpDirectory;
-
protected File getDumpDirectory() {
return dumpDirectory;
}
-
private IndexWriter writer;
-
protected IndexWriter getWriter() {
return writer;
}
-
/**
- * Add document to index
- */
+ * Add document to index
+ */
protected void addFile(File file) {
log.debug("adding document: " + file.getAbsolutePath());
-
try {
Document doc = getDocumentCreator().getDocument(file, dumpDirectory);
writer.addDocument(doc);
} catch (Exception e) {
log.error(e);
}
-
info.increase();
log.info(info.printProgress());
}
}
-
/**
* DOCUMENT ME!
*/
public class CreateIndexHandler extends IndexHandler {
/**
* Creates a new CreateIndexHandler object.
- *
- * @param dumpDirectory DOCUMENT ME!
- * @param info DOCUMENT ME!
- * @param writer DOCUMENT ME!
+ *
+ * @param dumpDirectory
+ * DOCUMENT ME!
+ * @param info
+ * DOCUMENT ME!
+ * @param writer
+ * DOCUMENT ME!
*/
public CreateIndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) {
super(dumpDirectory, info, writer);
}
-
/**
* Handles a file. Used when creating a new index.
*/
@@ -357,22 +326,23 @@
addFile(file);
}
}
-
/**
* DOCUMENT ME!
*/
public class UpdateIndexHandler extends IndexHandler {
/**
* Creates a new UpdateIndexHandler object.
- *
- * @param dumpDirectory DOCUMENT ME!
- * @param info DOCUMENT ME!
- * @param writer DOCUMENT ME!
+ *
+ * @param dumpDirectory
+ * DOCUMENT ME!
+ * @param info
+ * DOCUMENT ME!
+ * @param writer
+ * DOCUMENT ME!
*/
public UpdateIndexHandler(File dumpDirectory, IndexInformation info, IndexWriter writer) {
super(dumpDirectory, info, writer);
}
-
/**
* Handles a new document. Used when updating the index.
*/
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/DefaultDocumentCreator.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/DefaultDocumentCreator.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/DefaultDocumentCreator.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/lucene/index/DefaultDocumentCreator.java Tue Sep 11 14:39:37 2007
@@ -14,46 +14,44 @@
* limitations under the License.
*
*/
-
/* $Id$ */
-
package org.apache.lenya.lucene.index;
import java.io.File;
-
import org.apache.lenya.lucene.parser.HTMLParser;
import org.apache.lenya.lucene.parser.HTMLParserFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class DefaultDocumentCreator extends AbstractDocumentCreator {
-
- /**
+ /**
* Creates a new instance of DefaultDocumentCreator
*/
public DefaultDocumentCreator() {
}
-
/**
* DOCUMENT ME!
- *
- * @param file DOCUMENT ME!
- * @param htdocsDumpDir DOCUMENT ME!
- *
+ *
+ * @param file
+ * DOCUMENT ME!
+ * @param htdocsDumpDir
+ * DOCUMENT ME!
+ *
* @return DOCUMENT ME!
- *
- * @throws Exception DOCUMENT ME!
+ *
+ * @throws Exception
+ * DOCUMENT ME!
*/
public Document getDocument(File file, File htdocsDumpDir) throws Exception {
Document document = super.getDocument(file, htdocsDumpDir);
-
HTMLParser parser = HTMLParserFactory.newInstance(file);
parser.parse(file);
-
- document.add(Field.Text("title", parser.getTitle()));
- document.add(Field.Text("keywords", parser.getKeywords()));
- document.add(Field.Text("contents", parser.getReader()));
-
+ // document.add(Field.Text("title", parser.getTitle()));
+ document.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ // document.add(Field.Text("keywords", parser.getKeywords()));
+ document.add(new Field("keywords", parser.getKeywords(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ // document.add(Field.Text("contents", parser.getReader()));
+ document.add(new Field("contents", parser.getReader()));
return document;
}
}
Modified: lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/util/CacheMap.java
URL: http://svn.apache.org/viewvc/lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/util/CacheMap.java?rev=574702&r1=574701&r2=574702&view=diff
==============================================================================
--- lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/util/CacheMap.java (original)
+++ lenya/branches/revolution/1.3.x/src/java/org/apache/lenya/util/CacheMap.java Tue Sep 11 14:39:37 2007
@@ -38,7 +38,7 @@
* @param capacity The maximum number of entries.
*/
public CacheMap(int capacity) {
- assert capacity > -1;
+// assert capacity > -1;
this.capacity = capacity;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org