You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/05/09 21:36:55 UTC
svn commit: r536629 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/Indexer.java
src/java/org/apache/nutch/indexer/IndexingFilter.java
src/java/org/apache/nutch/indexer/IndexingFilters.java
Author: ab
Date: Wed May 9 12:36:54 2007
New Revision: 536629
URL: http://svn.apache.org/viewvc?view=rev&rev=536629
Log:
NUTCH-393 - Indexer should handle null documents returned by filters.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed May 9 12:36:54 2007
@@ -7,6 +7,9 @@
2. NUTCH-443 - Allow parsers to return multiple Parse objects.
(Dogacan Guney et al, via ab)
+ 3. NUTCH-393 - Indexer should handle null documents returned by filters.
+ (Eelco Lempsink via ab)
+
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed May 9 12:36:54 2007
@@ -218,6 +218,9 @@
return;
}
+ // skip documents discarded by indexing filters
+ if (doc == null) return;
+
float boost = 1.0f;
// run scoring filters
try {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Wed May 9 12:36:54 2007
@@ -41,14 +41,15 @@
/**
* Adds fields or otherwise modifies the document that will be indexed for a
- * parse.
+ * parse. Unwanted documents can be removed from indexing by returning a null value.
*
* @param doc document instance for collecting fields
* @param parse parse data instance
* @param url page url
* @param datum crawl datum for the page
* @param inlinks page inlinks
- * @return modified (or a new) document instance
+ * @return modified (or a new) document instance, or null (meaning the document
+ * should be discarded)
* @throws IndexingException
*/
Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Wed May 9 12:36:54 2007
@@ -108,6 +108,8 @@
Inlinks inlinks) throws IndexingException {
for (int i = 0; i < this.indexingFilters.length; i++) {
doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
+ // break the loop if an indexing filter discards the doc
+ if (doc == null) return null;
}
return doc;