You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/05/09 21:36:55 UTC

svn commit: r536629 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/Indexer.java src/java/org/apache/nutch/indexer/IndexingFilter.java src/java/org/apache/nutch/indexer/IndexingFilters.java

Author: ab
Date: Wed May  9 12:36:54 2007
New Revision: 536629

URL: http://svn.apache.org/viewvc?view=rev&rev=536629
Log:
NUTCH-393 - Indexer should handle null documents returned by filters.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed May  9 12:36:54 2007
@@ -7,6 +7,9 @@
  2. NUTCH-443 - Allow parsers to return multiple Parse objects.
     (Dogacan Guney et al, via ab)
 
+ 3. NUTCH-393 - Indexer should handle null documents returned by filters.
+    (Eelco Lempsink via ab)
+
 
 
 Release 0.9 - 2007-04-02

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed May  9 12:36:54 2007
@@ -218,6 +218,9 @@
       return;
     }
 
+    // skip documents discarded by indexing filters
+    if (doc == null) return;
+    
     float boost = 1.0f;
     // run scoring filters
     try {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Wed May  9 12:36:54 2007
@@ -41,14 +41,15 @@
 
   /**
    * Adds fields or otherwise modifies the document that will be indexed for a
-   * parse.
+   * parse. Unwanted documents can be removed from indexing by returning a null value.
    * 
    * @param doc document instance for collecting fields
    * @param parse parse data instance
    * @param url page url
    * @param datum crawl datum for the page
    * @param inlinks page inlinks
-   * @return modified (or a new) document instance
+   * @return modified (or a new) document instance, or null (meaning the document
+   * should be discarded)
    * @throws IndexingException
    */
   Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=536629&r1=536628&r2=536629
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Wed May  9 12:36:54 2007
@@ -108,6 +108,8 @@
       Inlinks inlinks) throws IndexingException {
     for (int i = 0; i < this.indexingFilters.length; i++) {
       doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
+      // break the loop if an indexing filter discards the doc
+      if (doc == null) return null;
     }
 
     return doc;