You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Paul Tomblin <pt...@xcski.com> on 2009/10/14 16:37:30 UTC

Re: Recrawling Nutch

nutch doesn't do a good job on storing or testing the Last-Modified
time of pages it's crawled.  I made the following changes which seem
to help a lot:

snowbird:~/src/nutch/trunk> svn diff
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 817382)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -21,6 +21,7 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.UnknownHostException;
+import java.text.ParseException;
 import java.util.*;
 import java.util.Map.Entry;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -42,6 +43,7 @@
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -742,6 +744,23 @@

       datum.setStatus(status);
       datum.setFetchTime(System.currentTimeMillis());
+      LOG.debug("metadata = " + (content != null ?
content.getMetadata() : "content-null"));
+      LOG.debug("modified? = " + ((content != null &&
content.getMetadata() != null) ?
content.getMetadata().get("Last-Modified") : "content-null"));
+      if (content != null && content.getMetadata() != null &&
content.getMetadata().get("Last-Modified") != null)
+      {
+          String lastModifiedStr = content.getMetadata().get("Last-Modified");
+
+          try
+          {
+              long lastModifiedDate = HttpDateFormat.toLong(lastModifiedStr);
+              LOG.debug("last modified = " + lastModifiedStr + " = "
+ lastModifiedDate);
+              datum.setModifiedTime(lastModifiedDate);
+          }
+          catch (ParseException e)
+          {
+              LOG.error("unable to parse " + lastModifiedStr, e);
+          }
+      }
       if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

       ParseResult parseResult = null;
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(revision 817382)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(working copy)
@@ -84,8 +84,10 @@
         if (CrawlDatum.hasDbStatus(datum))
           dbDatum = datum;
         else if (CrawlDatum.hasFetchStatus(datum)) {
-          // don't index unmodified (empty) pages
-          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
+          /*
+           * Where did this person get the idea that unmodified pages
are empty?
+           // don't index unmodified (empty) pages
+          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) */
             fetchDatum = datum;
         } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                    CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
@@ -108,7 +110,7 @@
     }

     if (!parseData.getStatus().isSuccess() ||
-        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
+        (fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS &&
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)) {
       return;
     }

Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision
817382)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working
copy)
@@ -124,11 +124,14 @@
         reqStr.append("\r\n");
       }

-      reqStr.append("\r\n");
       if (datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " +
HttpDateFormat.toString(datum.getModifiedTime()));
+		String httpDate =
+		  HttpDateFormat.toString(datum.getModifiedTime());
+		Http.LOG.debug("modified time: " + httpDate);
+        reqStr.append("If-Modified-Since: " + httpDate);
         reqStr.append("\r\n");
       }
+      reqStr.append("\r\n");

       byte[] reqBytes= reqStr.toString().getBytes();



On Wed, Oct 14, 2009 at 9:40 AM, sprabhu_PN
<sh...@pinakilabs.com> wrote:
>
> "We are looking at picking up updates in a recrawl - How do I get the the
> fetcher to read the recently built segment, get to the url and decide
> whether to get the content based on whether the url has been updated since?
> "
>
> Shreekanth Prabhu
> --
> View this message in context: http://www.nabble.com/Recrawling--Nutch-tp25891294p25891294.html
> Sent from the Nutch - User mailing list archive at Nabble.com.
>
>



-- 
http://www.linkedin.com/in/paultomblin