You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/07/26 14:07:46 UTC

svn commit: r1365960 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/parse/ParseUtil.java src/java/org/apache/nutch/parse/ParserJob.java

Author: ferdy
Date: Thu Jul 26 12:07:46 2012
New Revision: 1365960

URL: http://svn.apache.org/viewvc?rev=1365960&view=rev
Log:
NUTCH-1438 ParserJob support for option -reparse

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jul 26 12:07:46 2012
@@ -1,6 +1,8 @@
 Nutch Change Log
 
 Release 2.1 - Current Development
+* NUTCH-1438 ParserJob support for option -reparse (ferdy)
+
 * NUTCH-1437 HostInjectorJob to accept lines with or without protocol (ferdy)
 
 * NUTCH-1435 Host jobs throw NullPointerException with MySQL (ferdy via lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Thu Jul 26 12:07:46 2012
@@ -269,7 +269,10 @@ public class ParseUtil extends Configure
 
           page.putToOutlinks(new Utf8(toUrl), new Utf8(outlinks[i].getAnchor()));
         }
-        Mark.PARSE_MARK.putMark(page, Mark.FETCH_MARK.checkMark(page));
+        Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
+        if (fetchMark != null) {
+          Mark.PARSE_MARK.putMark(page, fetchMark);
+        }
       }
     }
     return redirectedPage;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Thu Jul 26 12:07:46 2012
@@ -55,6 +55,8 @@ public class ParserJob extends NutchTool
   private static final String FORCE_KEY = "parse.job.force";
   
   public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+  
+  private static final Utf8 REPARSE = new Utf8("-reparse");
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -100,19 +102,23 @@ public class ParserJob extends NutchTool
         throws IOException, InterruptedException {
       Utf8 mark = Mark.FETCH_MARK.checkMark(page);
       String unreverseKey = TableUtil.unreverseUrl(key);
-      if (!NutchJob.shouldProcess(mark, batchId)) {
-        LOG.info("Skipping " + unreverseKey + "; different batch id");
-        return;
-      }
-      if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) {
-        if (force) {
-          LOG.info("Forced parsing " + unreverseKey + "; already parsed");
-        } else {
-          LOG.info("Skipping " + unreverseKey + "; already parsed");
+      if (batchId.equals(REPARSE)) {
+        LOG.debug("Reparsing " + unreverseKey);
+      } else {
+        if (!NutchJob.shouldProcess(mark, batchId)) {
+          LOG.info("Skipping " + unreverseKey + "; different batch id");
           return;
         }
-      } else {
-        LOG.info("Parsing " + unreverseKey);
+        if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) {
+          if (force) {
+            LOG.info("Forced parsing " + unreverseKey + "; already parsed");
+          } else {
+            LOG.info("Skipping " + unreverseKey + "; already parsed");
+            return;
+          }
+        } else {
+          LOG.info("Parsing " + unreverseKey);
+        }
       }
 
       if (skipTruncated && isTruncated(unreverseKey, page)) {
@@ -294,7 +300,7 @@ public class ParserJob extends NutchTool
       }
     }
     if (batchId == null) {
-      System.err.println("BatchId not set (or -all not specified)!");
+      System.err.println("BatchId not set (or -all/-reparse not specified)!");
       return -1;
     }
     return parse(batchId, shouldResume, force);