You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/19 13:20:05 UTC

svn commit: r965430 - /nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java

Author: ab
Date: Mon Jul 19 11:20:05 2010
New Revision: 965430

URL: http://svn.apache.org/viewvc?rev=965430&view=rev
Log:
Improve cmd-line args handling. Add option to force a re-parse.

Modified:
    nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java?rev=965430&r1=965429&r2=965430&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserJob.java Mon Jul 19 11:20:05 2010
@@ -29,6 +29,9 @@ public class ParserJob extends GoraMappe
     implements Tool {
 
   public static final Log LOG = LogFactory.getLog(ParserJob.class);
+  
+  private static final String RESUME_KEY = "parse.job.resume";
+  private static final String FORCE_KEY = "parse.job.force";
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -46,7 +49,9 @@ public class ParserJob extends GoraMappe
 
   private ParseUtil parseUtil;
 
-  private boolean shouldContinue;
+  private boolean shouldResume;
+  
+  private boolean force;
 
   private Utf8 crawlId;
 
@@ -54,7 +59,8 @@ public class ParserJob extends GoraMappe
   public void setup(Context context) throws IOException {
     Configuration conf = context.getConfiguration();
     parseUtil = new ParseUtil(conf);
-    shouldContinue = conf.getBoolean("job.continue", false);
+    shouldResume = conf.getBoolean(RESUME_KEY, false);
+    force = conf.getBoolean(FORCE_KEY, false);    
     crawlId = new Utf8(conf.get(GeneratorJob.CRAWL_ID, Nutch.ALL_CRAWL_ID_STR));
   }
 
@@ -68,11 +74,17 @@ public class ParserJob extends GoraMappe
       }
       return;
     }
-    if (shouldContinue && Mark.PARSE_MARK.checkMark(page) != null) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; already parsed");
+    if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) {
+      if (force) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Forced parsing " + TableUtil.unreverseUrl(key) + "; already parsed");
+        }        
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; already parsed");
+        }
+        return;
       }
-      return;
     }
 
     URLWebPage redirectedPage = parseUtil.process(key, page);
@@ -123,17 +135,21 @@ public class ParserJob extends GoraMappe
     this.conf = conf;
   }
 
-  public int parse(String crawlId, boolean shouldContinue) throws Exception {
+  public int parse(String crawlId, boolean shouldResume, boolean force) throws Exception {
     LOG.info("ParserJob: starting");
 
-    getConf().set(GeneratorJob.CRAWL_ID, crawlId);
-    getConf().setBoolean("job.continue", shouldContinue);
+    if (crawlId != null) {
+      getConf().set(GeneratorJob.CRAWL_ID, crawlId);
+    }
+    getConf().setBoolean(RESUME_KEY, shouldResume);
+    getConf().setBoolean(FORCE_KEY, force);
 
-    LOG.info("ParserJob: continuing: " + getConf().getBoolean("job.continue", false));
-    if (crawlId.equals(Nutch.ALL_CRAWL_ID_STR)) {
+    LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, false));
+    LOG.info("ParserJob: forced reparse:\t" + getConf().getBoolean(FORCE_KEY, false));
+    if (crawlId == null || crawlId.equals(Nutch.ALL_CRAWL_ID_STR)) {
       LOG.info("ParserJob: parsing all");
     } else {
-      LOG.info("ParserJob: crawlId: " + crawlId);
+      LOG.info("ParserJob: crawlId:\t" + crawlId);
     }
 
     final Job job = new NutchJob(getConf(), "parse");
@@ -153,27 +169,38 @@ public class ParserJob extends GoraMappe
   }
 
   public int run(String[] args) throws Exception {
-    boolean shouldContinue = false;
-    String crawlId;
-
-    String usage = "Usage: ParserJob (<crawl id> | -all) [-continue]";
+    boolean shouldResume = false;
+    boolean force = false;
+    String crawlId = null;
 
     if (args.length < 1) {
-      System.err.println(usage);
-      return 1;
+      System.err.println("Usage: ParserJob (<crawlId> | -all) [-resume] [-force]");
+      System.err.println("\tcrawlId\tsymbolic crawl ID created by Generator");
+      System.err.println("\t-all\tconsider pages from all crawl jobs");
+      System.err.println("-resume\tresume a previous incomplete job");
+      System.err.println("-force\tforce re-parsing even if a page is already parsed");
+      return -1;
     }
-
-    crawlId = args[0];
-    if (crawlId.equals("-continue")) {
-      System.err.println(usage);
-      return 1;
+    for (String s : args) {
+      if ("-resume".equals(s)) {
+        shouldResume = true;
+      } else if ("-force".equals(s)) {
+        force = true;
+      } else if ("-all".equals(s)) {
+        crawlId = s;
+      } else {
+        if (crawlId != null) {
+          System.err.println("CrawlId already set to '" + crawlId + "'!");
+          return -1;
+        }
+        crawlId = s;
+      }
     }
-
-    if (args.length >= 1 && "-continue".equals(args[0])) {
-      shouldContinue = true;
+    if (crawlId == null) {
+      System.err.println("CrawlId not set (or -all not specified)!");
+      return -1;
     }
-
-    return parse(crawlId, shouldContinue);
+    return parse(crawlId, shouldResume, force);
   }
 
   public static void main(String[] args) throws Exception {