You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/07/26 14:07:46 UTC
svn commit: r1365960 - in /nutch/branches/2.x: CHANGES.txt
src/java/org/apache/nutch/parse/ParseUtil.java
src/java/org/apache/nutch/parse/ParserJob.java
Author: ferdy
Date: Thu Jul 26 12:07:46 2012
New Revision: 1365960
URL: http://svn.apache.org/viewvc?rev=1365960&view=rev
Log:
NUTCH-1438 ParserJob support for option -reparse
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jul 26 12:07:46 2012
@@ -1,6 +1,8 @@
Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1438 ParserJob support for option -reparse (ferdy)
+
* NUTCH-1437 HostInjectorJob to accept lines with or without protocol (ferdy)
* NUTCH-1435 Host jobs throw NullPointerException with MySQL (ferdy via lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Thu Jul 26 12:07:46 2012
@@ -269,7 +269,10 @@ public class ParseUtil extends Configure
page.putToOutlinks(new Utf8(toUrl), new Utf8(outlinks[i].getAnchor()));
}
- Mark.PARSE_MARK.putMark(page, Mark.FETCH_MARK.checkMark(page));
+ Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
+ if (fetchMark != null) {
+ Mark.PARSE_MARK.putMark(page, fetchMark);
+ }
}
}
return redirectedPage;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1365960&r1=1365959&r2=1365960&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Thu Jul 26 12:07:46 2012
@@ -55,6 +55,8 @@ public class ParserJob extends NutchTool
private static final String FORCE_KEY = "parse.job.force";
public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+
+ private static final Utf8 REPARSE = new Utf8("-reparse");
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -100,19 +102,23 @@ public class ParserJob extends NutchTool
throws IOException, InterruptedException {
Utf8 mark = Mark.FETCH_MARK.checkMark(page);
String unreverseKey = TableUtil.unreverseUrl(key);
- if (!NutchJob.shouldProcess(mark, batchId)) {
- LOG.info("Skipping " + unreverseKey + "; different batch id");
- return;
- }
- if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) {
- if (force) {
- LOG.info("Forced parsing " + unreverseKey + "; already parsed");
- } else {
- LOG.info("Skipping " + unreverseKey + "; already parsed");
+ if (batchId.equals(REPARSE)) {
+ LOG.debug("Reparsing " + unreverseKey);
+ } else {
+ if (!NutchJob.shouldProcess(mark, batchId)) {
+ LOG.info("Skipping " + unreverseKey + "; different batch id");
return;
}
- } else {
- LOG.info("Parsing " + unreverseKey);
+ if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) {
+ if (force) {
+ LOG.info("Forced parsing " + unreverseKey + "; already parsed");
+ } else {
+ LOG.info("Skipping " + unreverseKey + "; already parsed");
+ return;
+ }
+ } else {
+ LOG.info("Parsing " + unreverseKey);
+ }
}
if (skipTruncated && isTruncated(unreverseKey, page)) {
@@ -294,7 +300,7 @@ public class ParserJob extends NutchTool
}
}
if (batchId == null) {
- System.err.println("BatchId not set (or -all not specified)!");
+ System.err.println("BatchId not set (or -all/-reparse not specified)!");
return -1;
}
return parse(batchId, shouldResume, force);