You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/01/05 11:15:10 UTC
svn commit: r895972 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/fetcher/Fetcher.java
src/java/org/apache/nutch/parse/ParseSegment.java
src/java/org/apache/nutch/protocol/ProtocolStatus.java
Author: jnioche
Date: Tue Jan 5 10:14:49 2010
New Revision: 895972
URL: http://svn.apache.org/viewvc?rev=895972&view=rev
Log:
NUTCH-658 : Add Counter for # of doc fetched in Reporter
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972&r1=895971&r2=895972&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jan 5 10:14:49 2010
@@ -2,6 +2,8 @@
Unreleased Changes
+* NUTCH-658 Use counters to report fetching and parsing status (jnioche)
+
* NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)
* NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via ab)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972&r1=895971&r2=895972&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan 5 10:14:49 2010
@@ -607,6 +607,7 @@
LOG.debug("Denied by robots.txt: " + fit.url);
}
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+ reporter.incrCounter("FetcherStatus", "robots_denied", 1);
continue;
}
if (rules.getCrawlDelay() > 0) {
@@ -615,6 +616,7 @@
fetchQueues.finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+ reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
@@ -630,6 +632,8 @@
String urlString = fit.url.toString();
+ reporter.incrCounter("FetcherStatus", status.getName(), 1);
+
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
@@ -664,6 +668,7 @@
} else {
// stop redirecting
redirecting = false;
+ reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
}
}
}
@@ -701,6 +706,7 @@
} else {
// stop redirecting
redirecting = false;
+ reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
}
} else {
// stop redirecting
@@ -926,6 +932,7 @@
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
+ reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
return p.getData().getStatus();
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972&r1=895971&r2=895972&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 5 10:14:49 2010
@@ -93,6 +93,8 @@
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
+ reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+
if (!parseStatus.isSuccess()) {
LOG.warn("Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(getConf());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=895972&r1=895971&r2=895972&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Tue Jan 5 10:14:49 2010
@@ -191,6 +191,10 @@
public int getCode() {
return code;
}
+
+ public String getName() {
+ return codeToName.get(this.code);
+ }
public void setCode(int code) {
this.code = code;