You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/17 13:33:58 UTC
svn commit: r637858 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/fetcher/Fetcher.java
src/java/org/apache/nutch/fetcher/Fetcher2.java
src/java/org/apache/nutch/parse/ParseOutputFormat.java
Author: ab
Date: Mon Mar 17 05:33:56 2008
New Revision: 637858
URL: http://svn.apache.org/viewvc?rev=637858&view=rev
Log:
NUTCH-615 Redirected URL-s fetched without setting fetchInterval. Guard against
reprUrl being null.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:33:56 2008
@@ -229,6 +229,9 @@
83. NUTCH-126 - Fetching https does not work with a proxy (Fritz Elfert via ab)
+84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval.
+ Guard against reprUrl being null. (Emmanuel Joke, ab)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:33:56 2008
@@ -282,8 +282,10 @@
return url;
} else {
CrawlDatum newDatum = new CrawlDatum();
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
+ if (reprUrl != null) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
+ }
output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - " + redirType + " redirect to " +
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:33:56 2008
@@ -549,9 +549,12 @@
refreshTime < Fetcher.PERM_REFRESH_TIME,
Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
- CrawlDatum newDatum = new CrawlDatum();
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
+ CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+ fit.datum.getFetchInterval(), fit.datum.getScore());
+ if (reprUrl != null) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
+ }
fit = FetchItem.create(redirUrl, newDatum, byIP);
if (fit != null) {
FetchItemQueue fiq =
@@ -582,14 +585,22 @@
handleRedirect(fit.url, fit.datum,
urlString, newUrl, temp,
Fetcher.PROTOCOL_REDIR);
- CrawlDatum newDatum = new CrawlDatum();
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
- fit = FetchItem.create(redirUrl, newDatum, byIP);
- if (fit != null) {
- FetchItemQueue fiq =
- fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.addInProgressFetchItem(fit);
+ if (redirUrl != null) {
+ CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+ fit.datum.getFetchInterval(), fit.datum.getScore());
+ if (reprUrl != null) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
+ }
+ fit = FetchItem.create(redirUrl, newDatum, byIP);
+ if (fit != null) {
+ FetchItemQueue fiq =
+ fetchQueues.getFetchItemQueue(fit.queueID);
+ fiq.addInProgressFetchItem(fit);
+ } else {
+ // stop redirecting
+ redirecting = false;
+ }
} else {
// stop redirecting
redirecting = false;
@@ -622,7 +633,7 @@
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
- output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
+ output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount >= maxRedirect) {
@@ -674,8 +685,10 @@
return url;
} else {
CrawlDatum newDatum = new CrawlDatum();
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
+ if (reprUrl != null) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
+ }
output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - " + redirType + " redirect to " +
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Mar 17 05:33:56 2008
@@ -152,7 +152,7 @@
refreshTime < Fetcher.PERM_REFRESH_TIME);
CrawlDatum newDatum = new CrawlDatum();
newDatum.setStatus(CrawlDatum.STATUS_LINKED);
- if (!reprUrl.equals(newUrl)) {
+ if (reprUrl != null && !reprUrl.equals(newUrl)) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
new Text(reprUrl));
}