You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/13 13:17:04 UTC
svn commit: r1724409 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: markus
Date: Wed Jan 13 12:17:03 2016
New Revision: 1724409
URL: http://svn.apache.org/viewvc?rev=1724409&view=rev
Log:
NUTCH-2195 IndexingFilterChecker to optionally follow N redirects
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724409&r1=1724408&r2=1724409&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 12:17:03 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
+
* NUTCH-2190 Protocol normalizer (markus)
* NUTCH-1838 Host and domain based regex and automaton filtering (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724409&r1=1724408&r2=1724409&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 12:17:03 2016
@@ -70,8 +70,9 @@ public class IndexingFiltersChecker exte
String contentType = null;
String url = null;
boolean dumpText = false;
+ boolean followRedirects = false;
- String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] <url>";
+ String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>";
if (args.length == 0) {
System.err.println(usage);
@@ -82,7 +83,9 @@ public class IndexingFiltersChecker exte
HashMap<String, String> metadata = new HashMap<String, String>();
for (int i = 0; i < args.length; i++) {
- if (args[i].equals("-dumpText")) {
+ if (args[i].equals("-followRedirects")) {
+ followRedirects = true;
+ } else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
@@ -116,11 +119,22 @@ public class IndexingFiltersChecker exte
}
IndexingFilters indexers = new IndexingFilters(getConf());
+
+ int maxRedirects = 3;
- ProtocolFactory factory = new ProtocolFactory(getConf());
- Protocol protocol = factory.getProtocol(url);
+ ProtocolOutput output = getProtocolOutput(url, datum);
Text turl = new Text(url);
- ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+
+ // Following redirects and not reached maxRedirects?
+ while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) {
+ String[] stuff = output.getStatus().getArgs();
+ url = stuff[0];
+ turl.set(url);
+
+ // try again
+ output = getProtocolOutput(url, datum);
+ maxRedirects--;
+ }
if (!output.getStatus().isSuccess()) {
System.out.println("Fetch failed with protocol status: "
@@ -224,6 +238,14 @@ public class IndexingFiltersChecker exte
return 0;
}
+
+ protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+ ProtocolFactory factory = new ProtocolFactory(getConf());
+ Protocol protocol = factory.getProtocol(url);
+ Text turl = new Text(url);
+ ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+ return output;
+ }
public static void main(String[] args) throws Exception {
final int res = ToolRunner.run(NutchConfiguration.create(),