You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/13 13:17:04 UTC

svn commit: r1724409 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Author: markus
Date: Wed Jan 13 12:17:03 2016
New Revision: 1724409

URL: http://svn.apache.org/viewvc?rev=1724409&view=rev
Log:
NUTCH-2195 IndexingFilterChecker to optionally follow N redirects

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724409&r1=1724408&r2=1724409&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 12:17:03 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
+
 * NUTCH-2190 Protocol normalizer (markus)
 
 * NUTCH-1838 Host and domain based regex and automaton filtering (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724409&r1=1724408&r2=1724409&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 12:17:03 2016
@@ -70,8 +70,9 @@ public class IndexingFiltersChecker exte
     String contentType = null;
     String url = null;
     boolean dumpText = false;
+    boolean followRedirects = false;
 
-    String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] <url>";
+    String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>";
 
     if (args.length == 0) {
       System.err.println(usage);
@@ -82,7 +83,9 @@ public class IndexingFiltersChecker exte
     HashMap<String, String> metadata = new HashMap<String, String>();
 
     for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-dumpText")) {
+      if (args[i].equals("-followRedirects")) {
+        followRedirects = true;
+      } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-md")) {
         String k = null, v = null;
@@ -116,11 +119,22 @@ public class IndexingFiltersChecker exte
     }
 
     IndexingFilters indexers = new IndexingFilters(getConf());
+    
+    int maxRedirects = 3;
 
-    ProtocolFactory factory = new ProtocolFactory(getConf());
-    Protocol protocol = factory.getProtocol(url);
+    ProtocolOutput output = getProtocolOutput(url, datum);
     Text turl = new Text(url);
-    ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+    
+    // Following redirects and not reached maxRedirects?
+    while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) {
+      String[] stuff = output.getStatus().getArgs();
+      url = stuff[0];
+      turl.set(url);
+      
+      // try again
+      output = getProtocolOutput(url, datum);
+      maxRedirects--;
+    }
 
     if (!output.getStatus().isSuccess()) {
       System.out.println("Fetch failed with protocol status: "
@@ -224,6 +238,14 @@ public class IndexingFiltersChecker exte
 
     return 0;
   }
+  
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+    ProtocolFactory factory = new ProtocolFactory(getConf());
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+    return output;
+  }
 
   public static void main(String[] args) throws Exception {
     final int res = ToolRunner.run(NutchConfiguration.create(),