You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/13 14:10:19 UTC

svn commit: r1724418 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Author: markus
Date: Wed Jan 13 13:10:19 2016
New Revision: 1724418

URL: http://svn.apache.org/viewvc?rev=1724418&view=rev
Log:
NUTCH-2196 IndexingFilterChecker to optionally normalize

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724418&r1=1724417&r2=1724418&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 13:10:19 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2196 IndexingFilterChecker to optionally normalize (markus)
+
 * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
 
 * NUTCH-2190 Protocol normalizer (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724418&r1=1724417&r2=1724418&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 13:10:19 2016
@@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseSegment;
@@ -43,7 +44,6 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,10 +69,11 @@ public class IndexingFiltersChecker exte
   public int run(String[] args) throws Exception {
     String contentType = null;
     String url = null;
+    URLNormalizers normalizers = null;
     boolean dumpText = false;
     boolean followRedirects = false;
 
-    String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>";
+    String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] <url>";
 
     if (args.length == 0) {
       System.err.println(usage);
@@ -83,7 +84,9 @@ public class IndexingFiltersChecker exte
     HashMap<String, String> metadata = new HashMap<String, String>();
 
     for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-followRedirects")) {
+      if (args[i].equals("-normalize")) {
+        normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+      } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
@@ -101,9 +104,13 @@ public class IndexingFiltersChecker exte
         System.err.println(usage);
         System.exit(-1);
       } else {
-        url = URLUtil.toASCII(args[i]);
+        url =args[i];
       }
     }
+    
+    if (normalizers != null) {
+      url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+    }
 
     LOG.info("fetching: " + url);
 
@@ -129,6 +136,11 @@ public class IndexingFiltersChecker exte
     while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) {
       String[] stuff = output.getStatus().getArgs();
       url = stuff[0];
+      
+      if (normalizers != null) {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+      }
+    
       turl.set(url);
       
       // try again