You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/09/01 10:05:51 UTC

[nutch] branch master updated: NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input - output empty string for invalid URLs (MalformdURLException thrown) or if normalizer(s) return null

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 36c2ce6  NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input - output empty string for invalid URLs (MalformdURLException thrown)   or if normalizer(s) return null
     new 9dd11cd  Merge pull request #435 from sebastian-nagel/NUTCH-2598-normalizerchecker-fails-on-invalid-url
36c2ce6 is described below

commit 36c2ce6925a7fc4e4c3f5b6052ee96d491d2e5eb
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jan 22 17:04:16 2019 +0100

    NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input
    - output empty string for invalid URLs (MalformdURLException thrown)
      or if normalizer(s) return null
---
 src/java/org/apache/nutch/net/URLNormalizerChecker.java | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index 2805f85..ee25f2f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.net;
 
+import java.net.MalformedURLException;
+
 import org.apache.hadoop.util.ToolRunner;
 
 import org.apache.nutch.util.AbstractChecker;
@@ -35,7 +37,8 @@ public class URLNormalizerChecker extends AbstractChecker {
         + "\n             \t(if not given all configured URL normalizers are applied)"
         + "\n  -scope     \tone of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"
         + "\n  -stdin     \ttool reads a list of URLs from stdin, one URL per line"
-        + "\n  -listen <port>\trun tool as Telnet server listening on <port>\n";
+        + "\n  -listen <port>\trun tool as Telnet server listening on <port>"
+        + "\n\nAn empty line is added to the output if a URL fails to normalize (MalformedURLException or null returned).\n";
 
     // Print help when no args given
     if (args.length < 1) {
@@ -71,7 +74,16 @@ public class URLNormalizerChecker extends AbstractChecker {
   }
 
   protected int process(String line, StringBuilder output) throws Exception {
-    output.append(normalizers.normalize(line, scope));
+    try {
+      String norm = normalizers.normalize(line, scope);
+      if (norm == null) {
+        output.append("");
+      } else {
+        output.append(norm);
+      }
+    } catch (MalformedURLException e) {
+      output.append("");
+    }
     return 0;
   }