You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/09/01 10:05:51 UTC
[nutch] branch master updated: NUTCH-2598 URLNormalizerChecker
fails on invalid URLs in input - output empty string for invalid URLs
(MalformdURLException thrown) or if normalizer(s) return null
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 36c2ce6 NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input - output empty string for invalid URLs (MalformdURLException thrown) or if normalizer(s) return null
new 9dd11cd Merge pull request #435 from sebastian-nagel/NUTCH-2598-normalizerchecker-fails-on-invalid-url
36c2ce6 is described below
commit 36c2ce6925a7fc4e4c3f5b6052ee96d491d2e5eb
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jan 22 17:04:16 2019 +0100
NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input
- output empty string for invalid URLs (MalformdURLException thrown)
or if normalizer(s) return null
---
src/java/org/apache/nutch/net/URLNormalizerChecker.java | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index 2805f85..ee25f2f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.net;
+import java.net.MalformedURLException;
+
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
@@ -35,7 +37,8 @@ public class URLNormalizerChecker extends AbstractChecker {
+ "\n \t(if not given all configured URL normalizers are applied)"
+ "\n -scope \tone of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"
+ "\n -stdin \ttool reads a list of URLs from stdin, one URL per line"
- + "\n -listen <port>\trun tool as Telnet server listening on <port>\n";
+ + "\n -listen <port>\trun tool as Telnet server listening on <port>"
+ + "\n\nAn empty line is added to the output if a URL fails to normalize (MalformedURLException or null returned).\n";
// Print help when no args given
if (args.length < 1) {
@@ -71,7 +74,16 @@ public class URLNormalizerChecker extends AbstractChecker {
}
protected int process(String line, StringBuilder output) throws Exception {
- output.append(normalizers.normalize(line, scope));
+ try {
+ String norm = normalizers.normalize(line, scope);
+ if (norm == null) {
+ output.append("");
+ } else {
+ output.append(norm);
+ }
+ } catch (MalformedURLException e) {
+ output.append("");
+ }
return 0;
}