You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/18 18:32:42 UTC
svn commit: r1674536 - in /nutch/trunk: CHANGES.txt ivy/ivy.xml
src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Author: mattmann
Date: Sat Apr 18 16:32:42 2015
New Revision: 1674536
URL: http://svn.apache.org/r1674536
Log:
Fix for NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper contributed by Giuseppe Totaro.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 16:32:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
+
* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel)
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Apr 18 16:32:42 2015
@@ -43,6 +43,8 @@
<dependency org="commons-lang" name="commons-lang" rev="2.6"
conf="*->default" />
+ <dependency org="commons-validator" name="commons-validator" rev="1.4.1"
+ conf="*->default" />
<dependency org="commons-collections" name="commons-collections"
rev="3.1" conf="*->default" />
<dependency org="commons-httpclient" name="commons-httpclient"
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat Apr 18 16:32:42 2015
@@ -49,6 +49,7 @@ import org.apache.commons.compress.compr
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.validator.routines.UrlValidator;
//Hadoop
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -384,6 +385,12 @@ public class CommonCrawlDataDumper {
reader.getCurrentValue(content);
Metadata metadata = content.getMetadata();
String url = key.toString();
+
+ UrlValidator urlValidator = new UrlValidator();
+ if (!urlValidator.isValid(url)) {
+ LOG.warn("Not valid URL detected: " + url);
+ }
+
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);