You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/18 18:32:42 UTC

svn commit: r1674536 - in /nutch/trunk: CHANGES.txt ivy/ivy.xml src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Author: mattmann
Date: Sat Apr 18 16:32:42 2015
New Revision: 1674536

URL: http://svn.apache.org/r1674536
Log:
Fix for NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper contributed by Giuseppe Totaro.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 16:32:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
+
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
 
 * NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel)

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Apr 18 16:32:42 2015
@@ -43,6 +43,8 @@
 		
 		<dependency org="commons-lang" name="commons-lang" rev="2.6"
 			conf="*->default" />
+		<dependency org="commons-validator" name="commons-validator" rev="1.4.1"
+			conf="*->default" />
 		<dependency org="commons-collections" name="commons-collections"
 			rev="3.1" conf="*->default" />
 		<dependency org="commons-httpclient" name="commons-httpclient"

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat Apr 18 16:32:42 2015
@@ -49,6 +49,7 @@ import org.apache.commons.compress.compr
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;
 
+import org.apache.commons.validator.routines.UrlValidator;
 //Hadoop
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -384,6 +385,12 @@ public class CommonCrawlDataDumper {
 					reader.getCurrentValue(content);
 					Metadata metadata = content.getMetadata();
 					String url = key.toString();
+					
+					UrlValidator urlValidator = new UrlValidator();
+					if (!urlValidator.isValid(url)) {
+						LOG.warn("Not valid URL detected: " + url);
+					}
+					
 					String baseName = FilenameUtils.getBaseName(url);
 					String extension = FilenameUtils.getExtension(url);