You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2015/12/16 23:10:18 UTC

svn commit: r1720466 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/FileDumper.java

Author: joyce
Date: Wed Dec 16 22:10:18 2015
New Revision: 1720466

URL: http://svn.apache.org/viewvc?rev=1720466&view=rev
Log:
NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1720466&r1=1720465&r2=1720466&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Dec 16 22:10:18 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency
+
 * NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc)
 
 * NUTCH-2180 FileDumper skips Corrupt Segments (Harshavardhan Manjunatha via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1720466&r1=1720465&r2=1720466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Dec 16 22:10:18 2015
@@ -37,6 +37,7 @@ import org.apache.commons.cli.Options;
 //Commons imports
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.codec.digest.DigestUtils;
 
 //Hadoop
 import org.apache.hadoop.conf.Configuration;
@@ -244,24 +245,7 @@ public class FileDumper {
                     String[] reversedURL = TableUtil.reverseUrl(url).split(":");
                     reversedURL[0] = reversedURL[0].replace('.', '/');
 
-                    // URLs with content at a folder level and nested below that
-                    // run into problems when dumping. For example:
-                    //
-                    // www.foo.com/bar/
-                    // www.foo.com/bar/about.html
-                    //
-                    // One of these will fail to dump depending on processing order.
-                    // To address this, we will use a placeholder when dumping a URL
-                    // such as the one ending in '/bar/'
-                    String lastDir = reversedURL[reversedURL.length - 1];
-                    if (! lastDir.contains(".")) {
-                      if (lastDir.charAt(lastDir.length() - 1) != '/') {
-                        reversedURL[reversedURL.length - 1] += '/';
-                      }
-                      reversedURL[reversedURL.length - 1] += "_file";
-                    }
-
-                    String reversedURLPath = org.apache.commons.lang3.StringUtils.join(reversedURL, "/");
+                    String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
                     outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
                     
                     // We'll drop the trailing file name and create the nested structure if it doesn't already exist.