You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2015/12/16 23:10:18 UTC
svn commit: r1720466 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/tools/FileDumper.java
Author: joyce
Date: Wed Dec 16 22:10:18 2015
New Revision: 1720466
URL: http://svn.apache.org/viewvc?rev=1720466&view=rev
Log:
NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1720466&r1=1720465&r2=1720466&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Dec 16 22:10:18 2015
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency
+
* NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc)
* NUTCH-2180 FileDumper skips Corrupt Segments (Harshavardhan Manjunatha via lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1720466&r1=1720465&r2=1720466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Dec 16 22:10:18 2015
@@ -37,6 +37,7 @@ import org.apache.commons.cli.Options;
//Commons imports
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.codec.digest.DigestUtils;
//Hadoop
import org.apache.hadoop.conf.Configuration;
@@ -244,24 +245,7 @@ public class FileDumper {
String[] reversedURL = TableUtil.reverseUrl(url).split(":");
reversedURL[0] = reversedURL[0].replace('.', '/');
- // URLs with content at a folder level and nested below that
- // run into problems when dumping. For example:
- //
- // www.foo.com/bar/
- // www.foo.com/bar/about.html
- //
- // One of these will fail to dump depending on processing order.
- // To address this, we will use a placeholder when dumping a URL
- // such as the one ending in '/bar/'
- String lastDir = reversedURL[reversedURL.length - 1];
- if (! lastDir.contains(".")) {
- if (lastDir.charAt(lastDir.length() - 1) != '/') {
- reversedURL[reversedURL.length - 1] += '/';
- }
- reversedURL[reversedURL.length - 1] += "_file";
- }
-
- String reversedURLPath = org.apache.commons.lang3.StringUtils.join(reversedURL, "/");
+ String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
// We'll drop the trailing file name and create the nested structure if it doesn't already exist.