You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/15 05:24:45 UTC

svn commit: r1666777 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/FileDumper.java src/java/org/apache/nutch/util/DumpFileUtil.java src/test/org/apache/nutch/util/DumpFileUtilTest.java

Author: mattmann
Date: Sun Mar 15 04:24:44 2015
New Revision: 1666777

URL: http://svn.apache.org/r1666777
Log:
Fix for NUTCH-1957 FileDumper output file name collisions contributed by Renxia Wang this closes #12

Added:
    nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
    nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1666777&r1=1666776&r2=1666777&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Mar 15 04:24:44 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1957 FileDumper output file name collisions (Renxia Wang via mattmann)
+
 * NUTCH-1955 ByteWritable missing in NutchWritable (markus)
 
 * NUTCH-1956 Members to be public in URLCrawlDatum (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1666777&r1=1666776&r2=1666777&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Sun Mar 15 04:24:44 2015
@@ -28,6 +28,7 @@ import java.util.HashMap;
 import java.util.Map;
 import java.security.MessageDigest;
 
+import com.google.common.base.Strings;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.GnuParser;
@@ -47,6 +48,7 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 //Tika imports
@@ -203,40 +205,24 @@ public class FileDumper {
           }
 
           if (filter) {
-            String outputFullPath = outputDir + "/" + filename;
-            File outputFile = new File(outputFullPath);
-            if (!outputFile.exists()) {
-              LOG.info("Writing: [" + outputFullPath + "]");
-              try {
-                FileOutputStream output = new FileOutputStream(outputFile);
-                IOUtils.write(content.getContent(), output);
-                fileCount++;
-                  
-              } catch (Exception e) {
-                // if the file name is too long, we get the first 32 chars of the original name and append its MD5
-                // after the first 32 chars as the new file name
-                MessageDigest md = MessageDigest.getInstance("MD5");
-                md.update(outputFullPath.getBytes());
-                byte[] digest = md.digest();
-                StringBuffer sb = new StringBuffer();
-                for (byte b : digest) {
-                  sb.append(String.format("%02x", b & 0xff));
-                }
-                outputFullPath = outputFullPath.substring(0, 32) + "_" + sb.toString();
+            String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+            String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
 
-                File newOutPutFile = new File(outputFullPath);
-                FileOutputStream output = new FileOutputStream(newOutPutFile);
+            if (!Strings.isNullOrEmpty(fullDir)) {
+              String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+              File outputFile = new File(outputFullPath);
+
+              if (!outputFile.exists()) {
+                LOG.info("Writing: [" + outputFullPath + "]");
+                FileOutputStream output = new FileOutputStream(outputFile);
                 IOUtils.write(content.getContent(), output);
                 fileCount++;
-                LOG.info("File name is too long. Truncated and MD5 appended.");
-                
-                //e.printStackTrace();
+              } else {
+                LOG.info("Skipping writing: [" + outputFullPath
+                        + "]: file already exists");
               }
-              
-            } else {
-              LOG.info("Skipping writing: [" + outputFullPath
-                  + "]: file already exists");
             }
+
           }
         }
         reader.close();

Added: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1666777&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Sun Mar 15 04:24:44 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.MD5Hash;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+public class DumpFileUtil {
+    private static final Logger LOG = LoggerFactory.getLogger(DumpFileUtil.class
+            .getName());
+
+    private final static String DIR_PATTERN = "%s/%s/%s";
+    private final static String FILENAME_PATTERN = "%s_%s.%s";
+    private final static Integer MAX_LENGTH_OF_FILENAME = 32;
+
+    public static String getUrlMD5(String url) {
+        byte[] digest = MD5Hash.digest(url).getDigest();
+
+        StringBuffer sb = new StringBuffer();
+        for (byte b : digest) {
+            sb.append(String.format("%02x", b & 0xff));
+        }
+
+        return sb.toString();
+    }
+
+    public static String createTwoLevelsDirectory(String basePath, String md5) {
+        String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
+        String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
+
+        String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
+
+        try {
+            FileUtils.forceMkdir(new File(fullDirPath));
+        } catch (IOException e) {
+            LOG.error("Failed to create dir: {}", fullDirPath);
+            fullDirPath = null;
+        }
+
+        return fullDirPath;
+    }
+
+    public static String createFileName(String md5, String fileBaseName, String fileExtension) {
+        if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {
+            LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME);
+            return String.format(FILENAME_PATTERN, md5, StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME), fileExtension);
+        } else {
+            return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension);
+        }
+    }
+}

Added: nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java?rev=1666777&view=auto
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java (added)
+++ nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java Sun Mar 15 04:24:44 2015
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class DumpFileUtilTest {
+
+    @Test
+    public void testGetUrlMD5() throws Exception {
+        String testUrl = "http://apache.org";
+
+        String result = DumpFileUtil.getUrlMD5(testUrl);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7", result);
+    }
+
+    @Test
+    public void testCreateTwoLevelsDirectory() throws Exception {
+        String testUrl = "http://apache.org";
+        String basePath = "/tmp";
+        String fullDir = DumpFileUtil.createTwoLevelsDirectory(basePath, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertEquals("/tmp/96/ea", fullDir);
+
+        String basePath2 = "/this/path/is/not/existed/just/for/testing";
+        String fullDir2 = DumpFileUtil.createTwoLevelsDirectory(basePath2, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertNull(fullDir2);
+    }
+
+    @Test
+    public void testCreateFileName() throws Exception {
+        String testUrl = "http://apache.org";
+        String baseName = "test";
+        String extension = "html";
+        String fullDir = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.html", fullDir);
+
+        String tooLongBaseName = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+        String fullDir2 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), tooLongBaseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_testtesttesttesttesttesttesttest.html", fullDir2);
+    }
+}