You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/15 05:24:45 UTC
svn commit: r1666777 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/tools/FileDumper.java
src/java/org/apache/nutch/util/DumpFileUtil.java
src/test/org/apache/nutch/util/DumpFileUtilTest.java
Author: mattmann
Date: Sun Mar 15 04:24:44 2015
New Revision: 1666777
URL: http://svn.apache.org/r1666777
Log:
Fix for NUTCH-1957 FileDumper output file name collisions contributed by Renxia Wang this closes #12
Added:
nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1666777&r1=1666776&r2=1666777&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Mar 15 04:24:44 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1957 FileDumper output file name collisions (Renxia Wang via mattmann)
+
* NUTCH-1955 ByteWritable missing in NutchWritable (markus)
* NUTCH-1956 Members to be public in URLCrawlDatum (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1666777&r1=1666776&r2=1666777&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Sun Mar 15 04:24:44 2015
@@ -28,6 +28,7 @@ import java.util.HashMap;
import java.util.Map;
import java.security.MessageDigest;
+import com.google.common.base.Strings;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
@@ -47,6 +48,7 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
//Tika imports
@@ -203,40 +205,24 @@ public class FileDumper {
}
if (filter) {
- String outputFullPath = outputDir + "/" + filename;
- File outputFile = new File(outputFullPath);
- if (!outputFile.exists()) {
- LOG.info("Writing: [" + outputFullPath + "]");
- try {
- FileOutputStream output = new FileOutputStream(outputFile);
- IOUtils.write(content.getContent(), output);
- fileCount++;
-
- } catch (Exception e) {
- // if the file name is too long, we get the first 32 chars of the original name and append its MD5
- // after the first 32 chars as the new file name
- MessageDigest md = MessageDigest.getInstance("MD5");
- md.update(outputFullPath.getBytes());
- byte[] digest = md.digest();
- StringBuffer sb = new StringBuffer();
- for (byte b : digest) {
- sb.append(String.format("%02x", b & 0xff));
- }
- outputFullPath = outputFullPath.substring(0, 32) + "_" + sb.toString();
+ String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+ String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
- File newOutPutFile = new File(outputFullPath);
- FileOutputStream output = new FileOutputStream(newOutPutFile);
+ if (!Strings.isNullOrEmpty(fullDir)) {
+ String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+ File outputFile = new File(outputFullPath);
+
+ if (!outputFile.exists()) {
+ LOG.info("Writing: [" + outputFullPath + "]");
+ FileOutputStream output = new FileOutputStream(outputFile);
IOUtils.write(content.getContent(), output);
fileCount++;
- LOG.info("File name is too long. Truncated and MD5 appended.");
-
- //e.printStackTrace();
+ } else {
+ LOG.info("Skipping writing: [" + outputFullPath
+ + "]: file already exists");
}
-
- } else {
- LOG.info("Skipping writing: [" + outputFullPath
- + "]: file already exists");
}
+
}
}
reader.close();
Added: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1666777&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Sun Mar 15 04:24:44 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.MD5Hash;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+public class DumpFileUtil {
+ private static final Logger LOG = LoggerFactory.getLogger(DumpFileUtil.class
+ .getName());
+
+ private final static String DIR_PATTERN = "%s/%s/%s";
+ private final static String FILENAME_PATTERN = "%s_%s.%s";
+ private final static Integer MAX_LENGTH_OF_FILENAME = 32;
+
+ public static String getUrlMD5(String url) {
+ byte[] digest = MD5Hash.digest(url).getDigest();
+
+ StringBuffer sb = new StringBuffer();
+ for (byte b : digest) {
+ sb.append(String.format("%02x", b & 0xff));
+ }
+
+ return sb.toString();
+ }
+
+ public static String createTwoLevelsDirectory(String basePath, String md5) {
+ String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
+ String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
+
+ String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
+
+ try {
+ FileUtils.forceMkdir(new File(fullDirPath));
+ } catch (IOException e) {
+ LOG.error("Failed to create dir: {}", fullDirPath);
+ fullDirPath = null;
+ }
+
+ return fullDirPath;
+ }
+
+ public static String createFileName(String md5, String fileBaseName, String fileExtension) {
+ if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {
+ LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME);
+ return String.format(FILENAME_PATTERN, md5, StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME), fileExtension);
+ } else {
+ return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension);
+ }
+ }
+}
Added: nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java?rev=1666777&view=auto
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java (added)
+++ nutch/trunk/src/test/org/apache/nutch/util/DumpFileUtilTest.java Sun Mar 15 04:24:44 2015
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class DumpFileUtilTest {
+
+ @Test
+ public void testGetUrlMD5() throws Exception {
+ String testUrl = "http://apache.org";
+
+ String result = DumpFileUtil.getUrlMD5(testUrl);
+
+ assertEquals("991e599262e04ea2ec76b6c5aed499a7", result);
+ }
+
+ @Test
+ public void testCreateTwoLevelsDirectory() throws Exception {
+ String testUrl = "http://apache.org";
+ String basePath = "/tmp";
+ String fullDir = DumpFileUtil.createTwoLevelsDirectory(basePath, DumpFileUtil.getUrlMD5(testUrl));
+
+ assertEquals("/tmp/96/ea", fullDir);
+
+ String basePath2 = "/this/path/is/not/existed/just/for/testing";
+ String fullDir2 = DumpFileUtil.createTwoLevelsDirectory(basePath2, DumpFileUtil.getUrlMD5(testUrl));
+
+ assertNull(fullDir2);
+ }
+
+ @Test
+ public void testCreateFileName() throws Exception {
+ String testUrl = "http://apache.org";
+ String baseName = "test";
+ String extension = "html";
+ String fullDir = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, extension);
+
+ assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.html", fullDir);
+
+ String tooLongBaseName = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+ String fullDir2 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), tooLongBaseName, extension);
+
+ assertEquals("991e599262e04ea2ec76b6c5aed499a7_testtesttesttesttesttesttesttest.html", fullDir2);
+ }
+}