You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/10/29 22:30:50 UTC
svn commit: r1711366 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/tools/FileDumper.java
Author: lewismc
Date: Thu Oct 29 21:30:50 2015
New Revision: 1711366
URL: http://svn.apache.org/viewvc?rev=1711366&view=rev
Log:
NUTCH-1988 Make nested output directory dump optional
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1711366&r1=1711365&r2=1711366&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 29 21:30:50 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch Current Development 1.11 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-1988 Make nested output directory dump optional... again (Michael Joyce via lewismc)
+
* NUTCH-1800 Documentation for Nutch 1.X and 2.X REST APIs (lewismc)
* NUTCH-2149 REST endpoint to read Nutch sequence files (Sujen Shah)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1711366&r1=1711365&r2=1711366&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Thu Oct 29 21:30:50 2015
@@ -126,12 +126,16 @@ public class FileDumper {
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
+ * @param flatDir
+ * a boolean flag specifying whether the output directory should contain
+ * only files instead of using nested directories to prevent naming
+ * conflicts.
* @param mimeTypeStats
- * a flag indicating whether mimetype stats should be displayed
- * instead of dumping files.
+ * a flag indicating whether mimetype stats should be displayed
+ * instead of dumping files.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean mimeTypeStats)
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
@@ -150,7 +154,7 @@ public class FileDumper {
}
});
if (segmentDirs == null) {
- System.err.println("No segment directories found in ["
+ LOG.error("No segment directories found in ["
+ segmentRootDir.getAbsolutePath() + "]");
return;
}
@@ -211,38 +215,42 @@ public class FileDumper {
if (filter) {
if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
- String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
-
+
+ String fullDir = outputDir.getAbsolutePath();
+ if (!flatDir) {
+ fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
+ }
+
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
File outputFile = new File(outputFullPath);
-
+
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
- // Modified to prevent FileNotFoundException (Invalid Argument)
- FileOutputStream output = null;
- try {
- output = new FileOutputStream(outputFile);
- IOUtils.write(content.getContent(), output);
+ // Modified to prevent FileNotFoundException (Invalid Argument)
+ FileOutputStream output = null;
+ try {
+ output = new FileOutputStream(outputFile);
+ IOUtils.write(content.getContent(), output);
}
catch (Exception e) {
- LOG.warn("Write Error: [" + outputFullPath + "]");
- e.printStackTrace();
+ LOG.warn("Write Error: [" + outputFullPath + "]");
+ e.printStackTrace();
}
finally {
- if (output != null) {
- output.flush();
- try {
- output.close();
- } catch (Exception ignore) {
- }
- }
+ if (output != null) {
+ output.flush();
+ try {
+ output.close();
+ } catch (Exception ignore) {
+ }
+ }
}
fileCount++;
} else {
LOG.info("Skipping writing: [" + outputFullPath
- + "]: file already exists");
+ + "]: file already exists");
}
}
}
@@ -282,27 +290,33 @@ public class FileDumper {
// argument options
@SuppressWarnings("static-access")
Option outputOpt = OptionBuilder
- .withArgName("outputDir")
- .hasArg()
- .withDescription(
- "output directory (which will be created) to host the raw data")
- .create("outputDir");
+ .withArgName("outputDir")
+ .hasArg()
+ .withDescription(
+ "output directory (which will be created) to host the raw data")
+ .create("outputDir");
@SuppressWarnings("static-access")
Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
- .withDescription("the segment(s) to use").create("segment");
+ .withDescription("the segment(s) to use").create("segment");
@SuppressWarnings("static-access")
Option mimeOpt = OptionBuilder
- .withArgName("mimetype")
- .hasArgs()
- .withDescription(
- "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
- .create("mimetype");
+ .withArgName("mimetype")
+ .hasArgs()
+ .withDescription(
+ "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+ .create("mimetype");
@SuppressWarnings("static-access")
Option mimeStat = OptionBuilder
- .withArgName("mimeStats")
- .withDescription(
- "only display mimetype stats for the segment(s) instead of dumping file.")
- .create("mimeStats");
+ .withArgName("mimeStats")
+ .withDescription(
+ "only display mimetype stats for the segment(s) instead of dumping file.")
+ .create("mimeStats");
+ @SuppressWarnings("static-access")
+ Option dirStructureOpt = OptionBuilder
+ .withArgName("flatdir")
+ .withDescription(
+ "optionally specify that the output directory should only contain files.")
+ .create("flatdir");
// create the options
Options options = new Options();
@@ -311,6 +325,7 @@ public class FileDumper {
options.addOption(segOpt);
options.addOption(mimeOpt);
options.addOption(mimeStat);
+ options.addOption(dirStructureOpt);
CommandLineParser parser = new GnuParser();
try {
@@ -325,22 +340,23 @@ public class FileDumper {
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
+ boolean flatDir = line.hasOption("flatdir");
boolean shouldDisplayStats = false;
if (line.hasOption("mimeStats"))
shouldDisplayStats = true;
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
- + "]: does not exist, creating it.");
- if (!shouldDisplayStats) {
+ + "]: does not exist, creating it.");
+ if (!shouldDisplayStats) {
if (!outputDir.mkdirs())
throw new Exception("Unable to create: ["
- + outputDir.getAbsolutePath() + "]");
+ + outputDir.getAbsolutePath() + "]");
}
}
FileDumper dumper = new FileDumper();
- dumper.dump(outputDir, segmentRootDir, mimeTypes, shouldDisplayStats);
+ dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();