You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2014/09/21 07:47:40 UTC

svn commit: r1626517 - in /nutch/trunk: CHANGES.txt src/bin/nutch src/java/org/apache/nutch/tools/FileDumper.java

Author: mattmann
Date: Sun Sep 21 05:47:39 2014
New Revision: 1626517

URL: http://svn.apache.org/r1626517
Log:
Fix for NUTCH-1526 Create SegmentContentDumperTool for easily extracting out file contents from SegmentDirs
.

Added:
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1626517&r1=1626516&r2=1626517&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Sep 21 05:47:39 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1526 Create SegmentContentDumperTool for easily extracting out file contents from SegmentDirs (mattmann, lewismc, Julien Le Dem)
+
 * NUTCH-1840 the describe function in SolrIndexWriter is not correct (kaveh minooie via jnioche)
 
 * NUTCH-1837 Upgrade to Tika 1.6 (jnioche)

Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1626517&r1=1626516&r2=1626517&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Sun Sep 21 05:47:39 2014
@@ -71,6 +71,7 @@ if [ $# = 0 ]; then
   echo "  mergelinkdb       merge linkdb-s, with optional filtering"
   echo "  index             run the plugin-based indexer on parsed segments and linkdb"
   echo "  dedup             deduplicate entries in the crawldb and give them a special status"
+  echo "  dump              exports cralwed data from segments into files"
   echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
   echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
   echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
@@ -230,6 +231,8 @@ elif [ "$COMMAND" = "invertlinks" ] ; th
   CLASS=org.apache.nutch.crawl.LinkDb
 elif [ "$COMMAND" = "mergelinkdb" ] ; then
   CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+  CLASS=org.apache.nutch.tools.FileDumper
 elif [ "$COMMAND" = "solrindex" ] ; then
   CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
   shift

Added: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1626517&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Sun Sep 21 05:47:39 2014
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.ByteArrayInputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+//Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+//Tika imports
+import org.apache.tika.Tika;
+
+public class FileDumper {
+
+    private static final Logger LOG = Logger.getLogger(FileDumper.class
+						       .getName());
+
+
+    public void dump(File outputDir, File segmentRootDir) throws Exception {
+        Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+        Configuration conf = NutchConfiguration.create();
+        FileSystem fs = FileSystem.get(conf);
+        int fileCount = 0;
+        File[] segmentDirs = segmentRootDir
+            .listFiles(new FileFilter() {
+
+                    @Override
+                        public boolean accept(File file) {
+                        return file.canRead() && file.isDirectory();
+                    }
+                });
+
+        for (File segment : segmentDirs) {
+            LOG.log(Level.INFO,
+                    "Processing segment: [" + segment.getAbsolutePath() + "]");
+            DataOutputStream doutputStream = null;
+            try {
+                String segmentPath = segment.getAbsolutePath()
+                    + "/" + Content.DIR_NAME + "/part-00000/data";
+                Path file = new Path(segmentPath);
+                if (!new File(file.toString()).exists()) {
+                    LOG.log(Level.WARNING, "Skipping segment: [" + segmentPath
+                            + "]: no data directory present");
+                    continue;
+                }
+                SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
+                                                                     conf);
+
+                Writable key = (Writable)reader.getKeyClass().newInstance();
+                Content content = null;
+
+                while (reader.next(key)) {
+		    content = new Content();
+		    reader.getCurrentValue(content);
+                    String url = key.toString();
+		    String baseName = FilenameUtils.getBaseName(url);
+		    String extension = FilenameUtils.getExtension(url);
+		    if (extension == null || (extension != null && 
+					      extension.equals(""))){
+			    extension = "html";
+		    }
+
+		    String filename = baseName + "." + extension;
+
+		    ByteArrayInputStream bas = null;
+		    try{
+			bas = new ByteArrayInputStream(content.getContent());
+			String mimeType = new Tika().detect(content.getContent());
+                        collectStats(typeCounts, mimeType);
+		    }
+		    catch(Exception e){
+			e.printStackTrace();
+			LOG.log(Level.WARNING, "Unable to detect type for: ["+url+"]: Message: "+e.getMessage());
+		    }
+		    finally{
+			if(bas != null){
+			    try{
+				bas.close();
+			    }
+			    catch(Exception ignore){}
+			    bas = null;
+			}
+		    }
+
+                    String outputFullPath = outputDir + "/" + filename;
+                    File outputFile = new File(outputFullPath);
+                    if (!outputFile.exists()) {
+                        LOG.log(Level.INFO, "Writing: [" + outputFullPath + "]");
+			FileOutputStream output = new FileOutputStream(outputFile);
+			IOUtils.write(content.getContent(), output);
+                        fileCount++;
+                    } else {
+                        LOG.log(Level.INFO, "Skipping writing: ["
+                                + outputFullPath + "]: file already exists");
+                    }
+                    content = null;
+                }
+                reader.close();
+            }
+             finally {
+                fs.close();
+                if (doutputStream != null){
+		    try{
+			doutputStream.close();
+		    }
+		    catch (Exception ignore){}
+		}
+            }
+        }
+
+        LOG.log(Level.INFO, "Processed: [" + fileCount + "] files.");
+        LOG.log(Level.INFO, "File Types: " + displayFileTypes(typeCounts));
+
+    }
+
+    public static void main(String[] args) throws Exception {
+	String usage = "Usage: FileDumper <output directory> <segments dir>\n";
+	if (args.length != 2) {
+	    System.err.println(usage);
+	    System.exit(1);
+	}
+
+        String outputDir = args[0];
+        String segmentRootDir = args[1];
+	File outputDirFile = new File(outputDir);
+	File segmentRootDirFile = new File(segmentRootDir);
+
+	if (!outputDirFile.exists()) {
+	    LOG.log(Level.WARNING, "Output directory: [" + outputDir
+		    + "]: does not exist, creating it.");
+	    if(!outputDirFile.mkdirs()) throw new Exception("Unable to create: ["+outputDir+"]");
+	}
+
+	FileDumper dumper = new FileDumper();
+	dumper.dump(outputDirFile, segmentRootDirFile);
+    }
+
+    private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+	typeCounts.put(mimeType,
+		       typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1
+		       : 1);
+    }
+
+    private String displayFileTypes(Map<String, Integer> typeCounts) {
+	StringBuilder  builder = new StringBuilder();
+	builder.append("{\n");
+	for (String mimeType : typeCounts.keySet()) {
+	    builder.append("{\"mimeType\":\"");
+	    builder.append(mimeType);
+	    builder.append("\",\"count\":");
+	    builder.append(typeCounts.get(mimeType));
+	    builder.append("\"}\n");
+	}
+	builder.append("}\n");
+	return builder.toString();
+    }
+
+}