You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by ma...@apache.org on 2016/04/18 00:31:19 UTC

[1/2] nutch git commit: NUTCH-2250 : CommonCrawlDumper : Invalid format and skipped parts

Repository: nutch
Updated Branches:
  refs/heads/master b62f43fda -> d6bcefd92


NUTCH-2250 : CommonCrawlDumper : Invalid format and skipped parts

+ Reads all parts of segements
+ FIX : writes only one document to dump file

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/47cc4e27
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/47cc4e27
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/47cc4e27

Branch: refs/heads/master
Commit: 47cc4e27a41a78753ed431685010da1987bca269
Parents: b62f43f
Author: Thamme Gowda <tg...@gmail.com>
Authored: Thu Apr 14 02:38:30 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Thu Apr 14 02:38:30 2016 -0700

----------------------------------------------------------------------
 .../nutch/tools/CommonCrawlDataDumper.java      | 82 ++++++++------------
 .../apache/nutch/tools/CommonCrawlFormat.java   |  3 +-
 2 files changed, 33 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/47cc4e27/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index 87d1db8..d00df0a 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -23,7 +23,6 @@ import java.io.BufferedOutputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
-import java.io.FileFilter;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.net.MalformedURLException;
@@ -33,6 +32,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.commons.cli.CommandLine;
@@ -54,7 +54,9 @@ import org.apache.commons.io.FilenameUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
@@ -238,66 +240,42 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
     Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
 
     Configuration nutchConfig = NutchConfiguration.create();
-    FileSystem fs = FileSystem.get(nutchConfig);
-    File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
-      @Override
-      public boolean accept(File file) {
-        return file.canRead() && file.isDirectory();
+    final FileSystem fs = FileSystem.get(nutchConfig);
+    Path segmentRootPath = new Path(segmentRootDir.toString());
+
+    //get all paths
+    List<Path> parts = new ArrayList<>();
+    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
+    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
+    while (files.hasNext()) {
+      LocatedFileStatus next = files.next();
+      if (next.isFile()) {
+        Path path = next.getPath();
+        if (path.toString().matches(partPattern)){
+          parts.add(path);
+        }
       }
-    });
-
-    if (new File(
-        segmentRootDir.getAbsolutePath() + File.separator + Content.DIR_NAME
-            + "/part-00000/data").exists()) {
-      segmentDirs = new File[] { segmentRootDir };
     }
 
-    if (segmentDirs == null) {
-      LOG.error(
-          "No segment directories found in [" + segmentRootDir.getAbsolutePath()
-              + "]");
+    if (parts == null || parts.size() == 0) {
+      LOG.error( "No segment directories found in [ {}] ", segmentRootDir.getAbsolutePath());
       System.exit(1);
     }
-
+    LOG.info("Found {} segment parts", parts.size());
     if (gzip && !warc) {
-      fileList = new ArrayList<String>();
+      fileList = new ArrayList<>();
       constructNewStream(outputDir);
     }
 
-    CommonCrawlFormat format = CommonCrawlFormatFactory
-        .getCommonCrawlFormat("JACKSON", nutchConfig, config);
-
-    if (warc) {
-      format = CommonCrawlFormatFactory
-          .getCommonCrawlFormat("WARC", nutchConfig, config);
-    }
-
-    for (File segment : segmentDirs) {
-      LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+    for (Path segmentPart : parts) {
+      LOG.info("Processing segment Part : [ {} ]", segmentPart);
       try {
-        String segmentContentPath =
-            segment.getAbsolutePath() + File.separator + Content.DIR_NAME
-                + "/part-00000/data";
-        Path file = new Path(segmentContentPath);
-
-        if (!new File(file.toString()).exists()) {
-          LOG.warn("Skipping segment: [" + segmentContentPath
-              + "]: no data directory present");
-          continue;
-        }
         SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
-            SequenceFile.Reader.file(file));
-
-        if (!new File(file.toString()).exists()) {
-          LOG.warn("Skipping segment: [" + segmentContentPath
-              + "]: no data directory present");
-          continue;
-        }
+            SequenceFile.Reader.file(segmentPart));
 
         Writable key = (Writable) reader.getKeyClass().newInstance();
 
         Content content = null;
-
         while (reader.next(key)) {
           content = new Content();
           reader.getCurrentValue(content);
@@ -368,7 +346,11 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
             String mimeType = new Tika().detect(content.getContent());
             // Maps file to JSON-based structure
 
-            jsonData = format.getJsonData(url, content, metadata);
+            //TODO: Make this Jackson Format implementation reusable
+            try (CommonCrawlFormat format = CommonCrawlFormatFactory
+                .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
+              jsonData = format.getJsonData(url, content, metadata);
+            }
 
             collectStats(typeCounts, mimeType);
             // collects statistics for the given mimetypes
@@ -415,16 +397,14 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
             }
           }
         }
-
         reader.close();
+      } catch (Exception e){
+        LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
       } finally {
         fs.close();
       }
     }
 
-    // close the format if needed
-    format.close();
-
     if (gzip && !warc) {
       closeStream();
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/47cc4e27/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
index ec19027..87baeb5 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -21,6 +21,7 @@ import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 
+import java.io.Closeable;
 import java.io.IOException;
 
 /**
@@ -30,7 +31,7 @@ import java.io.IOException;
  * @author gtotaro
  *
  */
-public interface CommonCrawlFormat {
+public interface CommonCrawlFormat extends Closeable {
 
   /**
    *

[2/2] nutch git commit: Record changes for NUTCH-2250.

Posted by ma...@apache.org.

Record changes for NUTCH-2250.


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d6bcefd9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d6bcefd9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d6bcefd9

Branch: refs/heads/master
Commit: d6bcefd926dffb02d8f6ebfebad8adeea31e8f50
Parents: 47cc4e2
Author: Chris Mattmann <ma...@apache.org>
Authored: Sun Apr 17 15:30:46 2016 -0700
Committer: Chris Mattmann <ma...@apache.org>
Committed: Sun Apr 17 15:30:46 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/d6bcefd9/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 5e061a4..a3bde42 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme Gowda N.,lewismc via mattmann)
+
 * NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen)
 
 * NUTCH-2241 Unstable Selenium plugin in Nutch. Fixed bugs and enhanced configuration (karanjeets via mattmann)