You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/12/10 04:05:24 UTC

svn commit: r1719006 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingJob.java src/java/org/apache/nutch/segment/SegmentChecker.java

Author: lewismc
Date: Thu Dec 10 03:05:24 2015
New Revision: 1719006

URL: http://svn.apache.org/viewvc?rev=1719006&view=rev
Log:
NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 10 03:05:24 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc)
+
 * NUTCH-2180 FileDumper skips Corrupt Segments (Harshavardhan Manjunatha via lewismc)
 
 * NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Thu Dec 10 03:05:24 2015
@@ -186,11 +186,13 @@ public class IndexingJob extends NutchTo
     boolean base64 = false;
 
     for (int i = 1; i < args.length; i++) {
+      FileSystem fs = null;
+      Path dir = null;
       if (args[i].equals("-linkdb")) {
         linkDb = new Path(args[++i]);
       } else if (args[i].equals("-dir")) {
-        Path dir = new Path(args[++i]);
-        FileSystem fs = dir.getFileSystem(getConf());
+        dir = new Path(args[++i]);
+        fs = dir.getFileSystem(getConf());
         FileStatus[] fstats = fs.listStatus(dir,
             HadoopFSUtil.getPassDirectoriesFilter(fs));
         Path[] files = HadoopFSUtil.getPaths(fstats);
@@ -214,7 +216,11 @@ public class IndexingJob extends NutchTo
       } else if (args[i].equals("-params")) {
         params = args[++i];
       } else {
-        segments.add(new Path(args[i]));
+        dir = new Path(args[i]);
+        fs = dir.getFileSystem(getConf());
+        if (SegmentChecker.isIndexable(dir,fs)) {
+          segments.add(dir);
+        }
       }
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Thu Dec 10 03:05:24 2015
@@ -65,6 +65,11 @@ public class SegmentChecker {
   public static boolean checkSegmentDir(Path segmentPath, FileSystem fs)
       throws IOException {
 
+    if (segmentPath.getName().length() != 14) {
+      LOG.warn("The input path at {} is not a segment... skipping", segmentPath.getName());
+      return false;
+    }
+    
     FileStatus[] fstats_segment = fs.listStatus(segmentPath,
         HadoopFSUtil.getPassDirectoriesFilter(fs));
     Path[] segment_files = HadoopFSUtil.getPaths(fstats_segment);