You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/12/10 04:05:24 UTC
svn commit: r1719006 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexingJob.java
src/java/org/apache/nutch/segment/SegmentChecker.java
Author: lewismc
Date: Thu Dec 10 03:05:24 2015
New Revision: 1719006
URL: http://svn.apache.org/viewvc?rev=1719006&view=rev
Log:
NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 10 03:05:24 2015
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc)
+
* NUTCH-2180 FileDumper skips Corrupt Segments (Harshavardhan Manjunatha via lewismc)
* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Thu Dec 10 03:05:24 2015
@@ -186,11 +186,13 @@ public class IndexingJob extends NutchTo
boolean base64 = false;
for (int i = 1; i < args.length; i++) {
+ FileSystem fs = null;
+ Path dir = null;
if (args[i].equals("-linkdb")) {
linkDb = new Path(args[++i]);
} else if (args[i].equals("-dir")) {
- Path dir = new Path(args[++i]);
- FileSystem fs = dir.getFileSystem(getConf());
+ dir = new Path(args[++i]);
+ fs = dir.getFileSystem(getConf());
FileStatus[] fstats = fs.listStatus(dir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
Path[] files = HadoopFSUtil.getPaths(fstats);
@@ -214,7 +216,11 @@ public class IndexingJob extends NutchTo
} else if (args[i].equals("-params")) {
params = args[++i];
} else {
- segments.add(new Path(args[i]));
+ dir = new Path(args[i]);
+ fs = dir.getFileSystem(getConf());
+ if (SegmentChecker.isIndexable(dir,fs)) {
+ segments.add(dir);
+ }
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1719006&r1=1719005&r2=1719006&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Thu Dec 10 03:05:24 2015
@@ -65,6 +65,11 @@ public class SegmentChecker {
public static boolean checkSegmentDir(Path segmentPath, FileSystem fs)
throws IOException {
+ if (segmentPath.getName().length() != 14) {
+ LOG.warn("The input path at {} is not a segment... skipping", segmentPath.getName());
+ return false;
+ }
+
FileStatus[] fstats_segment = fs.listStatus(segmentPath,
HadoopFSUtil.getPassDirectoriesFilter(fs));
Path[] segment_files = HadoopFSUtil.getPaths(fstats_segment);