You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/02/13 13:28:13 UTC
svn commit: r1659533 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/LinkDbReader.java
Author: markus
Date: Fri Feb 13 12:28:13 2015
New Revision: 1659533
URL: http://svn.apache.org/r1659533
Log:
NUTCH-1724 LinkDBReader to support regex output filtering
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659533&r1=1659532&r2=1659533&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Feb 13 12:28:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1724 LinkDBReader to support regex output filtering (markus)
+
* NUTCH-1939 Fetcher fails to follow redirects (Leo Ye via snagel)
* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1659533&r1=1659532&r2=1659533&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Fri Feb 13 12:28:13 2015
@@ -19,6 +19,9 @@ package org.apache.nutch.crawl;
import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -43,7 +46,7 @@ import java.io.Closeable;
public class LinkDbReader extends Configured implements Tool, Closeable {
public static final Logger LOG = LoggerFactory.getLogger(LinkDbReader.class);
- private static final Partitioner<WritableComparable<?>, Writable> PARTITIONER = new HashPartitioner<WritableComparable<?>, Writable>();
+ private static final Partitioner<WritableComparable, Writable> PARTITIONER = new HashPartitioner<WritableComparable, Writable>();
private FileSystem fs;
private Path directory;
@@ -90,8 +93,33 @@ public class LinkDbReader extends Config
}
}
}
+
+ public static class LinkDBDumpMapper implements Mapper<Text, Inlinks, Text, Inlinks> {
+ Pattern pattern = null;
+ Matcher matcher = null;
+
+ public void configure(JobConf job) {
+ if (job.get("linkdb.regex", null) != null) {
+ pattern = Pattern.compile(job.get("linkdb.regex"));
+ }
+ }
+
+ public void close() {}
+ public void map(Text key, Inlinks value, OutputCollector<Text, Inlinks> output, Reporter reporter)
+ throws IOException {
+
+ if (pattern != null) {
+ matcher = pattern.matcher(key.toString());
+ if (!matcher.matches()) {
+ return;
+ }
+ }
+
+ output.collect(key, value);
+ }
+ }
- public void processDumpJob(String linkdb, String output) throws IOException {
+ public void processDumpJob(String linkdb, String output, String regex) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
@@ -102,6 +130,11 @@ public class LinkDbReader extends Config
JobConf job = new NutchJob(getConf());
job.setJobName("read " + linkdb);
+
+ if (regex != null) {
+ job.set("linkdb.regex", regex);
+ job.setMapperClass(LinkDBDumpMapper.class);
+ }
FileInputFormat.addInputPath(job, new Path(linkdb, LinkDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
@@ -127,16 +160,24 @@ public class LinkDbReader extends Config
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err
- .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)");
+ .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> [-regex <regex>]) | -url <url>");
System.err
.println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
System.err
+ .println("\t\t-regex <regex>\trestrict to url's matching expression");
+ System.err
.println("\t-url <url>\tprint information about <url> to System.out");
return -1;
}
try {
if (args[1].equals("-dump")) {
- processDumpJob(args[0], args[2]);
+ String regex = null;
+ for (int i = 2; i < args.length; i++) {
+ if (args[i].equals("-regex")) {
+ regex = args[++i];
+ }
+ }
+ processDumpJob(args[0], args[2], regex);
return 0;
} else if (args[1].equals("-url")) {
init(new Path(args[0]));