You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/08 12:14:33 UTC
svn commit: r1723690 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus
Date: Fri Jan 8 11:14:33 2016
New Revision: 1723690
URL: http://svn.apache.org/viewvc?rev=1723690&view=rev
Log:
NUTCH-2178 DeduplicationJob to optionally group on host or domain
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723690&r1=1723689&r2=1723690&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 8 11:14:33 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)
+
* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
* NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1723690&r1=1723689&r2=1723690&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Fri Jan 8 11:14:33 2016
@@ -49,6 +49,7 @@ import org.apache.nutch.util.NutchConfig
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -67,12 +68,16 @@ public class DeduplicationJob extends Nu
.getLogger(DeduplicationJob.class);
private final static Text urlKey = new Text("_URLTEMPKEY_");
+ private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
public static class DBFilter implements
Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
+
+ private String groupMode;
@Override
public void configure(JobConf arg0) {
+ groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
}
@Override
@@ -90,10 +95,31 @@ public class DeduplicationJob extends Nu
byte[] signature = value.getSignature();
if (signature == null)
return;
- BytesWritable sig = new BytesWritable(signature);
+ String url = key.toString();
+ BytesWritable sig = null;
+ byte[] data;
+ switch (groupMode) {
+ case "none":
+ sig = new BytesWritable(signature);
+ break;
+ case "host":
+ byte[] host = URLUtil.getHost(url).getBytes();
+ data = new byte[signature.length + host.length];
+ System.arraycopy(signature, 0, data, 0, signature.length);
+ System.arraycopy(host, 0, data, signature.length, host.length);
+ sig = new BytesWritable(data);
+ break;
+ case "domain":
+ byte[] domain = URLUtil.getDomainName(url).getBytes();
+ data = new byte[signature.length + domain.length];
+ System.arraycopy(signature, 0, data, 0, signature.length);
+ System.arraycopy(domain, 0, data, signature.length, domain.length);
+ sig = new BytesWritable(data);
+ break;
+ }
// add the URL as a temporary MD
value.getMetaData().put(urlKey, key);
- // reduce on the signature
+ // reduce on the signature optionall grouped on host or domain or not at all
output.collect(sig, value);
}
}
@@ -216,11 +242,17 @@ public class DeduplicationJob extends Nu
public int run(String[] args) throws IOException {
if (args.length < 1) {
- System.err.println("Usage: DeduplicationJob <crawldb>");
+ System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>]");
return 1;
}
+ String group = "none";
String crawldb = args[0];
+
+ for (int i = 1; i < args.length; i++) {
+ if (args[i].equals("-group"))
+ group = args[++i];
+ }
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -233,6 +265,7 @@ public class DeduplicationJob extends Nu
JobConf job = new NutchJob(getConf());
job.setJobName("Deduplication on " + crawldb);
+ job.set(DEDUPLICATION_GROUP_MODE, group);
FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);