You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/22 15:41:37 UTC
svn commit: r1731651 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus
Date: Mon Feb 22 14:41:37 2016
New Revision: 1731651
URL: http://svn.apache.org/viewvc?rev=1731651&view=rev
Log:
NUTCH-2219 Criteria order to be configurable in DeduplicationJob
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731651&r1=1731650&r2=1731651&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Feb 22 14:41:37 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus)
+
* NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce)
* NUTCH-2223 Upgrade xercesImpl to 2.11.0 to fix hang on issue in tika mimetype detection (Tien Nguyen Manh via markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1731651&r1=1731650&r2=1731651&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Feb 22 14:41:37 2016
@@ -22,6 +22,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
+import java.util.Arrays;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -69,6 +70,7 @@ public class DeduplicationJob extends Nu
private final static Text urlKey = new Text("_URLTEMPKEY_");
private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
+ private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
public static class DBFilter implements
Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -128,6 +130,13 @@ public class DeduplicationJob extends Nu
public static class DedupReducer implements
Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
+ private String[] compareOrder;
+
+ @Override
+ public void configure(JobConf arg0) {
+ compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+ }
+
private void writeOutAsDuplicate(CrawlDatum datum,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
@@ -144,6 +153,7 @@ public class DeduplicationJob extends Nu
throws IOException {
CrawlDatum existingDoc = null;
+ outerloop:
while (values.hasNext()) {
if (existingDoc == null) {
existingDoc = new CrawlDatum();
@@ -151,48 +161,56 @@ public class DeduplicationJob extends Nu
continue;
}
CrawlDatum newDoc = values.next();
- // compare based on score
- if (existingDoc.getScore() < newDoc.getScore()) {
- writeOutAsDuplicate(existingDoc, output, reporter);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue;
- } else if (existingDoc.getScore() > newDoc.getScore()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, output, reporter);
- continue;
- }
- // same score? delete the one which is oldest
- if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, output, reporter);
- continue;
- } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
- // mark existing one as duplicate
- writeOutAsDuplicate(existingDoc, output, reporter);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue;
- }
- // same time? keep the one which has the shortest URL
- String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
- String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
- if (urlExisting.length() < urlnewDoc.length()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, output, reporter);
- continue;
- } else if (urlExisting.length() > urlnewDoc.length()) {
- // mark existing one as duplicate
- writeOutAsDuplicate(existingDoc, output, reporter);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue;
+
+ for (int i = 0; i < compareOrder.length; i++) {
+ switch (compareOrder[i]) {
+ case "score":
+ // compare based on score
+ if (existingDoc.getScore() < newDoc.getScore()) {
+ writeOutAsDuplicate(existingDoc, output, reporter);
+ existingDoc = new CrawlDatum();
+ existingDoc.set(newDoc);
+ continue outerloop;
+ } else if (existingDoc.getScore() > newDoc.getScore()) {
+ // mark new one as duplicate
+ writeOutAsDuplicate(newDoc, output, reporter);
+ continue outerloop;
+ }
+ break;
+ case "fetchTime":
+ // same score? delete the one which is oldest
+ if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
+ // mark new one as duplicate
+ writeOutAsDuplicate(newDoc, output, reporter);
+ continue outerloop;
+ } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
+ // mark existing one as duplicate
+ writeOutAsDuplicate(existingDoc, output, reporter);
+ existingDoc = new CrawlDatum();
+ existingDoc.set(newDoc);
+ continue outerloop;
+ }
+ break;
+ case "urlLength":
+ // same time? keep the one which has the shortest URL
+ String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
+ String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+ if (urlExisting.length() < urlnewDoc.length()) {
+ // mark new one as duplicate
+ writeOutAsDuplicate(newDoc, output, reporter);
+ continue outerloop;
+ } else if (urlExisting.length() > urlnewDoc.length()) {
+ // mark existing one as duplicate
+ writeOutAsDuplicate(existingDoc, output, reporter);
+ existingDoc = new CrawlDatum();
+ existingDoc.set(newDoc);
+ continue outerloop;
+ }
+ break;
+ }
}
- }
- }
- @Override
- public void configure(JobConf arg0) {
+ }
}
@Override
@@ -242,16 +260,27 @@ public class DeduplicationJob extends Nu
public int run(String[] args) throws IOException {
if (args.length < 1) {
- System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>]");
+ System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]");
return 1;
}
String group = "none";
String crawldb = args[0];
-
+ String compareOrder = "score,fetchTime,urlLength";
+
for (int i = 1; i < args.length; i++) {
- if (args[i].equals("-group"))
+ if (args[i].equals("-group"))
group = args[++i];
+ if (args[i].equals("-compareOrder")) {
+ compareOrder = args[++i];
+
+ if (compareOrder.indexOf("score") == -1 ||
+ compareOrder.indexOf("fetchTime") == -1 ||
+ compareOrder.indexOf("urlLength") == -1) {
+ System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
+ return 1;
+ }
+ }
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@@ -266,6 +295,7 @@ public class DeduplicationJob extends Nu
job.setJobName("Deduplication on " + crawldb);
job.set(DEDUPLICATION_GROUP_MODE, group);
+ job.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);