You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/05/07 20:39:49 UTC
[2/6] nutch git commit: Handling duplicate inlinks
Handling duplicate inlinks
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a0880491
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a0880491
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a0880491
Branch: refs/heads/master
Commit: a08804913c500b331c9296996ce1ba3d4929d3a8
Parents: 6ff1ecb
Author: Thamme Gowda <tg...@gmail.com>
Authored: Mon Apr 18 19:20:38 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Mon Apr 18 19:20:38 2016 -0700
----------------------------------------------------------------------
.../org/apache/nutch/tools/CommonCrawlDataDumper.java | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/a0880491/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index e26e088..83da679 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -33,8 +33,10 @@ import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -355,13 +357,13 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
String mimeType = new Tika().detect(content.getContent());
// Maps file to JSON-based structure
- List<String> inUrls = null;
+ Set<String> inUrls = null; ///may be there are duplicates, so using set
if (linkDbReader != null) {
int max = 5000; //just in case there are too many urls!
Inlinks inlinks = linkDbReader.getInlinks((Text) key);
if (inlinks != null) {
Iterator<Inlink> iterator = inlinks.iterator();
- inUrls = new ArrayList<>();
+ inUrls = new LinkedHashSet<>();
while (max >= 0 && iterator.hasNext()){
inUrls.add(iterator.next().getFromUrl());
max--;
@@ -371,7 +373,9 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
//TODO: Make this Jackson Format implementation reusable
try (CommonCrawlFormat format = CommonCrawlFormatFactory
.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
- format.setInLinks(inUrls);
+ if (inUrls != null) {
+ format.setInLinks(new ArrayList<>(inUrls));
+ }
jsonData = format.getJsonData(url, content, metadata);
}