You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/05/07 20:39:49 UTC

[2/6] nutch git commit: Handling duplicate inlinks

Handling duplicate inlinks

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a0880491
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a0880491
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a0880491

Branch: refs/heads/master
Commit: a08804913c500b331c9296996ce1ba3d4929d3a8
Parents: 6ff1ecb
Author: Thamme Gowda <tg...@gmail.com>
Authored: Mon Apr 18 19:20:38 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Mon Apr 18 19:20:38 2016 -0700

----------------------------------------------------------------------
 .../org/apache/nutch/tools/CommonCrawlDataDumper.java     | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/a0880491/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index e26e088..83da679 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -33,8 +33,10 @@ import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -355,13 +357,13 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
             String mimeType = new Tika().detect(content.getContent());
             // Maps file to JSON-based structure
 
-          List<String> inUrls = null;
+          Set<String> inUrls = null; ///may be there are duplicates, so using set
           if (linkDbReader != null) {
             int max = 5000;     //just in case there are too many urls!
             Inlinks inlinks = linkDbReader.getInlinks((Text) key);
             if (inlinks != null) {
               Iterator<Inlink> iterator = inlinks.iterator();
-              inUrls = new ArrayList<>();
+              inUrls = new LinkedHashSet<>();
               while (max >= 0 && iterator.hasNext()){
                 inUrls.add(iterator.next().getFromUrl());
                 max--;
@@ -371,7 +373,9 @@ public class CommonCrawlDataDumper extends Configured implements Tool {
           //TODO: Make this Jackson Format implementation reusable
           try (CommonCrawlFormat format = CommonCrawlFormatFactory
                   .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
-            format.setInLinks(inUrls);
+            if (inUrls != null) {
+              format.setInLinks(new ArrayList<>(inUrls));
+            }
             jsonData = format.getJsonData(url, content, metadata);
           }