You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/09/18 22:20:42 UTC
svn commit: r1387341 - in /nutch/trunk: CHANGES.txt src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Author: lewismc
Date: Tue Sep 18 20:20:41 2012
New Revision: 1387341

URL: http://svn.apache.org/viewvc?rev=1387341&view=rev
Log:
NUTCH-1441 AnchorIndexingFilter should use plain HashSet

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1387341&r1=1387340&r2=1387341&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Sep 18 20:20:41 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc)
+
 * NUTCH-1470 Ensure test files are included for runtime testing (lewismc)
 
 * NUTCH-1434 Indexer to delete robots noindex (markus)

Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1387341&r1=1387340&r2=1387341&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Tue Sep 18 20:20:41 2012
@@ -16,10 +16,8 @@
  */
 package org.apache.nutch.indexer.anchor;
 
-import java.util.WeakHashMap;
+import java.util.HashSet;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -28,6 +26,8 @@ import org.apache.nutch.indexer.Indexing
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Indexing filter that indexes all inbound anchor text for a document. 
@@ -56,19 +56,19 @@ public class AnchorIndexingFilter
     String[] anchors = (inlinks != null ? inlinks.getAnchors()
       : new String[0]);
 
-    // https://issues.apache.org/jira/browse/NUTCH-1037
-    WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+    HashSet<String> set = null;
 
     for (int i = 0; i < anchors.length; i++) {
       if (deduplicate) {
+        if (set == null) set = new HashSet<String>();
         String lcAnchor = anchors[i].toLowerCase();
 
         // Check if already processed the current anchor
-        if (!map.containsKey(lcAnchor)) {
+        if (!set.contains(lcAnchor)) {
           doc.add("anchor", anchors[i]);
 
           // Add to map
-          map.put(lcAnchor, 1);
+          set.add(lcAnchor);
         }
       } else {
         doc.add("anchor", anchors[i]);