You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/07/30 14:20:52 UTC

svn commit: r1367064 - in /nutch/branches/2.x: CHANGES.txt src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Author: ferdy
Date: Mon Jul 30 12:20:51 2012
New Revision: 1367064

URL: http://svn.apache.org/viewvc?rev=1367064&view=rev
Log:
NUTCH-1441 AnchorIndexingFilter should use plain HashSet

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1367064&r1=1367063&r2=1367064&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Jul 30 12:20:51 2012
@@ -1,6 +1,7 @@
 Nutch Change Log
 
 Release 2.1 - Current Development
+* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy)
 
 * NUTCH-1417 Remove o.a.n.metadata.Office (lewismc)
 

Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1367064&r1=1367063&r2=1367064&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Mon Jul 30 12:20:51 2012
@@ -19,17 +19,16 @@ package org.apache.nutch.indexer.anchor;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map.Entry;
-import java.util.WeakHashMap;
 
 import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.TableUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Indexing filter that indexes all inbound anchor text for a document.
@@ -63,28 +62,27 @@ public class AnchorIndexingFilter implem
   @Override
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
-
-    // https://issues.apache.org/jira/browse/NUTCH-1037
-    WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
-
+    HashSet<String> set = null;
+    
     for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
       String anchor = TableUtil.toString(e.getValue());
 
       if (deduplicate) {
+        if (set == null) set = new HashSet<String>();
         String lcAnchor = anchor.toLowerCase();
 
         // Check if already processed the current anchor
-        if (!map.containsKey(lcAnchor)) {
+        if (!set.contains(lcAnchor)) {
           doc.add("anchor", anchor);
 
-          // Add to map
-          map.put(lcAnchor, 1);
+          // Add to set
+          set.add(lcAnchor);
         }
       } else {
         doc.add("anchor", anchor);
       }
     }
-
+    
     return doc;
   }