You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/07/30 14:20:52 UTC
svn commit: r1367064 - in /nutch/branches/2.x: CHANGES.txt
src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Author: ferdy
Date: Mon Jul 30 12:20:51 2012
New Revision: 1367064
URL: http://svn.apache.org/viewvc?rev=1367064&view=rev
Log:
NUTCH-1441 AnchorIndexingFilter should use plain HashSet
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1367064&r1=1367063&r2=1367064&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Jul 30 12:20:51 2012
@@ -1,6 +1,7 @@
Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy)
* NUTCH-1417 Remove o.a.n.metadata.Office (lewismc)
Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1367064&r1=1367063&r2=1367064&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Mon Jul 30 12:20:51 2012
@@ -19,17 +19,16 @@ package org.apache.nutch.indexer.anchor;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map.Entry;
-import java.util.WeakHashMap;
import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Indexing filter that indexes all inbound anchor text for a document.
@@ -63,28 +62,27 @@ public class AnchorIndexingFilter implem
@Override
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
-
- // https://issues.apache.org/jira/browse/NUTCH-1037
- WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
-
+ HashSet<String> set = null;
+
for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
String anchor = TableUtil.toString(e.getValue());
if (deduplicate) {
+ if (set == null) set = new HashSet<String>();
String lcAnchor = anchor.toLowerCase();
// Check if already processed the current anchor
- if (!map.containsKey(lcAnchor)) {
+ if (!set.contains(lcAnchor)) {
doc.add("anchor", anchor);
- // Add to map
- map.put(lcAnchor, 1);
+ // Add to set
+ set.add(lcAnchor);
}
} else {
doc.add("anchor", anchor);
}
}
-
+
return doc;
}