You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/03/21 00:21:00 UTC
svn commit: r387341 - in
/lucene/nutch/trunk/src/java/org/apache/nutch/crawl: Inlinks.java
LinkDb.java LinkDbReader.java
Author: ab
Date: Mon Mar 20 15:20:56 2006
New Revision: 387341
URL: http://svn.apache.org/viewcvs?rev=387341&view=rev
Log:
Don't allow Inlink duplicates (NUTCH-235).
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Mon Mar 20 15:20:56 2006
@@ -24,22 +24,23 @@
/** A list of {@link Inlink}s. */
public class Inlinks implements Writable {
- private ArrayList inlinks = new ArrayList(1);
+ private HashSet inlinks = new HashSet(1);
public void add(Inlink inlink) { inlinks.add(inlink); }
public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); }
+ public Iterator iterator() {
+ return this.inlinks.iterator();
+ }
+
public int size() { return inlinks.size(); }
- public Inlink get(int i) { return (Inlink)inlinks.get(i); }
-
public void clear() { inlinks.clear(); }
public void readFields(DataInput in) throws IOException {
int length = in.readInt();
inlinks.clear();
- inlinks.ensureCapacity(length);
for (int i = 0; i < length; i++) {
add(Inlink.read(in));
}
@@ -47,17 +48,19 @@
public void write(DataOutput out) throws IOException {
out.writeInt(inlinks.size());
- for (int i = 0; i < inlinks.size(); i++) {
- ((Writable)inlinks.get(i)).write(out);
+ Iterator it = inlinks.iterator();
+ while (it.hasNext()) {
+ ((Writable)it.next()).write(out);
}
}
public String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append("Inlinks:\n");
- for (int i = 0; i < inlinks.size(); i++) {
+ Iterator it = inlinks.iterator();
+ while (it.hasNext()) {
buffer.append(" ");
- buffer.append(inlinks.get(i));
+ buffer.append(it.next());
buffer.append("\n");
}
return buffer.toString();
@@ -68,8 +71,9 @@
public String[] getAnchors() throws IOException {
HashMap domainToAnchors = new HashMap();
ArrayList results = new ArrayList();
- for (int i = 0; i < inlinks.size(); i++) {
- Inlink inlink = (Inlink)inlinks.get(i);
+ Iterator it = inlinks.iterator();
+ while (it.hasNext()) {
+ Inlink inlink = (Inlink)it.next();
String anchor = inlink.getAnchor();
if (anchor.length() == 0) // skip empty anchors
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Mon Mar 20 15:20:56 2006
@@ -117,8 +117,10 @@
}
int end = Math.min(maxInlinks - result.size(), inlinks.size());
- for (int i = 0; i < end; i++) {
- result.add(inlinks.get(i));
+ Iterator it = inlinks.iterator();
+ int i = 0;
+ while(it.hasNext() && i++ < end) {
+ result.add((Inlink)it.next());
}
}
output.collect(key, result);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Mon Mar 20 15:20:56 2006
@@ -29,6 +29,7 @@
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import java.util.Iterator;
import java.util.logging.Logger;
/** . */
@@ -112,8 +113,9 @@
if (links == null) {
System.out.println(" - no link information.");
} else {
- for (int i = 0; i < links.size(); i++) {
- System.out.println(links.get(i).toString());
+ Iterator it = links.iterator();
+ while (it.hasNext()) {
+ System.out.println(it.next().toString());
}
}
} else {