You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/11 23:00:52 UTC
svn commit: r495397 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/DeleteDuplicates.java
src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Author: ab
Date: Thu Jan 11 14:00:51 2007
New Revision: 495397
URL: http://svn.apache.org/viewvc?view=rev&rev=495397
Log:
Fix NUTCH-420 - DeleteDuplicates depended on the order of IndexDoc
processing..
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495397&r1=495396&r2=495397
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 14:00:51 2007
@@ -128,6 +128,9 @@
41. Upgrade to Hadoop 0.10.1. (ab)
+42. NUTCH-420 - Fix a bug in DeleteDuplicates where results depended on the
+ order in which IndexDoc-s are processed. (Dogacan Guney via ab)
+
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=495397&r1=495396&r2=495397
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jan 11 14:00:51 2007
@@ -311,22 +311,25 @@
highest = value;
continue;
}
- if (byScore) {
- if (value.score > highest.score) {
- highest.keep = false;
- LOG.debug("-discard " + highest + ", keep " + value);
- output.collect(highest.url, highest); // delete highest
- highest = value;
- }
+ IndexDoc toDelete = null, toKeep = null;
+ boolean metric = byScore ? (value.score > highest.score) :
+ (value.urlLen < highest.urlLen);
+ if (metric) {
+ toDelete = highest;
+ toKeep = value;
} else {
- if (value.urlLen < highest.urlLen) {
- highest.keep = false;
- LOG.debug("-discard " + highest + ", keep " + value);
- output.collect(highest.url, highest); // delete highest
- highest = value;
- }
+ toDelete = value;
+ toKeep = highest;
}
- }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("-discard " + toDelete + ", keep " + toKeep);
+ }
+
+ toDelete.keep = false;
+ output.collect(toDelete.url, toDelete);
+ highest = toKeep;
+ }
LOG.debug("-keep " + highest);
// no need to add this - in phase 2 we only process docs to delete them
// highest.keep = true;
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=495397&r1=495396&r2=495397
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Thu Jan 11 14:00:51 2007
@@ -41,6 +41,7 @@
Path root;
Path index1;
Path index2;
+ Path index3;
public void setUp() throws Exception {
conf = NutchConfiguration.create();
@@ -48,11 +49,12 @@
fs = FileSystem.get(conf);
root = new Path("build/test/dedup2-test-" + new Random().nextInt());
// create test indexes
- index1 = createIndex("index1", true, 1.0f, 10L);
- index2 = createIndex("index2", false, 2.0f, 20L);
+ index1 = createIndex("index1", true, 1.0f, 10L, false);
+ index2 = createIndex("index2", false, 2.0f, 20L, true);
+ index3 = createIndex("index3", true, 1.0f, 10L, true);
}
- private Path createIndex(String name, boolean hashDup, float inc, long time) throws Exception {
+ private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception {
Path idx = new Path(root, name);
Path sub = new Path(idx, "part-0000");
Directory dir = FSDirectory.getDirectory(sub.toString(), true);
@@ -60,18 +62,18 @@
Document doc = makeDoc(name,
MD5Hash.digest("1").toString(),
"http://www.example.com/1",
- 1.0f, time);
+ 1.0f + (incFirst ? inc : 0.0f), time);
writer.addDocument(doc);
if (hashDup) {
doc = makeDoc(name,
MD5Hash.digest("1").toString(),
"http://www.example.com/2",
- 1.0f + inc, time + 1);
+ 1.0f + (!incFirst ? inc : 0.0f), time + 1);
} else {
doc = makeDoc(name,
MD5Hash.digest("2").toString(),
"http://www.example.com/1",
- 1.0f + inc, time + 1);
+ 1.0f + (!incFirst ? inc : 0.0f), time + 1);
}
writer.addDocument(doc);
writer.close();
@@ -93,10 +95,10 @@
fs.delete(root);
}
- public void testHashDuplicates() throws Exception {
+ private void hashDuplicatesHelper(Path index, String url) throws Exception {
DeleteDuplicates dedup = new DeleteDuplicates(conf);
- dedup.dedup(new Path[]{index1});
- FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
+ dedup.dedup(new Path[]{index});
+ FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf);
IndexReader reader = IndexReader.open(dir);
assertEquals("only one doc left", reader.numDocs(), 1);
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -106,10 +108,15 @@
}
Document doc = reader.document(i);
// make sure we got the right one
- assertEquals("check url", "http://www.example.com/2", doc.get("url"));
+ assertEquals("check url", url, doc.get("url"));
System.out.println(doc);
}
reader.close();
+ }
+
+ public void testHashDuplicates() throws Exception {
+ hashDuplicatesHelper(index1, "http://www.example.com/2");
+ hashDuplicatesHelper(index3, "http://www.example.com/1");
}
public void testUrlDuplicates() throws Exception {