You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/07/26 10:44:35 UTC
svn commit: r559754 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/DeleteDuplicates.java
src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Author: dogacan
Date: Thu Jul 26 01:44:33 2007
New Revision: 559754
URL: http://svn.apache.org/viewvc?view=rev&rev=559754
Log:
NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException when trying to rerun dedup on a segment. Contributed by Vishal Shah.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jul 26 01:44:33 2007
@@ -99,6 +99,9 @@
33. NUTCH-516 - Next fetch time is not set when it is a
CrawlDatum.STATUS_FETCH_GONE. (Emmanuel Joke via dogacan)
+34. NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException
+ when trying to rerun dedup on a segment. (Vishal Shah via dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jul 26 01:44:33 2007
@@ -182,7 +182,7 @@
return false;
// skip deleted documents
- while (indexReader.isDeleted(doc) && doc < maxDoc) doc++;
+ while (doc < maxDoc && indexReader.isDeleted(doc)) doc++;
if (doc >= maxDoc)
return false;
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Thu Jul 26 01:44:33 2007
@@ -42,6 +42,8 @@
Path index1;
Path index2;
Path index3;
+ Path index4;
+ Path index5;
public void setUp() throws Exception {
conf = NutchConfiguration.create();
@@ -52,6 +54,8 @@
index1 = createIndex("index1", true, 1.0f, 10L, false);
index2 = createIndex("index2", false, 2.0f, 20L, true);
index3 = createIndex("index3", true, 1.0f, 10L, true);
+ index4 = createSingleDocIndex("index4", 1.0f, 10L);
+ index5 = createSingleDocIndex("index5", 1.0f, 20L);
}
private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception {
@@ -80,6 +84,20 @@
return idx;
}
+ private Path createSingleDocIndex(String name, float inc, long time) throws Exception {
+ Path idx = new Path(root, name);
+ Path sub = new Path(idx, "part-0000");
+ Directory dir = FSDirectory.getDirectory(sub.toString());
+ IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+ Document doc = makeDoc(name,
+ MD5Hash.digest("1").toString(),
+ "http://www.example.com/1",
+ 1.0f + inc, time + 1);
+ writer.addDocument(doc);
+ writer.close();
+ return idx;
+ }
+
private Document makeDoc(String segment, String digest, String url, float boost, long time) {
Document doc = new Document();
doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
@@ -171,6 +189,12 @@
System.out.println(doc);
}
reader.close();
+ }
+
+ public void testRededuplicate() throws Exception {
+ DeleteDuplicates dedup = new DeleteDuplicates(conf);
+ dedup.dedup(new Path[]{index4, index5});
+ dedup.dedup(new Path[]{index4, index5});
}
}