You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/07/26 10:44:35 UTC

svn commit: r559754 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/DeleteDuplicates.java src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java

Author: dogacan
Date: Thu Jul 26 01:44:33 2007
New Revision: 559754

URL: http://svn.apache.org/viewvc?view=rev&rev=559754
Log:
NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException when trying to rerun dedup on a segment. Contributed by Vishal Shah.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jul 26 01:44:33 2007
@@ -99,6 +99,9 @@
 33. NUTCH-516 - Next fetch time is not set when it is a 
     CrawlDatum.STATUS_FETCH_GONE. (Emmanuel Joke via dogacan)
 
+34. NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException 
+    when trying to rerun dedup on a segment. (Vishal Shah via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jul 26 01:44:33 2007
@@ -182,7 +182,7 @@
           return false;
 
         // skip deleted documents
-        while (indexReader.isDeleted(doc) && doc < maxDoc) doc++;
+        while (doc < maxDoc && indexReader.isDeleted(doc)) doc++;
         if (doc >= maxDoc)
           return false;
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Thu Jul 26 01:44:33 2007
@@ -42,6 +42,8 @@
   Path index1;
   Path index2;
   Path index3;
+  Path index4;
+  Path index5;
   
   public void setUp() throws Exception {
     conf = NutchConfiguration.create();
@@ -52,6 +54,8 @@
     index1 = createIndex("index1", true, 1.0f, 10L, false);
     index2 = createIndex("index2", false, 2.0f, 20L, true);
     index3 = createIndex("index3", true, 1.0f, 10L, true);
+    index4 = createSingleDocIndex("index4", 1.0f, 10L);
+    index5 = createSingleDocIndex("index5", 1.0f, 20L);
   }
   
   private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception {
@@ -80,6 +84,20 @@
     return idx;
   }
   
+  private Path createSingleDocIndex(String name, float inc, long time) throws Exception {
+    Path idx = new Path(root, name);
+    Path sub = new Path(idx, "part-0000");
+    Directory dir = FSDirectory.getDirectory(sub.toString());
+    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+    Document doc = makeDoc(name,
+        MD5Hash.digest("1").toString(),
+        "http://www.example.com/1",
+       1.0f + inc, time + 1);
+    writer.addDocument(doc);
+    writer.close();
+    return idx;
+  }
+  
   private Document makeDoc(String segment, String digest, String url, float boost, long time) {
     Document doc = new Document();
     doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
@@ -171,6 +189,12 @@
       System.out.println(doc);
     }
     reader.close();
+  }
+  
+  public void testRededuplicate() throws Exception {
+    DeleteDuplicates dedup = new DeleteDuplicates(conf);
+    dedup.dedup(new Path[]{index4, index5});
+    dedup.dedup(new Path[]{index4, index5});
   }
   
 }