You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/10/03 15:02:30 UTC

svn commit: r1528837 - in /lucene/dev/trunk/lucene/core/src: java/org/apache/lucene/index/ test/org/apache/lucene/index/

Author: shaie
Date: Thu Oct  3 13:02:29 2013
New Revision: 1528837

URL: http://svn.apache.org/r1528837
Log:
LUCENE-5189: fix updates-order and docsWithField bugs

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/BufferedDeletes.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/ReadersAndLiveDocs.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=1528837&r1=1528836&r2=1528837&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/BufferedDeletes.java Thu Oct  3 13:02:29 2013
@@ -66,29 +66,32 @@ class BufferedDeletes { // TODO (DVU_REN
 
   /* Rough logic: NumericUpdate calculates its actual size,
    * including the update Term and DV field (String). The 
-   * per-term map holds a reference to the update Term, and
+   * per-field map holds a reference to the updated field, and
    * therefore we only account for the object reference and 
    * map space itself. This is incremented when we first see
-   * an update Term.
-   * LinkedHashMap has an array[Entry] w/ varying load factor 
-   * (say 2*POINTER). Entry is an object w/ Term key, Map val, 
-   * int hash, Entry next, Entry before, Entry after (OBJ_HEADER + 5*POINTER + INT).
-   * Term (key) is counted only as POINTER.
-   * Map (val) is counted as OBJ_HEADER, array[Entry] ref + header, 4*INT, 1*FLOAT,
+   * an updated field.
+   * 
+   * HashMap has an array[Entry] w/ varying load
+   * factor (say 2*POINTER). Entry is an object w/ String key, 
+   * LinkedHashMap val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT).
+   * 
+   * LinkedHashMap (val) is counted as OBJ_HEADER, array[Entry] ref + header, 4*INT, 1*FLOAT,
    * Set (entrySet) (2*OBJ_HEADER + ARRAY_HEADER + 2*POINTER + 4*INT + FLOAT)
    */
-  final static int BYTES_PER_NUMERIC_UPDATE_TERM_ENTRY = 
-      9*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 3*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 
+  final static int BYTES_PER_NUMERIC_FIELD_ENTRY =
+      7*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 3*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 
       RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 5*RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT;
-  
-  /* Rough logic: Incremented when we see another field for an already updated
-   * Term.
-   * HashMap has an array[Entry] w/ varying load
-   * factor (say 2*POINTER). Entry is an object w/ String key, 
-   * NumericUpdate val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT).
-   * NumericUpdate returns its own size, and therefore isn't accounted for here.
+      
+  /* Rough logic: Incremented when we see another Term for an already updated
+   * field.
+   * LinkedHashMap has an array[Entry] w/ varying load factor 
+   * (say 2*POINTER). Entry is an object w/ Term key, NumericUpdate val, 
+   * int hash, Entry next, Entry before, Entry after (OBJ_HEADER + 5*POINTER + INT).
+   * 
+   * Term (key) is counted only as POINTER.
+   * NumericUpdate (val) counts its own size and isn't accounted for here.
    */
-  final static int BYTES_PER_NUMERIC_UPDATE_ENTRY = 5*RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
+  final static int BYTES_PER_NUMERIC_UPDATE_ENTRY = 7*RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
   
   final AtomicInteger numTermDeletes = new AtomicInteger();
   final AtomicInteger numNumericUpdates = new AtomicInteger();
@@ -96,12 +99,14 @@ class BufferedDeletes { // TODO (DVU_REN
   final Map<Query,Integer> queries = new HashMap<Query,Integer>();
   final List<Integer> docIDs = new ArrayList<Integer>();
 
-  // Map<updateTerm,Map<dvField,NumericUpdate>>
-  // LinkedHashMap because we need to preserve the order of the updates. That
-  // is, if two terms update the same document and same DV field, whoever came
-  // in last should win. LHM guarantees we iterate on the map in insertion
-  // order.
-  final Map<Term,Map<String,NumericUpdate>> numericUpdates = new LinkedHashMap<Term,Map<String,NumericUpdate>>();
+  // Map<dvField,Map<updateTerm,NumericUpdate>>
+  // For each field we keep an ordered list of NumericUpdates, key'd by the
+  // update Term. LinkedHashMap guarantees we will later traverse the map in
+  // insertion order (so that if two terms affect the same document, the last
+  // one that came in wins), and helps us detect faster if the same Term is
+  // used to update the same field multiple times (so we later traverse it
+  // only once).
+  final Map<String,LinkedHashMap<Term,NumericUpdate>> numericUpdates = new HashMap<String,LinkedHashMap<Term,NumericUpdate>>();
 
   public static final Integer MAX_INT = Integer.valueOf(Integer.MAX_VALUE);
 
@@ -180,13 +185,13 @@ class BufferedDeletes { // TODO (DVU_REN
   }
  
   public void addNumericUpdate(NumericUpdate update, int docIDUpto) {
-    Map<String,NumericUpdate> termUpdates = numericUpdates.get(update.term);
-    if (termUpdates == null) {
-      termUpdates = new HashMap<String,NumericUpdate>();
-      numericUpdates.put(update.term, termUpdates);
-      bytesUsed.addAndGet(BYTES_PER_NUMERIC_UPDATE_TERM_ENTRY);
+    LinkedHashMap<Term,NumericUpdate> fieldUpdates = numericUpdates.get(update.field);
+    if (fieldUpdates == null) {
+      fieldUpdates = new LinkedHashMap<Term,NumericUpdate>();
+      numericUpdates.put(update.field, fieldUpdates);
+      bytesUsed.addAndGet(BYTES_PER_NUMERIC_FIELD_ENTRY);
     }
-    final NumericUpdate current = termUpdates.get(update.field);
+    final NumericUpdate current = fieldUpdates.get(update.term);
     if (current != null && docIDUpto < current.docIDUpto) {
       // Only record the new number if it's greater than or equal to the current
       // one. This is important because if multiple threads are replacing the
@@ -196,7 +201,12 @@ class BufferedDeletes { // TODO (DVU_REN
     }
 
     update.docIDUpto = docIDUpto;
-    termUpdates.put(update.field, update);
+    // since it's a LinkedHashMap, we must first remove the Term entry so that
+    // it's added last (we're interested in insertion-order).
+    if (current != null) {
+      fieldUpdates.remove(update.term);
+    }
+    fieldUpdates.put(update.term, update);
     numNumericUpdates.incrementAndGet();
     if (current == null) {
       bytesUsed.addAndGet(BYTES_PER_NUMERIC_UPDATE_ENTRY + update.sizeInBytes());

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java?rev=1528837&r1=1528836&r2=1528837&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java Thu Oct  3 13:02:29 2013
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -78,9 +79,13 @@ class FrozenBufferedDeletes { // TODO (D
       upto++;
     }
 
+    // TODO if a Term affects multiple fields, we could keep the updates key'd by Term
+    // so that it maps to all fields it affects, sorted by their docUpto, and traverse
+    // that Term only once, applying the update to all fields that still need to be
+    // updated. 
     List<NumericUpdate> allUpdates = new ArrayList<NumericUpdate>();
     int numericUpdatesSize = 0;
-    for (Map<String,NumericUpdate> fieldUpdates : deletes.numericUpdates.values()) {
+    for (LinkedHashMap<Term,NumericUpdate> fieldUpdates : deletes.numericUpdates.values()) {
       for (NumericUpdate update : fieldUpdates.values()) {
         allUpdates.add(update);
         numericUpdatesSize += update.sizeInBytes();

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/ReadersAndLiveDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/ReadersAndLiveDocs.java?rev=1528837&r1=1528836&r2=1528837&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/ReadersAndLiveDocs.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/ReadersAndLiveDocs.java Thu Oct  3 13:02:29 2013
@@ -409,6 +409,7 @@ class ReadersAndLiveDocs { // TODO (DVU_
               fieldsConsumer.addNumericField(fieldInfo, new Iterable<Number>() {
                 @SuppressWarnings("synthetic-access")
                 final NumericDocValues currentValues = reader.getNumericDocValues(field);
+                final Bits docsWithField = reader.getDocsWithField(field);
                 @Override
                 public Iterator<Number> iterator() {
                   return new Iterator<Number>() {
@@ -429,7 +430,10 @@ class ReadersAndLiveDocs { // TODO (DVU_
                       }
                       Long updatedValue = updates.get(curDoc);
                       if (updatedValue == null) {
-                        updatedValue = Long.valueOf(currentValues.get(curDoc));
+                        // only read the current value if the document had a value before
+                        if (currentValues != null && docsWithField.get(curDoc)) {
+                          updatedValue = currentValues.get(curDoc);
+                        }
                       } else if (updatedValue == NumericUpdate.MISSING) {
                         updatedValue = null;
                       }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java?rev=1528837&r1=1528836&r2=1528837&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNumericDocValuesUpdates.java Thu Oct  3 13:02:29 2013
@@ -512,7 +512,7 @@ public class TestNumericDocValuesUpdates
   }
   
   @Test
-  public void testUpdateNonDocValueField() throws Exception {
+  public void testUpdateNonNumericDocValuesField() throws Exception {
     // we don't support adding new fields or updating existing non-numeric-dv
     // fields through numeric updates
     Directory dir = newDirectory();
@@ -811,7 +811,10 @@ public class TestNumericDocValuesUpdates
     // first segment with NDV
     Document doc = new Document();
     doc.add(new StringField("id", "doc0", Store.NO));
-    doc.add(new NumericDocValuesField("ndv", 5));
+    doc.add(new NumericDocValuesField("ndv", 3));
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new StringField("id", "doc4", Store.NO)); // document without 'ndv' field
     writer.addDocument(doc);
     writer.commit();
     
@@ -819,9 +822,17 @@ public class TestNumericDocValuesUpdates
     doc = new Document();
     doc.add(new StringField("id", "doc1", Store.NO));
     writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new StringField("id", "doc2", Store.NO)); // document that isn't updated
+    writer.addDocument(doc);
     writer.commit();
     
-    // update document in the second segment
+    // update document in the first segment - should not affect docsWithField of
+    // the document without NDV field
+    writer.updateNumericDocValue(new Term("id", "doc0"), "ndv", 5L);
+    
+    // update document in the second segment - field should be added and we should
+    // be able to handle the other document correctly (e.g. no NPE)
     writer.updateNumericDocValue(new Term("id", "doc1"), "ndv", 5L);
     writer.close();
 
@@ -829,9 +840,12 @@ public class TestNumericDocValuesUpdates
     for (AtomicReaderContext context : reader.leaves()) {
       AtomicReader r = context.reader();
       NumericDocValues ndv = r.getNumericDocValues("ndv");
-      for (int i = 0; i < r.maxDoc(); i++) {
-        assertEquals(5L, ndv.get(i));
-      }
+      Bits docsWithField = r.getDocsWithField("ndv");
+      assertNotNull(docsWithField);
+      assertTrue(docsWithField.get(0));
+      assertEquals(5L, ndv.get(0));
+      assertFalse(docsWithField.get(1));
+      assertEquals(0L, ndv.get(1));
     }
     reader.close();
 
@@ -1236,4 +1250,31 @@ public class TestNumericDocValuesUpdates
     dir.close();
   }
   
+  @Test
+  public void testUpdatesOrder() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    IndexWriter writer = new IndexWriter(dir, conf);
+    
+    Document doc = new Document();
+    doc.add(new StringField("upd", "t1", Store.NO));
+    doc.add(new StringField("upd", "t2", Store.NO));
+    doc.add(new NumericDocValuesField("f1", 1L));
+    doc.add(new NumericDocValuesField("f2", 1L));
+    writer.addDocument(doc);
+    writer.updateNumericDocValue(new Term("upd", "t1"), "f1", 2L); // update f1 to 2
+    writer.updateNumericDocValue(new Term("upd", "t1"), "f2", 2L); // update f2 to 2
+    writer.updateNumericDocValue(new Term("upd", "t2"), "f1", 3L); // update f1 to 3
+    writer.updateNumericDocValue(new Term("upd", "t2"), "f2", 3L); // update f2 to 3
+    writer.updateNumericDocValue(new Term("upd", "t1"), "f1", 4L); // update f1 to 4 (but not f2)
+    writer.close();
+    
+    DirectoryReader reader = DirectoryReader.open(dir);
+    assertEquals(4, reader.leaves().get(0).reader().getNumericDocValues("f1").get(0));
+    assertEquals(3, reader.leaves().get(0).reader().getNumericDocValues("f2").get(0));
+    reader.close();
+    
+    dir.close();
+  }
+  
 }