You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/01/30 15:21:22 UTC

svn commit: r1440429 - in /lucene/dev/branches/lucene4547/lucene/core/src: java/org/apache/lucene/index/SortedDocValuesWriter.java test/org/apache/lucene/index/Test2BSortedDocValues.java

Author: rmuir
Date: Wed Jan 30 14:21:22 2013
New Revision: 1440429

URL: http://svn.apache.org/viewvc?rev=1440429&view=rev
Log:
OOM prevention with many unique sorted values (still slooooooooooooow)

Modified:
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
    lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java?rev=1440429&r1=1440428&r2=1440429&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java Wed Jan 30 14:21:22 2013
@@ -89,6 +89,12 @@ class SortedDocValuesWriter extends DocV
     int ord = hash.add(value);
     if (ord < 0) {
       ord = -ord-1;
+    } else {
+      // reserve additional space for each unique value:
+      // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
+      //    TODO: can this same OOM happen in THPF?
+      // 2. when flushing, we need 1 int per value (slot in the ordMap).
+      iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT);
     }
     
     pending.add(ord);
@@ -120,9 +126,7 @@ class SortedDocValuesWriter extends DocV
 
     final int valueCount = hash.size();
 
-    // nocommit: account for both sortedValues and ordMap as-we-go...
     final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
-    final int sortedValueRamUsage = RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT*valueCount;
     final int[] ordMap = new int[valueCount];
 
     for(int ord=0;ord<valueCount;ord++) {

Modified: lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java?rev=1440429&r1=1440428&r2=1440429&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java Wed Jan 30 14:21:22 2013
@@ -93,5 +93,67 @@ public class Test2BSortedDocValues exten
     dir.close();
   }
   
-  // TODO: variable, and also Test2BOrds
+  // indexes Integer.MAX_VALUE docs with a fixed binary field
+  // nocommit: this must be some kind of worst case for BytesRefHash / its hash fn... 
+  // or there is some other perf bug...VERY slow!
+  public void test2BOrds() throws Exception {
+    BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BOrds"));
+    if (dir instanceof MockDirectoryWrapper) {
+      ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
+    }
+    
+    IndexWriter w = new IndexWriter(dir,
+        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
+        .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+        .setRAMBufferSizeMB(256.0)
+        .setMergeScheduler(new ConcurrentMergeScheduler())
+        .setMergePolicy(newLogMergePolicy(false, 10))
+        .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+
+    Document doc = new Document();
+    byte bytes[] = new byte[4];
+    BytesRef data = new BytesRef(bytes);
+    SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
+    doc.add(dvField);
+    
+    for (int i = 0; i < Integer.MAX_VALUE; i++) {
+      bytes[0] = (byte)(i >> 24);
+      bytes[1] = (byte)(i >> 16);
+      bytes[2] = (byte)(i >> 8);
+      bytes[3] = (byte) i;
+      w.addDocument(doc);
+      if (i % 100000 == 0) {
+        System.out.println("indexed: " + i);
+        System.out.flush();
+      }
+    }
+    
+    w.forceMerge(1);
+    w.close();
+    
+    System.out.println("verifying...");
+    System.out.flush();
+    
+    DirectoryReader r = DirectoryReader.open(dir);
+    int expectedValue = 0;
+    for (AtomicReaderContext context : r.leaves()) {
+      AtomicReader reader = context.reader();
+      BytesRef scratch = new BytesRef();
+      BinaryDocValues dv = reader.getSortedDocValues("dv");
+      for (int i = 0; i < reader.maxDoc(); i++) {
+        bytes[0] = (byte)(expectedValue >> 24);
+        bytes[1] = (byte)(expectedValue >> 16);
+        bytes[2] = (byte)(expectedValue >> 8);
+        bytes[3] = (byte) expectedValue;
+        dv.get(i, scratch);
+        assertEquals(data, scratch);
+        expectedValue++;
+      }
+    }
+    
+    r.close();
+    dir.close();
+  }
+  
+  // TODO: variable
 }