You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/01/30 15:21:22 UTC
svn commit: r1440429 - in /lucene/dev/branches/lucene4547/lucene/core/src:
java/org/apache/lucene/index/SortedDocValuesWriter.java
test/org/apache/lucene/index/Test2BSortedDocValues.java
Author: rmuir
Date: Wed Jan 30 14:21:22 2013
New Revision: 1440429
URL: http://svn.apache.org/viewvc?rev=1440429&view=rev
Log:
OOM prevention with many unique sorted values (still slooooooooooooow)
Modified:
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java?rev=1440429&r1=1440428&r2=1440429&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java Wed Jan 30 14:21:22 2013
@@ -89,6 +89,12 @@ class SortedDocValuesWriter extends DocV
int ord = hash.add(value);
if (ord < 0) {
ord = -ord-1;
+ } else {
+ // reserve additional space for each unique value:
+ // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
+ // TODO: can this same OOM happen in THPF?
+ // 2. when flushing, we need 1 int per value (slot in the ordMap).
+ iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT);
}
pending.add(ord);
@@ -120,9 +126,7 @@ class SortedDocValuesWriter extends DocV
final int valueCount = hash.size();
- // nocommit: account for both sortedValues and ordMap as-we-go...
final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
- final int sortedValueRamUsage = RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT*valueCount;
final int[] ordMap = new int[valueCount];
for(int ord=0;ord<valueCount;ord++) {
Modified: lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java?rev=1440429&r1=1440428&r2=1440429&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java Wed Jan 30 14:21:22 2013
@@ -93,5 +93,67 @@ public class Test2BSortedDocValues exten
dir.close();
}
- // TODO: variable, and also Test2BOrds
+ // indexes Integer.MAX_VALUE docs with a fixed binary field
+ // nocommit: this must be some kind of worst case for BytesRefHash / its hash fn...
+ // or there is some other perf bug...VERY slow!
+ public void test2BOrds() throws Exception {
+ BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BOrds"));
+ if (dir instanceof MockDirectoryWrapper) {
+ ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
+ }
+
+ IndexWriter w = new IndexWriter(dir,
+ new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
+ .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+ .setRAMBufferSizeMB(256.0)
+ .setMergeScheduler(new ConcurrentMergeScheduler())
+ .setMergePolicy(newLogMergePolicy(false, 10))
+ .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+
+ Document doc = new Document();
+ byte bytes[] = new byte[4];
+ BytesRef data = new BytesRef(bytes);
+ SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
+ doc.add(dvField);
+
+ for (int i = 0; i < Integer.MAX_VALUE; i++) {
+ bytes[0] = (byte)(i >> 24);
+ bytes[1] = (byte)(i >> 16);
+ bytes[2] = (byte)(i >> 8);
+ bytes[3] = (byte) i;
+ w.addDocument(doc);
+ if (i % 100000 == 0) {
+ System.out.println("indexed: " + i);
+ System.out.flush();
+ }
+ }
+
+ w.forceMerge(1);
+ w.close();
+
+ System.out.println("verifying...");
+ System.out.flush();
+
+ DirectoryReader r = DirectoryReader.open(dir);
+ int expectedValue = 0;
+ for (AtomicReaderContext context : r.leaves()) {
+ AtomicReader reader = context.reader();
+ BytesRef scratch = new BytesRef();
+ BinaryDocValues dv = reader.getSortedDocValues("dv");
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ bytes[0] = (byte)(expectedValue >> 24);
+ bytes[1] = (byte)(expectedValue >> 16);
+ bytes[2] = (byte)(expectedValue >> 8);
+ bytes[3] = (byte) expectedValue;
+ dv.get(i, scratch);
+ assertEquals(data, scratch);
+ expectedValue++;
+ }
+ }
+
+ r.close();
+ dir.close();
+ }
+
+ // TODO: variable
}