You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/07/09 14:44:58 UTC
svn commit: r792532 - in /lucene/java/trunk: CHANGES.txt
src/java/org/apache/lucene/index/BufferedDeletes.java
src/java/org/apache/lucene/index/DocumentsWriter.java
src/java/org/apache/lucene/index/IndexWriter.java
Author: mikemccand
Date: Thu Jul 9 12:44:57 2009
New Revision: 792532
URL: http://svn.apache.org/viewvc?rev=792532&view=rev
Log:
LUCENE-1717: properly account for RAM used by buffered deletes
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Jul 9 12:44:57 2009
@@ -127,6 +127,9 @@
is failing to close reader/writers. (Brian Groose via Mike
McCandless)
+ 9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of
+ buffered deletions. (Mike McCandless)
+
API Changes
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java Thu Jul 9 12:44:57 2009
@@ -35,6 +35,7 @@
HashMap terms = new HashMap();
HashMap queries = new HashMap();
List docIDs = new ArrayList();
+ long bytesUsed;
// Number of documents a delete term applies to.
final static class Num {
@@ -60,17 +61,21 @@
}
}
-
+ int size() {
+ // We use numTerms not terms.size() intentionally, so
+ // that deletes by the same term multiple times "count",
+ // ie if you ask to flush every 1000 deletes then even
+ // dup'd terms are counted towards that 1000
+ return numTerms + queries.size() + docIDs.size();
+ }
void update(BufferedDeletes in) {
numTerms += in.numTerms;
+ bytesUsed += in.bytesUsed;
terms.putAll(in.terms);
queries.putAll(in.queries);
docIDs.addAll(in.docIDs);
- in.terms.clear();
- in.numTerms = 0;
- in.queries.clear();
- in.docIDs.clear();
+ in.clear();
}
void clear() {
@@ -78,6 +83,11 @@
queries.clear();
docIDs.clear();
numTerms = 0;
+ bytesUsed = 0;
+ }
+
+ void addBytesUsed(long b) {
+ bytesUsed += b;
}
boolean any() {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Jul 9 12:44:57 2009
@@ -38,6 +38,7 @@
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;
/**
* This class accepts multiple added documents and directly
@@ -887,8 +888,25 @@
}
synchronized boolean deletesFull() {
- return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH
- && ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size()) >= maxBufferedDeleteTerms);
+ return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+ (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
+ (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+ ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
+ }
+
+ synchronized boolean doApplyDeletes() {
+ // Very similar to deletesFull(), except we don't count
+ // numBytesAlloc, because we are checking whether
+ // deletes (alone) are consuming too many resources now
+ // and thus should be applied. We apply deletes if RAM
+ // usage is > 1/2 of our allowed RAM buffer, to prevent
+ // too-frequent flushing of a long tail of tiny segments
+ // when merges (which always apply deletes) are
+ // infrequent.
+ return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+ (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) ||
+ (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+ ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
}
synchronized private boolean timeToFlushDeletes() {
@@ -1015,20 +1033,24 @@
else
num.setNum(docIDUpto);
deletesInRAM.numTerms++;
+
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE);
}
// Buffer a specific docID for deletion. Currently only
// used when we hit a exception when adding a document
synchronized private void addDeleteDocID(int docID) {
deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID));
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID);
}
synchronized private void addDeleteQuery(Query query, int docID) {
deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID));
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY);
}
synchronized boolean doBalanceRAM() {
- return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
+ return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
}
/** Does the synchronized work to finish/flush the
@@ -1044,7 +1066,6 @@
assert docWriter == null || docWriter.docID == perThread.docState.docID;
-
if (aborting) {
// We are currently aborting, and another thread is
@@ -1109,7 +1130,7 @@
final SkipDocWriter skipDocWriter = new SkipDocWriter();
long getRAMUsed() {
- return numBytesUsed;
+ return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
}
long numBytesAlloc;
@@ -1137,10 +1158,34 @@
// Coarse estimates used to measure RAM usage of buffered deletes
final static int OBJECT_HEADER_BYTES = 8;
- final static int POINTER_NUM_BYTE = 4;
+ final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4;
final static int INT_NUM_BYTE = 4;
final static int CHAR_NUM_BYTE = 2;
+ /* Rough logic: HashMap has an array[Entry] w/ varying
+ load factor (say 2 * POINTER). Entry is object w/ Term
+ key, BufferedDeletes.Num val, int hash, Entry next
+ (OBJ_HEADER + 3*POINTER + INT). Term is object w/
+ String field and String text (OBJ_HEADER + 2*POINTER).
+ We don't count Term's field since it's interned.
+ Term's text is String (OBJ_HEADER + 4*INT + POINTER +
+ OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is
+ OBJ_HEADER + INT. */
+
+ final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE;
+
+ /* Rough logic: del docIDs are List<Integer>. Say list
+ allocates ~2X size (2*POINTER). Integer is OBJ_HEADER
+ + int */
+ final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE;
+
+ /* Rough logic: HashMap has an array[Entry] w/ varying
+ load factor (say 2 * POINTER). Entry is object w/
+ Query key, Integer val, int hash, Entry next
+ (OBJ_HEADER + 3*POINTER + INT). Query we often
+ undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
+ final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE + 24;
+
/* Initial chunks size of the shared byte[] blocks used to
store postings data */
final static int BYTE_BLOCK_SHIFT = 15;
@@ -1285,17 +1330,20 @@
// We flush when we've used our target usage
final long flushTrigger = ramBufferSize;
- if (numBytesAlloc > freeTrigger) {
+ final long deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
+
+ if (numBytesAlloc+deletesRAMUsed > freeTrigger) {
if (infoStream != null)
message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
" vs trigger=" + toMB(flushTrigger) +
" allocMB=" + toMB(numBytesAlloc) +
+ " deletesMB=" + toMB(deletesRAMUsed) +
" vs trigger=" + toMB(freeTrigger) +
" byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
" charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE));
- final long startBytesAlloc = numBytesAlloc;
+ final long startBytesAlloc = numBytesAlloc + deletesRAMUsed;
int iter = 0;
@@ -1305,12 +1353,12 @@
boolean any = true;
- while(numBytesAlloc > freeLevel) {
+ while(numBytesAlloc+deletesRAMUsed > freeLevel) {
synchronized(this) {
if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) {
// Nothing else to free -- must flush now.
- bufferIsFull = numBytesUsed > flushTrigger;
+ bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger;
if (infoStream != null) {
if (numBytesUsed > flushTrigger)
message(" nothing to free; now set bufferIsFull");
@@ -1345,7 +1393,7 @@
}
if (infoStream != null)
- message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.) + " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
+ message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
} else {
// If we have not crossed the 100% mark, but have
@@ -1355,10 +1403,11 @@
// flush.
synchronized(this) {
- if (numBytesUsed > flushTrigger) {
+ if (numBytesUsed+deletesRAMUsed > flushTrigger) {
if (infoStream != null)
message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) +
" allocMB=" + nf.format(numBytesAlloc/1024./1024.) +
+ " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) +
" triggerMB=" + nf.format(flushTrigger/1024./1024.));
bufferIsFull = true;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Thu Jul 9 12:44:57 2009
@@ -1729,17 +1729,28 @@
}
/** Determines the amount of RAM that may be used for
- * buffering added documents before they are flushed as a
- * new Segment. Generally for faster indexing performance
- * it's best to flush by RAM usage instead of document
- * count and use as large a RAM buffer as you can.
+ * buffering added documents and deletions before they are
+ * flushed to the Directory. Generally for faster
+ * indexing performance it's best to flush by RAM usage
+ * instead of document count and use as large a RAM buffer
+ * as you can.
*
* <p>When this is set, the writer will flush whenever
- * buffered documents use this much RAM. Pass in {@link
- * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
- * to RAM usage. Note that if flushing by document count
- * is also enabled, then the flush will be triggered by
- * whichever comes first.</p>
+ * buffered documents and deletions use this much RAM.
+ * Pass in {@link #DISABLE_AUTO_FLUSH} to prevent
+ * triggering a flush due to RAM usage. Note that if
+ * flushing by document count is also enabled, then the
+ * flush will be triggered by whichever comes first.</p>
+ *
+ * <p> <b>NOTE</b>: the account of RAM usage for pending
+ * deletions is only approximate. Specifically, if you
+ * delete by Query, Lucene currently has no way to measure
+ * the RAM usage if individual Queries so the accounting
+ * will under-estimate and you should compensate by either
+ * calling commit() periodically yourself, or by using
+ * {@link #setMaxBufferedDeleteTerms} to flush by count
+ * instead of RAM usage (each buffered delete Query counts
+ * as one).
*
* <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
*
@@ -4089,7 +4100,10 @@
flushCount++;
- flushDeletes |= docWriter.deletesFull();
+ // If we are flushing because too many deletes
+ // accumulated, then we should apply the deletes to free
+ // RAM:
+ flushDeletes |= docWriter.doApplyDeletes();
// When autoCommit=true we must always flush deletes
// when flushing a segment; otherwise deletes may become