You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/05 16:52:23 UTC
svn commit: r1099830 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/ lucene/backwards/src/test/org/apache/lucene/index/
lucene/src/java/org/apache/lucene/index/
lucene/src/test/org/apache/lucene/index/ solr/
Author: mikemccand
Date: Thu May 5 14:52:22 2011
New Revision: 1099830
URL: http://svn.apache.org/viewvc?rev=1099830&view=rev
Log:
LUCENE-2897: apply delete-by-term on flushed segment while we flush (still buffer delete-by-terms for past segments) (merged from trunk)
Added:
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
- copied unchanged from r1065855, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Thu May 5 14:52:22 2011
@@ -20,6 +20,9 @@ Optimizations
* LUCENE-2990: ArrayUtil/CollectionUtil.*Sort() methods now exit early
on empty or one-element lists/arrays. (Uwe Schindler)
+* LUCENE-2897: Apply deleted terms while flushing a segment. We still
+ buffer deleted terms to later apply to past segments. (Mike McCandless)
+
Bug fixes
* LUCENE-2996: addIndexes(IndexReader) did not flush before adding the new
Modified: lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java Thu May 5 14:52:22 2011
@@ -18,7 +18,6 @@ package org.apache.lucene.index;
*/
import java.io.ByteArrayOutputStream;
-import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
@@ -2715,7 +2714,7 @@ public class TestIndexWriter extends Luc
count++;
}
}
- assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting") + " count=" + count, count > 2500);
+ assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting") + " count=" + count, count > 1500);
}
w.close();
dir.close();
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java Thu May 5 14:52:22 2011
@@ -18,21 +18,23 @@ package org.apache.lucene.index;
*/
import java.util.ArrayList;
+import java.util.Iterator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
-/** Holds buffered deletes, by docID, term or query for a
- * single segment. This is used to hold buffered pending
- * deletes against the to-be-flushed segment as well as
- * per-segment deletes for each segment in the index. */
+/* Holds buffered deletes, by docID, term or query for a
+ * single segment. This is used to hold buffered pending
+ * deletes against the to-be-flushed segment. Once the
+ * deletes are pushed (on flush in DocumentsWriter), these
+ * deletes are converted to a FrozenDeletes instance. */
// NOTE: we are sync'd by BufferedDeletes, ie, all access to
// instances of this class is via sync'd methods on
@@ -62,13 +64,8 @@ class BufferedDeletes {
undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
final static int BYTES_PER_DEL_QUERY = 5*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2*RamUsageEstimator.NUM_BYTES_INT + 24;
- // TODO: many of the deletes stored here will map to
- // Integer.MAX_VALUE; we could be more efficient for this
- // case ie use a SortedSet not a SortedMap. But: Java's
- // SortedSet impls are simply backed by a Map so we won't
- // save anything unless we do something custom...
final AtomicInteger numTermDeletes = new AtomicInteger();
- final SortedMap<Term,Integer> terms = new TreeMap<Term,Integer>();
+ final Map<Term,Integer> terms;
final Map<Query,Integer> queries = new HashMap<Query,Integer>();
final List<Integer> docIDs = new ArrayList<Integer>();
@@ -80,6 +77,14 @@ class BufferedDeletes {
long gen;
+ public BufferedDeletes(boolean sortTerms) {
+ if (sortTerms) {
+ terms = new TreeMap<Term,Integer>();
+ } else {
+ terms = new HashMap<Term,Integer>();
+ }
+ }
+
@Override
public String toString() {
if (VERBOSE_DELETES) {
@@ -129,6 +134,26 @@ class BufferedDeletes {
// should already be cleared
}
+ void update(FrozenBufferedDeletes in) {
+ numTermDeletes.addAndGet(in.numTermDeletes);
+ for(Term term : in.terms) {
+ if (!terms.containsKey(term)) {
+ // only incr bytesUsed if this term wasn't already buffered:
+ bytesUsed.addAndGet(BYTES_PER_DEL_TERM);
+ }
+ terms.put(term, MAX_INT);
+ }
+
+ for(int queryIdx=0;queryIdx<in.queries.length;queryIdx++) {
+ final Query query = in.queries[queryIdx];
+ if (!queries.containsKey(query)) {
+ // only incr bytesUsed if this query wasn't already buffered:
+ bytesUsed.addAndGet(BYTES_PER_DEL_QUERY);
+ }
+ queries.put(query, MAX_INT);
+ }
+ }
+
public void addQuery(Query query, int docIDUpto) {
Integer current = queries.put(query, docIDUpto);
// increment bytes used only if the query wasn't added so far.
@@ -161,6 +186,43 @@ class BufferedDeletes {
bytesUsed.addAndGet(BYTES_PER_DEL_TERM + term.text.length() * RamUsageEstimator.NUM_BYTES_CHAR);
}
}
+
+ public Iterable<Term> termsIterable() {
+ return new Iterable<Term>() {
+ // @Override -- not until Java 1.6
+ public Iterator<Term> iterator() {
+ return terms.keySet().iterator();
+ }
+ };
+ }
+
+ public Iterable<QueryAndLimit> queriesIterable() {
+ return new Iterable<QueryAndLimit>() {
+
+ // @Override -- not until Java 1.6
+ public Iterator<QueryAndLimit> iterator() {
+ return new Iterator<QueryAndLimit>() {
+ private final Iterator<Map.Entry<Query,Integer>> iter = queries.entrySet().iterator();
+
+ // @Override -- not until Java 1.6
+ public boolean hasNext() {
+ return iter.hasNext();
+ }
+
+ // @Override -- not until Java 1.6
+ public QueryAndLimit next() {
+ final Map.Entry<Query,Integer> ent = iter.next();
+ return new QueryAndLimit(ent.getKey(), ent.getValue());
+ }
+
+ // @Override -- not until Java 1.6
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ };
+ }
void clear() {
terms.clear();
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Thu May 5 14:52:22 2011
@@ -22,7 +22,6 @@ import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
import java.util.Date;
-import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicInteger;
@@ -51,7 +50,7 @@ import org.apache.lucene.search.Weight;
class BufferedDeletesStream {
// TODO: maybe linked list?
- private final List<BufferedDeletes> deletes = new ArrayList<BufferedDeletes>();
+ private final List<FrozenBufferedDeletes> deletes = new ArrayList<FrozenBufferedDeletes>();
// Starts at 1 so that SegmentInfos that have never had
// deletes applied (whose bufferedDelGen defaults to 0)
@@ -82,13 +81,13 @@ class BufferedDeletesStream {
// Appends a new packet of buffered deletes to the stream,
// setting its generation:
- public synchronized void push(BufferedDeletes packet) {
+ public synchronized void push(FrozenBufferedDeletes packet) {
assert packet.any();
assert checkDeleteStats();
- packet.gen = nextGen++;
+ assert packet.gen < nextGen;
deletes.add(packet);
- numTerms.addAndGet(packet.numTermDeletes.get());
- bytesUsed.addAndGet(packet.bytesUsed.get());
+ numTerms.addAndGet(packet.numTermDeletes);
+ bytesUsed.addAndGet(packet.bytesUsed);
if (infoStream != null) {
message("push deletes " + packet + " delGen=" + packet.gen + " packetCount=" + deletes.size());
}
@@ -181,14 +180,14 @@ class BufferedDeletesStream {
while (infosIDX >= 0) {
//System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);
- final BufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
+ final FrozenBufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
final SegmentInfo info = infos2.get(infosIDX);
final long segGen = info.getBufferedDeletesGen();
if (packet != null && segGen < packet.gen) {
//System.out.println(" coalesce");
if (coalescedDeletes == null) {
- coalescedDeletes = new BufferedDeletes();
+ coalescedDeletes = new BufferedDeletes(true);
}
coalescedDeletes.update(packet);
delIDX--;
@@ -201,25 +200,25 @@ class BufferedDeletesStream {
int delCount = 0;
try {
if (coalescedDeletes != null) {
- delCount += applyDeletes(coalescedDeletes, reader);
+ //System.out.println(" del coalesced");
+ delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+ delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
}
- delCount += applyDeletes(packet, reader);
+ //System.out.println(" del exact");
+ // Don't delete by Term here; DocumentsWriter
+ // already did that on flush:
+ delCount += applyQueryDeletes(packet.queriesIterable(), reader);
} finally {
readerPool.release(reader);
}
anyNewDeletes |= delCount > 0;
- // We've applied doc ids, and they're only applied
- // on the current segment
- bytesUsed.addAndGet(-packet.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
- packet.clearDocIDs();
-
if (infoStream != null) {
message("seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount);
}
if (coalescedDeletes == null) {
- coalescedDeletes = new BufferedDeletes();
+ coalescedDeletes = new BufferedDeletes(true);
}
coalescedDeletes.update(packet);
delIDX--;
@@ -235,7 +234,8 @@ class BufferedDeletesStream {
SegmentReader reader = readerPool.get(info, false);
int delCount = 0;
try {
- delCount += applyDeletes(coalescedDeletes, reader);
+ delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+ delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
} finally {
readerPool.release(reader);
}
@@ -300,91 +300,91 @@ class BufferedDeletesStream {
message("pruneDeletes: prune " + count + " packets; " + (deletes.size() - count) + " packets remain");
}
for(int delIDX=0;delIDX<count;delIDX++) {
- final BufferedDeletes packet = deletes.get(delIDX);
- numTerms.addAndGet(-packet.numTermDeletes.get());
+ final FrozenBufferedDeletes packet = deletes.get(delIDX);
+ numTerms.addAndGet(-packet.numTermDeletes);
assert numTerms.get() >= 0;
- bytesUsed.addAndGet(-packet.bytesUsed.get());
+ bytesUsed.addAndGet(-packet.bytesUsed);
assert bytesUsed.get() >= 0;
}
deletes.subList(0, count).clear();
}
}
- private synchronized long applyDeletes(BufferedDeletes deletes, SegmentReader reader) throws IOException {
-
+ // Delete by Term
+ private synchronized long applyTermDeletes(Iterable<Term> termsIter, SegmentReader reader) throws IOException {
long delCount = 0;
-
+
assert checkDeleteTerm(null);
- if (deletes.terms.size() > 0) {
- final TermDocs docs = reader.termDocs();
+ final TermDocs docs = reader.termDocs();
+
+ for (Term term : termsIter) {
- for (Entry<Term,Integer> entry: deletes.terms.entrySet()) {
- Term term = entry.getKey();
- // Since we visit terms sorted, we gain performance
- // by re-using the same TermsEnum and seeking only
- // forwards
- assert checkDeleteTerm(term);
- docs.seek(term);
+ // Since we visit terms sorted, we gain performance
+ // by re-using the same TermsEnum and seeking only
+ // forwards
+ assert checkDeleteTerm(term);
+ docs.seek(term);
- final int limit = entry.getValue();
- while (docs.next()) {
- final int docID = docs.doc();
- if (docID >= limit) {
- break;
- }
- // TODO: we could/should change
- // reader.deleteDocument to return boolean
- // true if it did in fact delete, because here
- // we could be deleting an already-deleted doc
- // which makes this an upper bound:
- delCount++;
+ while (docs.next()) {
+ final int docID = docs.doc();
+ reader.deleteDocument(docID);
+ // TODO: we could/should change
+ // reader.deleteDocument to return boolean
+ // true if it did in fact delete, because here
+ // we could be deleting an already-deleted doc
+ // which makes this an upper bound:
+ delCount++;
- reader.deleteDocument(docID);
- }
+ reader.deleteDocument(docID);
}
}
- // Delete by docID
- for (Integer docIdInt : deletes.docIDs) {
- int docID = docIdInt.intValue();
- reader.deleteDocument(docID);
- delCount++;
- }
-
- // Delete by query
- if (deletes.queries.size() > 0) {
- IndexSearcher searcher = new IndexSearcher(reader);
- try {
- for (Entry<Query, Integer> entry : deletes.queries.entrySet()) {
- Query query = entry.getKey();
- int limit = entry.getValue().intValue();
- Weight weight = query.weight(searcher);
- Scorer scorer = weight.scorer(reader, true, false);
- if (scorer != null) {
- while(true) {
- int doc = scorer.nextDoc();
- if (doc >= limit)
- break;
-
- reader.deleteDocument(doc);
- // TODO: we could/should change
- // reader.deleteDocument to return boolean
- // true if it did in fact delete, because here
- // we could be deleting an already-deleted doc
- // which makes this an upper bound:
- delCount++;
- }
+ return delCount;
+ }
+
+ public static class QueryAndLimit {
+ public final Query query;
+ public final int limit;
+ public QueryAndLimit(Query query, int limit) {
+ this.query = query;
+ this.limit = limit;
+ }
+ }
+
+ // Delete by query
+ private synchronized long applyQueryDeletes(Iterable<QueryAndLimit> queriesIter, SegmentReader reader) throws IOException {
+ long delCount = 0;
+ IndexSearcher searcher = new IndexSearcher(reader);
+ try {
+ for (QueryAndLimit ent : queriesIter) {
+ Query query = ent.query;
+ int limit = ent.limit;
+ Weight weight = query.weight(searcher);
+ Scorer scorer = weight.scorer(reader, true, false);
+ if (scorer != null) {
+ while(true) {
+ int doc = scorer.nextDoc();
+ if (doc >= limit)
+ break;
+
+ reader.deleteDocument(doc);
+ // TODO: we could/should change
+ // reader.deleteDocument to return boolean
+ // true if it did in fact delete, because here
+ // we could be deleting an already-deleted doc
+ // which makes this an upper bound:
+ delCount++;
}
}
- } finally {
- searcher.close();
}
+ } finally {
+ searcher.close();
}
return delCount;
}
-
+
// used only by assert
private boolean checkDeleteTerm(Term term) {
if (term != null) {
@@ -398,9 +398,9 @@ class BufferedDeletesStream {
private boolean checkDeleteStats() {
int numTerms2 = 0;
long bytesUsed2 = 0;
- for(BufferedDeletes packet : deletes) {
- numTerms2 += packet.numTermDeletes.get();
- bytesUsed2 += packet.bytesUsed.get();
+ for(FrozenBufferedDeletes packet : deletes) {
+ numTerms2 += packet.numTermDeletes;
+ bytesUsed2 += packet.bytesUsed;
}
assert numTerms2 == numTerms.get(): "numTerms2=" + numTerms2 + " vs " + numTerms.get();
assert bytesUsed2 == bytesUsed.get(): "bytesUsed2=" + bytesUsed2 + " vs " + bytesUsed;
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Thu May 5 14:52:22 2011
@@ -35,8 +35,10 @@ import org.apache.lucene.store.AlreadyCl
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.ThreadInterruptedException;
+
/**
* This class accepts multiple added documents and directly
@@ -132,7 +134,7 @@ final class DocumentsWriter {
private final int maxThreadStates;
// Deletes for our still-in-RAM (to be flushed next) segment
- private BufferedDeletes pendingDeletes = new BufferedDeletes();
+ private BufferedDeletes pendingDeletes = new BufferedDeletes(false);
static class DocState {
DocumentsWriter docWriter;
@@ -323,6 +325,9 @@ final class DocumentsWriter {
return doFlush;
}
+ // TODO: we could check w/ FreqProxTermsWriter: if the
+ // term doesn't exist, don't bother buffering into the
+ // per-DWPT map (but still must go into the global map)
boolean deleteTerm(Term term, boolean skipWait) {
final boolean doFlush = flushControl.waitUpdate(0, 1, skipWait);
synchronized(this) {
@@ -469,17 +474,19 @@ final class DocumentsWriter {
private void pushDeletes(SegmentInfo newSegment, SegmentInfos segmentInfos) {
// Lock order: DW -> BD
+ final long delGen = bufferedDeletesStream.getNextGen();
if (pendingDeletes.any()) {
if (segmentInfos.size() > 0 || newSegment != null) {
+ final FrozenBufferedDeletes packet = new FrozenBufferedDeletes(pendingDeletes, delGen);
if (infoStream != null) {
message("flush: push buffered deletes");
}
- bufferedDeletesStream.push(pendingDeletes);
+ bufferedDeletesStream.push(packet);
if (infoStream != null) {
- message("flush: delGen=" + pendingDeletes.gen);
+ message("flush: delGen=" + packet.gen);
}
if (newSegment != null) {
- newSegment.setBufferedDeletesGen(pendingDeletes.gen);
+ newSegment.setBufferedDeletesGen(packet.gen);
}
} else {
if (infoStream != null) {
@@ -489,9 +496,9 @@ final class DocumentsWriter {
// there are no segments, the deletions cannot
// affect anything.
}
- pendingDeletes = new BufferedDeletes();
+ pendingDeletes.clear();
} else if (newSegment != null) {
- newSegment.setBufferedDeletesGen(bufferedDeletesStream.getNextGen());
+ newSegment.setBufferedDeletesGen(delGen);
}
}
@@ -541,7 +548,19 @@ final class DocumentsWriter {
}
final SegmentWriteState flushState = new SegmentWriteState(infoStream, directory, segment, fieldInfos,
- numDocs, writer.getConfig().getTermIndexInterval());
+ numDocs, writer.getConfig().getTermIndexInterval(),
+ pendingDeletes);
+ // Apply delete-by-docID now (delete-byDocID only
+ // happens when an exception is hit processing that
+ // doc, eg if analyzer has some problem w/ the text):
+ if (pendingDeletes.docIDs.size() > 0) {
+ flushState.deletedDocs = new BitVector(numDocs);
+ for(int delDocID : pendingDeletes.docIDs) {
+ flushState.deletedDocs.set(delDocID);
+ }
+ pendingDeletes.bytesUsed.addAndGet(-pendingDeletes.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
+ pendingDeletes.docIDs.clear();
+ }
newSegment = new SegmentInfo(segment, numDocs, directory, false, true, fieldInfos.hasProx(), false);
@@ -553,10 +572,14 @@ final class DocumentsWriter {
double startMBUsed = bytesUsed()/1024./1024.;
consumer.flush(threads, flushState);
+
newSegment.setHasVectors(flushState.hasVectors);
if (infoStream != null) {
message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors"));
+ if (flushState.deletedDocs != null) {
+ message("new segment has " + flushState.deletedDocs.count() + " deleted docs");
+ }
message("flushedFiles=" + newSegment.files());
}
@@ -576,6 +599,30 @@ final class DocumentsWriter {
newSegment.setUseCompoundFile(true);
}
+ // Must write deleted docs after the CFS so we don't
+ // slurp the del file into CFS:
+ if (flushState.deletedDocs != null) {
+ final int delCount = flushState.deletedDocs.count();
+ assert delCount > 0;
+ newSegment.setDelCount(delCount);
+ newSegment.advanceDelGen();
+ final String delFileName = newSegment.getDelFileName();
+ boolean success2 = false;
+ try {
+ flushState.deletedDocs.write(directory, delFileName);
+ success2 = true;
+ } finally {
+ if (!success2) {
+ try {
+ directory.deleteFile(delFileName);
+ } catch (Throwable t) {
+ // suppress this so we keep throwing the
+ // original exception
+ }
+ }
+ }
+ }
+
if (infoStream != null) {
message("flush: segment=" + newSegment);
final double newSegmentSizeNoStore = newSegment.sizeInBytes(false)/1024./1024.;
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java Thu May 5 14:52:22 2011
@@ -78,6 +78,14 @@ final class FreqProxFieldMergeState {
return true;
}
+ public String termText() {
+ int upto = textOffset;
+ while(text[upto] != 0xffff) {
+ upto++;
+ }
+ return new String(text, textOffset, upto-textOffset);
+ }
+
public boolean nextDoc() throws IOException {
if (freq.eof()) {
if (postings.lastDocCodes[currentTermID] != -1) {
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Thu May 5 14:52:22 2011
@@ -24,6 +24,7 @@ import java.util.Collection;
import java.util.Map;
import java.util.ArrayList;
import java.util.List;
+import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.CollectionUtil;
final class FreqProxTermsWriter extends TermsHashConsumer {
@@ -116,7 +117,7 @@ final class FreqProxTermsWriter extends
// If this field has postings then add them to the
// segment
- appendPostings(fields, consumer);
+ appendPostings(fieldName, state, fields, consumer);
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
@@ -142,7 +143,8 @@ final class FreqProxTermsWriter extends
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
- void appendPostings(FreqProxTermsWriterPerField[] fields,
+ void appendPostings(String fieldName, SegmentWriteState state,
+ FreqProxTermsWriterPerField[] fields,
FormatPostingsFieldsConsumer consumer)
throws CorruptIndexException, IOException {
@@ -161,17 +163,31 @@ final class FreqProxTermsWriter extends
}
final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
+ final Term protoTerm = new Term(fieldName);
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
+ final Map<Term,Integer> segDeletes;
+ if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
+ segDeletes = state.segDeletes.terms;
+ } else {
+ segDeletes = null;
+ }
+
+ // TODO: really TermsHashPerField should take over most
+ // of this loop, including merge sort of terms from
+ // multiple threads and interacting with the
+ // TermsConsumer, only calling out to us (passing us the
+ // DocsConsumer) to handle delivery of docs/positions
while(numFields > 0) {
// Get the next term to merge
termStates[0] = mergeStates[0];
int numToMerge = 1;
+ // TODO: pqueue
for(int i=1;i<numFields;i++) {
final char[] text = mergeStates[i].text;
final int textOffset = mergeStates[i].textOffset;
@@ -186,6 +202,18 @@ final class FreqProxTermsWriter extends
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);
+ final int delDocLimit;
+ if (segDeletes != null) {
+ final Integer docIDUpto = segDeletes.get(protoTerm.createTerm(termStates[0].termText()));
+ if (docIDUpto != null) {
+ delDocLimit = docIDUpto;
+ } else {
+ delDocLimit = 0;
+ }
+ } else {
+ delDocLimit = 0;
+ }
+
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
@@ -200,6 +228,28 @@ final class FreqProxTermsWriter extends
final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
+ // NOTE: we could check here if the docID was
+ // deleted, and skip it. However, this is somewhat
+ // dangerous because it can yield non-deterministic
+ // behavior since we may see the docID before we see
+ // the term that caused it to be deleted. This
+ // would mean some (but not all) of its postings may
+ // make it into the index, which'd alter the docFreq
+ // for those terms. We could fix this by doing two
+ // passes, ie first sweep marks all del docs, and
+ // 2nd sweep does the real flush, but I suspect
+ // that'd add too much time to flush.
+
+ if (minState.docID < delDocLimit) {
+ // Mark it deleted. TODO: we could also skip
+ // writing its postings; this would be
+ // deterministic (just for this Term's docs).
+ if (state.deletedDocs == null) {
+ state.deletedDocs = new BitVector(state.numDocs);
+ }
+ state.deletedDocs.set(minState.docID);
+ }
+
final ByteSliceReader prox = minState.prox;
// Carefully copy over the prox + payload info,
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java Thu May 5 14:52:22 2011
@@ -434,7 +434,7 @@ public class IndexWriter implements Clos
IndexReader getReader(int termInfosIndexDivisor, boolean applyAllDeletes) throws IOException {
ensureOpen();
-
+
final long tStart = System.currentTimeMillis();
if (infoStream != null) {
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java Thu May 5 14:52:22 2011
@@ -643,7 +643,7 @@ public abstract class LogMergePolicy ext
sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
- sb.append("useCompoundFile=").append(useCompoundFile);
+ sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
sb.append("requireContiguousMerge=").append(requireContiguousMerge);
sb.append("]");
return sb.toString();
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java Thu May 5 14:52:22 2011
@@ -256,8 +256,7 @@ final class SegmentMerger {
// details.
throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
- segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval);
-
+ segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval, null);
return docCount;
}
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Thu May 5 14:52:22 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.PrintStream;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitVector;
/**
* @lucene.experimental
@@ -32,6 +33,16 @@ public class SegmentWriteState {
public final int numDocs;
public boolean hasVectors;
+ // Deletes to apply while we are flushing the segment. A
+ // Term is enrolled in here if it was deleted at one
+ // point, and it's mapped to the docIDUpto, meaning any
+ // docID < docIDUpto containing this term should be
+ // deleted.
+ public final BufferedDeletes segDeletes;
+
+ // Lazily created:
+ public BitVector deletedDocs;
+
/** Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
@@ -52,8 +63,9 @@ public class SegmentWriteState {
public final int maxSkipLevels = 10;
public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos,
- int numDocs, int termIndexInterval) {
+ int numDocs, int termIndexInterval, BufferedDeletes segDeletes) {
this.infoStream = infoStream;
+ this.segDeletes = segDeletes;
this.directory = directory;
this.segmentName = segmentName;
this.fieldInfos = fieldInfos;
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java Thu May 5 14:52:22 2011
@@ -24,7 +24,6 @@ import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@@ -152,6 +151,7 @@ public class TestAddIndexes extends Luce
setUpDirs(dir, aux);
IndexWriter writer = newWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
writer.setInfoStream(VERBOSE ? System.out : null);
+ writer.setInfoStream(VERBOSE ? System.out : null);
writer.addIndexes(aux);
// Adds 10 docs, then replaces them with another 10
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Thu May 5 14:52:22 2011
@@ -2777,7 +2777,7 @@ public class TestIndexWriter extends Luc
count++;
}
}
- assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting") + " count=" + count, count > 2500);
+ assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting") + " count=" + count, count > 1500);
}
w.close();
dir.close();
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java Thu May 5 14:52:22 2011
@@ -21,7 +21,6 @@ import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
@@ -158,8 +157,6 @@ public class TestIndexWriterDelete exten
assertEquals(0, modifier.getSegmentCount());
modifier.commit();
- modifier.commit();
-
IndexReader reader = IndexReader.open(dir, true);
assertEquals(1, reader.numDocs());