You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/07/04 17:16:40 UTC
svn commit: r553236 [5/6] - in /lucene/java/trunk: ./
contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/ docs/
src/java/org/apache/lucene/analysis/ src/java/org/apache/lucene/index/
src/java/org/apache/lucene/store/ src/site/src/documentati...
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java Wed Jul 4 08:16:38 2007
@@ -43,4 +43,9 @@
this.omitNorms = omitNorms;
this.storePayloads = storePayloads;
}
+
+ public Object clone() {
+ return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector,
+ storeOffsetWithTermVector, omitNorms, storePayloads);
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java Wed Jul 4 08:16:38 2007
@@ -62,6 +62,20 @@
}
}
+ /**
+ * Returns a deep clone of this FieldInfos instance.
+ */
+ public Object clone() {
+ FieldInfos fis = new FieldInfos();
+ final int numField = byNumber.size();
+ for(int i=0;i<numField;i++) {
+ FieldInfo fi = (FieldInfo) ((FieldInfo) byNumber.get(i)).clone();
+ fis.byNumber.add(fi);
+ fis.byName.put(fi.name, fi);
+ }
+ return fis;
+ }
+
/** Adds field info for a Document. */
public void add(Document doc) {
List fields = doc.getFields();
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java Wed Jul 4 08:16:38 2007
@@ -51,19 +51,39 @@
private int size;
private boolean closed;
+ // The docID offset where our docs begin in the index
+ // file. This will be 0 if we have our own private file.
+ private int docStoreOffset;
+
private ThreadLocal fieldsStreamTL = new ThreadLocal();
FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
- this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE);
+ this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
}
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
+ this(d, segment, fn, readBufferSize, -1, 0);
+ }
+
+ FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
fieldInfos = fn;
cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize);
fieldsStream = (IndexInput)cloneableFieldsStream.clone();
indexStream = d.openInput(segment + ".fdx", readBufferSize);
- size = (int) (indexStream.length() / 8);
+
+ if (docStoreOffset != -1) {
+ // We read only a slice out of this shared fields file
+ this.docStoreOffset = docStoreOffset;
+ this.size = size;
+
+ // Verify the file is long enough to hold all of our
+ // docs
+ assert ((int) (indexStream.length()/8)) >= size + this.docStoreOffset;
+ } else {
+ this.docStoreOffset = 0;
+ this.size = (int) (indexStream.length() / 8);
+ }
}
/**
@@ -100,7 +120,7 @@
}
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
- indexStream.seek(n * 8L);
+ indexStream.seek((n + docStoreOffset) * 8L);
long position = indexStream.readLong();
fieldsStream.seek(position);
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java Wed Jul 4 08:16:38 2007
@@ -24,6 +24,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.IndexOutput;
final class FieldsWriter
@@ -38,15 +39,92 @@
private IndexOutput indexStream;
+ private boolean doClose;
+
FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
fieldInfos = fn;
fieldsStream = d.createOutput(segment + ".fdt");
indexStream = d.createOutput(segment + ".fdx");
+ doClose = true;
+ }
+
+ FieldsWriter(IndexOutput fdx, IndexOutput fdt, FieldInfos fn) throws IOException {
+ fieldInfos = fn;
+ fieldsStream = fdt;
+ indexStream = fdx;
+ doClose = false;
+ }
+
+ // Writes the contents of buffer into the fields stream
+ // and adds a new entry for this document into the index
+ // stream. This assumes the buffer was already written
+ // in the correct fields format.
+ void flushDocument(RAMOutputStream buffer) throws IOException {
+ indexStream.writeLong(fieldsStream.getFilePointer());
+ buffer.writeTo(fieldsStream);
+ }
+
+ void flush() throws IOException {
+ indexStream.flush();
+ fieldsStream.flush();
}
final void close() throws IOException {
+ if (doClose) {
fieldsStream.close();
indexStream.close();
+ }
+ }
+
+ final void writeField(FieldInfo fi, Fieldable field) throws IOException {
+ // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
+ // and field.binaryValue() already returns the compressed value for a field
+ // with isCompressed()==true, so we disable compression in that case
+ boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
+ fieldsStream.writeVInt(fi.number);
+ byte bits = 0;
+ if (field.isTokenized())
+ bits |= FieldsWriter.FIELD_IS_TOKENIZED;
+ if (field.isBinary())
+ bits |= FieldsWriter.FIELD_IS_BINARY;
+ if (field.isCompressed())
+ bits |= FieldsWriter.FIELD_IS_COMPRESSED;
+
+ fieldsStream.writeByte(bits);
+
+ if (field.isCompressed()) {
+ // compression is enabled for the current field
+ byte[] data = null;
+
+ if (disableCompression) {
+ // optimized case for merging, the data
+ // is already compressed
+ data = field.binaryValue();
+ } else {
+ // check if it is a binary field
+ if (field.isBinary()) {
+ data = compress(field.binaryValue());
+ }
+ else {
+ data = compress(field.stringValue().getBytes("UTF-8"));
+ }
+ }
+ final int len = data.length;
+ fieldsStream.writeVInt(len);
+ fieldsStream.writeBytes(data, len);
+ }
+ else {
+ // compression is disabled for the current field
+ if (field.isBinary()) {
+ byte[] data = field.binaryValue();
+ final int len = data.length;
+ fieldsStream.writeVInt(len);
+ fieldsStream.writeBytes(data, len);
+ }
+ else {
+ fieldsStream.writeString(field.stringValue());
+ }
+ }
}
final void addDocument(Document doc) throws IOException {
@@ -64,57 +142,8 @@
fieldIterator = doc.getFields().iterator();
while (fieldIterator.hasNext()) {
Fieldable field = (Fieldable) fieldIterator.next();
- // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
- // and field.binaryValue() already returns the compressed value for a field
- // with isCompressed()==true, so we disable compression in that case
- boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
- if (field.isStored()) {
- fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
-
- byte bits = 0;
- if (field.isTokenized())
- bits |= FieldsWriter.FIELD_IS_TOKENIZED;
- if (field.isBinary())
- bits |= FieldsWriter.FIELD_IS_BINARY;
- if (field.isCompressed())
- bits |= FieldsWriter.FIELD_IS_COMPRESSED;
-
- fieldsStream.writeByte(bits);
-
- if (field.isCompressed()) {
- // compression is enabled for the current field
- byte[] data = null;
-
- if (disableCompression) {
- // optimized case for merging, the data
- // is already compressed
- data = field.binaryValue();
- } else {
- // check if it is a binary field
- if (field.isBinary()) {
- data = compress(field.binaryValue());
- }
- else {
- data = compress(field.stringValue().getBytes("UTF-8"));
- }
- }
- final int len = data.length;
- fieldsStream.writeVInt(len);
- fieldsStream.writeBytes(data, len);
- }
- else {
- // compression is disabled for the current field
- if (field.isBinary()) {
- byte[] data = field.binaryValue();
- final int len = data.length;
- fieldsStream.writeVInt(len);
- fieldsStream.writeBytes(data, len);
- }
- else {
- fieldsStream.writeString(field.stringValue());
- }
- }
- }
+ if (field.isStored())
+ writeField(fieldInfos.fieldInfo(field.name()), field);
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileDeleter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileDeleter.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileDeleter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileDeleter.java Wed Jul 4 08:16:38 2007
@@ -97,6 +97,7 @@
private PrintStream infoStream;
private Directory directory;
private IndexDeletionPolicy policy;
+ private DocumentsWriter docWriter;
void setInfoStream(PrintStream infoStream) {
this.infoStream = infoStream;
@@ -116,10 +117,12 @@
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
- public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream)
+ public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter)
throws CorruptIndexException, IOException {
+ this.docWriter = docWriter;
this.infoStream = infoStream;
+
this.policy = policy;
this.directory = directory;
@@ -294,7 +297,7 @@
public void checkpoint(SegmentInfos segmentInfos, boolean isCommit) throws IOException {
if (infoStream != null) {
- message("now checkpoint \"" + segmentInfos.getCurrentSegmentFileName() + "\" [isCommit = " + isCommit + "]");
+ message("now checkpoint \"" + segmentInfos.getCurrentSegmentFileName() + "\" [" + segmentInfos.size() + " segments " + "; isCommit = " + isCommit + "]");
}
// Try again now to delete any previously un-deletable
@@ -310,6 +313,8 @@
// Incref the files:
incRef(segmentInfos, isCommit);
+ if (docWriter != null)
+ incRef(docWriter.files());
if (isCommit) {
// Append to our commits list:
@@ -325,9 +330,8 @@
// DecRef old files from the last checkpoint, if any:
int size = lastFiles.size();
if (size > 0) {
- for(int i=0;i<size;i++) {
+ for(int i=0;i<size;i++)
decRef((List) lastFiles.get(i));
- }
lastFiles.clear();
}
@@ -340,6 +344,8 @@
lastFiles.add(segmentInfo.files());
}
}
+ if (docWriter != null)
+ lastFiles.add(docWriter.files());
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java Wed Jul 4 08:16:38 2007
@@ -38,18 +38,54 @@
/** Extension of norms file */
static final String NORMS_EXTENSION = "nrm";
+ /** Extension of freq postings file */
+ static final String FREQ_EXTENSION = "frq";
+
+ /** Extension of prox postings file */
+ static final String PROX_EXTENSION = "prx";
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tis";
+
+ /** Extension of terms index file */
+ static final String TERMS_INDEX_EXTENSION = "tii";
+
+ /** Extension of stored fields index file */
+ static final String FIELDS_INDEX_EXTENSION = "fdx";
+
+ /** Extension of stored fields file */
+ static final String FIELDS_EXTENSION = "fdt";
+
+ /** Extension of vectors fields file */
+ static final String VECTORS_FIELDS_EXTENSION = "tvf";
+
+ /** Extension of vectors documents file */
+ static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
+
+ /** Extension of vectors index file */
+ static final String VECTORS_INDEX_EXTENSION = "tvx";
+
/** Extension of compound file */
static final String COMPOUND_FILE_EXTENSION = "cfs";
+ /** Extension of compound file for doc store files*/
+ static final String COMPOUND_FILE_STORE_EXTENSION = "cfx";
+
/** Extension of deletes */
static final String DELETES_EXTENSION = "del";
+ /** Extension of field infos */
+ static final String FIELD_INFOS_EXTENSION = "fnm";
+
/** Extension of plain norms */
static final String PLAIN_NORMS_EXTENSION = "f";
/** Extension of separate norms */
static final String SEPARATE_NORMS_EXTENSION = "s";
+ /** Extension of gen file */
+ static final String GEN_EXTENSION = "gen";
+
/**
* This array contains all filename extensions used by
* Lucene's index files, with two exceptions, namely the
@@ -59,25 +95,72 @@
* filename extension.
*/
static final String INDEX_EXTENSIONS[] = new String[] {
- "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
- "tvx", "tvd", "tvf", "gen", "nrm"
+ COMPOUND_FILE_EXTENSION,
+ FIELD_INFOS_EXTENSION,
+ FIELDS_INDEX_EXTENSION,
+ FIELDS_EXTENSION,
+ TERMS_INDEX_EXTENSION,
+ TERMS_EXTENSION,
+ FREQ_EXTENSION,
+ PROX_EXTENSION,
+ DELETES_EXTENSION,
+ VECTORS_INDEX_EXTENSION,
+ VECTORS_DOCUMENTS_EXTENSION,
+ VECTORS_FIELDS_EXTENSION,
+ GEN_EXTENSION,
+ NORMS_EXTENSION,
+ COMPOUND_FILE_STORE_EXTENSION,
};
/** File extensions that are added to a compound file
* (same as above, minus "del", "gen", "cfs"). */
static final String[] INDEX_EXTENSIONS_IN_COMPOUND_FILE = new String[] {
- "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx",
- "tvx", "tvd", "tvf", "nrm"
+ FIELD_INFOS_EXTENSION,
+ FIELDS_INDEX_EXTENSION,
+ FIELDS_EXTENSION,
+ TERMS_INDEX_EXTENSION,
+ TERMS_EXTENSION,
+ FREQ_EXTENSION,
+ PROX_EXTENSION,
+ VECTORS_INDEX_EXTENSION,
+ VECTORS_DOCUMENTS_EXTENSION,
+ VECTORS_FIELDS_EXTENSION,
+ NORMS_EXTENSION
+ };
+
+ static final String[] STORE_INDEX_EXTENSIONS = new String[] {
+ VECTORS_INDEX_EXTENSION,
+ VECTORS_FIELDS_EXTENSION,
+ VECTORS_DOCUMENTS_EXTENSION,
+ FIELDS_INDEX_EXTENSION,
+ FIELDS_EXTENSION
+ };
+
+ static final String[] NON_STORE_INDEX_EXTENSIONS = new String[] {
+ FIELD_INFOS_EXTENSION,
+ FREQ_EXTENSION,
+ PROX_EXTENSION,
+ TERMS_EXTENSION,
+ TERMS_INDEX_EXTENSION,
+ NORMS_EXTENSION
};
/** File extensions of old-style index files */
static final String COMPOUND_EXTENSIONS[] = new String[] {
- "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
+ FIELD_INFOS_EXTENSION,
+ FREQ_EXTENSION,
+ PROX_EXTENSION,
+ FIELDS_INDEX_EXTENSION,
+ FIELDS_EXTENSION,
+ TERMS_INDEX_EXTENSION,
+ TERMS_EXTENSION
};
/** File extensions for term vector support */
static final String VECTOR_EXTENSIONS[] = new String[] {
- "tvx", "tvd", "tvf"
+ VECTORS_INDEX_EXTENSION,
+ VECTORS_DOCUMENTS_EXTENSION,
+ VECTORS_FIELDS_EXTENSION
};
/**
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexModifier.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexModifier.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexModifier.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexModifier.java Wed Jul 4 08:16:38 2007
@@ -203,7 +203,8 @@
indexWriter = new IndexWriter(directory, analyzer, false);
indexWriter.setInfoStream(infoStream);
indexWriter.setUseCompoundFile(useCompoundFile);
- indexWriter.setMaxBufferedDocs(maxBufferedDocs);
+ if (maxBufferedDocs != 0)
+ indexWriter.setMaxBufferedDocs(maxBufferedDocs);
indexWriter.setMaxFieldLength(maxFieldLength);
indexWriter.setMergeFactor(mergeFactor);
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java Wed Jul 4 08:16:38 2007
@@ -783,7 +783,7 @@
// KeepOnlyLastCommitDeleter:
IndexFileDeleter deleter = new IndexFileDeleter(directory,
deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy,
- segmentInfos, null);
+ segmentInfos, null, null);
// Checkpoint the state we are about to change, in
// case we have to roll back:
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Wed Jul 4 08:16:38 2007
@@ -61,14 +61,19 @@
When finished adding, deleting and updating documents, <a href="#close()"><b>close</b></a> should be called.</p>
<p>These changes are buffered in memory and periodically
- flushed to the {@link Directory} (during the above method calls). A flush is triggered when there are
- enough buffered deletes (see {@link
- #setMaxBufferedDeleteTerms}) or enough added documents
- (see {@link #setMaxBufferedDocs}) since the last flush,
- whichever is sooner. You can also force a flush by
- calling {@link #flush}. When a flush occurs, both pending
- deletes and added documents are flushed to the index. A
- flush may also trigger one or more segment merges.</p>
+ flushed to the {@link Directory} (during the above method
+ calls). A flush is triggered when there are enough
+ buffered deletes (see {@link #setMaxBufferedDeleteTerms})
+ or enough added documents since the last flush, whichever
+ is sooner. For the added documents, flushing is triggered
+ either by RAM usage of the documents (see {@link
+ #setRAMBufferSizeMB}) or the number of added documents
+ (this is the default; see {@link #setMaxBufferedDocs}).
+ For best indexing speed you should flush by RAM usage with
+ a large RAM buffer. You can also force a flush by calling
+ {@link #flush}. When a flush occurs, both pending deletes
+ and added documents are flushed to the index. A flush may
+ also trigger one or more segment merges.</p>
<a name="autoCommit"></a>
<p>The optional <code>autoCommit</code> argument to the
@@ -181,7 +186,20 @@
/**
* Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
*/
+
public final static int DEFAULT_MAX_BUFFERED_DOCS = 10;
+ /* new merge policy
+ public final static int DEFAULT_MAX_BUFFERED_DOCS = 0;
+ */
+
+ /**
+ * Default value is 0 MB (which means flush only by doc
+ * count). Change using {@link #setRAMBufferSizeMB}.
+ */
+ public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0;
+ /* new merge policy
+ public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
+ */
/**
* Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}.
@@ -224,8 +242,7 @@
private boolean autoCommit = true; // false if we should commit only on close
SegmentInfos segmentInfos = new SegmentInfos(); // the segments
- SegmentInfos ramSegmentInfos = new SegmentInfos(); // the segments in ramDirectory
- private final RAMDirectory ramDirectory = new RAMDirectory(); // for temp segs
+ private DocumentsWriter docWriter;
private IndexFileDeleter deleter;
private Lock writeLock;
@@ -621,11 +638,14 @@
rollbackSegmentInfos = (SegmentInfos) segmentInfos.clone();
}
+ docWriter = new DocumentsWriter(directory, this);
+ docWriter.setInfoStream(infoStream);
+
// Default deleter (for backwards compatibility) is
// KeepOnlyLastCommitDeleter:
deleter = new IndexFileDeleter(directory,
deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy,
- segmentInfos, infoStream);
+ segmentInfos, infoStream, docWriter);
} catch (IOException e) {
this.writeLock.release();
@@ -683,31 +703,64 @@
return maxFieldLength;
}
- /** Determines the minimal number of documents required before the buffered
- * in-memory documents are merged and a new Segment is created.
- * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
- * large value gives faster indexing. At the same time, mergeFactor limits
- * the number of files open in a FSDirectory.
- *
- * <p> The default value is 10.
- *
- * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
+ /** Determines the minimal number of documents required
+ * before the buffered in-memory documents are flushed as
+ * a new Segment. Large values generally gives faster
+ * indexing.
+ *
+ * <p>When this is set, the writer will flush every
+ * maxBufferedDocs added documents and never flush by RAM
+ * usage.</p>
+ *
+ * <p> The default value is 0 (writer flushes by RAM
+ * usage).</p>
+ *
+ * @throws IllegalArgumentException if maxBufferedDocs is
+ * smaller than 2
+ * @see #setRAMBufferSizeMB
*/
public void setMaxBufferedDocs(int maxBufferedDocs) {
ensureOpen();
if (maxBufferedDocs < 2)
throw new IllegalArgumentException("maxBufferedDocs must at least be 2");
- this.minMergeDocs = maxBufferedDocs;
+ docWriter.setMaxBufferedDocs(maxBufferedDocs);
}
/**
- * Returns the number of buffered added documents that will
+ * Returns 0 if this writer is flushing by RAM usage, else
+ * returns the number of buffered added documents that will
* trigger a flush.
* @see #setMaxBufferedDocs
*/
public int getMaxBufferedDocs() {
ensureOpen();
- return minMergeDocs;
+ return docWriter.getMaxBufferedDocs();
+ }
+
+ /** Determines the amount of RAM that may be used for
+ * buffering added documents before they are flushed as a
+ * new Segment. Generally for faster indexing performance
+ * it's best to flush by RAM usage instead of document
+ * count and use as large a RAM buffer as you can.
+ *
+ * <p>When this is set, the writer will flush whenever
+ * buffered documents use this much RAM.</p>
+ *
+ * <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
+ */
+ public void setRAMBufferSizeMB(double mb) {
+ if (mb <= 0.0)
+ throw new IllegalArgumentException("ramBufferSize should be > 0.0 MB");
+ docWriter.setRAMBufferSizeMB(mb);
+ }
+
+ /**
+ * Returns 0.0 if this writer is flushing by document
+ * count, else returns the value set by {@link
+ * #setRAMBufferSizeMB}.
+ */
+ public double getRAMBufferSizeMB() {
+ return docWriter.getRAMBufferSizeMB();
}
/**
@@ -788,6 +841,7 @@
public void setInfoStream(PrintStream infoStream) {
ensureOpen();
this.infoStream = infoStream;
+ docWriter.setInfoStream(infoStream);
deleter.setInfoStream(infoStream);
}
@@ -871,7 +925,7 @@
*/
public synchronized void close() throws CorruptIndexException, IOException {
if (!closed) {
- flushRamSegments();
+ flush(true, true);
if (commitPending) {
segmentInfos.write(directory); // now commit changes
@@ -880,18 +934,79 @@
rollbackSegmentInfos = null;
}
- ramDirectory.close();
if (writeLock != null) {
writeLock.release(); // release write lock
writeLock = null;
}
closed = true;
+ docWriter = null;
if(closeDir)
directory.close();
}
}
+ /** Tells the docWriter to close its currently open shared
+ * doc stores (stored fields & vectors files). */
+ private void flushDocStores() throws IOException {
+
+ List files = docWriter.files();
+
+ if (files.size() > 0) {
+ String docStoreSegment;
+
+ boolean success = false;
+ try {
+ docStoreSegment = docWriter.closeDocStore();
+ success = true;
+ } finally {
+ if (!success)
+ docWriter.abort();
+ }
+
+ if (useCompoundFile && docStoreSegment != null) {
+ // Now build compound doc store file
+ checkpoint();
+
+ success = false;
+
+ final int numSegments = segmentInfos.size();
+
+ try {
+ CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION);
+ final int size = files.size();
+ for(int i=0;i<size;i++)
+ cfsWriter.addFile((String) files.get(i));
+
+ // Perform the merge
+ cfsWriter.close();
+
+ for(int i=0;i<numSegments;i++) {
+ SegmentInfo si = segmentInfos.info(i);
+ if (si.getDocStoreOffset() != -1 &&
+ si.getDocStoreSegment().equals(docStoreSegment))
+ si.setDocStoreIsCompoundFile(true);
+ }
+ checkpoint();
+ success = true;
+ } finally {
+ if (!success) {
+ // Rollback to no compound file
+ for(int i=0;i<numSegments;i++) {
+ SegmentInfo si = segmentInfos.info(i);
+ if (si.getDocStoreOffset() != -1 &&
+ si.getDocStoreSegment().equals(docStoreSegment))
+ si.setDocStoreIsCompoundFile(false);
+ }
+ deleter.refresh();
+ }
+ }
+
+ deleter.checkpoint(segmentInfos, false);
+ }
+ }
+ }
+
/** Release the write lock, if needed. */
protected void finalize() throws Throwable {
try {
@@ -916,11 +1031,10 @@
return analyzer;
}
-
/** Returns the number of documents currently in this index. */
public synchronized int docCount() {
ensureOpen();
- int count = ramSegmentInfos.size();
+ int count = docWriter.getNumDocsInRAM();
for (int i = 0; i < segmentInfos.size(); i++) {
SegmentInfo si = segmentInfos.info(i);
count += si.docCount;
@@ -998,22 +1112,8 @@
*/
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
ensureOpen();
- SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
- synchronized (this) {
- ramSegmentInfos.addElement(newSegmentInfo);
- maybeFlushRamSegments();
- }
- }
-
- SegmentInfo buildSingleDocSegment(Document doc, Analyzer analyzer)
- throws CorruptIndexException, IOException {
- DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, this);
- dw.setInfoStream(infoStream);
- String segmentName = newRamSegmentName();
- dw.addDocument(segmentName, doc);
- SegmentInfo si = new SegmentInfo(segmentName, 1, ramDirectory, false, false);
- si.setNumFields(dw.getNumFields());
- return si;
+ if (docWriter.addDocument(doc, analyzer))
+ flush(true, false);
}
/**
@@ -1025,7 +1125,7 @@
public synchronized void deleteDocuments(Term term) throws CorruptIndexException, IOException {
ensureOpen();
bufferDeleteTerm(term);
- maybeFlushRamSegments();
+ maybeFlush();
}
/**
@@ -1041,7 +1141,7 @@
for (int i = 0; i < terms.length; i++) {
bufferDeleteTerm(terms[i]);
}
- maybeFlushRamSegments();
+ maybeFlush();
}
/**
@@ -1077,16 +1177,13 @@
public void updateDocument(Term term, Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
ensureOpen();
- SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
synchronized (this) {
bufferDeleteTerm(term);
- ramSegmentInfos.addElement(newSegmentInfo);
- maybeFlushRamSegments();
}
- }
-
- final synchronized String newRamSegmentName() {
- return "_ram_" + Integer.toString(ramSegmentInfos.counter++, Character.MAX_RADIX);
+ if (docWriter.addDocument(doc, analyzer))
+ flush(true, false);
+ else
+ maybeFlush();
}
// for test purpose
@@ -1095,8 +1192,8 @@
}
// for test purpose
- final synchronized int getRamSegmentCount(){
- return ramSegmentInfos.size();
+ final synchronized int getNumBufferedDocuments(){
+ return docWriter.getNumDocsInRAM();
}
// for test purpose
@@ -1108,7 +1205,7 @@
}
}
- final synchronized String newSegmentName() {
+ final String newSegmentName() {
return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
}
@@ -1125,17 +1222,10 @@
*/
private int mergeFactor = DEFAULT_MERGE_FACTOR;
- /** Determines the minimal number of documents required before the buffered
- * in-memory documents are merging and a new Segment is created.
- * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
- * large value gives faster indexing. At the same time, mergeFactor limits
- * the number of files open in a FSDirectory.
- *
- * <p> The default value is {@link #DEFAULT_MAX_BUFFERED_DOCS}.
-
+ /** Determines amount of RAM usage by the buffered docs at
+ * which point we trigger a flush to the index.
*/
- private int minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS;
-
+ private double ramBufferSize = DEFAULT_RAM_BUFFER_SIZE_MB*1024F*1024F;
/** Determines the largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
@@ -1151,6 +1241,7 @@
*/
private PrintStream infoStream = null;
+
private static PrintStream defaultInfoStream = null;
/** Merges all segments together into a single segment,
@@ -1219,16 +1310,16 @@
*/
public synchronized void optimize() throws CorruptIndexException, IOException {
ensureOpen();
- flushRamSegments();
+ flush();
while (segmentInfos.size() > 1 ||
(segmentInfos.size() == 1 &&
(SegmentReader.hasDeletions(segmentInfos.info(0)) ||
SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
segmentInfos.info(0).dir != directory ||
(useCompoundFile &&
- (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
+ !segmentInfos.info(0).getUseCompoundFile())))) {
int minSegment = segmentInfos.size() - mergeFactor;
- mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
+ mergeSegments(minSegment < 0 ? 0 : minSegment, segmentInfos.size());
}
}
@@ -1245,7 +1336,7 @@
localRollbackSegmentInfos = (SegmentInfos) segmentInfos.clone();
localAutoCommit = autoCommit;
if (localAutoCommit) {
- flushRamSegments();
+ flush();
// Turn off auto-commit during our local transaction:
autoCommit = false;
} else
@@ -1335,16 +1426,18 @@
segmentInfos.clear();
segmentInfos.addAll(rollbackSegmentInfos);
+ docWriter.abort();
+
// Ask deleter to locate unreferenced files & remove
// them:
deleter.checkpoint(segmentInfos, false);
deleter.refresh();
- ramSegmentInfos = new SegmentInfos();
bufferedDeleteTerms.clear();
numBufferedDeleteTerms = 0;
commitPending = false;
+ docWriter.abort();
close();
} else {
@@ -1439,7 +1532,7 @@
for (int base = start; base < segmentInfos.size(); base++) {
int end = Math.min(segmentInfos.size(), base+mergeFactor);
if (end-base > 1) {
- mergeSegments(segmentInfos, base, end);
+ mergeSegments(base, end);
}
}
}
@@ -1479,7 +1572,7 @@
// segments in S may not since they could come from multiple indexes.
// Here is the merge algorithm for addIndexesNoOptimize():
//
- // 1 Flush ram segments.
+ // 1 Flush ram.
// 2 Consider a combined sequence with segments from T followed
// by segments from S (same as current addIndexes(Directory[])).
// 3 Assume the highest level for segments in S is h. Call
@@ -1500,13 +1593,18 @@
// copy a segment, which may cause doc count to change because deleted
// docs are garbage collected.
- // 1 flush ram segments
+ // 1 flush ram
ensureOpen();
- flushRamSegments();
+ flush();
// 2 copy segment infos and find the highest level from dirs
- int startUpperBound = minMergeDocs;
+ int startUpperBound = docWriter.getMaxBufferedDocs();
+
+ /* new merge policy
+ if (startUpperBound == 0)
+ startUpperBound = 10;
+ */
boolean success = false;
@@ -1566,7 +1664,7 @@
// copy those segments from S
for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) {
- mergeSegments(segmentInfos, i, i + 1);
+ mergeSegments(i, i + 1);
}
if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) {
success = true;
@@ -1575,7 +1673,7 @@
}
// invariants do not hold, simply merge those segments
- mergeSegments(segmentInfos, segmentCount - numTailSegments, segmentCount);
+ mergeSegments(segmentCount - numTailSegments, segmentCount);
// maybe merge segments again if necessary
if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) {
@@ -1637,7 +1735,8 @@
}
segmentInfos.setSize(0); // pop old infos & add new
- info = new SegmentInfo(mergedName, docCount, directory, false, true);
+ info = new SegmentInfo(mergedName, docCount, directory, false, true,
+ -1, null, false);
segmentInfos.addElement(info);
success = true;
@@ -1720,29 +1819,19 @@
* buffered added documents or buffered deleted terms are
* large enough.
*/
- protected final void maybeFlushRamSegments() throws CorruptIndexException, IOException {
- // A flush is triggered if enough new documents are buffered or
- // if enough delete terms are buffered
- if (ramSegmentInfos.size() >= minMergeDocs || numBufferedDeleteTerms >= maxBufferedDeleteTerms) {
- flushRamSegments();
- }
+ protected final synchronized void maybeFlush() throws CorruptIndexException, IOException {
+ // We only check for flush due to number of buffered
+ // delete terms, because triggering of a flush due to
+ // too many added documents is handled by
+ // DocumentsWriter
+ if (numBufferedDeleteTerms >= maxBufferedDeleteTerms && docWriter.setFlushPending())
+ flush(true, false);
}
- /** Expert: Flushes all RAM-resident segments (buffered documents), then may merge segments. */
- private final synchronized void flushRamSegments() throws CorruptIndexException, IOException {
- flushRamSegments(true);
+ public final synchronized void flush() throws CorruptIndexException, IOException {
+ flush(true, false);
}
-
- /** Expert: Flushes all RAM-resident segments (buffered documents),
- * then may merge segments if triggerMerge==true. */
- protected final synchronized void flushRamSegments(boolean triggerMerge)
- throws CorruptIndexException, IOException {
- if (ramSegmentInfos.size() > 0 || bufferedDeleteTerms.size() > 0) {
- mergeSegments(ramSegmentInfos, 0, ramSegmentInfos.size());
- if (triggerMerge) maybeMergeSegments(minMergeDocs);
- }
- }
-
+
/**
* Flush all in-memory buffered updates (adds and deletes)
* to the Directory.
@@ -1751,9 +1840,158 @@
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
- public final synchronized void flush() throws CorruptIndexException, IOException {
+ public final synchronized void flush(boolean triggerMerge, boolean flushDocStores) throws CorruptIndexException, IOException {
ensureOpen();
- flushRamSegments();
+
+ // Make sure no threads are actively adding a document
+ docWriter.pauseAllThreads();
+
+ try {
+
+ SegmentInfo newSegment = null;
+
+ final int numDocs = docWriter.getNumDocsInRAM();
+
+ // Always flush docs if there are any
+ boolean flushDocs = numDocs > 0;
+
+ // With autoCommit=true we always must flush the doc
+ // stores when we flush
+ flushDocStores |= autoCommit;
+ String docStoreSegment = docWriter.getDocStoreSegment();
+ if (docStoreSegment == null)
+ flushDocStores = false;
+
+ // Always flush deletes if there are any delete terms.
+ // TODO: when autoCommit=false we don't have to flush
+ // deletes with every flushed segment; we can save
+ // CPU/IO by buffering longer & flushing deletes only
+ // when they are full or writer is being closed. We
+ // have to fix the "applyDeletesSelectively" logic to
+ // apply to more than just the last flushed segment
+ boolean flushDeletes = bufferedDeleteTerms.size() > 0;
+
+ if (infoStream != null)
+ infoStream.println(" flush: flushDocs=" + flushDocs +
+ " flushDeletes=" + flushDeletes +
+ " flushDocStores=" + flushDocStores +
+ " numDocs=" + numDocs);
+
+ int docStoreOffset = docWriter.getDocStoreOffset();
+ boolean docStoreIsCompoundFile = false;
+
+ // Check if the doc stores must be separately flushed
+ // because other segments, besides the one we are about
+ // to flush, reference it
+ if (flushDocStores && (!flushDocs || !docWriter.getSegment().equals(docWriter.getDocStoreSegment()))) {
+ // We must separately flush the doc store
+ if (infoStream != null)
+ infoStream.println(" flush shared docStore segment " + docStoreSegment);
+
+ flushDocStores();
+ flushDocStores = false;
+ docStoreIsCompoundFile = useCompoundFile;
+ }
+
+ String segment = docWriter.getSegment();
+
+ if (flushDocs || flushDeletes) {
+
+ SegmentInfos rollback = null;
+
+ if (flushDeletes)
+ rollback = (SegmentInfos) segmentInfos.clone();
+
+ boolean success = false;
+
+ try {
+ if (flushDocs) {
+
+ if (0 == docStoreOffset && flushDocStores) {
+ // This means we are flushing private doc stores
+ // with this segment, so it will not be shared
+ // with other segments
+ assert docStoreSegment != null;
+ assert docStoreSegment.equals(segment);
+ docStoreOffset = -1;
+ docStoreIsCompoundFile = false;
+ docStoreSegment = null;
+ }
+
+ int flushedDocCount = docWriter.flush(flushDocStores);
+
+ newSegment = new SegmentInfo(segment,
+ flushedDocCount,
+ directory, false, true,
+ docStoreOffset, docStoreSegment,
+ docStoreIsCompoundFile);
+ segmentInfos.addElement(newSegment);
+ }
+
+ if (flushDeletes) {
+ // we should be able to change this so we can
+ // buffer deletes longer and then flush them to
+ // multiple flushed segments, when
+ // autoCommit=false
+ applyDeletes(flushDocs);
+ doAfterFlush();
+ }
+
+ checkpoint();
+ success = true;
+ } finally {
+ if (!success) {
+ if (flushDeletes) {
+ // Fully replace the segmentInfos since flushed
+ // deletes could have changed any of the
+ // SegmentInfo instances:
+ segmentInfos.clear();
+ segmentInfos.addAll(rollback);
+ } else {
+ // Remove segment we added, if any:
+ if (newSegment != null &&
+ segmentInfos.size() > 0 &&
+ segmentInfos.info(segmentInfos.size()-1) == newSegment)
+ segmentInfos.remove(segmentInfos.size()-1);
+ }
+ if (flushDocs)
+ docWriter.abort();
+ deleter.checkpoint(segmentInfos, false);
+ deleter.refresh();
+ }
+ }
+
+ deleter.checkpoint(segmentInfos, autoCommit);
+
+ if (flushDocs && useCompoundFile) {
+ success = false;
+ try {
+ docWriter.createCompoundFile(segment);
+ newSegment.setUseCompoundFile(true);
+ checkpoint();
+ success = true;
+ } finally {
+ if (!success) {
+ newSegment.setUseCompoundFile(false);
+ deleter.refresh();
+ }
+ }
+
+ deleter.checkpoint(segmentInfos, autoCommit);
+ }
+
+ /* new merge policy
+ if (0 == docWriter.getMaxBufferedDocs())
+ maybeMergeSegments(mergeFactor * numDocs / 2);
+ else
+ maybeMergeSegments(docWriter.getMaxBufferedDocs());
+ */
+ maybeMergeSegments(docWriter.getMaxBufferedDocs());
+ }
+ } finally {
+ docWriter.clearFlushPending();
+ docWriter.resumeAllThreads();
+ }
}
/** Expert: Return the total size of all index files currently cached in memory.
@@ -1761,15 +1999,15 @@
*/
public final long ramSizeInBytes() {
ensureOpen();
- return ramDirectory.sizeInBytes();
+ return docWriter.getRAMUsed();
}
/** Expert: Return the number of documents whose segments are currently cached in memory.
- * Useful when calling flushRamSegments()
+ * Useful when calling flush()
*/
public final synchronized int numRamDocs() {
ensureOpen();
- return ramSegmentInfos.size();
+ return docWriter.getNumDocsInRAM();
}
/** Incremental segment merger. */
@@ -1777,6 +2015,10 @@
long lowerBound = -1;
long upperBound = startUpperBound;
+ /* new merge policy
+ if (upperBound == 0) upperBound = 10;
+ */
+
while (upperBound < maxMergeDocs) {
int minSegment = segmentInfos.size();
int maxSegment = -1;
@@ -1808,7 +2050,7 @@
while (numSegments >= mergeFactor) {
// merge the leftmost* mergeFactor segments
- int docCount = mergeSegments(segmentInfos, minSegment, minSegment + mergeFactor);
+ int docCount = mergeSegments(minSegment, minSegment + mergeFactor);
numSegments -= mergeFactor;
if (docCount > upperBound) {
@@ -1837,39 +2079,108 @@
* Merges the named range of segments, replacing them in the stack with a
* single segment.
*/
- private final int mergeSegments(SegmentInfos sourceSegments, int minSegment, int end)
+
+ private final int mergeSegments(int minSegment, int end)
throws CorruptIndexException, IOException {
- // We may be called solely because there are deletes
- // pending, in which case doMerge is false:
- boolean doMerge = end > 0;
final String mergedName = newSegmentName();
+
SegmentMerger merger = null;
-
- final List ramSegmentsToDelete = new ArrayList();
-
SegmentInfo newSegment = null;
int mergedDocCount = 0;
- boolean anyDeletes = (bufferedDeleteTerms.size() != 0);
// This is try/finally to make sure merger's readers are closed:
try {
- if (doMerge) {
- if (infoStream != null) infoStream.print("merging segments");
- merger = new SegmentMerger(this, mergedName);
-
- for (int i = minSegment; i < end; i++) {
- SegmentInfo si = sourceSegments.info(i);
- if (infoStream != null)
- infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
- IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE); // no need to set deleter (yet)
- merger.add(reader);
- if (reader.directory() == this.ramDirectory) {
- ramSegmentsToDelete.add(si);
- }
- }
+ if (infoStream != null) infoStream.print("merging segments");
+
+ // Check whether this merge will allow us to skip
+ // merging the doc stores (stored field & vectors).
+ // This is a very substantial optimization (saves tons
+ // of IO) that can only be applied with
+ // autoCommit=false.
+
+ Directory lastDir = directory;
+ String lastDocStoreSegment = null;
+ boolean mergeDocStores = false;
+ boolean doFlushDocStore = false;
+ int next = -1;
+
+ // Test each segment to be merged
+ for (int i = minSegment; i < end; i++) {
+ SegmentInfo si = segmentInfos.info(i);
+
+ // If it has deletions we must merge the doc stores
+ if (si.hasDeletions())
+ mergeDocStores = true;
+
+ // If it has its own (private) doc stores we must
+ // merge the doc stores
+ if (-1 == si.getDocStoreOffset())
+ mergeDocStores = true;
+
+ // If it has a different doc store segment than
+ // previous segments, we must merge the doc stores
+ String docStoreSegment = si.getDocStoreSegment();
+ if (docStoreSegment == null)
+ mergeDocStores = true;
+ else if (lastDocStoreSegment == null)
+ lastDocStoreSegment = docStoreSegment;
+ else if (!lastDocStoreSegment.equals(docStoreSegment))
+ mergeDocStores = true;
+
+ // Segments' docScoreOffsets must be in-order,
+ // contiguous. For the default merge policy now
+ // this will always be the case but for an arbitrary
+ // merge policy this may not be the case
+ if (-1 == next)
+ next = si.getDocStoreOffset() + si.docCount;
+ else if (next != si.getDocStoreOffset())
+ mergeDocStores = true;
+ else
+ next = si.getDocStoreOffset() + si.docCount;
+
+ // If the segment comes from a different directory
+ // we must merge
+ if (lastDir != si.dir)
+ mergeDocStores = true;
+
+ // If the segment is referencing the current "live"
+ // doc store outputs then we must merge
+ if (si.getDocStoreOffset() != -1 && si.getDocStoreSegment().equals(docWriter.getDocStoreSegment()))
+ doFlushDocStore = true;
+ }
+
+ final int docStoreOffset;
+ final String docStoreSegment;
+ final boolean docStoreIsCompoundFile;
+ if (mergeDocStores) {
+ docStoreOffset = -1;
+ docStoreSegment = null;
+ docStoreIsCompoundFile = false;
+ } else {
+ SegmentInfo si = segmentInfos.info(minSegment);
+ docStoreOffset = si.getDocStoreOffset();
+ docStoreSegment = si.getDocStoreSegment();
+ docStoreIsCompoundFile = si.getDocStoreIsCompoundFile();
+ }
+
+ if (mergeDocStores && doFlushDocStore)
+ // SegmentMerger intends to merge the doc stores
+ // (stored fields, vectors), and at least one of the
+ // segments to be merged refers to the currently
+ // live doc stores.
+ flushDocStores();
+
+ merger = new SegmentMerger(this, mergedName);
+
+ for (int i = minSegment; i < end; i++) {
+ SegmentInfo si = segmentInfos.info(i);
+ if (infoStream != null)
+ infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
+ IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, mergeDocStores); // no need to set deleter (yet)
+ merger.add(reader);
}
SegmentInfos rollback = null;
@@ -1879,65 +2190,32 @@
// if we hit exception when doing the merge:
try {
- if (doMerge) {
- mergedDocCount = merger.merge();
-
- if (infoStream != null) {
- infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
- }
+ mergedDocCount = merger.merge(mergeDocStores);
- newSegment = new SegmentInfo(mergedName, mergedDocCount,
- directory, false, true);
+ if (infoStream != null) {
+ infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
}
+
+ newSegment = new SegmentInfo(mergedName, mergedDocCount,
+ directory, false, true,
+ docStoreOffset,
+ docStoreSegment,
+ docStoreIsCompoundFile);
- if (sourceSegments != ramSegmentInfos || anyDeletes) {
- // Now save the SegmentInfo instances that
- // we are replacing:
- rollback = (SegmentInfos) segmentInfos.clone();
- }
+ rollback = (SegmentInfos) segmentInfos.clone();
- if (doMerge) {
- if (sourceSegments == ramSegmentInfos) {
- segmentInfos.addElement(newSegment);
- } else {
- for (int i = end-1; i > minSegment; i--) // remove old infos & add new
- sourceSegments.remove(i);
+ for (int i = end-1; i > minSegment; i--) // remove old infos & add new
+ segmentInfos.remove(i);
- segmentInfos.set(minSegment, newSegment);
- }
- }
+ segmentInfos.set(minSegment, newSegment);
- if (sourceSegments == ramSegmentInfos) {
- maybeApplyDeletes(doMerge);
- doAfterFlush();
- }
-
checkpoint();
success = true;
} finally {
-
- if (success) {
- // The non-ram-segments case is already committed
- // (above), so all the remains for ram segments case
- // is to clear the ram segments:
- if (sourceSegments == ramSegmentInfos) {
- ramSegmentInfos.removeAllElements();
- }
- } else {
-
- // Must rollback so our state matches index:
- if (sourceSegments == ramSegmentInfos && !anyDeletes) {
- // Simple case: newSegment may or may not have
- // been added to the end of our segment infos,
- // so just check & remove if so:
- if (newSegment != null &&
- segmentInfos.size() > 0 &&
- segmentInfos.info(segmentInfos.size()-1) == newSegment) {
- segmentInfos.remove(segmentInfos.size()-1);
- }
- } else if (rollback != null) {
+ if (!success) {
+ if (rollback != null) {
// Rollback the individual SegmentInfo
// instances, but keep original SegmentInfos
// instance (so we don't try to write again the
@@ -1952,16 +2230,13 @@
}
} finally {
// close readers before we attempt to delete now-obsolete segments
- if (doMerge) merger.closeReaders();
+ merger.closeReaders();
}
- // Delete the RAM segments
- deleter.deleteDirect(ramDirectory, ramSegmentsToDelete);
-
// Give deleter a chance to remove files now.
deleter.checkpoint(segmentInfos, autoCommit);
- if (useCompoundFile && doMerge) {
+ if (useCompoundFile) {
boolean success = false;
@@ -1988,19 +2263,23 @@
}
// Called during flush to apply any buffered deletes. If
- // doMerge is true then a new segment was just created and
- // flushed from the ram segments.
- private final void maybeApplyDeletes(boolean doMerge) throws CorruptIndexException, IOException {
+ // flushedNewSegment is true then a new segment was just
+ // created and flushed from the ram segments, so we will
+ // selectively apply the deletes to that new segment.
+ private final void applyDeletes(boolean flushedNewSegment) throws CorruptIndexException, IOException {
if (bufferedDeleteTerms.size() > 0) {
if (infoStream != null)
infoStream.println("flush " + numBufferedDeleteTerms + " buffered deleted terms on "
+ segmentInfos.size() + " segments.");
- if (doMerge) {
+ if (flushedNewSegment) {
IndexReader reader = null;
try {
- reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1));
+ // Open readers w/o opening the stored fields /
+ // vectors because these files may still be held
+ // open for writing by docWriter
+ reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1), false);
// Apply delete terms to the segment just flushed from ram
// apply appropriately so that a delete term is only applied to
@@ -2018,14 +2297,14 @@
}
int infosEnd = segmentInfos.size();
- if (doMerge) {
+ if (flushedNewSegment) {
infosEnd--;
}
for (int i = 0; i < infosEnd; i++) {
IndexReader reader = null;
try {
- reader = SegmentReader.get(segmentInfos.info(i));
+ reader = SegmentReader.get(segmentInfos.info(i), false);
// Apply delete terms to disk segments
// except the one just flushed from ram.
@@ -2049,7 +2328,12 @@
private final boolean checkNonDecreasingLevels(int start) {
int lowerBound = -1;
- int upperBound = minMergeDocs;
+ int upperBound = docWriter.getMaxBufferedDocs();
+
+ /* new merge policy
+ if (upperBound == 0)
+ upperBound = 10;
+ */
for (int i = segmentInfos.size() - 1; i >= start; i--) {
int docCount = segmentInfos.info(i).docCount;
@@ -2098,10 +2382,11 @@
// well as the disk segments.
private void bufferDeleteTerm(Term term) {
Num num = (Num) bufferedDeleteTerms.get(term);
+ int numDoc = docWriter.getNumDocsInRAM();
if (num == null) {
- bufferedDeleteTerms.put(term, new Num(ramSegmentInfos.size()));
+ bufferedDeleteTerms.put(term, new Num(numDoc));
} else {
- num.setNum(ramSegmentInfos.size());
+ num.setNum(numDoc);
}
numBufferedDeleteTerms++;
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java Wed Jul 4 08:16:38 2007
@@ -65,6 +65,12 @@
private List files; // cached list of files that this segment uses
// in the Directory
+ private int docStoreOffset; // if this segment shares stored fields & vectors, this
+ // offset is where in that file this segment's docs begin
+ private String docStoreSegment; // name used to derive fields/vectors file we share with
+ // other segments
+ private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
+
public SegmentInfo(String name, int docCount, Directory dir) {
this.name = name;
this.docCount = docCount;
@@ -73,13 +79,25 @@
isCompoundFile = CHECK_DIR;
preLockless = true;
hasSingleNormFile = false;
+ docStoreOffset = -1;
+ docStoreSegment = name;
+ docStoreIsCompoundFile = false;
}
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) {
+ this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false);
+ }
+
+ public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile,
+ int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile) {
this(name, docCount, dir);
this.isCompoundFile = (byte) (isCompoundFile ? YES : NO);
this.hasSingleNormFile = hasSingleNormFile;
preLockless = false;
+ this.docStoreOffset = docStoreOffset;
+ this.docStoreSegment = docStoreSegment;
+ this.docStoreIsCompoundFile = docStoreIsCompoundFile;
+ assert docStoreOffset == -1 || docStoreSegment != null;
}
/**
@@ -92,6 +110,8 @@
dir = src.dir;
preLockless = src.preLockless;
delGen = src.delGen;
+ docStoreOffset = src.docStoreOffset;
+ docStoreIsCompoundFile = src.docStoreIsCompoundFile;
if (src.normGen == null) {
normGen = null;
} else {
@@ -116,6 +136,20 @@
docCount = input.readInt();
if (format <= SegmentInfos.FORMAT_LOCKLESS) {
delGen = input.readLong();
+ if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) {
+ docStoreOffset = input.readInt();
+ if (docStoreOffset != -1) {
+ docStoreSegment = input.readString();
+ docStoreIsCompoundFile = (1 == input.readByte());
+ } else {
+ docStoreSegment = name;
+ docStoreIsCompoundFile = false;
+ }
+ } else {
+ docStoreOffset = -1;
+ docStoreSegment = name;
+ docStoreIsCompoundFile = false;
+ }
if (format <= SegmentInfos.FORMAT_SINGLE_NORM_FILE) {
hasSingleNormFile = (1 == input.readByte());
} else {
@@ -138,6 +172,9 @@
isCompoundFile = CHECK_DIR;
preLockless = true;
hasSingleNormFile = false;
+ docStoreOffset = -1;
+ docStoreIsCompoundFile = false;
+ docStoreSegment = null;
}
}
@@ -368,6 +405,28 @@
return dir.fileExists(name + "." + IndexFileNames.COMPOUND_FILE_EXTENSION);
}
}
+
+ int getDocStoreOffset() {
+ return docStoreOffset;
+ }
+
+ boolean getDocStoreIsCompoundFile() {
+ return docStoreIsCompoundFile;
+ }
+
+ void setDocStoreIsCompoundFile(boolean v) {
+ docStoreIsCompoundFile = v;
+ files = null;
+ }
+
+ String getDocStoreSegment() {
+ return docStoreSegment;
+ }
+
+ void setDocStoreOffset(int offset) {
+ docStoreOffset = offset;
+ files = null;
+ }
/**
* Save this segment's info.
@@ -377,6 +436,12 @@
output.writeString(name);
output.writeInt(docCount);
output.writeLong(delGen);
+ output.writeInt(docStoreOffset);
+ if (docStoreOffset != -1) {
+ output.writeString(docStoreSegment);
+ output.writeByte((byte) (docStoreIsCompoundFile ? 1:0));
+ }
+
output.writeByte((byte) (hasSingleNormFile ? 1:0));
if (normGen == null) {
output.writeInt(NO);
@@ -389,6 +454,11 @@
output.writeByte(isCompoundFile);
}
+ private void addIfExists(List files, String fileName) throws IOException {
+ if (dir.fileExists(fileName))
+ files.add(fileName);
+ }
+
/*
* Return all files referenced by this SegmentInfo. The
* returns List is a locally cached List so you should not
@@ -409,13 +479,28 @@
if (useCompoundFile) {
files.add(name + "." + IndexFileNames.COMPOUND_FILE_EXTENSION);
} else {
- for (int i = 0; i < IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE.length; i++) {
- String ext = IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE[i];
- String fileName = name + "." + ext;
- if (dir.fileExists(fileName)) {
- files.add(fileName);
- }
+ final String[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS;
+ for(int i=0;i<exts.length;i++)
+ addIfExists(files, name + "." + exts[i]);
+ }
+
+ if (docStoreOffset != -1) {
+ // We are sharing doc stores (stored fields, term
+ // vectors) with other segments
+ assert docStoreSegment != null;
+ if (docStoreIsCompoundFile) {
+ files.add(docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION);
+ } else {
+ final String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
+ for(int i=0;i<exts.length;i++)
+ addIfExists(files, docStoreSegment + "." + exts[i]);
}
+ } else if (!useCompoundFile) {
+ // We are not sharing, and, these files were not
+ // included in the compound file
+ final String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
+ for(int i=0;i<exts.length;i++)
+ addIfExists(files, name + "." + exts[i]);
}
String delFileName = IndexFileNames.fileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen);
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java Wed Jul 4 08:16:38 2007
@@ -51,8 +51,12 @@
*/
public static final int FORMAT_SINGLE_NORM_FILE = -3;
+ /** This format allows multiple segments to share a single
+ * vectors and stored fields file. */
+ public static final int FORMAT_SHARED_DOC_STORE = -4;
+
/* This must always point to the most recent file format. */
- private static final int CURRENT_FORMAT = FORMAT_SINGLE_NORM_FILE;
+ private static final int CURRENT_FORMAT = FORMAT_SHARED_DOC_STORE;
public int counter = 0; // used to name new segments
/**
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Wed Jul 4 08:16:38 2007
@@ -52,6 +52,12 @@
private int mergedDocs;
+ // Whether we should merge doc stores (stored fields and
+ // vectors files). When all segments we are merging
+ // already share the same doc store files, we don't need
+ // to merge the doc stores.
+ private boolean mergeDocStores;
+
/** This ctor used only by test code.
*
* @param dir The Directory to merge the other segments into
@@ -92,18 +98,32 @@
* @throws IOException if there is a low-level IO error
*/
final int merge() throws CorruptIndexException, IOException {
- int value;
-
+ return merge(true);
+ }
+
+ /**
+ * Merges the readers specified by the {@link #add} method
+ * into the directory passed to the constructor.
+ * @param mergeDocStores if false, we will not merge the
+ * stored fields nor vectors files
+ * @return The number of documents that were merged
+ * @throws CorruptIndexException if the index is corrupt
+ * @throws IOException if there is a low-level IO error
+ */
+ final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException {
+
+ this.mergeDocStores = mergeDocStores;
+
mergedDocs = mergeFields();
mergeTerms();
mergeNorms();
- if (fieldInfos.hasVectors())
+ if (mergeDocStores && fieldInfos.hasVectors())
mergeVectors();
return mergedDocs;
}
-
+
/**
* close all IndexReaders that have been added.
* Should not be called before merge().
@@ -126,7 +146,10 @@
// Basic files
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
- files.add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]);
+ String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
+ if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) &&
+ !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
+ files.add(segment + "." + ext);
}
// Fieldable norm files
@@ -139,7 +162,7 @@
}
// Vector files
- if (fieldInfos.hasVectors()) {
+ if (fieldInfos.hasVectors() && mergeDocStores) {
for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
}
@@ -173,7 +196,20 @@
* @throws IOException if there is a low-level IO error
*/
private final int mergeFields() throws CorruptIndexException, IOException {
- fieldInfos = new FieldInfos(); // merge field names
+
+ if (!mergeDocStores) {
+ // When we are not merging by doc stores, that means
+ // all segments were written as part of a single
+ // autoCommit=false IndexWriter session, so their field
+ // name -> number mapping are the same. So, we start
+ // with the fieldInfos of the last segment in this
+ // case, to keep that numbering.
+ final SegmentReader sr = (SegmentReader) readers.elementAt(readers.size()-1);
+ fieldInfos = (FieldInfos) sr.fieldInfos.clone();
+ } else {
+ fieldInfos = new FieldInfos(); // merge field names
+ }
+
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
@@ -187,30 +223,40 @@
}
fieldInfos.write(directory, segment + ".fnm");
- FieldsWriter fieldsWriter = // merge field values
- new FieldsWriter(directory, segment, fieldInfos);
-
- // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
- // in merge mode, we use this FieldSelector
- FieldSelector fieldSelectorMerge = new FieldSelector() {
- public FieldSelectorResult accept(String fieldName) {
- return FieldSelectorResult.LOAD_FOR_MERGE;
- }
- };
+ if (mergeDocStores) {
+
+ FieldsWriter fieldsWriter = // merge field values
+ new FieldsWriter(directory, segment, fieldInfos);
- try {
- for (int i = 0; i < readers.size(); i++) {
- IndexReader reader = (IndexReader) readers.elementAt(i);
- int maxDoc = reader.maxDoc();
- for (int j = 0; j < maxDoc; j++)
- if (!reader.isDeleted(j)) { // skip deleted docs
- fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
- docCount++;
- }
+ // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
+ // in merge mode, we use this FieldSelector
+ FieldSelector fieldSelectorMerge = new FieldSelector() {
+ public FieldSelectorResult accept(String fieldName) {
+ return FieldSelectorResult.LOAD_FOR_MERGE;
+ }
+ };
+
+ try {
+ for (int i = 0; i < readers.size(); i++) {
+ IndexReader reader = (IndexReader) readers.elementAt(i);
+ int maxDoc = reader.maxDoc();
+ for (int j = 0; j < maxDoc; j++)
+ if (!reader.isDeleted(j)) { // skip deleted docs
+ fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
+ docCount++;
+ }
+ }
+ } finally {
+ fieldsWriter.close();
}
- } finally {
- fieldsWriter.close();
- }
+
+ } else
+ // If we are skipping the doc stores, that means there
+ // are no deletions in any of these segments, so we
+ // just sum numDocs() of each segment to get total docCount
+ for (int i = 0; i < readers.size(); i++)
+ docCount += ((IndexReader) readers.elementAt(i)).numDocs();
+
return docCount;
}
@@ -355,6 +401,7 @@
for (int i = 0; i < n; i++) {
SegmentMergeInfo smi = smis[i];
TermPositions postings = smi.getPositions();
+ assert postings != null;
int base = smi.base;
int[] docMap = smi.getDocMap();
postings.seek(smi.termEnum);
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Wed Jul 4 08:16:38 2007
@@ -60,6 +60,7 @@
// Compound File Reader when based on a compound file segment
CompoundFileReader cfsReader = null;
+ CompoundFileReader storeCFSReader = null;
private class Norm {
public Norm(IndexInput in, int number, long normSeek)
@@ -128,7 +129,15 @@
* @throws IOException if there is a low-level IO error
*/
public static SegmentReader get(SegmentInfo si) throws CorruptIndexException, IOException {
- return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE);
+ return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE, true);
+ }
+
+ /**
+ * @throws CorruptIndexException if the index is corrupt
+ * @throws IOException if there is a low-level IO error
+ */
+ public static SegmentReader get(SegmentInfo si, boolean doOpenStores) throws CorruptIndexException, IOException {
+ return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE, doOpenStores);
}
/**
@@ -136,7 +145,15 @@
* @throws IOException if there is a low-level IO error
*/
public static SegmentReader get(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException {
- return get(si.dir, si, null, false, false, readBufferSize);
+ return get(si.dir, si, null, false, false, readBufferSize, true);
+ }
+
+ /**
+ * @throws CorruptIndexException if the index is corrupt
+ * @throws IOException if there is a low-level IO error
+ */
+ public static SegmentReader get(SegmentInfo si, int readBufferSize, boolean doOpenStores) throws CorruptIndexException, IOException {
+ return get(si.dir, si, null, false, false, readBufferSize, doOpenStores);
}
/**
@@ -145,7 +162,7 @@
*/
public static SegmentReader get(SegmentInfos sis, SegmentInfo si,
boolean closeDir) throws CorruptIndexException, IOException {
- return get(si.dir, si, sis, closeDir, true, BufferedIndexInput.BUFFER_SIZE);
+ return get(si.dir, si, sis, closeDir, true, BufferedIndexInput.BUFFER_SIZE, true);
}
/**
@@ -157,6 +174,19 @@
boolean closeDir, boolean ownDir,
int readBufferSize)
throws CorruptIndexException, IOException {
+ return get(dir, si, sis, closeDir, ownDir, readBufferSize, true);
+ }
+
+ /**
+ * @throws CorruptIndexException if the index is corrupt
+ * @throws IOException if there is a low-level IO error
+ */
+ public static SegmentReader get(Directory dir, SegmentInfo si,
+ SegmentInfos sis,
+ boolean closeDir, boolean ownDir,
+ int readBufferSize,
+ boolean doOpenStores)
+ throws CorruptIndexException, IOException {
SegmentReader instance;
try {
instance = (SegmentReader)IMPL.newInstance();
@@ -164,11 +194,11 @@
throw new RuntimeException("cannot load SegmentReader class: " + e, e);
}
instance.init(dir, sis, closeDir, ownDir);
- instance.initialize(si, readBufferSize);
+ instance.initialize(si, readBufferSize, doOpenStores);
return instance;
}
- private void initialize(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException {
+ private void initialize(SegmentInfo si, int readBufferSize, boolean doOpenStores) throws CorruptIndexException, IOException {
segment = si.name;
this.si = si;
@@ -178,17 +208,45 @@
// Use compound file directory for some files, if it exists
Directory cfsDir = directory();
if (si.getUseCompoundFile()) {
- cfsReader = new CompoundFileReader(directory(), segment + ".cfs", readBufferSize);
+ cfsReader = new CompoundFileReader(directory(), segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize);
cfsDir = cfsReader;
}
+ final Directory storeDir;
+
+ if (doOpenStores) {
+ if (si.getDocStoreOffset() != -1) {
+ if (si.getDocStoreIsCompoundFile()) {
+ storeCFSReader = new CompoundFileReader(directory(), si.getDocStoreSegment() + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION, readBufferSize);
+ storeDir = storeCFSReader;
+ } else {
+ storeDir = directory();
+ }
+ } else {
+ storeDir = cfsDir;
+ }
+ } else
+ storeDir = null;
+
// No compound file exists - use the multi-file format
fieldInfos = new FieldInfos(cfsDir, segment + ".fnm");
- fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos, readBufferSize);
- // Verify two sources of "maxDoc" agree:
- if (fieldsReader.size() != si.docCount) {
- throw new CorruptIndexException("doc counts differ for segment " + si.name + ": fieldsReader shows " + fieldsReader.size() + " but segmentInfo shows " + si.docCount);
+ final String fieldsSegment;
+ final Directory dir;
+
+ if (si.getDocStoreOffset() != -1)
+ fieldsSegment = si.getDocStoreSegment();
+ else
+ fieldsSegment = segment;
+
+ if (doOpenStores) {
+ fieldsReader = new FieldsReader(storeDir, fieldsSegment, fieldInfos, readBufferSize,
+ si.getDocStoreOffset(), si.docCount);
+
+ // Verify two sources of "maxDoc" agree:
+ if (si.getDocStoreOffset() == -1 && fieldsReader.size() != si.docCount) {
+ throw new CorruptIndexException("doc counts differ for segment " + si.name + ": fieldsReader shows " + fieldsReader.size() + " but segmentInfo shows " + si.docCount);
+ }
}
tis = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize);
@@ -209,8 +267,13 @@
proxStream = cfsDir.openInput(segment + ".prx", readBufferSize);
openNorms(cfsDir, readBufferSize);
- if (fieldInfos.hasVectors()) { // open term vector files only as needed
- termVectorsReaderOrig = new TermVectorsReader(cfsDir, segment, fieldInfos, readBufferSize);
+ if (doOpenStores && fieldInfos.hasVectors()) { // open term vector files only as needed
+ final String vectorsSegment;
+ if (si.getDocStoreOffset() != -1)
+ vectorsSegment = si.getDocStoreSegment();
+ else
+ vectorsSegment = segment;
+ termVectorsReaderOrig = new TermVectorsReader(storeDir, vectorsSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount);
}
success = true;
} finally {
@@ -273,6 +336,9 @@
if (cfsReader != null)
cfsReader.close();
+
+ if (storeCFSReader != null)
+ storeCFSReader.close();
}
static boolean hasDeletions(SegmentInfo si) throws IOException {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java Wed Jul 4 08:16:38 2007
@@ -33,6 +33,10 @@
private IndexInput tvd;
private IndexInput tvf;
private int size;
+
+ // The docID offset where our docs begin in the index
+ // file. This will be 0 if we have our own private file.
+ private int docStoreOffset;
private int tvdFormat;
private int tvfFormat;
@@ -44,6 +48,11 @@
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
throws CorruptIndexException, IOException {
+ this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
+ }
+
+ TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
+ throws CorruptIndexException, IOException {
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION, readBufferSize);
checkValidFormat(tvx);
@@ -51,7 +60,16 @@
tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION, readBufferSize);
tvfFormat = checkValidFormat(tvf);
- size = (int) tvx.length() / 8;
+ if (-1 == docStoreOffset) {
+ this.docStoreOffset = 0;
+ this.size = (int) (tvx.length() / 8);
+ } else {
+ this.docStoreOffset = docStoreOffset;
+ this.size = size;
+ // Verify the file is long enough to hold all of our
+ // docs
+ assert ((int) (tvx.length()/8)) >= size + docStoreOffset;
+ }
}
this.fieldInfos = fieldInfos;
@@ -102,7 +120,7 @@
//We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file
- tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
+ tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
long position = tvx.readLong();
@@ -154,7 +172,7 @@
// Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
- tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
+ tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
long position = tvx.readLong();
tvd.seek(position);
Modified: lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java Wed Jul 4 08:16:38 2007
@@ -125,6 +125,31 @@
}
}
+ /** Writes a sequence of UTF-8 encoded characters from a char[].
+ * @param s the source of the characters
+ * @param start the first character in the sequence
+ * @param length the number of characters in the sequence
+ * @see IndexInput#readChars(char[],int,int)
+ */
+ public void writeChars(char[] s, int start, int length)
+ throws IOException {
+ final int end = start + length;
+ for (int i = start; i < end; i++) {
+ final int code = (int)s[i];
+ if (code >= 0x01 && code <= 0x7F)
+ writeByte((byte)code);
+ else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
+ writeByte((byte)(0xC0 | (code >> 6)));
+ writeByte((byte)(0x80 | (code & 0x3F)));
+ } else {
+ writeByte((byte)(0xE0 | (code >>> 12)));
+ writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
+ writeByte((byte)(0x80 | (code & 0x3F)));
+ }
+ }
+ }
+
+
/** Forces any buffered output to be written. */
public abstract void flush() throws IOException;
Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Wed Jul 4 08:16:38 2007
@@ -60,6 +60,15 @@
Lucene will not be able to read the index.
</p>
+ <p>
+ In version 2.3, the file format was changed to allow
+ segments to share a single set of doc store (vectors &
+ stored fields) files. This allows for faster indexing
+ in certain cases. The change is fully backwards
+ compatible (in the same way as the lock-less commits
+ change in 2.1).
+ </p>
+
</section>
<section id="Definitions"><title>Definitions</title>
@@ -809,9 +818,15 @@
NormGen<sup>NumField</sup>,
IsCompoundFile><sup>SegCount</sup>
</p>
+ <p>
+ <b>2.3 and above:</b>
+ Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
+ NormGen<sup>NumField</sup>,
+ IsCompoundFile><sup>SegCount</sup>
+ </p>
<p>
- Format, NameCounter, SegCount, SegSize, NumField --> Int32
+ Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --> Int32
</p>
<p>
@@ -819,11 +834,11 @@
</p>
<p>
- SegName --> String
+ SegName, DocStoreSegment --> String
</p>
<p>
- IsCompoundFile, HasSingleNormFile --> Int8
+ IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile --> Int8
</p>
<p>
@@ -889,6 +904,29 @@
"Normalization Factors" below for details.
</p>
+ <p>
+ DocStoreOffset, DocStoreSegment,
+ DocStoreIsCompoundFile: If DocStoreOffset is -1,
+ this segment has its own doc store (stored fields
+ values and term vectors) files and DocStoreSegment
+ and DocStoreIsCompoundFile are not stored. In
+ this case all files for stored field values
+ (<tt>*.fdt</tt> and <tt>*.fdx</tt>) and term
+ vectors (<tt>*.tvf</tt>, <tt>*.tvd</tt> and
+ <tt>*.tvx</tt>) will be stored with this segment.
+ Otherwise, DocStoreSegment is the name of the
+ segment that has the shared doc store files;
+ DocStoreIsCompoundFile is 1 if that segment is
+ stored in compound file format (as a <tt>.cfx</tt>
+ file); and DocStoreOffset is the starting document
+ in the shared doc store files where this segment's
+ documents begin. In this case, this segment does
+ not store its own doc store files but instead
+ shares a single set of these files with other
+ segments.
+ </p>
+
+
</section>
<section id="Lock File"><title>Lock File</title>
@@ -946,6 +984,14 @@
<p>FileData --> raw file data</p>
<p>The raw file data is the data from the individual files named above.</p>
+
+ <p>Starting with Lucene 2.3, doc store files (stored
+ field values and term vectors) can be shared in a
+ single set of files for more than one segment. When
+ compound file is enabled, these shared files will be
+ added into a single compound file (same format as
+ above) but with the extension <tt>.cfx</tt>.
+ </p>
</section>