You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/10 18:54:16 UTC
svn commit: r1200452 - in
/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index: ./
codecs/
Author: rmuir
Date: Thu Nov 10 17:54:15 2011
New Revision: 1200452
URL: http://svn.apache.org/viewvc?rev=1200452&view=rev
Log:
LUCENE-2621: move bulk term vector merging to DefaultTermVectorsWriter
Modified:
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentReader.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsReader.java
lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentMerger.java Thu Nov 10 17:54:15 2011
@@ -25,12 +25,10 @@ import java.util.List;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader.FieldOption;
-import org.apache.lucene.index.MergePolicy.MergeAbortedException;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.StoredFieldsWriter;
import org.apache.lucene.index.codecs.PerDocConsumer;
-import org.apache.lucene.index.codecs.TermVectorsReader;
import org.apache.lucene.index.codecs.TermVectorsWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -53,10 +51,6 @@ final class SegmentMerger {
private final String segment;
private final int termIndexInterval;
- /** Maximum number of contiguous documents to bulk-copy
- when merging term vectors */
- private final static int MAX_RAW_MERGE_DOCS = 4192;
-
private final Codec codec;
private final IOContext context;
@@ -129,7 +123,8 @@ final class SegmentMerger {
mergeNorms();
if (mergeState.fieldInfos.hasVectors()) {
- mergeVectors(segmentWriteState);
+ int numMerged = mergeVectors();
+ assert numMerged == mergeState.mergedDocCount;
}
// write FIS once merge is done. IDV might change types or drops fields
mergeState.fieldInfos.write(directory, segment + "." + IndexFileNames.FIELD_INFOS_EXTENSION);
@@ -214,130 +209,29 @@ final class SegmentMerger {
* @throws IOException if there is a low-level IO error
*/
private int mergeFields() throws CorruptIndexException, IOException {
- int docCount = 0;
-
final StoredFieldsWriter fieldsWriter = codec.storedFieldsFormat().fieldsWriter(directory, segment, context);
+
try {
- docCount = fieldsWriter.merge(mergeState);
+ return fieldsWriter.merge(mergeState);
} finally {
fieldsWriter.close();
}
-
- return docCount;
}
/**
* Merge the TermVectors from each of the segments into the new one.
* @throws IOException
*/
- // nocommit: move to codec
- private final void mergeVectors(SegmentWriteState segmentWriteState) throws IOException {
- TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, segment, mergeState.fieldInfos, context);
- // Used for bulk-reading raw bytes for term vectors
- int rawDocLengths[] = new int[MAX_RAW_MERGE_DOCS];
- int rawDocLengths2[] = new int[MAX_RAW_MERGE_DOCS];
+ private final int mergeVectors() throws IOException {
+ final TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, segment, mergeState.fieldInfos, context);
+
try {
- int idx = 0;
- for (final MergeState.IndexReaderAndLiveDocs reader : mergeState.readers) {
- final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
- TermVectorsReader matchingVectorsReader = null;
- if (matchingSegmentReader != null) {
- TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader();
-
- // If the TV* files are an older format then they cannot read raw docs:
- if (vectorsReader != null && vectorsReader.canReadRawDocs()) {
- matchingVectorsReader = vectorsReader;
- }
- }
- if (reader.liveDocs != null) {
- copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2);
- } else {
- copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2);
- }
- }
- termVectorsWriter.finish(segmentWriteState.numDocs);
+ return termVectorsWriter.merge(mergeState);
} finally {
termVectorsWriter.close();
}
}
- private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter,
- final TermVectorsReader matchingVectorsReader,
- final MergeState.IndexReaderAndLiveDocs reader,
- int rawDocLengths[],
- int rawDocLengths2[])
- throws IOException, MergeAbortedException {
- final int maxDoc = reader.reader.maxDoc();
- final Bits liveDocs = reader.liveDocs;
- if (matchingVectorsReader != null) {
- // We can bulk-copy because the fieldInfos are "congruent"
- for (int docNum = 0; docNum < maxDoc;) {
- if (!liveDocs.get(docNum)) {
- // skip deleted docs
- ++docNum;
- continue;
- }
- // We can optimize this case (doing a bulk byte copy) since the field
- // numbers are identical
- int start = docNum, numDocs = 0;
- do {
- docNum++;
- numDocs++;
- if (docNum >= maxDoc) break;
- if (!liveDocs.get(docNum)) {
- docNum++;
- break;
- }
- } while(numDocs < MAX_RAW_MERGE_DOCS);
-
- matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
- termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
- mergeState.checkAbort.work(300 * numDocs);
- }
- } else {
- for (int docNum = 0; docNum < maxDoc; docNum++) {
- if (!liveDocs.get(docNum)) {
- // skip deleted docs
- continue;
- }
-
- // NOTE: it's very important to first assign to vectors then pass it to
- // termVectorsWriter.addAllDocVectors; see LUCENE-1282
- TermFreqVector[] vectors = reader.reader.getTermFreqVectors(docNum);
- termVectorsWriter.addAllDocVectors(vectors);
- mergeState.checkAbort.work(300);
- }
- }
- }
-
- private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter,
- final TermVectorsReader matchingVectorsReader,
- final MergeState.IndexReaderAndLiveDocs reader,
- int rawDocLengths[],
- int rawDocLengths2[])
- throws IOException, MergeAbortedException {
- final int maxDoc = reader.reader.maxDoc();
- if (matchingVectorsReader != null) {
- // We can bulk-copy because the fieldInfos are "congruent"
- int docCount = 0;
- while (docCount < maxDoc) {
- int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
- matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
- termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
- docCount += len;
- mergeState.checkAbort.work(300 * len);
- }
- } else {
- for (int docNum = 0; docNum < maxDoc; docNum++) {
- // NOTE: it's very important to first assign to vectors then pass it to
- // termVectorsWriter.addAllDocVectors; see LUCENE-1282
- TermFreqVector[] vectors = reader.reader.getTermFreqVectors(docNum);
- termVectorsWriter.addAllDocVectors(vectors);
- mergeState.checkAbort.work(300);
- }
- }
- }
-
private final void mergeTerms(SegmentWriteState segmentWriteState) throws CorruptIndexException, IOException {
int docBase = 0;
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/SegmentReader.java Thu Nov 10 17:54:15 2011
@@ -660,8 +660,9 @@ public class SegmentReader extends Index
/**
* Create a clone from the initial TermVectorsReader and store it in the ThreadLocal.
* @return TermVectorsReader
+ * @lucene.internal
*/
- TermVectorsReader getTermVectorsReader() {
+ public TermVectorsReader getTermVectorsReader() {
TermVectorsReader tvReader = termVectorsLocal.get();
if (tvReader == null) {
TermVectorsReader orig = core.getTermVectorsReaderOrig();
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java Thu Nov 10 17:54:15 2011
@@ -138,16 +138,12 @@ public class DefaultTermVectorsReader ex
}
// Used for bulk copy when merging
- // nocommit: not public
- @Override
- public IndexInput getTvdStream() {
+ IndexInput getTvdStream() {
return tvd;
}
// Used for bulk copy when merging
- // nocommit: not public
- @Override
- public IndexInput getTvfStream() {
+ IndexInput getTvfStream() {
return tvf;
}
@@ -155,8 +151,7 @@ public class DefaultTermVectorsReader ex
tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
}
- @Override
- public boolean canReadRawDocs() {
+ boolean canReadRawDocs() {
// we can always read raw docs, unless the term vectors
// didn't exist
return format != 0;
@@ -168,9 +163,7 @@ public class DefaultTermVectorsReader ex
* merging segments, if the field numbers are
* congruent. Once this returns, the tvf & tvd streams
* are seeked to the startDocID. */
- // nocommit: not public
- @Override
- public final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
+ final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
if (tvx == null) {
Arrays.fill(tvdLengths, 0);
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java Thu Nov 10 17:54:15 2011
@@ -19,12 +19,17 @@ package org.apache.lucene.index.codecs;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.MergePolicy.MergeAbortedException;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.MergeState.CheckAbort;
+import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
@@ -179,8 +184,7 @@ public final class DefaultTermVectorsWri
* streams. This is used to expedite merging, if the
* field numbers are congruent.
*/
- @Override
- public void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
+ private void addRawDocuments(DefaultTermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
long tvdPosition = tvd.getFilePointer();
long tvfPosition = tvf.getFilePointer();
long tvdStart = tvdPosition;
@@ -198,6 +202,123 @@ public final class DefaultTermVectorsWri
}
@Override
+ public final int merge(MergeState mergeState) throws IOException {
+ // Used for bulk-reading raw bytes for term vectors
+ int rawDocLengths[] = new int[MAX_RAW_MERGE_DOCS];
+ int rawDocLengths2[] = new int[MAX_RAW_MERGE_DOCS];
+
+ int idx = 0;
+ int numDocs = 0;
+ for (final MergeState.IndexReaderAndLiveDocs reader : mergeState.readers) {
+ final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
+ DefaultTermVectorsReader matchingVectorsReader = null;
+ if (matchingSegmentReader != null) {
+ TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader();
+
+ if (vectorsReader != null && vectorsReader instanceof DefaultTermVectorsReader) {
+ // If the TV* files are an older format then they cannot read raw docs:
+ if (((DefaultTermVectorsReader)vectorsReader).canReadRawDocs()) {
+ matchingVectorsReader = (DefaultTermVectorsReader) vectorsReader;
+ }
+ }
+ }
+ if (reader.liveDocs != null) {
+ numDocs += copyVectorsWithDeletions(mergeState.checkAbort, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2);
+ } else {
+ numDocs += copyVectorsNoDeletions(mergeState.checkAbort, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2);
+ }
+ }
+ finish(numDocs);
+ return numDocs;
+ }
+
+ /** Maximum number of contiguous documents to bulk-copy
+ when merging term vectors */
+ private final static int MAX_RAW_MERGE_DOCS = 4192;
+
+ private int copyVectorsWithDeletions(CheckAbort checkAbort,
+ final DefaultTermVectorsReader matchingVectorsReader,
+ final MergeState.IndexReaderAndLiveDocs reader,
+ int rawDocLengths[],
+ int rawDocLengths2[])
+ throws IOException, MergeAbortedException {
+ final int maxDoc = reader.reader.maxDoc();
+ final Bits liveDocs = reader.liveDocs;
+ int totalNumDocs = 0;
+ if (matchingVectorsReader != null) {
+ // We can bulk-copy because the fieldInfos are "congruent"
+ for (int docNum = 0; docNum < maxDoc;) {
+ if (!liveDocs.get(docNum)) {
+ // skip deleted docs
+ ++docNum;
+ continue;
+ }
+ // We can optimize this case (doing a bulk byte copy) since the field
+ // numbers are identical
+ int start = docNum, numDocs = 0;
+ do {
+ docNum++;
+ numDocs++;
+ if (docNum >= maxDoc) break;
+ if (!liveDocs.get(docNum)) {
+ docNum++;
+ break;
+ }
+ } while(numDocs < MAX_RAW_MERGE_DOCS);
+
+ matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
+ addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
+ totalNumDocs += numDocs;
+ checkAbort.work(300 * numDocs);
+ }
+ } else {
+ for (int docNum = 0; docNum < maxDoc; docNum++) {
+ if (!liveDocs.get(docNum)) {
+ // skip deleted docs
+ continue;
+ }
+
+ // NOTE: it's very important to first assign to vectors then pass it to
+ // termVectorsWriter.addAllDocVectors; see LUCENE-1282
+ TermFreqVector[] vectors = reader.reader.getTermFreqVectors(docNum);
+ addAllDocVectors(vectors);
+ totalNumDocs++;
+ checkAbort.work(300);
+ }
+ }
+ return totalNumDocs;
+ }
+
+ private int copyVectorsNoDeletions(CheckAbort checkAbort,
+ final DefaultTermVectorsReader matchingVectorsReader,
+ final MergeState.IndexReaderAndLiveDocs reader,
+ int rawDocLengths[],
+ int rawDocLengths2[])
+ throws IOException, MergeAbortedException {
+ final int maxDoc = reader.reader.maxDoc();
+ if (matchingVectorsReader != null) {
+ // We can bulk-copy because the fieldInfos are "congruent"
+ int docCount = 0;
+ while (docCount < maxDoc) {
+ int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
+ matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
+ addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
+ docCount += len;
+ checkAbort.work(300 * len);
+ }
+ } else {
+ for (int docNum = 0; docNum < maxDoc; docNum++) {
+ // NOTE: it's very important to first assign to vectors then pass it to
+ // termVectorsWriter.addAllDocVectors; see LUCENE-1282
+ TermFreqVector[] vectors = reader.reader.getTermFreqVectors(docNum);
+ addAllDocVectors(vectors);
+ checkAbort.work(300);
+ }
+ }
+ return maxDoc;
+ }
+
+ @Override
public void finish(int numDocs) throws IOException {
if (4+((long) numDocs)*16 != tvx.getFilePointer())
// This is most likely a bug in Sun JRE 1.6.0_04/_05;
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsReader.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsReader.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsReader.java Thu Nov 10 17:54:15 2011
@@ -22,7 +22,6 @@ import java.io.IOException;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermVectorMapper;
-import org.apache.lucene.store.IndexInput;
public abstract class TermVectorsReader implements Cloneable,Closeable {
// TODO: can we consolidate all these get's?
@@ -32,11 +31,5 @@ public abstract class TermVectorsReader
public abstract void get(int doc, TermVectorMapper mapper) throws IOException;
public abstract void get(int doc, String field, TermVectorMapper mapper) throws IOException;
- public abstract TermVectorsReader clone();
-
- // nocommit: nuke all of these methods below:
- public abstract boolean canReadRawDocs();
- public abstract void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException;
- public abstract IndexInput getTvdStream();
- public abstract IndexInput getTvfStream();
+ public abstract TermVectorsReader clone();
}
Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java?rev=1200452&r1=1200451&r2=1200452&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java (original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java Thu Nov 10 17:54:15 2011
@@ -65,7 +65,4 @@ public abstract class TermVectorsWriter
// nocommit: this should be a sugar method only that consumes the normal api (once we have one)
public abstract void addAllDocVectors(TermFreqVector[] vectors) throws IOException;
-
- // nocommit: nuke this
- public abstract void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException;
}