You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/12/29 14:22:50 UTC
svn commit: r1648332 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/
lucene/core/src/java/org/apache/lucene/codecs/compressing/
lucene/core/src/java/org/apache/lucene/util/
Author: rmuir
Date: Mon Dec 29 13:22:50 2014
New Revision: 1648332
URL: http://svn.apache.org/r1648332
Log:
LUCENE-6133: improve default stored fields merge algorithm
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/core/ (props changed)
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1648332&r1=1648331&r2=1648332&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Mon Dec 29 13:22:50 2014
@@ -157,6 +157,9 @@ Optimizations
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
+* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient.
+ (Robert Muir)
+
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java?rev=1648332&r1=1648331&r2=1648332&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java Mon Dec 29 13:22:50 2014
@@ -18,15 +18,19 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
+import java.io.Reader;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.document.DocumentStoredFieldVisitor;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
/**
* Codec API for writing stored fields:
@@ -82,6 +86,7 @@ public abstract class StoredFieldsWriter
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
storedFieldsReader.checkIntegrity();
+ MergeVisitor visitor = new MergeVisitor(mergeState, i);
int maxDoc = mergeState.maxDocs[i];
Bits liveDocs = mergeState.liveDocs[i];
for (int docID=0;docID<maxDoc;docID++) {
@@ -89,16 +94,9 @@ public abstract class StoredFieldsWriter
// skip deleted docs
continue;
}
- // TODO: this could be more efficient using
- // FieldVisitor instead of loading/writing entire
- // doc; ie we just have to renumber the field number
- // on the fly?
- // NOTE: it's very important to first assign to doc then pass it to
- // fieldsWriter.addDocument; see LUCENE-1282
- DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
+ startDocument();
storedFieldsReader.visitDocument(docID, visitor);
- Document doc = visitor.getDocument();
- addDocument(doc, mergeState.mergeFieldInfos);
+ finishDocument();
docCount++;
mergeState.checkAbort.work(300);
}
@@ -107,16 +105,144 @@ public abstract class StoredFieldsWriter
return docCount;
}
- /** sugar method for startDocument() + writeField() for every stored field in the document */
- protected final void addDocument(Iterable<? extends IndexableField> doc, FieldInfos fieldInfos) throws IOException {
- startDocument();
- for (IndexableField field : doc) {
- if (field.fieldType().stored()) {
- writeField(fieldInfos.fieldInfo(field.name()), field);
+ /**
+ * A visitor that adds every field it sees.
+ * <p>
+ * Use like this:
+ * <pre>
+ * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
+ * for (...) {
+ * startDocument();
+ * storedFieldsReader.visitDocument(docID, visitor);
+ * finishDocument();
+ * }
+ * </pre>
+ */
+ protected class MergeVisitor extends StoredFieldVisitor implements IndexableField {
+ BytesRef binaryValue;
+ String stringValue;
+ Number numericValue;
+ FieldInfo currentField;
+ FieldInfos remapper;
+
+ /**
+ * Create new merge visitor.
+ */
+ public MergeVisitor(MergeState mergeState, int readerIndex) {
+ // if field numbers are aligned, we can save hash lookups
+ // on every field access. Otherwise, we need to lookup
+ // fieldname each time, and remap to a new number.
+ for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
+ FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
+ if (other == null || !other.name.equals(fi.name)) {
+ remapper = mergeState.mergeFieldInfos;
+ break;
+ }
}
}
+
+ @Override
+ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
+ reset(fieldInfo);
+ binaryValue = new BytesRef(value);
+ write();
+ }
+
+ @Override
+ public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+ reset(fieldInfo);
+ stringValue = value;
+ write();
+ }
+
+ @Override
+ public void intField(FieldInfo fieldInfo, int value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void longField(FieldInfo fieldInfo, long value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void floatField(FieldInfo fieldInfo, float value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public Status needsField(FieldInfo fieldInfo) throws IOException {
+ return Status.YES;
+ }
+
+ @Override
+ public String name() {
+ return currentField.name;
+ }
- finishDocument();
+ @Override
+ public IndexableFieldType fieldType() {
+ return StoredField.TYPE;
+ }
+
+ @Override
+ public BytesRef binaryValue() {
+ return binaryValue;
+ }
+
+ @Override
+ public String stringValue() {
+ return stringValue;
+ }
+
+ @Override
+ public Number numericValue() {
+ return numericValue;
+ }
+
+ @Override
+ public Reader readerValue() {
+ return null;
+ }
+
+ @Override
+ public float boost() {
+ return 1F;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
+ return null;
+ }
+
+ void reset(FieldInfo field) {
+ if (remapper != null) {
+ // field numbers are not aligned, we need to remap to the new field number
+ currentField = remapper.fieldInfo(field.name);
+ } else {
+ currentField = field;
+ }
+ binaryValue = null;
+ stringValue = null;
+ numericValue = null;
+ }
+
+ void write() throws IOException {
+ writeField(currentField, this);
+ }
}
@Override
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1648332&r1=1648331&r2=1648332&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Mon Dec 29 13:22:50 2014
@@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUti
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
-import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -74,7 +74,6 @@ public final class CompressingStoredFiel
private CompressingStoredFieldsIndexWriter indexWriter;
private IndexOutput fieldsStream;
- private final CompressionMode compressionMode;
private final Compressor compressor;
private final int chunkSize;
private final int maxDocsPerChunk;
@@ -90,7 +89,6 @@ public final class CompressingStoredFiel
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
assert directory != null;
this.segment = si.name;
- this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize;
this.maxDocsPerChunk = maxDocsPerChunk;
@@ -237,6 +235,8 @@ public final class CompressingStoredFiel
numBufferedDocs = 0;
bufferedDocs.length = 0;
}
+
+ byte scratchBytes[] = new byte[16];
@Override
public void writeField(FieldInfo info, IndexableField field)
@@ -284,7 +284,11 @@ public final class CompressingStoredFiel
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
- bufferedDocs.writeString(field.stringValue());
+ // this is just an optimized writeString() that re-uses scratchBytes.
+ scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
+ int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
+ bufferedDocs.writeVInt(length);
+ bufferedDocs.writeBytes(scratchBytes, length);
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue());
@@ -474,6 +478,7 @@ public final class CompressingStoredFiel
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
+ MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
CompressingStoredFieldsReader matchingFieldsReader = null;
if (matching.matchingReaders[readerIndex]) {
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
@@ -497,9 +502,9 @@ public final class CompressingStoredFiel
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
- DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
+ startDocument();
storedFieldsReader.visitDocument(docID, visitor);
- addDocument(visitor.getDocument(), mergeState.mergeFieldInfos);
+ finishDocument();
++docCount;
mergeState.checkAbort.work(300);
}
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=1648332&r1=1648331&r2=1648332&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java Mon Dec 29 13:22:50 2014
@@ -123,7 +123,7 @@ public final class UnicodeUtil {
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
/** Maximum number of UTF8 bytes per UTF16 character. */
- public static final int MAX_UTF8_BYTES_PER_CHAR = 4;
+ public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
/** Encode characters from a char[] source, starting at
* offset for length chars. It is the responsibility of the