You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2020/06/11 17:53:45 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9397: UniformSplit
supports encodable fields metadata.
This is an automated email from the ASF dual-hosted git repository.
broustant pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new ac7bb4a LUCENE-9397: UniformSplit supports encodable fields metadata.
ac7bb4a is described below
commit ac7bb4a53effcd4e37174e74c89f61187f04fcc0
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Wed Jun 10 16:09:32 2020 +0200
LUCENE-9397: UniformSplit supports encodable fields metadata.
---
lucene/CHANGES.txt | 2 +
.../uniformsplit/UniformSplitPostingsFormat.java | 12 +++---
.../uniformsplit/UniformSplitTermsReader.java | 44 ++++++++++++++++------
.../uniformsplit/UniformSplitTermsWriter.java | 17 ++++++++-
.../sharedterms/STUniformSplitPostingsFormat.java | 2 +-
.../sharedterms/STUniformSplitTermsReader.java | 5 +--
.../sharedterms/UnionFieldMetadataBuilder.java | 6 ---
.../TestUniformSplitPostingFormat.java | 19 +++++++---
...BlockReaderTest.java => TestSTBlockReader.java} | 4 +-
.../UniformSplitRot13PostingsFormat.java | 15 ++++++++
.../STUniformSplitRot13PostingsFormat.java | 7 ++++
11 files changed, 97 insertions(+), 36 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7184c58..a5833dd 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -71,6 +71,8 @@ Improvements
* LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel))
+* LUCENE-9397: UniformSplit supports encodable fields metadata. (Bruno Roustant)
+
Optimizations
---------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
index f982ed3..a58a1de 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
@@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "ustb";
- public static final int VERSION_CURRENT = 0;
+ public static final int VERSION_START = 0;
+ public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1;
+ public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA;
public static final String NAME = "UniformSplit";
@@ -74,10 +76,10 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}.
- * @param blockEncoder Optional block encoder, may be null if none.
- * It can be used for compression or encryption.
- * @param blockDecoder Optional block decoder, may be null if none.
- * It can be used for compression or encryption.
+ * @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms
+ * blocks, as well as the FST dictionary and the fields metadata.
+ * @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms
+ * blocks, as well as the FST dictionary and the fields metadata.
* @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without
* impact on performance. If block encoding/decoding is used, then the dictionary is always
* loaded on-heap whatever this parameter value is.
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
index 9b2552b..377919d 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
@@ -34,14 +34,14 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique.
@@ -51,12 +51,11 @@ import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.V
*/
public class UniformSplitTermsReader extends FieldsProducer {
- protected static final int VERSION_START = 0;
-
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2;
protected final PostingsReaderBase postingsReader;
+ protected final int version;
protected final IndexInput blockInput;
protected final IndexInput dictionaryInput;
@@ -93,7 +92,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension);
blockInput = state.directory.openInput(termsName, state.context);
- int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
+ version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension);
dictionaryInput = state.directory.openInput(indexName, state.context);
@@ -105,7 +104,8 @@ public class UniformSplitTermsReader extends FieldsProducer {
CodecUtil.retrieveChecksum(blockInput);
seekFieldsMetadata(blockInput);
- Collection<FieldMetadata> fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
+ Collection<FieldMetadata> fieldMetadataCollection =
+ readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
fieldToTermsMap = new HashMap<>();
this.blockInput = blockInput;
@@ -143,16 +143,36 @@ public class UniformSplitTermsReader extends FieldsProducer {
/**
* @param indexInput {@link IndexInput} must be positioned to the fields metadata
* details by calling {@link #seekFieldsMetadata(IndexInput)} before this call.
+ * @param blockDecoder Optional block decoder, may be null if none.
*/
- protected static Collection<FieldMetadata> parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos,
- FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
+ protected Collection<FieldMetadata> readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos,
+ FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
int numFields = indexInput.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput);
}
+ return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ?
+ readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs)
+ : readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs);
+ }
+
+ protected Collection<FieldMetadata> readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder,
+ FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader,
+ int maxNumDocs) throws IOException {
+ long encodedLength = metadataInput.readVLong();
+ if (encodedLength < 0) {
+ throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput);
+ }
+ BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength);
+ DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
+ return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs);
+ }
+
+ protected Collection<FieldMetadata> readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos,
+ FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields);
for (int i = 0; i < numFields; i++) {
- fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs));
+ fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs));
}
return fieldMetadataCollection;
}
@@ -212,7 +232,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
/**
* Positions the given {@link IndexInput} at the beginning of the fields metadata.
*/
- protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException {
+ protected void seekFieldsMetadata(IndexInput indexInput) throws IOException {
indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8);
indexInput.seek(indexInput.readLong());
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
index 101b6b5..c4e089f 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
@@ -249,11 +249,26 @@ public class UniformSplitTermsWriter extends FieldsConsumer {
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber);
- fieldsOutput.copyTo(blockOutput);
+ if (blockEncoder == null) {
+ writeUnencodedFieldsMetadata(fieldsOutput);
+ } else {
+ writeEncodedFieldsMetadata(fieldsOutput);
+ }
+ // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput);
}
+ protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ fieldsOutput.copyTo(blockOutput);
+ }
+
+ protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
+ blockOutput.writeVLong(encodedBytes.size());
+ encodedBytes.writeTo(blockOutput);
+ }
+
/**
* @return 1 if the field was written; 0 otherwise.
*/
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
index 57c1540..730728b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
@@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "stustb";
- public static final int VERSION_CURRENT = 0;
+ public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT;
public static final String NAME = "SharedTermsUniformSplit";
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
index cc25a30c..5c2b24b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
@@ -30,10 +30,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique,
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
index 85b6a27..4cf5c26 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
@@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder {
private BytesRef maxLastTerm;
public UnionFieldMetadataBuilder() {
- reset();
- }
-
- public UnionFieldMetadataBuilder reset() {
dictionaryStartFP = -1;
minStartBlockFP = Long.MAX_VALUE;
maxEndBlockFP = Long.MIN_VALUE;
- maxLastTerm = null;
- return this;
}
public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) {
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
index db1d6c1..9a68a14 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
@@ -51,17 +51,26 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase {
@Before
public void initialize() {
+ initializeInner();
+ }
+
+ protected void initializeInner() {
UniformSplitRot13PostingsFormat.resetEncodingFlags();
}
@After
public void checkEncodingCalled() {
if (checkEncoding) {
- assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
- assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
- if (shouldCheckDecoderWasCalled) {
- assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
- }
+ checkEncodingCalledInner();
+ }
+ }
+
+ protected void checkEncodingCalledInner() {
+ assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
+ assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded);
+ assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
+ if (shouldCheckDecoderWasCalled) {
+ assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
}
}
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
similarity index 98%
rename from lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java
rename to lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
index 6d09fe3..5707fb4 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
@@ -51,9 +51,9 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
-public class STBlockReaderTest extends LuceneTestCase {
+public class TestSTBlockReader extends LuceneTestCase {
- private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp";
+ private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp";
private FieldInfos fieldInfos;
private List<MockSTBlockLine> blockLines;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java
index 4b3a680..26d14ad 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java
@@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@@ -40,6 +41,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
public static volatile boolean encoderCalled;
public static volatile boolean decoderCalled;
public static volatile boolean blocksEncoded;
+ public static volatile boolean fieldsMetadataEncoded;
public static volatile boolean dictionaryEncoded;
protected final boolean dictionaryOnHeap;
@@ -56,6 +58,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
encoderCalled = false;
decoderCalled = false;
blocksEncoded = false;
+ fieldsMetadataEncoded = false;
dictionaryEncoded = false;
}
@@ -86,6 +89,11 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall();
}
+ @Override
+ protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ super.writeEncodedFieldsMetadata(fieldsOutput);
+ recordFieldsMetadataEncodingCall();
+ }
};
}
@@ -96,6 +104,13 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
}
}
+ protected void recordFieldsMetadataEncodingCall() {
+ if (encoderCalled) {
+ fieldsMetadataEncoded = true;
+ encoderCalled = false;
+ }
+ }
+
protected void recordDictionaryEncodingCall() {
if (encoderCalled) {
dictionaryEncoded = true;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java
index a300e36..04f3964 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java
@@ -28,6 +28,7 @@ import org.apache.lucene.codecs.uniformsplit.UniformSplitRot13PostingsFormat;
import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.ByteBuffersDataOutput;
/**
* {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher.
@@ -50,6 +51,12 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings
super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall();
}
+ @Override
+ protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ recordBlockEncodingCall();
+ super.writeEncodedFieldsMetadata(fieldsOutput);
+ recordFieldsMetadataEncodingCall();
+ }
};
}