You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by su...@apache.org on 2019/01/02 02:04:39 UTC
[incubator-pinot] branch pinot-text-search updated: Initial changes
to support TEXT datatype and creation of lucene-search index for the same
This is an automated email from the ASF dual-hosted git repository.
sunithabeeram pushed a commit to branch pinot-text-search
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/pinot-text-search by this push:
new d92dfea Initial changes to support TEXT datatype and creation of lucene-search index for the same
d92dfea is described below
commit d92dfea8c3323a5976479112dfe1c4676efa69dd
Author: Sunitha Beeram <sb...@linkedin.com>
AuthorDate: Tue Jan 1 18:04:07 2019 -0800
Initial changes to support TEXT datatype and creation of lucene-search index for the same
---
.../com/linkedin/pinot/common/data/FieldSpec.java | 6 +-
.../linkedin/pinot/common/data/FieldSpecTest.java | 9 ++
pinot-core/pom.xml | 5 +
.../data/readers/PinotSegmentColumnReader.java | 1 +
.../core/data/recordtransformer/PinotDataType.java | 35 +++-
.../generator/SegmentGeneratorConfig.java | 10 ++
...va => DictionaryBasedInvertedIndexCreator.java} | 15 +-
.../creator/DocBasedInvertedIndexCreator.java | 27 ++++
.../core/segment/creator/InvertedIndexCreator.java | 44 -----
.../creator/impl/SegmentColumnarIndexCreator.java | 80 ++++++---
.../creator/impl/SegmentDictionaryCreator.java | 6 +-
.../inv/OffHeapBitmapInvertedIndexCreator.java | 5 +-
.../impl/inv/OnHeapBitmapInvertedIndexCreator.java | 5 +-
.../stats/SegmentPreIndexStatsCollectorImpl.java | 1 +
.../impl/textsearch/LuceneIndexCreator.java | 85 ++++++++++
.../impl/textsearch/TextSearchIndexConfig.java | 54 +++++++
.../textsearch/TextSearchIndexCreatorFactory.java | 32 ++++
.../impl/textsearch/TextSearchIndexType.java | 50 ++++++
.../index/column/PhysicalColumnIndexContainer.java | 11 +-
.../com/linkedin/pinot/core/util/AvroUtils.java | 1 +
.../SegmentGenerationWithBytesTypeTest.java | 4 +-
...java => SegmentGenerationWithTextTypeTest.java} | 178 ++++++++-------------
22 files changed, 463 insertions(+), 201 deletions(-)
diff --git a/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java b/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
index f80d356..87edc53 100644
--- a/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
+++ b/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
@@ -194,6 +194,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
case DOUBLE:
return Double.valueOf(stringDefaultNullValue);
case STRING:
+ case TEXT:
return stringDefaultNullValue;
case BYTES:
try {
@@ -217,6 +218,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
case DOUBLE:
return DEFAULT_METRIC_NULL_VALUE_OF_DOUBLE;
case STRING:
+ case TEXT:
return DEFAULT_METRIC_NULL_VALUE_OF_STRING;
case BYTES:
return DEFAULT_METRIC_NULL_VALUE_OF_BYTES;
@@ -237,6 +239,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
case DOUBLE:
return DEFAULT_DIMENSION_NULL_VALUE_OF_DOUBLE;
case STRING:
+ case TEXT:
return DEFAULT_DIMENSION_NULL_VALUE_OF_STRING;
case BYTES:
return DEFAULT_DIMENSION_NULL_VALUE_OF_BYTES;
@@ -389,7 +392,8 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
DOUBLE,
BOOLEAN, // Stored as STRING
STRING,
- BYTES;
+ BYTES,
+ TEXT; // Stored as STRING
/**
* Returns the data type stored in Pinot.
diff --git a/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java b/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
index a2b9f70..70d622a 100644
--- a/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
+++ b/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
@@ -50,6 +50,7 @@ public class FieldSpecTest {
Assert.assertEquals(BOOLEAN.getStoredType(), STRING);
Assert.assertEquals(STRING.getStoredType(), STRING);
Assert.assertEquals(BYTES.getStoredType(), BYTES);
+ Assert.assertEquals(TEXT.getStoredType(), TEXT);
Assert.assertEquals(INT.size(), Integer.BYTES);
Assert.assertEquals(LONG.size(), Long.BYTES);
@@ -288,6 +289,14 @@ public class FieldSpecTest {
Assert.assertEquals(dimensionFieldSpec1, dimensionFieldSpec2, ERROR_MESSAGE);
Assert.assertEquals(dimensionFieldSpec1.getDefaultNullValue(), "false", ERROR_MESSAGE);
+ // Single-value Text type dimension field with default null value.
+ String[] dimensionFields2 = {"\"name\":\"dimension\"", "\"dataType\":\"TEXT\"", "\"defaultNullValue\":false"};
+ DimensionFieldSpec dimensionFieldSpec3 =
+ MAPPER.readValue(getRandomOrderJsonString(dimensionFields2), DimensionFieldSpec.class);
+ DimensionFieldSpec dimensionFieldSpec4 = new DimensionFieldSpec("dimension", TEXT, true, false);
+ Assert.assertEquals(dimensionFieldSpec3, dimensionFieldSpec4, ERROR_MESSAGE);
+ Assert.assertEquals(dimensionFieldSpec3.getDefaultNullValue(), "false", ERROR_MESSAGE);
+
// Multi-value dimension field with default null value.
dimensionFields =
new String[]{"\"name\":\"dimension\"", "\"dataType\":\"STRING\"", "\"singleValueField\":false", "\"defaultNullValue\":\"default\""};
diff --git a/pinot-core/pom.xml b/pinot-core/pom.xml
index fa608f1..de451b5 100644
--- a/pinot-core/pom.xml
+++ b/pinot-core/pom.xml
@@ -146,6 +146,11 @@
<artifactId>fastutil</artifactId>
</dependency>
<dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>7.6.0</version>
+ </dependency>
+ <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java b/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
index c126ff1..163fcb3 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
@@ -117,6 +117,7 @@ public class PinotSegmentColumnReader {
case DOUBLE:
return readDouble(docId);
case STRING:
+ case TEXT:
return readString(docId);
case BYTES:
return readBytes(docId);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java b/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
index e9b233b..9eb5f5f 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
@@ -258,6 +258,33 @@ public enum PinotDataType {
}
},
+ TEXT {
+ @Override
+ public Integer toInteger(Object value) {
+ return Integer.parseInt((String) value);
+ }
+
+ @Override
+ public Long toLong(Object value) {
+ return Long.parseLong((String) value);
+ }
+
+ @Override
+ public Float toFloat(Object value) {
+ return Float.parseFloat((String) value);
+ }
+
+ @Override
+ public Double toDouble(Object value) {
+ return Double.parseDouble((String) value);
+ }
+
+ @Override
+ public String convert(Object value, PinotDataType sourceType) {
+ return sourceType.toString(value);
+ }
+ },
+
OBJECT {
@Override
public Integer toInteger(Object value) {
@@ -428,7 +455,7 @@ public enum PinotDataType {
}
public Object convert(Object value, PinotDataType sourceType) {
- throw new UnsupportedOperationException("Cannot convert value: " + value + " form: " + sourceType + " to: " + this);
+ throw new UnsupportedOperationException("Cannot convert value: " + value + " from: " + sourceType + " to: " + this);
}
public boolean isSingleValue() {
@@ -473,6 +500,12 @@ public enum PinotDataType {
return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : PinotDataType.DOUBLE_ARRAY;
case STRING:
return fieldSpec.isSingleValueField() ? PinotDataType.STRING : PinotDataType.STRING_ARRAY;
+ case TEXT:
+ if (fieldSpec.isSingleValueField()) {
+ return PinotDataType.TEXT;
+ } else {
+ throw new UnsupportedOperationException("Unsupported multi-valued type: TEXT");
+ }
case BYTES:
if (fieldSpec.isSingleValueField()) {
return PinotDataType.BYTES;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java b/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
index 0d9f670..73e144b 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
@@ -30,6 +30,7 @@ import com.linkedin.pinot.core.data.readers.CSVRecordReaderConfig;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.data.readers.RecordReaderConfig;
import com.linkedin.pinot.core.io.compression.ChunkCompressorFactory;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexConfig;
import com.linkedin.pinot.core.segment.name.DefaultSegmentNameGenerator;
import com.linkedin.pinot.core.segment.name.SegmentNameGenerator;
import com.linkedin.pinot.core.startree.v2.builder.StarTreeV2BuilderConfig;
@@ -105,6 +106,7 @@ public class SegmentGeneratorConfig {
private String _simpleDateFormat = null;
// Use on-heap or off-heap memory to generate index (currently only affect inverted index and star-tree v2)
private boolean _onHeap = false;
+ private Map<String, TextSearchIndexConfig> _textSearchIndexConfigs = new HashMap<>();
public SegmentGeneratorConfig() {
}
@@ -570,6 +572,14 @@ public class SegmentGeneratorConfig {
_rawIndexCompressionType.putAll(rawIndexCompressionType);
}
+ public void setTextSearchIndexConfig(TextSearchIndexConfig config) {
+ _textSearchIndexConfigs.put(config.getColumnName(),config);
+ }
+
+ public Map<String, TextSearchIndexConfig> getTextSearchIndexConfigs() {
+ return _textSearchIndexConfigs;
+ }
+
@JsonIgnore
public String getMetrics() {
return getQualifyingFields(FieldType.METRIC, true);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
similarity index 88%
copy from pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
copy to pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
index 4af054d..c933efd 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
@@ -15,10 +15,6 @@
*/
package com.linkedin.pinot.core.segment.creator;
-import java.io.Closeable;
-import java.io.IOException;
-
-
/**
* Currently only support RoaringBitmap inverted index.
* <pre>
@@ -37,10 +33,10 @@ import java.io.IOException;
* |-------------------------------------------------------------------------|
* </pre>
*
- * <p>To create an inverted index:
+ * <p>To create a dictionary based inverted index:
* <ul>
* <li>
- * Construct an instance of <code>InvertedIndexCreator</code>
+ * Construct an instance of <code>DictionaryBasedInvertedIndexCreator</code>
* </li>
* <li>
* Call add() for each docId in sequence starting with 0 to add dictId (dictIds for multi-valued column) into the
@@ -51,7 +47,7 @@ import java.io.IOException;
* </li>
* </ul>
*/
-public interface InvertedIndexCreator extends Closeable {
+public interface DictionaryBasedInvertedIndexCreator extends InvertedIndexCreator {
/**
* For single-valued column, adds the dictionary Id for the next document.
@@ -62,9 +58,4 @@ public interface InvertedIndexCreator extends Closeable {
* For multi-valued column, adds the dictionary Ids for the next document.
*/
void add(int[] dictIds, int length);
-
- /**
- * Seals the index and flushes it to disk.
- */
- void seal() throws IOException;
}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java
new file mode 100644
index 0000000..884d446
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator;
+
+/**
+ * A document object based InvertedIndexCreator.
+ */
+public interface DocBasedInvertedIndexCreator extends InvertedIndexCreator {
+
+ /**
+ * Add a document/text field to the index
+ */
+ void add(Object doc);
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
index 4af054d..9943583 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
@@ -18,51 +18,7 @@ package com.linkedin.pinot.core.segment.creator;
import java.io.Closeable;
import java.io.IOException;
-
-/**
- * Currently only support RoaringBitmap inverted index.
- * <pre>
- * Layout for RoaringBitmap inverted index:
- * |-------------------------------------------------------------------------|
- * | Start offset of 1st bitmap |
- * | End offset of 1st bitmap (exclusive) / Start offset of 2nd bitmap |
- * | ... |
- * | End offset of 2nd last bitmap (exclusive) / Start offset of last bitmap |
- * | End offset of last bitmap (exclusive) |
- * |-------------------------------------------------------------------------|
- * | Data for 1st bitmap |
- * | Data for 2nd bitmap |
- * | ... |
- * | Data for last bitmap |
- * |-------------------------------------------------------------------------|
- * </pre>
- *
- * <p>To create an inverted index:
- * <ul>
- * <li>
- * Construct an instance of <code>InvertedIndexCreator</code>
- * </li>
- * <li>
- * Call add() for each docId in sequence starting with 0 to add dictId (dictIds for multi-valued column) into the
- * creator
- * </li>
- * <li>
- * Call seal() after all dictIds have been added
- * </li>
- * </ul>
- */
public interface InvertedIndexCreator extends Closeable {
-
- /**
- * For single-valued column, adds the dictionary Id for the next document.
- */
- void add(int dictId);
-
- /**
- * For multi-valued column, adds the dictionary Ids for the next document.
- */
- void add(int[] dictIds, int length);
-
/**
* Seals the index and flushes it to disk.
*/
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
index 93b74b8..44cf3a1 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -28,6 +28,8 @@ import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.io.compression.ChunkCompressorFactory;
import com.linkedin.pinot.core.io.util.PinotDataBitSet;
import com.linkedin.pinot.core.segment.creator.ColumnIndexCreationInfo;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.ForwardIndexCreator;
import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.MultiValueForwardIndexCreator;
@@ -42,6 +44,8 @@ import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueUnsortedForwa
import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueVarByteRawIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.inv.OffHeapBitmapInvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.inv.OnHeapBitmapInvertedIndexCreator;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexConfig;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexCreatorFactory;
import com.linkedin.pinot.startree.hll.HllConfig;
import java.io.File;
import java.io.IOException;
@@ -75,7 +79,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
private Map<String, ColumnIndexCreationInfo> indexCreationInfoMap;
private Map<String, SegmentDictionaryCreator> _dictionaryCreatorMap = new HashMap<>();
private Map<String, ForwardIndexCreator> _forwardIndexCreatorMap = new HashMap<>();
- private Map<String, InvertedIndexCreator> _invertedIndexCreatorMap = new HashMap<>();
+ private Map<String, DictionaryBasedInvertedIndexCreator> _invertedIndexCreatorMap = new HashMap<>();
+ private Map<String, DocBasedInvertedIndexCreator> _docBasedInvertedIndexCreatorMap = new HashMap<>();
private String segmentName;
private Schema schema;
private File _indexDir;
@@ -172,16 +177,34 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
// TODO: add support to multi-value column and inverted index
Preconditions.checkState(fieldSpec.isSingleValueField(), "Cannot create raw index for multi-value column: %s",
columnName);
- Preconditions.checkState(!invertedIndexColumns.contains(columnName),
+
+ boolean createInvertedIndex = invertedIndexColumns.contains(columnName);
+ FieldSpec.DataType type = fieldSpec.getDataType();
+ // TEXT columns can have inverted indices
+ Preconditions.checkState(type == FieldSpec.DataType.TEXT ||
+ !createInvertedIndex,
"Cannot create inverted index for raw index column: %s", columnName);
ChunkCompressorFactory.CompressionType compressionType =
getColumnCompressionType(segmentCreationSpec, fieldSpec);
- // Initialize forward index creator
- _forwardIndexCreatorMap.put(columnName,
- getRawIndexCreatorForColumn(_indexDir, compressionType, columnName, fieldSpec.getDataType(), totalDocs,
- indexCreationInfo.getLengthOfLongestEntry()));
+ boolean createFwdIndex = true;
+ if(createInvertedIndex && type == FieldSpec.DataType.TEXT) {
+ TextSearchIndexConfig config = segmentCreationSpec.getTextSearchIndexConfigs().get(columnName);
+ if (config == null) {
+ config = TextSearchIndexConfig.getDefaultConfig(columnName, _indexDir);
+ }
+ createFwdIndex = config.shouldStore();
+ _docBasedInvertedIndexCreatorMap.put(columnName, TextSearchIndexCreatorFactory.createSearchIndexer(config));
+ }
+
+ if (createFwdIndex) {
+ // Initialize forward index creator
+ _forwardIndexCreatorMap.put(columnName,
+ getRawIndexCreatorForColumn(_indexDir, compressionType, columnName, fieldSpec.getDataType(), totalDocs,
+ indexCreationInfo.getLengthOfLongestEntry()));
+
+ }
}
}
}
@@ -254,23 +277,31 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
throw new RuntimeException("Null value for column:" + columnName);
}
- SegmentDictionaryCreator dictionaryCreator = _dictionaryCreatorMap.get(columnName);
- if (schema.getFieldSpecFor(columnName).isSingleValueField()) {
- if (dictionaryCreator != null) {
- int dictId = dictionaryCreator.indexOfSV(columnValueToIndex);
- ((SingleValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictId);
- if (_invertedIndexCreatorMap.containsKey(columnName)) {
- _invertedIndexCreatorMap.get(columnName).add(dictId);
+ if (_docBasedInvertedIndexCreatorMap.containsKey(columnName)) {
+ // inverted index
+ _docBasedInvertedIndexCreatorMap.get(columnName).add(columnValueToIndex);
+ // forward index
+ ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
+ columnValueToIndex);
+ } else {
+ SegmentDictionaryCreator dictionaryCreator = _dictionaryCreatorMap.get(columnName);
+ if (schema.getFieldSpecFor(columnName).isSingleValueField()) {
+ if (dictionaryCreator != null) {
+ int dictId = dictionaryCreator.indexOfSV(columnValueToIndex);
+ ((SingleValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictId);
+ if (_invertedIndexCreatorMap.containsKey(columnName)) {
+ _invertedIndexCreatorMap.get(columnName).add(dictId);
+ }
+ } else {
+ ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
+ columnValueToIndex);
}
} else {
- ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
- columnValueToIndex);
- }
- } else {
- int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
- ((MultiValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictIds);
- if (_invertedIndexCreatorMap.containsKey(columnName)) {
- _invertedIndexCreatorMap.get(columnName).add(dictIds, dictIds.length);
+ int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
+ ((MultiValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictIds);
+ if (_invertedIndexCreatorMap.containsKey(columnName)) {
+ _invertedIndexCreatorMap.get(columnName).add(dictIds, dictIds.length);
+ }
}
}
}
@@ -287,6 +318,9 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
for (InvertedIndexCreator invertedIndexCreator : _invertedIndexCreatorMap.values()) {
invertedIndexCreator.seal();
}
+ for (DocBasedInvertedIndexCreator docIndexCreator : _docBasedInvertedIndexCreatorMap.values()) {
+ docIndexCreator.seal();
+ }
writeMetadata();
}
@@ -512,6 +546,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
case STRING:
case BYTES:
+ case TEXT:
indexCreator =
new SingleValueVarByteRawIndexCreator(file, compressionType, column, totalDocs, lengthOfLongestEntry);
break;
@@ -534,5 +569,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
for (InvertedIndexCreator invertedIndexCreator : _invertedIndexCreatorMap.values()) {
invertedIndexCreator.close();
}
+ for (DocBasedInvertedIndexCreator docIndexCreator : _docBasedInvertedIndexCreatorMap.values()) {
+ docIndexCreator.close();
+ }
}
}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
index b2d0c5b..9e4950c 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
@@ -137,6 +137,7 @@ public class SegmentDictionaryCreator implements Closeable {
_fieldSpec.getName(), numValues, sortedDoubles[0], sortedDoubles[numValues - 1]);
return;
case STRING:
+ case TEXT:
String[] sortedStrings = (String[]) _sortedValues;
numValues = sortedStrings.length;
Preconditions.checkState(numValues > 0);
@@ -162,8 +163,9 @@ public class SegmentDictionaryCreator implements Closeable {
}
}
LOGGER.info(
- "Created dictionary for STRING column: {} with cardinality: {}, max length in bytes: {}, range: {} to {}",
- _fieldSpec.getName(), numValues, _numBytesPerEntry, sortedStrings[0], sortedStrings[numValues - 1]);
+ "Created dictionary for {} column: {} with cardinality: {}, max length in bytes: {}, range: {} to {}",
+ _fieldSpec.getDataType(),_fieldSpec.getName(), numValues, _numBytesPerEntry, sortedStrings[0],
+ sortedStrings[numValues - 1]);
return;
case BYTES:
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
index 2d6d762..d847b6e 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
@@ -17,6 +17,7 @@ package com.linkedin.pinot.core.segment.creator.impl.inv;
import com.google.common.base.Preconditions;
import com.linkedin.pinot.common.data.FieldSpec;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.memory.PinotDataBuffer;
@@ -30,7 +31,7 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
/**
- * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * Implementation of {@link DictionaryBasedInvertedIndexCreator} that uses off-heap memory.
* <p>We use 2 passes to create the inverted index.
* <ul>
* <li>
@@ -48,7 +49,7 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
* </ul>
* <p>Based on the number of values we need to store, we use direct memory or MMap file to allocate the buffer.
*/
-public final class OffHeapBitmapInvertedIndexCreator implements InvertedIndexCreator {
+public final class OffHeapBitmapInvertedIndexCreator implements DictionaryBasedInvertedIndexCreator {
// Use MMapBuffer if the buffer size is larger than 100MB
private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 25_000_000;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
index b6d9eb1..8fdd351 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
@@ -16,6 +16,7 @@
package com.linkedin.pinot.core.segment.creator.impl.inv;
import com.google.common.base.Preconditions;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import java.io.BufferedOutputStream;
@@ -28,9 +29,9 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
/**
- * Implementation of {@link InvertedIndexCreator} that uses on-heap memory.
+ * Implementation of {@link DictionaryBasedInvertedIndexCreator} that uses on-heap memory.
*/
-public final class OnHeapBitmapInvertedIndexCreator implements InvertedIndexCreator {
+public final class OnHeapBitmapInvertedIndexCreator implements DictionaryBasedInvertedIndexCreator {
private final File _invertedIndexFile;
private final MutableRoaringBitmap[] _bitmaps;
private int _nextDocId;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
index 1938be9..436b9e2 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
@@ -51,6 +51,7 @@ public class SegmentPreIndexStatsCollectorImpl implements SegmentPreIndexStatsCo
switch (spec.getDataType()) {
case BOOLEAN:
case STRING:
+ case TEXT:
columnStatsCollectorMap.put(spec.getName(),
new StringColumnPreIndexStatsCollector(column, _statsCollectorConfig));
break;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java
new file mode 100644
index 0000000..2381113
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java
@@ -0,0 +1,85 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
+import java.io.File;
+import java.io.IOException;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+class LuceneIndexCreator implements DocBasedInvertedIndexCreator {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(LuceneIndexCreator.class);
+
+ public static final String VERSION = "7.6.0";
+ // Index file will be flushed after reaching this threshold
+ private static final int MAX_BUFFER_SIZE_MB = 500;
+ private static final String DEFAULT_FIELD = "TEXT";
+ private static final Field.Store DEFAULT_STORE = Field.Store.NO;
+
+ private final TextSearchIndexConfig _config;
+ private final StandardAnalyzer _analyzer;
+ private final IndexWriter _writer;
+ private final IndexWriterConfig _indexWriterConfig;
+ private final Directory _indexDirectory;
+
+ public LuceneIndexCreator(TextSearchIndexConfig config) {
+ _config = config;
+ _analyzer = new StandardAnalyzer();
+ _indexWriterConfig = new IndexWriterConfig(_analyzer);
+ _indexWriterConfig.setRAMBufferSizeMB(MAX_BUFFER_SIZE_MB);
+ File dir = new File(config.getIndexDir().getPath() + "/" + _config.getColumnName());
+ try {
+ _indexDirectory = FSDirectory.open(dir.toPath());
+ _writer = new IndexWriter(_indexDirectory, _indexWriterConfig);
+ } catch (IOException e) {
+ LOGGER.error("Encountered error creating LuceneIndexCreator ", e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void seal() throws IOException {
+
+ }
+
+ @Override
+ public void add(Object doc) {
+ Document document = new Document();
+ document.add(new TextField(DEFAULT_FIELD, doc.toString(), DEFAULT_STORE));
+ try {
+ _writer.addDocument(document);
+ } catch (IOException e) {
+ LOGGER.error("Encountered exception while adding doc:{}", doc.toString(), e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ _writer.close();
+ }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java
new file mode 100644
index 0000000..d8a4fbd
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import java.io.File;
+
+
+public class TextSearchIndexConfig {
+
+ private final TextSearchIndexType _type;
+ private final boolean _store;
+ private final File _indexDir;
+ private final String _columnName;
+
+ public TextSearchIndexConfig(TextSearchIndexType type, boolean store, File indexDir, String columnName) {
+ _type = type;
+ _store = store;
+ _indexDir = indexDir;
+ _columnName = columnName;
+ }
+
+ public TextSearchIndexType getType() {
+ return _type;
+ }
+
+ public boolean shouldStore() {
+ return _store;
+ }
+
+ public File getIndexDir() {
+ return _indexDir;
+ }
+
+ public String getColumnName() {
+ return _columnName;
+ }
+
+ public static TextSearchIndexConfig getDefaultConfig(String columnName, File indexDir) {
+ return new TextSearchIndexConfig(TextSearchIndexType.LUCENE, true, indexDir, columnName);
+ }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java
new file mode 100644
index 0000000..bd05b62
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java
@@ -0,0 +1,32 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
+
+
+public class TextSearchIndexCreatorFactory {
+
+ public static DocBasedInvertedIndexCreator createSearchIndexer(TextSearchIndexConfig config) {
+
+ switch (config.getType()) {
+ case LUCENE:
+ return new LuceneIndexCreator(config);
+ default:
+ throw new IllegalArgumentException("Unsupported TextSearchIndexType " + config.getType());
+ }
+ }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java
new file mode 100644
index 0000000..fe7637f
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * Enum for text search-index type
+ */
+public enum TextSearchIndexType {
+ // NOTE: Do not reorder the enums when adding new ones
+ LUCENE(1);
+
+ private int _value;
+
+ private static Map<Integer, TextSearchIndexType> _indexType = new HashMap<>();
+
+ TextSearchIndexType(int value) {
+ _value = value;
+ }
+
+ static {
+ for (TextSearchIndexType type : TextSearchIndexType.values()) {
+ _indexType.put(type._value, type);
+ }
+ }
+
+ public static TextSearchIndexType valueOf(int type) {
+ return _indexType.get(type);
+ }
+
+ public int getValue() {
+ return _value;
+ }
+}
\ No newline at end of file
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
index 29ef748..da1e9cd 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
@@ -71,6 +71,7 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
}
PinotDataBuffer fwdIndexBuffer = segmentReader.getIndexFor(columnName, ColumnIndexType.FORWARD_INDEX);
+ FieldSpec.DataType type = metadata.getDataType();
if (metadata.hasDictionary()) {
//bloom filter
if (loadBloomFilter) {
@@ -110,10 +111,15 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
}
} else {
// Raw index
- _forwardIndex = loadRawForwardIndex(fwdIndexBuffer, metadata.getDataType());
- _invertedIndex = null;
+ _forwardIndex = loadRawForwardIndex(fwdIndexBuffer, type);
_dictionary = null;
_bloomFilterReader = null;
+ _invertedIndex = null;
+ if (loadInvertedIndex) {
+ if (type == FieldSpec.DataType.TEXT) {
+ // TODO: load text-search inverted index
+ }
+ }
}
}
@@ -190,6 +196,7 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
return new FixedByteChunkSingleValueReader(forwardIndexBuffer);
case STRING:
case BYTES:
+ case TEXT:
return new VarByteChunkSingleValueReader(forwardIndexBuffer);
default:
throw new IllegalStateException("Illegal data type for raw forward index: " + dataType);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java b/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
index f43d676..58d6eb0 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
@@ -159,6 +159,7 @@ public class AvroUtils {
fieldAssembler = fieldAssembler.name(fieldSpec.getName()).type().doubleType().noDefault();
break;
case STRING:
+ case TEXT:
fieldAssembler = fieldAssembler.name(fieldSpec.getName()).type().stringType().noDefault();
break;
case BYTES:
diff --git a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
index 1c64375..127530e 100644
--- a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
+++ b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
@@ -65,12 +65,12 @@ public class SegmentGenerationWithBytesTypeTest {
private static final int NUM_SORTED_VALUES = 1001;
private static final String SEGMENT_DIR_NAME =
- System.getProperty("java.io.tmpdir") + File.separator + "bytesTypeTest";
+ System.getProperty("java.io.tmpdir") + File.separator + "TextTypeTest";
private static final String SEGMENT_NAME = "testSegment";
private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "tDigestTest";
- private static final String AVRO_NAME = "tDigest.avro";
+ private static final String AVRO_NAME = "text.avro";
private static final String FIXED_BYTE_SORTED_COLUMN = "sortedColumn";
private static final String FIXED_BYTES_UNSORTED_COLUMN = "fixedBytes";
diff --git a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
similarity index 54%
copy from pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
copy to pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
index 1c64375..a5440ae 100644
--- a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
+++ b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
@@ -15,14 +15,11 @@
*/
package com.linkedin.pinot.core.segment.index.creator;
-import com.google.common.primitives.Ints;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
-import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.common.segment.SegmentMetadata;
-import com.linkedin.pinot.common.utils.primitive.ByteArray;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.GenericRowRecordReader;
import com.linkedin.pinot.core.data.readers.PinotSegmentRecordReader;
@@ -41,6 +38,7 @@ import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
@@ -49,6 +47,7 @@ import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.RandomStringUtils;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
@@ -56,26 +55,22 @@ import org.testng.annotations.Test;
/**
- * Class for testing segment generation with byte[] data type.
+ * Class for testing segment generation with text data type.
*/
-public class SegmentGenerationWithBytesTypeTest {
+public class SegmentGenerationWithTextTypeTest {
private static final int NUM_ROWS = 10001;
- private static final int FIXED_BYTE_LENGTH = 53;
- private static final int MAX_VARIABLE_BYTES_LENGTH = 101;
- private static final int NUM_SORTED_VALUES = 1001;
+ private static final int MAX_STRING_LENGTH = 1000;
private static final String SEGMENT_DIR_NAME =
- System.getProperty("java.io.tmpdir") + File.separator + "bytesTypeTest";
+ System.getProperty("java.io.tmpdir") + File.separator + "textTypeTest";
private static final String SEGMENT_NAME = "testSegment";
- private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "tDigestTest";
+ private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "textTest";
- private static final String AVRO_NAME = "tDigest.avro";
+ private static final String AVRO_NAME = "text.avro";
- private static final String FIXED_BYTE_SORTED_COLUMN = "sortedColumn";
- private static final String FIXED_BYTES_UNSORTED_COLUMN = "fixedBytes";
- private static final String FIXED_BYTES_NO_DICT_COLUMN = "fixedBytesNoDict";
- private static final String VARIABLE_BYTES_COLUMN = "variableBytes";
+ private static final String TEXT_SORTED_COLUMN = "sortedColumn";
+ private static final String TEXT_COLUMN = "textColumn";
private Random _random;
private RecordReader _recordReader;
@@ -91,14 +86,17 @@ public class SegmentGenerationWithBytesTypeTest {
public void setup() throws Exception {
_schema = new Schema();
- _schema.addField(new DimensionFieldSpec(FIXED_BYTE_SORTED_COLUMN, FieldSpec.DataType.BYTES, true));
- _schema.addField(new DimensionFieldSpec(FIXED_BYTES_UNSORTED_COLUMN, FieldSpec.DataType.BYTES, true));
- _schema.addField(new DimensionFieldSpec(FIXED_BYTES_NO_DICT_COLUMN, FieldSpec.DataType.BYTES, true));
- _schema.addField(new DimensionFieldSpec(VARIABLE_BYTES_COLUMN, FieldSpec.DataType.BYTES, true));
+ _schema.addField(new DimensionFieldSpec(TEXT_SORTED_COLUMN, FieldSpec.DataType.TEXT, true));
+ _schema.addField(new DimensionFieldSpec(TEXT_COLUMN, FieldSpec.DataType.TEXT, true));
_random = new Random(System.nanoTime());
- _recordReader = buildIndex(_schema);
- _segment = ImmutableSegmentLoader.load(new File(SEGMENT_DIR_NAME, SEGMENT_NAME), ReadMode.heap);
+ try {
+ _recordReader = buildIndex(_schema);
+ _segment = ImmutableSegmentLoader.load(new File(SEGMENT_DIR_NAME, SEGMENT_NAME), ReadMode.heap);
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw e;
+ }
}
/**
@@ -122,12 +120,10 @@ public class SegmentGenerationWithBytesTypeTest {
GenericRow actualRow = pinotReader.next();
for (String column : _schema.getColumnNames()) {
- byte[] actual = (byte[]) actualRow.getValue(column);
- byte[] expected = (byte[]) expectedRow.getValue(column);
+ String actual = (String) actualRow.getValue(column);
+ String expected = (String) expectedRow.getValue(column);
- if (ByteArray.compare(actual, expected) != 0) {
- Assert.assertEquals(actualRow.getValue(column), expectedRow.getValue(column));
- }
+ Assert.assertEquals(actual, expected);
}
}
@@ -138,59 +134,30 @@ public class SegmentGenerationWithBytesTypeTest {
@Test
public void testMetadata() {
- Assert.assertTrue(_segment.getDataSource(FIXED_BYTE_SORTED_COLUMN).getDataSourceMetadata().isSorted());
- Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(FIXED_BYTES_NO_DICT_COLUMN));
- }
-
- @Test
- public void testDictionary() {
- ImmutableDictionaryReader dictionary = (ImmutableDictionaryReader) _segment.getDictionary(FIXED_BYTE_SORTED_COLUMN);
- Assert.assertEquals(dictionary.length(), NUM_SORTED_VALUES);
-
- // Test dictionary indexing.
- for (int i = 0; i < NUM_ROWS; i++) {
- int value = (i * NUM_SORTED_VALUES) / NUM_ROWS;
- // For sorted columns, values are written as 0, 0, 0.., 1, 1, 1...n, n, n
- Assert.assertEquals(dictionary.indexOf(Ints.toByteArray(value)), value % NUM_SORTED_VALUES);
- }
-
- // Test value not in dictionary.
- Assert.assertEquals(dictionary.indexOf(Ints.toByteArray(NUM_SORTED_VALUES + 1)), -1);
- Assert.assertEquals(dictionary.insertionIndexOf(Ints.toByteArray(NUM_SORTED_VALUES + 1)),
- -(dictionary.length() + 1));
-
- int[] dictIds = new int[dictionary.length()];
- for (int i = 0; i < dictIds.length; i++) {
- dictIds[i] = i;
- }
-
- byte[][] values = new byte[dictIds.length][];
- dictionary.readBytesValues(dictIds, 0, dictIds.length, values, 0);
- for (int expected = 0; expected < values.length; expected++) {
- int actual = ByteBuffer.wrap(values[expected]).asIntBuffer().get();
- Assert.assertEquals(actual, expected);
- }
+ Assert.assertTrue(_segment.getDataSource(TEXT_SORTED_COLUMN).getDataSourceMetadata().isSorted());
+ Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(TEXT_COLUMN));
+ Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(TEXT_SORTED_COLUMN));
}
/**
- * This test generates an avro with TDigest BYTES data, and tests segment generation.
+ * This test generates TEXT data, and tests segment generation.
*/
@Test
- public void testTDigestAvro() throws Exception {
+ public void testTextAvro() throws Exception {
Schema schema = new Schema();
- schema.addField(new MetricFieldSpec(FIXED_BYTES_UNSORTED_COLUMN, FieldSpec.DataType.BYTES));
- schema.addField(new MetricFieldSpec(VARIABLE_BYTES_COLUMN, FieldSpec.DataType.BYTES));
+ schema.addField(new DimensionFieldSpec(TEXT_COLUMN, FieldSpec.DataType.TEXT, true));
+ schema.addField(new DimensionFieldSpec(TEXT_SORTED_COLUMN, FieldSpec.DataType.TEXT, true));
- List<byte[]> _fixedExpected = new ArrayList<>(NUM_ROWS);
- List<byte[]> _varExpected = new ArrayList<>(NUM_ROWS);
+ String[] sortedExpected = new String[NUM_ROWS];
+ String[] unsortedExpected = new String[NUM_ROWS];
- buildAvro(schema, _fixedExpected, _varExpected);
+ buildAvro(schema, sortedExpected, unsortedExpected);
IndexSegment segment = buildSegmentFromAvro(schema, AVRO_DIR_NAME, AVRO_NAME, SEGMENT_NAME);
SegmentMetadata metadata = segment.getSegmentMetadata();
- Assert.assertTrue(metadata.hasDictionary(FIXED_BYTES_UNSORTED_COLUMN));
- Assert.assertFalse(metadata.hasDictionary(VARIABLE_BYTES_COLUMN));
+ Assert.assertFalse(metadata.hasDictionary(TEXT_COLUMN));
+ Assert.assertFalse(metadata.hasDictionary(TEXT_SORTED_COLUMN));
PinotSegmentRecordReader reader = new PinotSegmentRecordReader(new File(AVRO_DIR_NAME, SEGMENT_NAME));
GenericRow row = new GenericRow();
@@ -198,9 +165,9 @@ public class SegmentGenerationWithBytesTypeTest {
int i = 0;
while (reader.hasNext()) {
row = reader.next(row);
- Assert.assertEquals(ByteArray.compare((byte[]) row.getValue(FIXED_BYTES_UNSORTED_COLUMN), _fixedExpected.get(i)),
- 0);
- Assert.assertEquals(ByteArray.compare((byte[]) row.getValue(VARIABLE_BYTES_COLUMN), _varExpected.get(i++)), 0);
+ Assert.assertEquals(((String)row.getValue(TEXT_COLUMN)), unsortedExpected[i], "Comparison failed at index " + i);
+ Assert.assertTrue(((String)row.getValue(TEXT_SORTED_COLUMN)).equals(sortedExpected[i]), "Comparison failed at index " + i);
+ i++;
}
segment.destroy();
}
@@ -217,28 +184,23 @@ public class SegmentGenerationWithBytesTypeTest {
config.setOutDir(SEGMENT_DIR_NAME);
config.setSegmentName(SEGMENT_NAME);
- config.setRawIndexCreationColumns(Collections.singletonList(FIXED_BYTES_NO_DICT_COLUMN));
+ List<String> columns = Arrays.asList(TEXT_COLUMN, TEXT_SORTED_COLUMN);
+ config.setRawIndexCreationColumns(columns);
+ config.setInvertedIndexCreationColumns(columns);
+
+ String[] sorted = new String[NUM_ROWS];
+ for (int i = 0; i < NUM_ROWS; i++) {
+ String str = RandomStringUtils.randomAlphabetic(1 + _random.nextInt(MAX_STRING_LENGTH));
+ sorted[i] = str;
+ }
+ Arrays.sort(sorted);
List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
for (int i = 0; i < NUM_ROWS; i++) {
HashMap<String, Object> map = new HashMap<>();
- // Set the value for fixed-byte sorted column.
- map.put(FIXED_BYTE_SORTED_COLUMN, Ints.toByteArray((i * NUM_SORTED_VALUES) / NUM_ROWS));
-
- // Set the value for fixed-byte unsorted column.
- byte[] fixedBytes = new byte[FIXED_BYTE_LENGTH];
- _random.nextBytes(fixedBytes);
- map.put(FIXED_BYTES_UNSORTED_COLUMN, fixedBytes);
-
- // Set the value for fixed-byte no-dictionary column.
- map.put(FIXED_BYTES_NO_DICT_COLUMN, fixedBytes);
-
- // Set the value fo variable length column. Ensure at least one zero-length byte[].
- int length = (i == 0) ? 0 : _random.nextInt(MAX_VARIABLE_BYTES_LENGTH);
- byte[] varBytes = new byte[length];
- _random.nextBytes(varBytes);
- map.put(VARIABLE_BYTES_COLUMN, varBytes);
+ map.put(TEXT_SORTED_COLUMN, sorted[i]);
+ map.put(TEXT_COLUMN, RandomStringUtils.randomAlphabetic(1 + _random.nextInt(MAX_STRING_LENGTH)));
GenericRow genericRow = new GenericRow();
genericRow.init(map);
@@ -256,14 +218,14 @@ public class SegmentGenerationWithBytesTypeTest {
}
/**
- * Build Avro file containing serialized TDigest bytes.
+ * Build Avro file containing Text.
*
* @param schema Schema of data (one fixed and one variable column)
- * @param _fixedExpected Serialized bytes of fixed length column are populated here
- * @param _varExpected Serialized bytes of variable length column are populated here
+ * @param sortedExpected Sorted column are populated here
+ * @param unsortedExpected unsorted column are populated here
* @throws IOException
*/
- private void buildAvro(Schema schema, List<byte[]> _fixedExpected, List<byte[]> _varExpected) throws IOException {
+ private void buildAvro(Schema schema, String[] sortedExpected, String[] unsortedExpected) throws IOException {
org.apache.avro.Schema avroSchema = AvroUtils.getAvroSchemaFromPinotSchema(schema);
try (DataFileWriter<GenericData.Record> recordWriter = new DataFileWriter<>(new GenericDatumWriter<>(avroSchema))) {
@@ -272,31 +234,20 @@ public class SegmentGenerationWithBytesTypeTest {
throw new RuntimeException("Unable to create test directory: " + AVRO_DIR_NAME);
}
+ for (int i = 0; i < NUM_ROWS; i++) {
+ String str = RandomStringUtils.randomAlphanumeric(1 + _random.nextInt(MAX_STRING_LENGTH));
+ sortedExpected[i] = str;
+ }
+ Arrays.sort(sortedExpected);
+
recordWriter.create(avroSchema, new File(AVRO_DIR_NAME, AVRO_NAME));
for (int i = 0; i < NUM_ROWS; i++) {
GenericData.Record record = new GenericData.Record(avroSchema);
- TDigest tDigest = TDigest.createMergingDigest(PercentileTDigestAggregationFunction.DEFAULT_TDIGEST_COMPRESSION);
- tDigest.add(_random.nextDouble());
-
- ByteBuffer buffer = ByteBuffer.allocate(tDigest.byteSize());
- tDigest.asBytes(buffer);
- _fixedExpected.add(buffer.array());
-
- buffer.flip();
- record.put(FIXED_BYTES_UNSORTED_COLUMN, buffer);
-
- if (i % 2 == 0) {
- tDigest.add(_random.nextDouble());
- }
-
- buffer = ByteBuffer.allocate(tDigest.byteSize());
- tDigest.asBytes(buffer);
- _varExpected.add(buffer.array());
-
- buffer.flip();
- record.put(VARIABLE_BYTES_COLUMN, buffer);
-
+ String str = RandomStringUtils.randomAlphanumeric(1 + _random.nextInt(MAX_STRING_LENGTH));
+ unsortedExpected[i] = str;
+ record.put(TEXT_COLUMN, str);
+ record.put(TEXT_SORTED_COLUMN, sortedExpected[i]);
recordWriter.append(record);
}
}
@@ -315,6 +266,9 @@ public class SegmentGenerationWithBytesTypeTest {
config.setOutDir(dirName);
config.setSegmentName(segmentName);
config.setSchema(schema);
+ List<String> columns = Arrays.asList(TEXT_COLUMN, TEXT_SORTED_COLUMN);
+ config.setRawIndexCreationColumns(columns);
+ config.setInvertedIndexCreationColumns(columns);
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
driver.init(config);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org