You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2020/11/23 19:22:12 UTC
[incubator-pinot] branch master updated: Support for text index
without raw (#6284)
This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/master by this push:
new fe9d3c7 Support for text index without raw (#6284)
fe9d3c7 is described below
commit fe9d3c79d2da709e6ce15929df539f91fe960c2e
Author: Sidd <si...@gmail.com>
AuthorDate: Mon Nov 23 11:21:54 2020 -0800
Support for text index without raw (#6284)
* WIP - Support for text index withour raw
* review comments
Co-authored-by: Siddharth Teotia <st...@steotia-mn1.linkedin.biz>
---
.../generator/SegmentGeneratorConfig.java | 6 +++
.../creator/impl/SegmentColumnarIndexCreator.java | 32 +++++++++++++--
.../pinot/queries/TextSearchQueriesTest.java | 48 +++++++++++++++-------
.../apache/pinot/spi/config/table/FieldConfig.java | 3 ++
4 files changed, 71 insertions(+), 18 deletions(-)
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
index 5bc0be5..1d3324f 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
@@ -18,6 +18,7 @@
*/
package org.apache.pinot.core.indexsegment.generator;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import java.io.File;
import java.util.ArrayList;
@@ -299,6 +300,11 @@ public class SegmentGeneratorConfig {
}
}
+ @VisibleForTesting
+ public void setColumnProperties(Map<String, Map<String, String>> columnProperties) {
+ _columnProperties = columnProperties;
+ }
+
public void setColumnSortOrder(List<String> sortOrder) {
Preconditions.checkNotNull(sortOrder);
_columnSortOrder.addAll(sortOrder);
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
index 0b2845e..6aca94e 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -95,6 +95,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
private int totalDocs;
private int docIdCounter;
private boolean _nullHandlingEnabled;
+ private Map<String, Map<String, String>> _columnProperties;
private final Set<String> _textIndexColumns = new HashSet<>();
@@ -105,6 +106,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
docIdCounter = 0;
config = segmentCreationSpec;
this.indexCreationInfoMap = indexCreationInfoMap;
+ _columnProperties = segmentCreationSpec.getColumnProperties();
// Check that the output directory does not exist
Preconditions.checkState(!outDir.exists(), "Segment output directory: %s already exists", outDir);
@@ -322,6 +324,10 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
if (isSingleValue) {
// SV column
+ // text-index enabled SV column
+ if (_textIndexColumns.contains(columnName)) {
+ _textIndexCreatorMap.get(columnName).add((String) columnValueToIndex);
+ }
if (dictionaryCreator != null) {
// dictionary encoded SV column
// get dictID from dictionary
@@ -337,6 +343,14 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
} else {
// non-dictionary encoded SV column
// store the docId -> raw value mapping in forward index
+ if (_textIndexColumns.contains(columnName) && !shouldStoreRawValueForTextIndex(columnName)) {
+ // for text index on raw columns, check the config to determine if actual raw value should
+ // be stored or not
+ columnValueToIndex = _columnProperties.get(columnName).get(FieldConfig.TEXT_INDEX_RAW_VALUE);
+ if (columnValueToIndex == null) {
+ columnValueToIndex = FieldConfig.TEXT_INDEX_DEFAULT_RAW_VALUE;
+ }
+ }
switch (forwardIndexCreator.getValueType()) {
case INT:
forwardIndexCreator.putInt((int) columnValueToIndex);
@@ -360,10 +374,6 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
throw new IllegalStateException();
}
}
- // text-index enabled SV column
- if (_textIndexColumns.contains(columnName)) {
- _textIndexCreatorMap.get(columnName).add((String) columnValueToIndex);
- }
} else {
// MV column (always dictionary encoded)
int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
@@ -384,6 +394,20 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
docIdCounter++;
}
+ private boolean shouldStoreRawValueForTextIndex(String column) {
+ if (_columnProperties != null) {
+ Map<String, String> props = _columnProperties.get(column);
+ if (props != null && Boolean.parseBoolean(props.get(FieldConfig.TEXT_INDEX_NO_RAW_DATA))) {
+ // by default always store the raw value
+ // if the config is set to true, don't store the actual raw value
+ // there will be a dummy value
+ return false;
+ }
+ }
+
+ return true;
+ }
+
@Override
public void setSegmentName(String segmentName) {
this.segmentName = segmentName;
diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 64610f9..4caa8e0 100644
--- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -93,10 +93,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private static fin
private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
private static final String SKILLS_TEXT_COL_DICT_NAME = "SKILLS_TEXT_COL_DICT";
- private static final String SKILLS_COPY_TEXT_COL_NAME = "SKILLS_TEXT_COL_1";
+ private static final String SKILLS_TEXT_COL_MULTI_TERM_NAME = "SKILLS_TEXT_COL_1";
+ private static final String SKILLS_TEXT_NO_RAW_NAME = "SKILLS_TEXT_COL_2";
private static final String INT_COL_NAME = "INT_COL";
- private static final List<String> RAW_TEXT_INDEX_COLUMNS =
- Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_COPY_TEXT_COL_NAME);
+ private static final List<String> RAW_TEXT_INDEX_COLUMNS = Arrays
+ .asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_MULTI_TERM_NAME, SKILLS_TEXT_NO_RAW_NAME);
private static final List<String> DICT_TEXT_INDEX_COLUMNS = Arrays.asList(SKILLS_TEXT_COL_DICT_NAME);
private static final int INT_BASE_VALUE = 1000;
@@ -135,7 +136,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private static fin
Map<String, Map<String, String>> columnProperties = new HashMap<>();
Map<String, String> props = new HashMap<>();
props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
- columnProperties.put(SKILLS_COPY_TEXT_COL_NAME, props);
+ columnProperties.put(SKILLS_TEXT_COL_MULTI_TERM_NAME, props);
indexLoadingConfig.setColumnProperties(columnProperties);
ImmutableSegment immutableSegment =
ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), indexLoadingConfig);
@@ -169,13 +170,19 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private static fin
.addSingleValueDimension(QUERY_LOG_TEXT_COL_NAME, FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_COL_NAME, FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME, FieldSpec.DataType.STRING)
- .addSingleValueDimension(SKILLS_COPY_TEXT_COL_NAME, FieldSpec.DataType.STRING)
+ .addSingleValueDimension(SKILLS_TEXT_COL_MULTI_TERM_NAME, FieldSpec.DataType.STRING)
+ .addSingleValueDimension(SKILLS_TEXT_NO_RAW_NAME, FieldSpec.DataType.STRING)
.addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema);
config.setOutDir(INDEX_DIR.getPath());
config.setTableName(TABLE_NAME);
config.setSegmentName(SEGMENT_NAME);
-
+ Map<String, Map<String, String>> columnProperties = new HashMap<>();
+ Map<String, String> props = new HashMap<>();
+ props.put(FieldConfig.TEXT_INDEX_NO_RAW_DATA, "true");
+ props.put(FieldConfig.TEXT_INDEX_RAW_VALUE, "ILoveCoding");
+ columnProperties.put(SKILLS_TEXT_NO_RAW_NAME, props);
+ config.setColumnProperties(columnProperties);
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
try (RecordReader recordReader = new GenericRowRecordReader(rows)) {
driver.init(config, recordReader);
@@ -209,16 +216,18 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private static fin
String line;
while ((line = reader.readLine()) != null) {
GenericRow row = new GenericRow();
- row.putField(INT_COL_NAME, INT_BASE_VALUE + counter);
- row.putField(QUERY_LOG_TEXT_COL_NAME, line);
+ row.putValue(INT_COL_NAME, INT_BASE_VALUE + counter);
+ row.putValue(QUERY_LOG_TEXT_COL_NAME, line);
if (counter >= skillCount) {
- row.putField(SKILLS_TEXT_COL_NAME, "software engineering");
- row.putField(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
- row.putField(SKILLS_COPY_TEXT_COL_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_COL_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
} else {
- row.putField(SKILLS_TEXT_COL_NAME, skills[counter]);
- row.putField(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
- row.putField(SKILLS_COPY_TEXT_COL_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_COL_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_NO_RAW_NAME, skills[counter]);
}
rows.add(row);
counter++;
@@ -568,6 +577,17 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private static fin
query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
testTextSearchAggregationQueryHelper(query, expected.size());
+ // test for the text index configured to not store the default value
+ // full index is stored
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+ // configurable default value is used
+ query = "SELECT INT_COL, SKILLS_TEXT_COL_2 FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
+ expected = new ArrayList<>();
+ expected.add(new Serializable[]{1005, "ILoveCoding"});
+ expected.add(new Serializable[]{1017, "ILoveCoding"});
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+
// TEST 22: composite phrase and term query using boolean operator OR
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain ANY of the following skills:
// phrase "distributed systems" as is, term 'Java', term 'C++'. Note: OR operator is implicit when we don't specify
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index aecc25a..c9e1eb6 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -44,6 +44,9 @@ public class FieldConfig extends BaseJsonConfig {
// the cache improves performance of repeatable queries
public static String TEXT_INDEX_ENABLE_QUERY_CACHE = "enableQueryCacheForTextIndex";
public static String TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES = "useANDForMultiTermTextIndexQueries";
+ public static String TEXT_INDEX_NO_RAW_DATA = "noRawDataForTextIndex";
+ public static String TEXT_INDEX_RAW_VALUE = "rawValueForTextIndex";
+ public static String TEXT_INDEX_DEFAULT_RAW_VALUE = "n";
@JsonCreator
public FieldConfig(@JsonProperty(value = "name", required = true) String name,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org