You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2020/11/23 19:22:12 UTC

[incubator-pinot] branch master updated: Support for text index without raw (#6284)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new fe9d3c7   Support for text index without raw (#6284)
fe9d3c7 is described below

commit fe9d3c79d2da709e6ce15929df539f91fe960c2e
Author: Sidd <si...@gmail.com>
AuthorDate: Mon Nov 23 11:21:54 2020 -0800

     Support for text index without raw (#6284)
    
    * WIP - Support for text index withour raw
    
    * review comments
    
    Co-authored-by: Siddharth Teotia <st...@steotia-mn1.linkedin.biz>
---
 .../generator/SegmentGeneratorConfig.java          |  6 +++
 .../creator/impl/SegmentColumnarIndexCreator.java  | 32 +++++++++++++--
 .../pinot/queries/TextSearchQueriesTest.java       | 48 +++++++++++++++-------
 .../apache/pinot/spi/config/table/FieldConfig.java |  3 ++
 4 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
index 5bc0be5..1d3324f 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
@@ -18,6 +18,7 @@
  */
 package org.apache.pinot.core.indexsegment.generator;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import java.io.File;
 import java.util.ArrayList;
@@ -299,6 +300,11 @@ public class SegmentGeneratorConfig {
     }
   }
 
+  @VisibleForTesting
+  public void setColumnProperties(Map<String, Map<String, String>> columnProperties) {
+    _columnProperties = columnProperties;
+  }
+
   public void setColumnSortOrder(List<String> sortOrder) {
     Preconditions.checkNotNull(sortOrder);
     _columnSortOrder.addAll(sortOrder);
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
index 0b2845e..6aca94e 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -95,6 +95,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
   private int totalDocs;
   private int docIdCounter;
   private boolean _nullHandlingEnabled;
+  private Map<String, Map<String, String>> _columnProperties;
 
   private final Set<String> _textIndexColumns = new HashSet<>();
 
@@ -105,6 +106,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
     docIdCounter = 0;
     config = segmentCreationSpec;
     this.indexCreationInfoMap = indexCreationInfoMap;
+    _columnProperties = segmentCreationSpec.getColumnProperties();
 
     // Check that the output directory does not exist
     Preconditions.checkState(!outDir.exists(), "Segment output directory: %s already exists", outDir);
@@ -322,6 +324,10 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
 
       if (isSingleValue) {
         // SV column
+        // text-index enabled SV column
+        if (_textIndexColumns.contains(columnName)) {
+          _textIndexCreatorMap.get(columnName).add((String) columnValueToIndex);
+        }
         if (dictionaryCreator != null) {
           // dictionary encoded SV column
           // get dictID from dictionary
@@ -337,6 +343,14 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
         } else {
           // non-dictionary encoded SV column
           // store the docId -> raw value mapping in forward index
+          if (_textIndexColumns.contains(columnName) && !shouldStoreRawValueForTextIndex(columnName)) {
+            // for text index on raw columns, check the config to determine if actual raw value should
+            // be stored or not
+            columnValueToIndex = _columnProperties.get(columnName).get(FieldConfig.TEXT_INDEX_RAW_VALUE);
+            if (columnValueToIndex == null) {
+              columnValueToIndex = FieldConfig.TEXT_INDEX_DEFAULT_RAW_VALUE;
+            }
+          }
           switch (forwardIndexCreator.getValueType()) {
             case INT:
               forwardIndexCreator.putInt((int) columnValueToIndex);
@@ -360,10 +374,6 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
               throw new IllegalStateException();
           }
         }
-        // text-index enabled SV column
-        if (_textIndexColumns.contains(columnName)) {
-          _textIndexCreatorMap.get(columnName).add((String) columnValueToIndex);
-        }
       } else {
         // MV column (always dictionary encoded)
         int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
@@ -384,6 +394,20 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
     docIdCounter++;
   }
 
+  private boolean shouldStoreRawValueForTextIndex(String column) {
+    if (_columnProperties != null) {
+      Map<String, String> props = _columnProperties.get(column);
+      if (props != null && Boolean.parseBoolean(props.get(FieldConfig.TEXT_INDEX_NO_RAW_DATA))) {
+        // by default always store the raw value
+        // if the config is set to true, don't store the actual raw value
+        // there will be a dummy value
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   @Override
   public void setSegmentName(String segmentName) {
     this.segmentName = segmentName;
diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 64610f9..4caa8e0 100644
--- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -93,10 +93,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest {  private static fin
   private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
   private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
   private static final String SKILLS_TEXT_COL_DICT_NAME = "SKILLS_TEXT_COL_DICT";
-  private static final String SKILLS_COPY_TEXT_COL_NAME = "SKILLS_TEXT_COL_1";
+  private static final String SKILLS_TEXT_COL_MULTI_TERM_NAME = "SKILLS_TEXT_COL_1";
+  private static final String SKILLS_TEXT_NO_RAW_NAME = "SKILLS_TEXT_COL_2";
   private static final String INT_COL_NAME = "INT_COL";
-  private static final List<String> RAW_TEXT_INDEX_COLUMNS =
-      Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_COPY_TEXT_COL_NAME);
+  private static final List<String> RAW_TEXT_INDEX_COLUMNS = Arrays
+      .asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_MULTI_TERM_NAME, SKILLS_TEXT_NO_RAW_NAME);
   private static final List<String> DICT_TEXT_INDEX_COLUMNS = Arrays.asList(SKILLS_TEXT_COL_DICT_NAME);
   private static final int INT_BASE_VALUE = 1000;
 
@@ -135,7 +136,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {  private static fin
     Map<String, Map<String, String>> columnProperties = new HashMap<>();
     Map<String, String> props = new HashMap<>();
     props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
-    columnProperties.put(SKILLS_COPY_TEXT_COL_NAME, props);
+    columnProperties.put(SKILLS_TEXT_COL_MULTI_TERM_NAME, props);
     indexLoadingConfig.setColumnProperties(columnProperties);
     ImmutableSegment immutableSegment =
         ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), indexLoadingConfig);
@@ -169,13 +170,19 @@ public class TextSearchQueriesTest extends BaseQueriesTest {  private static fin
         .addSingleValueDimension(QUERY_LOG_TEXT_COL_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_NAME, FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME, FieldSpec.DataType.STRING)
-        .addSingleValueDimension(SKILLS_COPY_TEXT_COL_NAME, FieldSpec.DataType.STRING)
+        .addSingleValueDimension(SKILLS_TEXT_COL_MULTI_TERM_NAME, FieldSpec.DataType.STRING)
+        .addSingleValueDimension(SKILLS_TEXT_NO_RAW_NAME, FieldSpec.DataType.STRING)
         .addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
     SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema);
     config.setOutDir(INDEX_DIR.getPath());
     config.setTableName(TABLE_NAME);
     config.setSegmentName(SEGMENT_NAME);
-
+    Map<String, Map<String, String>> columnProperties = new HashMap<>();
+    Map<String, String> props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_NO_RAW_DATA, "true");
+    props.put(FieldConfig.TEXT_INDEX_RAW_VALUE, "ILoveCoding");
+    columnProperties.put(SKILLS_TEXT_NO_RAW_NAME, props);
+    config.setColumnProperties(columnProperties);
     SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
     try (RecordReader recordReader = new GenericRowRecordReader(rows)) {
       driver.init(config, recordReader);
@@ -209,16 +216,18 @@ public class TextSearchQueriesTest extends BaseQueriesTest {  private static fin
       String line;
       while ((line = reader.readLine()) != null) {
         GenericRow row = new GenericRow();
-        row.putField(INT_COL_NAME, INT_BASE_VALUE + counter);
-        row.putField(QUERY_LOG_TEXT_COL_NAME, line);
+        row.putValue(INT_COL_NAME, INT_BASE_VALUE + counter);
+        row.putValue(QUERY_LOG_TEXT_COL_NAME, line);
         if (counter >= skillCount) {
-          row.putField(SKILLS_TEXT_COL_NAME, "software engineering");
-          row.putField(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
-          row.putField(SKILLS_COPY_TEXT_COL_NAME, "software engineering");
+          row.putValue(SKILLS_TEXT_COL_NAME, "software engineering");
+          row.putValue(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
+          row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
+          row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
         } else {
-          row.putField(SKILLS_TEXT_COL_NAME, skills[counter]);
-          row.putField(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
-          row.putField(SKILLS_COPY_TEXT_COL_NAME, skills[counter]);
+          row.putValue(SKILLS_TEXT_COL_NAME, skills[counter]);
+          row.putValue(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
+          row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, skills[counter]);
+          row.putValue(SKILLS_TEXT_NO_RAW_NAME, skills[counter]);
         }
         rows.add(row);
         counter++;
@@ -568,6 +577,17 @@ public class TextSearchQueriesTest extends BaseQueriesTest {  private static fin
     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
 
+    // test for the text index configured to not store the default value
+    // full index is stored
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+    // configurable default value is used
+    query = "SELECT INT_COL, SKILLS_TEXT_COL_2 FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_2, '\"distributed systems\" AND Java AND C++') LIMIT 50000";
+    expected = new ArrayList<>();
+    expected.add(new Serializable[]{1005, "ILoveCoding"});
+    expected.add(new Serializable[]{1017, "ILoveCoding"});
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+
     // TEST 22: composite phrase and term query using boolean operator OR
     // Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain ANY of the following skills:
     // phrase "distributed systems" as is, term 'Java', term 'C++'. Note: OR operator is implicit when we don't specify
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index aecc25a..c9e1eb6 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -44,6 +44,9 @@ public class FieldConfig extends BaseJsonConfig {
   // the cache improves performance of repeatable queries
   public static String TEXT_INDEX_ENABLE_QUERY_CACHE = "enableQueryCacheForTextIndex";
   public static String TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES = "useANDForMultiTermTextIndexQueries";
+  public static String TEXT_INDEX_NO_RAW_DATA = "noRawDataForTextIndex";
+  public static String TEXT_INDEX_RAW_VALUE = "rawValueForTextIndex";
+  public static String TEXT_INDEX_DEFAULT_RAW_VALUE = "n";
 
   @JsonCreator
   public FieldConfig(@JsonProperty(value = "name", required = true) String name,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org