You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by su...@apache.org on 2019/01/02 02:04:39 UTC

[incubator-pinot] branch pinot-text-search updated: Initial changes to support TEXT datatype and creation of lucene-search index for the same

This is an automated email from the ASF dual-hosted git repository.

sunithabeeram pushed a commit to branch pinot-text-search
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/pinot-text-search by this push:
     new d92dfea  Initial changes to support TEXT datatype and creation of lucene-search index for the same
d92dfea is described below

commit d92dfea8c3323a5976479112dfe1c4676efa69dd
Author: Sunitha Beeram <sb...@linkedin.com>
AuthorDate: Tue Jan 1 18:04:07 2019 -0800

    Initial changes to support TEXT datatype and creation of lucene-search index for the same
---
 .../com/linkedin/pinot/common/data/FieldSpec.java  |   6 +-
 .../linkedin/pinot/common/data/FieldSpecTest.java  |   9 ++
 pinot-core/pom.xml                                 |   5 +
 .../data/readers/PinotSegmentColumnReader.java     |   1 +
 .../core/data/recordtransformer/PinotDataType.java |  35 +++-
 .../generator/SegmentGeneratorConfig.java          |  10 ++
 ...va => DictionaryBasedInvertedIndexCreator.java} |  15 +-
 .../creator/DocBasedInvertedIndexCreator.java      |  27 ++++
 .../core/segment/creator/InvertedIndexCreator.java |  44 -----
 .../creator/impl/SegmentColumnarIndexCreator.java  |  80 ++++++---
 .../creator/impl/SegmentDictionaryCreator.java     |   6 +-
 .../inv/OffHeapBitmapInvertedIndexCreator.java     |   5 +-
 .../impl/inv/OnHeapBitmapInvertedIndexCreator.java |   5 +-
 .../stats/SegmentPreIndexStatsCollectorImpl.java   |   1 +
 .../impl/textsearch/LuceneIndexCreator.java        |  85 ++++++++++
 .../impl/textsearch/TextSearchIndexConfig.java     |  54 +++++++
 .../textsearch/TextSearchIndexCreatorFactory.java  |  32 ++++
 .../impl/textsearch/TextSearchIndexType.java       |  50 ++++++
 .../index/column/PhysicalColumnIndexContainer.java |  11 +-
 .../com/linkedin/pinot/core/util/AvroUtils.java    |   1 +
 .../SegmentGenerationWithBytesTypeTest.java        |   4 +-
 ...java => SegmentGenerationWithTextTypeTest.java} | 178 ++++++++-------------
 22 files changed, 463 insertions(+), 201 deletions(-)

diff --git a/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java b/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
index f80d356..87edc53 100644
--- a/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
+++ b/pinot-common/src/main/java/com/linkedin/pinot/common/data/FieldSpec.java
@@ -194,6 +194,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
         case DOUBLE:
           return Double.valueOf(stringDefaultNullValue);
         case STRING:
+        case TEXT:
           return stringDefaultNullValue;
         case BYTES:
           try {
@@ -217,6 +218,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
             case DOUBLE:
               return DEFAULT_METRIC_NULL_VALUE_OF_DOUBLE;
             case STRING:
+            case TEXT:
               return DEFAULT_METRIC_NULL_VALUE_OF_STRING;
             case BYTES:
               return DEFAULT_METRIC_NULL_VALUE_OF_BYTES;
@@ -237,6 +239,7 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
             case DOUBLE:
               return DEFAULT_DIMENSION_NULL_VALUE_OF_DOUBLE;
             case STRING:
+            case TEXT:
               return DEFAULT_DIMENSION_NULL_VALUE_OF_STRING;
             case BYTES:
               return DEFAULT_DIMENSION_NULL_VALUE_OF_BYTES;
@@ -389,7 +392,8 @@ public abstract class FieldSpec implements Comparable<FieldSpec>, ConfigNodeLife
     DOUBLE,
     BOOLEAN,  // Stored as STRING
     STRING,
-    BYTES;
+    BYTES,
+    TEXT; // Stored as STRING
 
     /**
      * Returns the data type stored in Pinot.
diff --git a/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java b/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
index a2b9f70..70d622a 100644
--- a/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
+++ b/pinot-common/src/test/java/com/linkedin/pinot/common/data/FieldSpecTest.java
@@ -50,6 +50,7 @@ public class FieldSpecTest {
     Assert.assertEquals(BOOLEAN.getStoredType(), STRING);
     Assert.assertEquals(STRING.getStoredType(), STRING);
     Assert.assertEquals(BYTES.getStoredType(), BYTES);
+    Assert.assertEquals(TEXT.getStoredType(), TEXT);
 
     Assert.assertEquals(INT.size(), Integer.BYTES);
     Assert.assertEquals(LONG.size(), Long.BYTES);
@@ -288,6 +289,14 @@ public class FieldSpecTest {
     Assert.assertEquals(dimensionFieldSpec1, dimensionFieldSpec2, ERROR_MESSAGE);
     Assert.assertEquals(dimensionFieldSpec1.getDefaultNullValue(), "false", ERROR_MESSAGE);
 
+    // Single-value Text type dimension field with default null value.
+    String[] dimensionFields2 = {"\"name\":\"dimension\"", "\"dataType\":\"TEXT\"", "\"defaultNullValue\":false"};
+    DimensionFieldSpec dimensionFieldSpec3 =
+        MAPPER.readValue(getRandomOrderJsonString(dimensionFields2), DimensionFieldSpec.class);
+    DimensionFieldSpec dimensionFieldSpec4 = new DimensionFieldSpec("dimension", TEXT, true, false);
+    Assert.assertEquals(dimensionFieldSpec3, dimensionFieldSpec4, ERROR_MESSAGE);
+    Assert.assertEquals(dimensionFieldSpec3.getDefaultNullValue(), "false", ERROR_MESSAGE);
+
     // Multi-value dimension field with default null value.
     dimensionFields =
         new String[]{"\"name\":\"dimension\"", "\"dataType\":\"STRING\"", "\"singleValueField\":false", "\"defaultNullValue\":\"default\""};
diff --git a/pinot-core/pom.xml b/pinot-core/pom.xml
index fa608f1..de451b5 100644
--- a/pinot-core/pom.xml
+++ b/pinot-core/pom.xml
@@ -146,6 +146,11 @@
       <artifactId>fastutil</artifactId>
     </dependency>
     <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-core</artifactId>
+      <version>7.6.0</version>
+    </dependency>
+    <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
     </dependency>
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java b/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
index c126ff1..163fcb3 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/data/readers/PinotSegmentColumnReader.java
@@ -117,6 +117,7 @@ public class PinotSegmentColumnReader {
       case DOUBLE:
         return readDouble(docId);
       case STRING:
+      case TEXT:
         return readString(docId);
       case BYTES:
         return readBytes(docId);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java b/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
index e9b233b..9eb5f5f 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/data/recordtransformer/PinotDataType.java
@@ -258,6 +258,33 @@ public enum PinotDataType {
     }
   },
 
+  TEXT {
+    @Override
+    public Integer toInteger(Object value) {
+      return Integer.parseInt((String) value);
+    }
+
+    @Override
+    public Long toLong(Object value) {
+      return Long.parseLong((String) value);
+    }
+
+    @Override
+    public Float toFloat(Object value) {
+      return Float.parseFloat((String) value);
+    }
+
+    @Override
+    public Double toDouble(Object value) {
+      return Double.parseDouble((String) value);
+    }
+
+    @Override
+    public String convert(Object value, PinotDataType sourceType) {
+      return sourceType.toString(value);
+    }
+  },
+
   OBJECT {
     @Override
     public Integer toInteger(Object value) {
@@ -428,7 +455,7 @@ public enum PinotDataType {
   }
 
   public Object convert(Object value, PinotDataType sourceType) {
-    throw new UnsupportedOperationException("Cannot convert value: " + value + " form: " + sourceType + " to: " + this);
+    throw new UnsupportedOperationException("Cannot convert value: " + value + " from: " + sourceType + " to: " + this);
   }
 
   public boolean isSingleValue() {
@@ -473,6 +500,12 @@ public enum PinotDataType {
         return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : PinotDataType.DOUBLE_ARRAY;
       case STRING:
         return fieldSpec.isSingleValueField() ? PinotDataType.STRING : PinotDataType.STRING_ARRAY;
+      case TEXT:
+        if (fieldSpec.isSingleValueField()) {
+          return PinotDataType.TEXT;
+        } else {
+          throw new UnsupportedOperationException("Unsupported multi-valued type: TEXT");
+        }
       case BYTES:
         if (fieldSpec.isSingleValueField()) {
           return PinotDataType.BYTES;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java b/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
index 0d9f670..73e144b 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java
@@ -30,6 +30,7 @@ import com.linkedin.pinot.core.data.readers.CSVRecordReaderConfig;
 import com.linkedin.pinot.core.data.readers.FileFormat;
 import com.linkedin.pinot.core.data.readers.RecordReaderConfig;
 import com.linkedin.pinot.core.io.compression.ChunkCompressorFactory;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexConfig;
 import com.linkedin.pinot.core.segment.name.DefaultSegmentNameGenerator;
 import com.linkedin.pinot.core.segment.name.SegmentNameGenerator;
 import com.linkedin.pinot.core.startree.v2.builder.StarTreeV2BuilderConfig;
@@ -105,6 +106,7 @@ public class SegmentGeneratorConfig {
   private String _simpleDateFormat = null;
   // Use on-heap or off-heap memory to generate index (currently only affect inverted index and star-tree v2)
   private boolean _onHeap = false;
+  private Map<String, TextSearchIndexConfig> _textSearchIndexConfigs = new HashMap<>();
 
   public SegmentGeneratorConfig() {
   }
@@ -570,6 +572,14 @@ public class SegmentGeneratorConfig {
     _rawIndexCompressionType.putAll(rawIndexCompressionType);
   }
 
+  public void setTextSearchIndexConfig(TextSearchIndexConfig config) {
+    _textSearchIndexConfigs.put(config.getColumnName(),config);
+  }
+
+  public Map<String, TextSearchIndexConfig> getTextSearchIndexConfigs() {
+    return _textSearchIndexConfigs;
+  }
+
   @JsonIgnore
   public String getMetrics() {
     return getQualifyingFields(FieldType.METRIC, true);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
similarity index 88%
copy from pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
copy to pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
index 4af054d..c933efd 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DictionaryBasedInvertedIndexCreator.java
@@ -15,10 +15,6 @@
  */
 package com.linkedin.pinot.core.segment.creator;
 
-import java.io.Closeable;
-import java.io.IOException;
-
-
 /**
  * Currently only support RoaringBitmap inverted index.
  * <pre>
@@ -37,10 +33,10 @@ import java.io.IOException;
  * |-------------------------------------------------------------------------|
  * </pre>
  *
- * <p>To create an inverted index:
+ * <p>To create a dictionary based inverted index:
  * <ul>
  *   <li>
- *     Construct an instance of <code>InvertedIndexCreator</code>
+ *     Construct an instance of <code>DictionaryBasedInvertedIndexCreator</code>
  *   </li>
  *   <li>
  *     Call add() for each docId in sequence starting with 0 to add dictId (dictIds for multi-valued column) into the
@@ -51,7 +47,7 @@ import java.io.IOException;
  *   </li>
  * </ul>
  */
-public interface InvertedIndexCreator extends Closeable {
+public interface DictionaryBasedInvertedIndexCreator extends InvertedIndexCreator {
 
   /**
    * For single-valued column, adds the dictionary Id for the next document.
@@ -62,9 +58,4 @@ public interface InvertedIndexCreator extends Closeable {
    * For multi-valued column, adds the dictionary Ids for the next document.
    */
   void add(int[] dictIds, int length);
-
-  /**
-   * Seals the index and flushes it to disk.
-   */
-  void seal() throws IOException;
 }
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java
new file mode 100644
index 0000000..884d446
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/DocBasedInvertedIndexCreator.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator;
+
+/**
+ * A document object based InvertedIndexCreator.
+ */
+public interface DocBasedInvertedIndexCreator extends InvertedIndexCreator {
+
+  /**
+   * Add a document/text field to the index
+   */
+  void add(Object doc);
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
index 4af054d..9943583 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/InvertedIndexCreator.java
@@ -18,51 +18,7 @@ package com.linkedin.pinot.core.segment.creator;
 import java.io.Closeable;
 import java.io.IOException;
 
-
-/**
- * Currently only support RoaringBitmap inverted index.
- * <pre>
- * Layout for RoaringBitmap inverted index:
- * |-------------------------------------------------------------------------|
- * |                    Start offset of 1st bitmap                           |
- * |    End offset of 1st bitmap (exclusive) / Start offset of 2nd bitmap    |
- * |                                   ...                                   |
- * | End offset of 2nd last bitmap (exclusive) / Start offset of last bitmap |
- * |                  End offset of last bitmap (exclusive)                  |
- * |-------------------------------------------------------------------------|
- * |                           Data for 1st bitmap                           |
- * |                           Data for 2nd bitmap                           |
- * |                                   ...                                   |
- * |                           Data for last bitmap                          |
- * |-------------------------------------------------------------------------|
- * </pre>
- *
- * <p>To create an inverted index:
- * <ul>
- *   <li>
- *     Construct an instance of <code>InvertedIndexCreator</code>
- *   </li>
- *   <li>
- *     Call add() for each docId in sequence starting with 0 to add dictId (dictIds for multi-valued column) into the
- *     creator
- *   </li>
- *   <li>
- *     Call seal() after all dictIds have been added
- *   </li>
- * </ul>
- */
 public interface InvertedIndexCreator extends Closeable {
-
-  /**
-   * For single-valued column, adds the dictionary Id for the next document.
-   */
-  void add(int dictId);
-
-  /**
-   * For multi-valued column, adds the dictionary Ids for the next document.
-   */
-  void add(int[] dictIds, int length);
-
   /**
    * Seals the index and flushes it to disk.
    */
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
index 93b74b8..44cf3a1 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -28,6 +28,8 @@ import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
 import com.linkedin.pinot.core.io.compression.ChunkCompressorFactory;
 import com.linkedin.pinot.core.io.util.PinotDataBitSet;
 import com.linkedin.pinot.core.segment.creator.ColumnIndexCreationInfo;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.ForwardIndexCreator;
 import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.MultiValueForwardIndexCreator;
@@ -42,6 +44,8 @@ import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueUnsortedForwa
 import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueVarByteRawIndexCreator;
 import com.linkedin.pinot.core.segment.creator.impl.inv.OffHeapBitmapInvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.impl.inv.OnHeapBitmapInvertedIndexCreator;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexConfig;
+import com.linkedin.pinot.core.segment.creator.impl.textsearch.TextSearchIndexCreatorFactory;
 import com.linkedin.pinot.startree.hll.HllConfig;
 import java.io.File;
 import java.io.IOException;
@@ -75,7 +79,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
   private Map<String, ColumnIndexCreationInfo> indexCreationInfoMap;
   private Map<String, SegmentDictionaryCreator> _dictionaryCreatorMap = new HashMap<>();
   private Map<String, ForwardIndexCreator> _forwardIndexCreatorMap = new HashMap<>();
-  private Map<String, InvertedIndexCreator> _invertedIndexCreatorMap = new HashMap<>();
+  private Map<String, DictionaryBasedInvertedIndexCreator> _invertedIndexCreatorMap = new HashMap<>();
+  private Map<String, DocBasedInvertedIndexCreator> _docBasedInvertedIndexCreatorMap = new HashMap<>();
   private String segmentName;
   private Schema schema;
   private File _indexDir;
@@ -172,16 +177,34 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
         // TODO: add support to multi-value column and inverted index
         Preconditions.checkState(fieldSpec.isSingleValueField(), "Cannot create raw index for multi-value column: %s",
             columnName);
-        Preconditions.checkState(!invertedIndexColumns.contains(columnName),
+
+        boolean createInvertedIndex = invertedIndexColumns.contains(columnName);
+        FieldSpec.DataType type = fieldSpec.getDataType();
+        // TEXT columns can have inverted indices
+        Preconditions.checkState(type == FieldSpec.DataType.TEXT ||
+                !createInvertedIndex,
             "Cannot create inverted index for raw index column: %s", columnName);
 
         ChunkCompressorFactory.CompressionType compressionType =
             getColumnCompressionType(segmentCreationSpec, fieldSpec);
 
-        // Initialize forward index creator
-        _forwardIndexCreatorMap.put(columnName,
-            getRawIndexCreatorForColumn(_indexDir, compressionType, columnName, fieldSpec.getDataType(), totalDocs,
-                indexCreationInfo.getLengthOfLongestEntry()));
+        boolean createFwdIndex = true;
+        if(createInvertedIndex && type == FieldSpec.DataType.TEXT) {
+          TextSearchIndexConfig config = segmentCreationSpec.getTextSearchIndexConfigs().get(columnName);
+          if (config == null) {
+            config = TextSearchIndexConfig.getDefaultConfig(columnName, _indexDir);
+          }
+          createFwdIndex = config.shouldStore();
+          _docBasedInvertedIndexCreatorMap.put(columnName, TextSearchIndexCreatorFactory.createSearchIndexer(config));
+        }
+
+        if (createFwdIndex) {
+          // Initialize forward index creator
+          _forwardIndexCreatorMap.put(columnName,
+              getRawIndexCreatorForColumn(_indexDir, compressionType, columnName, fieldSpec.getDataType(), totalDocs,
+                  indexCreationInfo.getLengthOfLongestEntry()));
+
+        }
       }
     }
   }
@@ -254,23 +277,31 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
         throw new RuntimeException("Null value for column:" + columnName);
       }
 
-      SegmentDictionaryCreator dictionaryCreator = _dictionaryCreatorMap.get(columnName);
-      if (schema.getFieldSpecFor(columnName).isSingleValueField()) {
-        if (dictionaryCreator != null) {
-          int dictId = dictionaryCreator.indexOfSV(columnValueToIndex);
-          ((SingleValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictId);
-          if (_invertedIndexCreatorMap.containsKey(columnName)) {
-            _invertedIndexCreatorMap.get(columnName).add(dictId);
+      if (_docBasedInvertedIndexCreatorMap.containsKey(columnName)) {
+        // inverted index
+        _docBasedInvertedIndexCreatorMap.get(columnName).add(columnValueToIndex);
+        // forward index
+        ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
+            columnValueToIndex);
+      } else {
+        SegmentDictionaryCreator dictionaryCreator = _dictionaryCreatorMap.get(columnName);
+        if (schema.getFieldSpecFor(columnName).isSingleValueField()) {
+          if (dictionaryCreator != null) {
+            int dictId = dictionaryCreator.indexOfSV(columnValueToIndex);
+            ((SingleValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictId);
+            if (_invertedIndexCreatorMap.containsKey(columnName)) {
+              _invertedIndexCreatorMap.get(columnName).add(dictId);
+            }
+          } else {
+            ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
+                columnValueToIndex);
           }
         } else {
-          ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter,
-              columnValueToIndex);
-        }
-      } else {
-        int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
-        ((MultiValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictIds);
-        if (_invertedIndexCreatorMap.containsKey(columnName)) {
-          _invertedIndexCreatorMap.get(columnName).add(dictIds, dictIds.length);
+          int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
+          ((MultiValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictIds);
+          if (_invertedIndexCreatorMap.containsKey(columnName)) {
+            _invertedIndexCreatorMap.get(columnName).add(dictIds, dictIds.length);
+          }
         }
       }
     }
@@ -287,6 +318,9 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
     for (InvertedIndexCreator invertedIndexCreator : _invertedIndexCreatorMap.values()) {
       invertedIndexCreator.seal();
     }
+    for (DocBasedInvertedIndexCreator docIndexCreator : _docBasedInvertedIndexCreatorMap.values()) {
+      docIndexCreator.seal();
+    }
     writeMetadata();
   }
 
@@ -512,6 +546,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
 
       case STRING:
       case BYTES:
+      case TEXT:
         indexCreator =
             new SingleValueVarByteRawIndexCreator(file, compressionType, column, totalDocs, lengthOfLongestEntry);
         break;
@@ -534,5 +569,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
     for (InvertedIndexCreator invertedIndexCreator : _invertedIndexCreatorMap.values()) {
       invertedIndexCreator.close();
     }
+    for (DocBasedInvertedIndexCreator docIndexCreator : _docBasedInvertedIndexCreatorMap.values()) {
+      docIndexCreator.close();
+    }
   }
 }
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
index b2d0c5b..9e4950c 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java
@@ -137,6 +137,7 @@ public class SegmentDictionaryCreator implements Closeable {
             _fieldSpec.getName(), numValues, sortedDoubles[0], sortedDoubles[numValues - 1]);
         return;
       case STRING:
+      case TEXT:
         String[] sortedStrings = (String[]) _sortedValues;
         numValues = sortedStrings.length;
         Preconditions.checkState(numValues > 0);
@@ -162,8 +163,9 @@ public class SegmentDictionaryCreator implements Closeable {
           }
         }
         LOGGER.info(
-            "Created dictionary for STRING column: {} with cardinality: {}, max length in bytes: {}, range: {} to {}",
-            _fieldSpec.getName(), numValues, _numBytesPerEntry, sortedStrings[0], sortedStrings[numValues - 1]);
+            "Created dictionary for {} column: {} with cardinality: {}, max length in bytes: {}, range: {} to {}",
+            _fieldSpec.getDataType(),_fieldSpec.getName(), numValues, _numBytesPerEntry, sortedStrings[0],
+            sortedStrings[numValues - 1]);
         return;
 
       case BYTES:
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
index 2d6d762..d847b6e 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OffHeapBitmapInvertedIndexCreator.java
@@ -17,6 +17,7 @@ package com.linkedin.pinot.core.segment.creator.impl.inv;
 
 import com.google.common.base.Preconditions;
 import com.linkedin.pinot.common.data.FieldSpec;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
 import com.linkedin.pinot.core.segment.memory.PinotDataBuffer;
@@ -30,7 +31,7 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
 
 
 /**
- * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * Implementation of {@link DictionaryBasedInvertedIndexCreator} that uses off-heap memory.
  * <p>We use 2 passes to create the inverted index.
  * <ul>
  *   <li>
@@ -48,7 +49,7 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
  * </ul>
  * <p>Based on the number of values we need to store, we use direct memory or MMap file to allocate the buffer.
  */
-public final class OffHeapBitmapInvertedIndexCreator implements InvertedIndexCreator {
+public final class OffHeapBitmapInvertedIndexCreator implements DictionaryBasedInvertedIndexCreator {
   // Use MMapBuffer if the buffer size is larger than 100MB
   private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 25_000_000;
 
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
index b6d9eb1..8fdd351 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/inv/OnHeapBitmapInvertedIndexCreator.java
@@ -16,6 +16,7 @@
 package com.linkedin.pinot.core.segment.creator.impl.inv;
 
 import com.google.common.base.Preconditions;
+import com.linkedin.pinot.core.segment.creator.DictionaryBasedInvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator;
 import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
 import java.io.BufferedOutputStream;
@@ -28,9 +29,9 @@ import org.roaringbitmap.buffer.MutableRoaringBitmap;
 
 
 /**
- * Implementation of {@link InvertedIndexCreator} that uses on-heap memory.
+ * Implementation of {@link DictionaryBasedInvertedIndexCreator} that uses on-heap memory.
  */
-public final class OnHeapBitmapInvertedIndexCreator implements InvertedIndexCreator {
+public final class OnHeapBitmapInvertedIndexCreator implements DictionaryBasedInvertedIndexCreator {
   private final File _invertedIndexFile;
   private final MutableRoaringBitmap[] _bitmaps;
   private int _nextDocId;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
index 1938be9..436b9e2 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/stats/SegmentPreIndexStatsCollectorImpl.java
@@ -51,6 +51,7 @@ public class SegmentPreIndexStatsCollectorImpl implements SegmentPreIndexStatsCo
       switch (spec.getDataType()) {
         case BOOLEAN:
         case STRING:
+        case TEXT:
           columnStatsCollectorMap.put(spec.getName(),
               new StringColumnPreIndexStatsCollector(column, _statsCollectorConfig));
           break;
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java
new file mode 100644
index 0000000..2381113
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/LuceneIndexCreator.java
@@ -0,0 +1,85 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
+import java.io.File;
+import java.io.IOException;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+class LuceneIndexCreator implements DocBasedInvertedIndexCreator {
+
+  private static final Logger LOGGER = LoggerFactory.getLogger(LuceneIndexCreator.class);
+
+  public static final String VERSION = "7.6.0";
+  // Index file will be flushed after reaching this threshold
+  private static final int MAX_BUFFER_SIZE_MB = 500;
+  private static final String DEFAULT_FIELD = "TEXT";
+  private static final Field.Store DEFAULT_STORE = Field.Store.NO;
+
+  private final TextSearchIndexConfig _config;
+  private final StandardAnalyzer _analyzer;
+  private final IndexWriter _writer;
+  private final IndexWriterConfig _indexWriterConfig;
+  private final Directory _indexDirectory;
+
+  public LuceneIndexCreator(TextSearchIndexConfig config) {
+    _config = config;
+    _analyzer = new StandardAnalyzer();
+    _indexWriterConfig = new IndexWriterConfig(_analyzer);
+    _indexWriterConfig.setRAMBufferSizeMB(MAX_BUFFER_SIZE_MB);
+    File dir = new File(config.getIndexDir().getPath() + "/" + _config.getColumnName());
+    try {
+      _indexDirectory = FSDirectory.open(dir.toPath());
+      _writer = new IndexWriter(_indexDirectory, _indexWriterConfig);
+    } catch (IOException e) {
+      LOGGER.error("Encountered error creating LuceneIndexCreator ", e);
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void seal() throws IOException {
+
+  }
+
+  @Override
+  public void add(Object doc) {
+    Document document = new Document();
+    document.add(new TextField(DEFAULT_FIELD, doc.toString(), DEFAULT_STORE));
+    try {
+      _writer.addDocument(document);
+    } catch (IOException e) {
+      LOGGER.error("Encountered exception while adding doc:{}", doc.toString(), e);
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    _writer.close();
+  }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java
new file mode 100644
index 0000000..d8a4fbd
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexConfig.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import java.io.File;
+
+
+public class TextSearchIndexConfig {
+
+  private final TextSearchIndexType _type;
+  private final boolean _store;
+  private final File _indexDir;
+  private final String _columnName;
+
+  public TextSearchIndexConfig(TextSearchIndexType type, boolean store, File indexDir, String columnName) {
+    _type = type;
+    _store = store;
+    _indexDir = indexDir;
+    _columnName = columnName;
+  }
+
+  public TextSearchIndexType getType() {
+    return _type;
+  }
+
+  public boolean shouldStore() {
+    return _store;
+  }
+
+  public File getIndexDir() {
+    return _indexDir;
+  }
+
+  public String getColumnName() {
+    return _columnName;
+  }
+
+  public static TextSearchIndexConfig getDefaultConfig(String columnName, File indexDir) {
+    return new TextSearchIndexConfig(TextSearchIndexType.LUCENE, true, indexDir, columnName);
+  }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java
new file mode 100644
index 0000000..bd05b62
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexCreatorFactory.java
@@ -0,0 +1,32 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import com.linkedin.pinot.core.segment.creator.DocBasedInvertedIndexCreator;
+
+
+public class TextSearchIndexCreatorFactory {
+
+   public static DocBasedInvertedIndexCreator createSearchIndexer(TextSearchIndexConfig config) {
+
+     switch (config.getType()) {
+       case LUCENE:
+         return new LuceneIndexCreator(config);
+       default:
+         throw new IllegalArgumentException("Unsupported TextSearchIndexType " + config.getType());
+     }
+   }
+}
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java
new file mode 100644
index 0000000..fe7637f
--- /dev/null
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/creator/impl/textsearch/TextSearchIndexType.java
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2014-2018 LinkedIn Corp. (pinot-core@linkedin.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.linkedin.pinot.core.segment.creator.impl.textsearch;
+
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * Enum for text search-index type
+ */
+public enum TextSearchIndexType {
+  // NOTE: Do not reorder the enums when adding new ones
+  LUCENE(1);
+
+  private int _value;
+
+  private static Map<Integer, TextSearchIndexType> _indexType = new HashMap<>();
+
+  TextSearchIndexType(int value) {
+    _value = value;
+  }
+
+  static {
+    for (TextSearchIndexType type : TextSearchIndexType.values()) {
+      _indexType.put(type._value, type);
+    }
+  }
+
+  public static TextSearchIndexType valueOf(int type) {
+    return _indexType.get(type);
+  }
+
+  public int getValue() {
+    return _value;
+  }
+}
\ No newline at end of file
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
index 29ef748..da1e9cd 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
@@ -71,6 +71,7 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
     }
     PinotDataBuffer fwdIndexBuffer = segmentReader.getIndexFor(columnName, ColumnIndexType.FORWARD_INDEX);
 
+    FieldSpec.DataType type = metadata.getDataType();
     if (metadata.hasDictionary()) {
       //bloom filter
       if (loadBloomFilter) {
@@ -110,10 +111,15 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
       }
     } else {
       // Raw index
-      _forwardIndex = loadRawForwardIndex(fwdIndexBuffer, metadata.getDataType());
-      _invertedIndex = null;
+      _forwardIndex = loadRawForwardIndex(fwdIndexBuffer, type);
       _dictionary = null;
       _bloomFilterReader = null;
+      _invertedIndex = null;
+      if (loadInvertedIndex) {
+        if (type == FieldSpec.DataType.TEXT) {
+          // TODO: load text-search inverted index
+        }
+      }
     }
   }
 
@@ -190,6 +196,7 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
         return new FixedByteChunkSingleValueReader(forwardIndexBuffer);
       case STRING:
       case BYTES:
+      case TEXT:
         return new VarByteChunkSingleValueReader(forwardIndexBuffer);
       default:
         throw new IllegalStateException("Illegal data type for raw forward index: " + dataType);
diff --git a/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java b/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
index f43d676..58d6eb0 100644
--- a/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
+++ b/pinot-core/src/main/java/com/linkedin/pinot/core/util/AvroUtils.java
@@ -159,6 +159,7 @@ public class AvroUtils {
             fieldAssembler = fieldAssembler.name(fieldSpec.getName()).type().doubleType().noDefault();
             break;
           case STRING:
+          case TEXT:
             fieldAssembler = fieldAssembler.name(fieldSpec.getName()).type().stringType().noDefault();
             break;
           case BYTES:
diff --git a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
index 1c64375..127530e 100644
--- a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
+++ b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
@@ -65,12 +65,12 @@ public class SegmentGenerationWithBytesTypeTest {
   private static final int NUM_SORTED_VALUES = 1001;
 
   private static final String SEGMENT_DIR_NAME =
-      System.getProperty("java.io.tmpdir") + File.separator + "bytesTypeTest";
+      System.getProperty("java.io.tmpdir") + File.separator + "TextTypeTest";
   private static final String SEGMENT_NAME = "testSegment";
 
   private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "tDigestTest";
 
-  private static final String AVRO_NAME = "tDigest.avro";
+  private static final String AVRO_NAME = "text.avro";
 
   private static final String FIXED_BYTE_SORTED_COLUMN = "sortedColumn";
   private static final String FIXED_BYTES_UNSORTED_COLUMN = "fixedBytes";
diff --git a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
similarity index 54%
copy from pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
copy to pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
index 1c64375..a5440ae 100644
--- a/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithBytesTypeTest.java
+++ b/pinot-core/src/test/java/com/linkedin/pinot/core/segment/index/creator/SegmentGenerationWithTextTypeTest.java
@@ -15,14 +15,11 @@
  */
 package com.linkedin.pinot.core.segment.index.creator;
 
-import com.google.common.primitives.Ints;
 import com.linkedin.pinot.common.data.DimensionFieldSpec;
 import com.linkedin.pinot.common.data.FieldSpec;
-import com.linkedin.pinot.common.data.MetricFieldSpec;
 import com.linkedin.pinot.common.data.Schema;
 import com.linkedin.pinot.common.segment.ReadMode;
 import com.linkedin.pinot.common.segment.SegmentMetadata;
-import com.linkedin.pinot.common.utils.primitive.ByteArray;
 import com.linkedin.pinot.core.data.GenericRow;
 import com.linkedin.pinot.core.data.readers.GenericRowRecordReader;
 import com.linkedin.pinot.core.data.readers.PinotSegmentRecordReader;
@@ -41,6 +38,7 @@ import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
@@ -49,6 +47,7 @@ import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.RandomStringUtils;
 import org.testng.Assert;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
@@ -56,26 +55,22 @@ import org.testng.annotations.Test;
 
 
 /**
- * Class for testing segment generation with byte[] data type.
+ * Class for testing segment generation with text data type.
  */
-public class SegmentGenerationWithBytesTypeTest {
+public class SegmentGenerationWithTextTypeTest {
   private static final int NUM_ROWS = 10001;
-  private static final int FIXED_BYTE_LENGTH = 53;
-  private static final int MAX_VARIABLE_BYTES_LENGTH = 101;
-  private static final int NUM_SORTED_VALUES = 1001;
+  private static final int MAX_STRING_LENGTH = 1000;
 
   private static final String SEGMENT_DIR_NAME =
-      System.getProperty("java.io.tmpdir") + File.separator + "bytesTypeTest";
+      System.getProperty("java.io.tmpdir") + File.separator + "textTypeTest";
   private static final String SEGMENT_NAME = "testSegment";
 
-  private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "tDigestTest";
+  private static final String AVRO_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "textTest";
 
-  private static final String AVRO_NAME = "tDigest.avro";
+  private static final String AVRO_NAME = "text.avro";
 
-  private static final String FIXED_BYTE_SORTED_COLUMN = "sortedColumn";
-  private static final String FIXED_BYTES_UNSORTED_COLUMN = "fixedBytes";
-  private static final String FIXED_BYTES_NO_DICT_COLUMN = "fixedBytesNoDict";
-  private static final String VARIABLE_BYTES_COLUMN = "variableBytes";
+  private static final String TEXT_SORTED_COLUMN = "sortedColumn";
+  private static final String TEXT_COLUMN = "textColumn";
 
   private Random _random;
   private RecordReader _recordReader;
@@ -91,14 +86,17 @@ public class SegmentGenerationWithBytesTypeTest {
   public void setup() throws Exception {
 
     _schema = new Schema();
-    _schema.addField(new DimensionFieldSpec(FIXED_BYTE_SORTED_COLUMN, FieldSpec.DataType.BYTES, true));
-    _schema.addField(new DimensionFieldSpec(FIXED_BYTES_UNSORTED_COLUMN, FieldSpec.DataType.BYTES, true));
-    _schema.addField(new DimensionFieldSpec(FIXED_BYTES_NO_DICT_COLUMN, FieldSpec.DataType.BYTES, true));
-    _schema.addField(new DimensionFieldSpec(VARIABLE_BYTES_COLUMN, FieldSpec.DataType.BYTES, true));
+    _schema.addField(new DimensionFieldSpec(TEXT_SORTED_COLUMN, FieldSpec.DataType.TEXT, true));
+    _schema.addField(new DimensionFieldSpec(TEXT_COLUMN, FieldSpec.DataType.TEXT, true));
 
     _random = new Random(System.nanoTime());
-    _recordReader = buildIndex(_schema);
-    _segment = ImmutableSegmentLoader.load(new File(SEGMENT_DIR_NAME, SEGMENT_NAME), ReadMode.heap);
+    try {
+      _recordReader = buildIndex(_schema);
+      _segment = ImmutableSegmentLoader.load(new File(SEGMENT_DIR_NAME, SEGMENT_NAME), ReadMode.heap);
+    } catch (Exception e) {
+      e.printStackTrace();
+      throw e;
+    }
   }
 
   /**
@@ -122,12 +120,10 @@ public class SegmentGenerationWithBytesTypeTest {
       GenericRow actualRow = pinotReader.next();
 
       for (String column : _schema.getColumnNames()) {
-        byte[] actual = (byte[]) actualRow.getValue(column);
-        byte[] expected = (byte[]) expectedRow.getValue(column);
+        String actual = (String) actualRow.getValue(column);
+        String expected = (String) expectedRow.getValue(column);
 
-        if (ByteArray.compare(actual, expected) != 0) {
-          Assert.assertEquals(actualRow.getValue(column), expectedRow.getValue(column));
-        }
+        Assert.assertEquals(actual, expected);
       }
     }
 
@@ -138,59 +134,30 @@ public class SegmentGenerationWithBytesTypeTest {
 
   @Test
   public void testMetadata() {
-    Assert.assertTrue(_segment.getDataSource(FIXED_BYTE_SORTED_COLUMN).getDataSourceMetadata().isSorted());
-    Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(FIXED_BYTES_NO_DICT_COLUMN));
-  }
-
-  @Test
-  public void testDictionary() {
-    ImmutableDictionaryReader dictionary = (ImmutableDictionaryReader) _segment.getDictionary(FIXED_BYTE_SORTED_COLUMN);
-    Assert.assertEquals(dictionary.length(), NUM_SORTED_VALUES);
-
-    // Test dictionary indexing.
-    for (int i = 0; i < NUM_ROWS; i++) {
-      int value = (i * NUM_SORTED_VALUES) / NUM_ROWS;
-      // For sorted columns, values are written as 0, 0, 0.., 1, 1, 1...n, n, n
-      Assert.assertEquals(dictionary.indexOf(Ints.toByteArray(value)), value % NUM_SORTED_VALUES);
-    }
-
-    // Test value not in dictionary.
-    Assert.assertEquals(dictionary.indexOf(Ints.toByteArray(NUM_SORTED_VALUES + 1)), -1);
-    Assert.assertEquals(dictionary.insertionIndexOf(Ints.toByteArray(NUM_SORTED_VALUES + 1)),
-        -(dictionary.length() + 1));
-
-    int[] dictIds = new int[dictionary.length()];
-    for (int i = 0; i < dictIds.length; i++) {
-      dictIds[i] = i;
-    }
-
-    byte[][] values = new byte[dictIds.length][];
-    dictionary.readBytesValues(dictIds, 0, dictIds.length, values, 0);
-    for (int expected = 0; expected < values.length; expected++) {
-      int actual = ByteBuffer.wrap(values[expected]).asIntBuffer().get();
-      Assert.assertEquals(actual, expected);
-    }
+    Assert.assertTrue(_segment.getDataSource(TEXT_SORTED_COLUMN).getDataSourceMetadata().isSorted());
+    Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(TEXT_COLUMN));
+    Assert.assertFalse(_segment.getSegmentMetadata().hasDictionary(TEXT_SORTED_COLUMN));
   }
 
   /**
-   * This test generates an avro with TDigest BYTES data, and tests segment generation.
+   * This test generates TEXT data, and tests segment generation.
    */
   @Test
-  public void testTDigestAvro() throws Exception {
+  public void testTextAvro() throws Exception {
     Schema schema = new Schema();
-    schema.addField(new MetricFieldSpec(FIXED_BYTES_UNSORTED_COLUMN, FieldSpec.DataType.BYTES));
-    schema.addField(new MetricFieldSpec(VARIABLE_BYTES_COLUMN, FieldSpec.DataType.BYTES));
+    schema.addField(new DimensionFieldSpec(TEXT_COLUMN, FieldSpec.DataType.TEXT, true));
+    schema.addField(new DimensionFieldSpec(TEXT_SORTED_COLUMN, FieldSpec.DataType.TEXT, true));
 
-    List<byte[]> _fixedExpected = new ArrayList<>(NUM_ROWS);
-    List<byte[]> _varExpected = new ArrayList<>(NUM_ROWS);
+    String[] sortedExpected = new String[NUM_ROWS];
+    String[] unsortedExpected = new String[NUM_ROWS];
 
-    buildAvro(schema, _fixedExpected, _varExpected);
+    buildAvro(schema, sortedExpected, unsortedExpected);
 
     IndexSegment segment = buildSegmentFromAvro(schema, AVRO_DIR_NAME, AVRO_NAME, SEGMENT_NAME);
     SegmentMetadata metadata = segment.getSegmentMetadata();
 
-    Assert.assertTrue(metadata.hasDictionary(FIXED_BYTES_UNSORTED_COLUMN));
-    Assert.assertFalse(metadata.hasDictionary(VARIABLE_BYTES_COLUMN));
+    Assert.assertFalse(metadata.hasDictionary(TEXT_COLUMN));
+    Assert.assertFalse(metadata.hasDictionary(TEXT_SORTED_COLUMN));
 
     PinotSegmentRecordReader reader = new PinotSegmentRecordReader(new File(AVRO_DIR_NAME, SEGMENT_NAME));
     GenericRow row = new GenericRow();
@@ -198,9 +165,9 @@ public class SegmentGenerationWithBytesTypeTest {
     int i = 0;
     while (reader.hasNext()) {
       row = reader.next(row);
-      Assert.assertEquals(ByteArray.compare((byte[]) row.getValue(FIXED_BYTES_UNSORTED_COLUMN), _fixedExpected.get(i)),
-          0);
-      Assert.assertEquals(ByteArray.compare((byte[]) row.getValue(VARIABLE_BYTES_COLUMN), _varExpected.get(i++)), 0);
+      Assert.assertEquals(((String)row.getValue(TEXT_COLUMN)), unsortedExpected[i], "Comparison failed at index " + i);
+      Assert.assertTrue(((String)row.getValue(TEXT_SORTED_COLUMN)).equals(sortedExpected[i]), "Comparison failed at index " + i);
+      i++;
     }
     segment.destroy();
   }
@@ -217,28 +184,23 @@ public class SegmentGenerationWithBytesTypeTest {
 
     config.setOutDir(SEGMENT_DIR_NAME);
     config.setSegmentName(SEGMENT_NAME);
-    config.setRawIndexCreationColumns(Collections.singletonList(FIXED_BYTES_NO_DICT_COLUMN));
+    List<String> columns = Arrays.asList(TEXT_COLUMN, TEXT_SORTED_COLUMN);
+    config.setRawIndexCreationColumns(columns);
+    config.setInvertedIndexCreationColumns(columns);
+
+    String[] sorted = new String[NUM_ROWS];
+    for (int i = 0; i < NUM_ROWS; i++) {
+      String str = RandomStringUtils.randomAlphabetic(1 + _random.nextInt(MAX_STRING_LENGTH));
+      sorted[i] = str;
+    }
+    Arrays.sort(sorted);
 
     List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
     for (int i = 0; i < NUM_ROWS; i++) {
       HashMap<String, Object> map = new HashMap<>();
 
-      // Set the value for fixed-byte sorted column.
-      map.put(FIXED_BYTE_SORTED_COLUMN, Ints.toByteArray((i * NUM_SORTED_VALUES) / NUM_ROWS));
-
-      // Set the value for fixed-byte unsorted column.
-      byte[] fixedBytes = new byte[FIXED_BYTE_LENGTH];
-      _random.nextBytes(fixedBytes);
-      map.put(FIXED_BYTES_UNSORTED_COLUMN, fixedBytes);
-
-      // Set the value for fixed-byte no-dictionary column.
-      map.put(FIXED_BYTES_NO_DICT_COLUMN, fixedBytes);
-
-      // Set the value fo variable length column. Ensure at least one zero-length byte[].
-      int length = (i == 0) ? 0 : _random.nextInt(MAX_VARIABLE_BYTES_LENGTH);
-      byte[] varBytes = new byte[length];
-      _random.nextBytes(varBytes);
-      map.put(VARIABLE_BYTES_COLUMN, varBytes);
+      map.put(TEXT_SORTED_COLUMN, sorted[i]);
+      map.put(TEXT_COLUMN, RandomStringUtils.randomAlphabetic(1 + _random.nextInt(MAX_STRING_LENGTH)));
 
       GenericRow genericRow = new GenericRow();
       genericRow.init(map);
@@ -256,14 +218,14 @@ public class SegmentGenerationWithBytesTypeTest {
   }
 
   /**
-   * Build Avro file containing serialized TDigest bytes.
+   * Build Avro file containing Text.
    *
    * @param schema Schema of data (one fixed and one variable column)
-   * @param _fixedExpected Serialized bytes of fixed length column are populated here
-   * @param _varExpected Serialized bytes of variable length column are populated here
+   * @param sortedExpected Sorted column are populated here
+   * @param unsortedExpected unsorted column are populated here
    * @throws IOException
    */
-  private void buildAvro(Schema schema, List<byte[]> _fixedExpected, List<byte[]> _varExpected) throws IOException {
+  private void buildAvro(Schema schema, String[] sortedExpected, String[] unsortedExpected) throws IOException {
     org.apache.avro.Schema avroSchema = AvroUtils.getAvroSchemaFromPinotSchema(schema);
 
     try (DataFileWriter<GenericData.Record> recordWriter = new DataFileWriter<>(new GenericDatumWriter<>(avroSchema))) {
@@ -272,31 +234,20 @@ public class SegmentGenerationWithBytesTypeTest {
         throw new RuntimeException("Unable to create test directory: " + AVRO_DIR_NAME);
       }
 
+      for (int i = 0; i < NUM_ROWS; i++) {
+        String str = RandomStringUtils.randomAlphanumeric(1 + _random.nextInt(MAX_STRING_LENGTH));
+        sortedExpected[i] = str;
+      }
+      Arrays.sort(sortedExpected);
+
       recordWriter.create(avroSchema, new File(AVRO_DIR_NAME, AVRO_NAME));
       for (int i = 0; i < NUM_ROWS; i++) {
         GenericData.Record record = new GenericData.Record(avroSchema);
 
-        TDigest tDigest = TDigest.createMergingDigest(PercentileTDigestAggregationFunction.DEFAULT_TDIGEST_COMPRESSION);
-        tDigest.add(_random.nextDouble());
-
-        ByteBuffer buffer = ByteBuffer.allocate(tDigest.byteSize());
-        tDigest.asBytes(buffer);
-        _fixedExpected.add(buffer.array());
-
-        buffer.flip();
-        record.put(FIXED_BYTES_UNSORTED_COLUMN, buffer);
-
-        if (i % 2 == 0) {
-          tDigest.add(_random.nextDouble());
-        }
-
-        buffer = ByteBuffer.allocate(tDigest.byteSize());
-        tDigest.asBytes(buffer);
-        _varExpected.add(buffer.array());
-
-        buffer.flip();
-        record.put(VARIABLE_BYTES_COLUMN, buffer);
-
+        String str = RandomStringUtils.randomAlphanumeric(1 + _random.nextInt(MAX_STRING_LENGTH));
+        unsortedExpected[i] = str;
+        record.put(TEXT_COLUMN, str);
+        record.put(TEXT_SORTED_COLUMN, sortedExpected[i]);
         recordWriter.append(record);
       }
     }
@@ -315,6 +266,9 @@ public class SegmentGenerationWithBytesTypeTest {
     config.setOutDir(dirName);
     config.setSegmentName(segmentName);
     config.setSchema(schema);
+    List<String> columns = Arrays.asList(TEXT_COLUMN, TEXT_SORTED_COLUMN);
+    config.setRawIndexCreationColumns(columns);
+    config.setInvertedIndexCreationColumns(columns);
 
     SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
     driver.init(config);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org