You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2020/03/31 04:47:32 UTC
[incubator-pinot] branch master updated: Lucene DocId to PinotDocId cache to improve performance (#5177)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 8dfa51a  Lucene DocId to PinotDocId cache to improve performance (#5177)
8dfa51a is described below

commit 8dfa51af244f911d52824f94e95f983ffc50b5fb
Author: Sidd <si...@gmail.com>
AuthorDate: Mon Mar 30 21:47:22 2020 -0700

    Lucene DocId to PinotDocId cache to improve performance (#5177)
    
    Co-authored-by: Siddharth Teotia <st...@steotia-mn1.linkedin.biz>
---
 .../index/column/PhysicalColumnIndexContainer.java |  2 +-
 .../converter/SegmentV1V2ToV3FormatConverter.java  | 18 +++++
 .../loader/invertedindex/TextIndexHandler.java     | 59 ++++++++--------
 .../index/readers/text/LuceneTextIndexReader.java  | 79 ++++++++++++++++++----
 .../core/segment/store/SegmentDirectoryPaths.java  |  9 +++
 .../core/segment/index/loader/LoaderTest.java      | 57 ++++++++++++++--
 ...archQueries.java => TextSearchQueriesTest.java} |  3 +-
 7 files changed, 177 insertions(+), 50 deletions(-)

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
index 33ba360..76ff19e 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/column/PhysicalColumnIndexContainer.java
@@ -132,7 +132,7 @@ public final class PhysicalColumnIndexContainer implements ColumnIndexContainer
       _dictionary = null;
       _bloomFilterReader = null;
       if (loadTextIndex) {
-        _invertedIndex = new LuceneTextIndexReader(columnName, segmentIndexDir);
+        _invertedIndex = new LuceneTextIndexReader(columnName, segmentIndexDir, metadata.getTotalDocs());
       } else {
         _invertedIndex = null;
       }
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/converter/SegmentV1V2ToV3FormatConverter.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/converter/SegmentV1V2ToV3FormatConverter.java
index 71c56ae..f534fe9 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/converter/SegmentV1V2ToV3FormatConverter.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/converter/SegmentV1V2ToV3FormatConverter.java
@@ -35,6 +35,7 @@ import org.apache.pinot.core.indexsegment.generator.SegmentVersion;
 import org.apache.pinot.core.segment.creator.impl.V1Constants;
 import org.apache.pinot.core.segment.creator.impl.inv.text.LuceneTextIndexCreator;
 import org.apache.pinot.core.segment.index.metadata.SegmentMetadataImpl;
+import org.apache.pinot.core.segment.index.readers.text.LuceneTextIndexReader;
 import org.apache.pinot.core.segment.memory.PinotDataBuffer;
 import org.apache.pinot.core.segment.store.ColumnIndexType;
 import org.apache.pinot.core.segment.store.SegmentDirectory;
@@ -225,6 +226,7 @@ public class SegmentV1V2ToV3FormatConverter implements SegmentFormatConverter {
 
   private void copyLuceneTextIndexIfExists(File segmentDirectory, File v3Dir)
       throws IOException {
+    // TODO: see if this can be done by reusing some existing methods
     String suffix = LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION;
     File[] textIndexFiles = segmentDirectory.listFiles(new FilenameFilter() {
       @Override
@@ -241,6 +243,22 @@ public class SegmentV1V2ToV3FormatConverter implements SegmentFormatConverter {
         Files.copy(indexFile.toPath(), v3LuceneIndexFile.toPath());
       }
     }
+    // if segment reload is issued asking for up-conversion of
+    // on-disk segment format from v1/v2 to v3, then in addition
+    // to moving the lucene text index files, we need to move the
+    // docID mapping/cache file created by us in v1/v2 during an earlier
+    // load of the segment.
+    String docIDFileSuffix = LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION;
+    File[] textIndexDocIdMappingFiles = segmentDirectory.listFiles(new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return name.endsWith(docIDFileSuffix);
+      }
+    });
+    for (File docIdMappingFile : textIndexDocIdMappingFiles) {
+      File v3DocIdMappingFile = new File(v3Dir, docIdMappingFile.getName());
+      Files.copy(docIdMappingFile.toPath(), v3DocIdMappingFile.toPath());
+    }
   }
 
   private void deleteStaleConversionDirectories(File segmentDirectory) {
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/invertedindex/TextIndexHandler.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/invertedindex/TextIndexHandler.java
index a501596..1c1786f 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/invertedindex/TextIndexHandler.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/invertedindex/TextIndexHandler.java
@@ -61,8 +61,26 @@ import static org.apache.pinot.core.segment.creator.impl.V1Constants.MetadataKey
 import static org.apache.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.Column.getKeyFor;
 
 
+/**
+ * Helper class for text indexes used by {@link org.apache.pinot.core.segment.index.loader.SegmentPreProcessor}.
+ * to create text index for column during segment load time. Currently text index is always
+ * created (if enabled on a column) during segment generation
+ *
+ * (1) A new segment with text index is created/refreshed. Server loads the segment. The handler
+ * detects the existence of text index and returns.
+ *
+ * (2) A reload is issued on an existing segment with existing text index. The handler
+ * detects the existence of text index and returns.
+ *
+ * (3) A reload is issued on an existing segment after text index is enabled on an existing
+ * column. Read the forward index to create text index.
+ *
+ * (4) A reload is issued on an existing segment after text index is enabled on a newly
+ * added column. In this case, the default column handler would have taken care of adding
+ * forward index for the new column. Read the forward index to create text index.
+ */
 public class TextIndexHandler {
-  private static final Logger LOGGER = LoggerFactory.getLogger(InvertedIndexHandler.class);
+  private static final Logger LOGGER = LoggerFactory.getLogger(TextIndexHandler.class);
 
   private final File _indexDir;
   private final SegmentDirectory.Writer _segmentWriter;
@@ -85,26 +103,6 @@ public class TextIndexHandler {
     }
   }
 
-  /**
-   * Create text index for column during segment load time. Currently text index is always
-   * created (if enabled on a column) during segment generation (true for both offline
-   * and realtime segments). So this function is a NO-OP for case when a new segment is loaded
-   * after creation. However, when segment reload is issued in the following scenarios, we generate
-   * text index.
-   *
-   * SCENARIO 1: user enables text index on an existing column (table config change)
-   * SCENARIO 2: user adds a new column and enables text index (both schema and table config change)
-   *
-   * This function is a NO-OP for the above two cases. Later we can also add a segment generator
-   * config option to not necessarily generate text index during segment generation. When we do
-   * so, this function should be able to take care of that scenario too.
-   *
-   * For scenario 2, {@link org.apache.pinot.core.segment.index.loader.defaultcolumn.V3DefaultColumnHandler}
-   * would have already added the forward index for the column with default value. We use the forward
-   * index here to get the raw data and build text index.
-   *
-   * @throws IOException
-   */
   public void createTextIndexesOnSegmentLoad()
       throws Exception {
     for (ColumnMetadata columnMetadata : _textIndexColumns) {
@@ -125,21 +123,23 @@ public class TextIndexHandler {
   private void checkUnsupportedOperationsForTextIndex(ColumnMetadata columnMetadata) {
     String column = columnMetadata.getColumnName();
     if (columnMetadata.hasDictionary()) {
-      throw new UnsupportedOperationException("Text index is currently not supported on dictionary encoded column: "+column);
+      throw new UnsupportedOperationException(
+          "Text index is currently not supported on dictionary encoded column: " + column);
     }
 
     if (columnMetadata.isSorted()) {
       // since Pinot's current implementation doesn't support raw sorted columns,
       // we need to check for this too
-      throw new UnsupportedOperationException("Text index is currently not supported on sorted columns: "+column);
+      throw new UnsupportedOperationException("Text index is currently not supported on sorted columns: " + column);
     }
 
     if (!columnMetadata.isSingleValue()) {
-      throw new UnsupportedOperationException("Text index is currently not supported on multi-value columns: "+column);
+      throw new UnsupportedOperationException(
+          "Text index is currently not supported on multi-value columns: " + column);
     }
 
     if (columnMetadata.getDataType() != FieldSpec.DataType.STRING) {
-      throw new UnsupportedOperationException("Text index is currently only supported on STRING columns: "+column);
+      throw new UnsupportedOperationException("Text index is currently only supported on STRING columns: " + column);
     }
   }
 
@@ -153,8 +153,13 @@ public class TextIndexHandler {
     }
     int numDocs = columnMetadata.getTotalDocs();
     LOGGER.info("Creating new text index for column: {} in segment: {}", column, _segmentName);
-    File segmentIndexDir = SegmentDirectoryPaths.segmentDirectoryFor(_indexDir, _segmentVersion);
-    try (LuceneTextIndexCreator textIndexCreator = new LuceneTextIndexCreator(column, segmentIndexDir, true)) {
+    File segmentDirectory = SegmentDirectoryPaths.segmentDirectoryFor(_indexDir, _segmentVersion);
+    // The handlers are always invoked by the preprocessor. Before this ImmutableSegmentLoader would have already
+    // up-converted the segment from v1/v2 -> v3 (if needed). So based on the segmentVersion, whatever segment
+    // segmentDirectory is indicated to us by SegmentDirectoryPaths, we create lucene index there. There is no
+    // further need to move around the lucene index directory since it is created with correct directory structure
+    // based on segmentVersion.
+    try (LuceneTextIndexCreator textIndexCreator = new LuceneTextIndexCreator(column, segmentDirectory, true)) {
       try (DataFileReader forwardIndexReader = getForwardIndexReader(columnMetadata)) {
         VarByteChunkSingleValueReader forwardIndex = (VarByteChunkSingleValueReader) forwardIndexReader;
         for (int docID = 0; docID < numDocs; docID++) {
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
index abefe15..d55c453 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
@@ -18,8 +18,10 @@
  */
 package org.apache.pinot.core.segment.index.readers.text;
 
+import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
+import java.nio.ByteOrder;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
@@ -32,6 +34,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.pinot.core.segment.creator.impl.inv.text.LuceneTextIndexCreator;
 import org.apache.pinot.core.segment.index.readers.InvertedIndexReader;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
 import org.apache.pinot.core.segment.store.SegmentDirectoryPaths;
 import org.roaringbitmap.IntIterator;
 import org.roaringbitmap.buffer.MutableRoaringBitmap;
@@ -51,6 +54,9 @@ public class LuceneTextIndexReader implements InvertedIndexReader<MutableRoaring
   private final IndexSearcher _indexSearcher;
   private final QueryParser _queryParser;
   private final String _column;
+  private final DocIdTranslator _docIdTranslator;
+
+  public static final String LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION = ".lucene.mapping";
 
   /**
    * As part of loading the segment in ImmutableSegmentLoader,
@@ -58,18 +64,22 @@ public class LuceneTextIndexReader implements InvertedIndexReader<MutableRoaring
    * the reference in {@link org.apache.pinot.core.segment.index.column.PhysicalColumnIndexContainer}
    * similar to how it is done for other types of indexes.
    * @param column column name
-   * @param segmentIndexDir segment index directory
+   * @param indexDir segment index directory
+   * @param numDocs number of documents in the segment
    */
-  public LuceneTextIndexReader(String column, File segmentIndexDir) {
+  public LuceneTextIndexReader(String column, File indexDir, int numDocs) {
     _column = column;
     try {
-      File indexFile = getTextIndexFile(segmentIndexDir);
+      File indexFile = getTextIndexFile(indexDir);
       _indexDirectory = FSDirectory.open(indexFile.toPath());
       _indexReader = DirectoryReader.open(_indexDirectory);
       _indexSearcher = new IndexSearcher(_indexReader);
       // Disable Lucene query result cache. While it helps a lot with performance for
       // repeated queries, on the downside it cause heap issues.
       _indexSearcher.setQueryCache(null);
+      // TODO: consider using a threshold of num docs per segment to decide between building
+      // mapping file upfront on segment load v/s on-the-fly during query processing
+      _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher);
     } catch (Exception e) {
       LOGGER
           .error("Failed to instantiate Lucene text index reader for column {}, exception {}", column, e.getMessage());
@@ -123,9 +133,8 @@ public class LuceneTextIndexReader implements InvertedIndexReader<MutableRoaring
       _indexSearcher.search(query, docIDCollector);
       return getPinotDocIds(docIDs);
     } catch (Exception e) {
-      LOGGER.error("Failed while searching the text index for column {}, search query {}, exception {}", _column,
-          searchQuery, e.getMessage());
-      throw new RuntimeException(e);
+      String msg = "Caught excepttion while searching the text index for column:" + _column + " search query:" + searchQuery;
+      throw new RuntimeException(msg, e);
     }
   }
 
@@ -145,15 +154,10 @@ public class LuceneTextIndexReader implements InvertedIndexReader<MutableRoaring
   private MutableRoaringBitmap getPinotDocIds(MutableRoaringBitmap luceneDocIds) {
     IntIterator luceneDocIDIterator = luceneDocIds.getIntIterator();
     MutableRoaringBitmap actualDocIDs = new MutableRoaringBitmap();
-    try {
-      while (luceneDocIDIterator.hasNext()) {
-        int luceneDocId = luceneDocIDIterator.next();
-        Document document = _indexSearcher.doc(luceneDocId);
-        int pinotDocId = Integer.valueOf(document.get(LuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME));
-        actualDocIDs.add(pinotDocId);
-      }
-    } catch (Exception e) {
-      throw new RuntimeException("Error: failed while retrieving document from index: " + e);
+    while (luceneDocIDIterator.hasNext()) {
+      int luceneDocId = luceneDocIDIterator.next();
+      int pinotDocId = _docIdTranslator.getPinotDocId(luceneDocId);
+      actualDocIDs.add(pinotDocId);
     }
     return actualDocIDs;
   }
@@ -169,5 +173,50 @@ public class LuceneTextIndexReader implements InvertedIndexReader<MutableRoaring
       throws IOException {
     _indexReader.close();
     _indexDirectory.close();
+    _docIdTranslator.close();
+  }
+
+  private static class DocIdTranslator implements Closeable {
+    final PinotDataBuffer _buffer;
+
+    DocIdTranslator(File segmentIndexDir, String column, int numDocs, IndexSearcher indexSearcher)
+        throws Exception {
+      int length = Integer.BYTES * numDocs;
+      File docIdMappingFile = new File(SegmentDirectoryPaths.findSegmentDirectory(segmentIndexDir),
+          column + LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+      // The mapping is local to a segment. It is created on the server during segment load.
+      // Unless we are running Pinot on Solaris/SPARC, the underlying architecture is
+      // LITTLE_ENDIAN (Linux/x86). So use that as byte order.
+      String desc = "Text index docId mapping buffer: " + column;
+      if (docIdMappingFile.exists()) {
+        // we will be here for segment reload and server restart
+        // for refresh, we will not be here since segment is deleted/replaced
+        // TODO: see if we can prefetch the pages
+        _buffer =
+            PinotDataBuffer.mapFile(docIdMappingFile, /* readOnly */ true, 0, length, ByteOrder.LITTLE_ENDIAN, desc);
+      } else {
+        _buffer =
+            PinotDataBuffer.mapFile(docIdMappingFile, /* readOnly */ false, 0, length, ByteOrder.LITTLE_ENDIAN, desc);
+        for (int i = 0; i < numDocs; i++) {
+          try {
+            Document document = indexSearcher.doc(i);
+            int pinotDocId = Integer.parseInt(document.get(LuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME));
+            _buffer.putInt(i * Integer.BYTES, pinotDocId);
+          } catch (Exception e) {
+            throw new RuntimeException("Caught exception while building doc id mapping for text index column: " + column, e);
+          }
+        }
+      }
+    }
+
+    int getPinotDocId(int luceneDocId) {
+      return _buffer.getInt(luceneDocId * Integer.BYTES);
+    }
+
+    @Override
+    public void close()
+        throws IOException {
+      _buffer.close();
+    }
   }
 }
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/store/SegmentDirectoryPaths.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/store/SegmentDirectoryPaths.java
index 322710c..9e1588a 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/store/SegmentDirectoryPaths.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/store/SegmentDirectoryPaths.java
@@ -18,12 +18,14 @@
  */
 package org.apache.pinot.core.segment.store;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import java.io.File;
 import javax.annotation.Nullable;
 import org.apache.pinot.core.indexsegment.generator.SegmentVersion;
 import org.apache.pinot.core.segment.creator.impl.V1Constants;
 import org.apache.pinot.core.segment.creator.impl.inv.text.LuceneTextIndexCreator;
+import org.apache.pinot.core.segment.index.readers.text.LuceneTextIndexReader;
 
 
 public class SegmentDirectoryPaths {
@@ -83,6 +85,13 @@ public class SegmentDirectoryPaths {
     return findFormatFile(indexDir, luceneIndexDirectory);
   }
 
+  @Nullable
+  @VisibleForTesting
+  public static File findTextIndexDocIdMappingFile(File indexDir, String column) {
+    String file = column + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION;
+    return findFormatFile(indexDir, file);
+  }
+
   /**
    * Find a file in any segment version.
    * <p>Index directory passed in should be top level segment directory.
diff --git a/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/LoaderTest.java b/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/LoaderTest.java
index 7d81b7b..c6c1e7c 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/LoaderTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/LoaderTest.java
@@ -20,7 +20,10 @@ package org.apache.pinot.core.segment.index.loader;
 
 import java.io.File;
 import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import org.apache.commons.io.FileUtils;
 import org.apache.pinot.common.segment.ReadMode;
 import org.apache.pinot.common.utils.TarGzCompressionUtils;
@@ -36,6 +39,7 @@ import org.apache.pinot.core.segment.index.converter.SegmentV1V2ToV3FormatConver
 import org.apache.pinot.core.segment.index.metadata.ColumnMetadata;
 import org.apache.pinot.core.segment.index.metadata.SegmentMetadataImpl;
 import org.apache.pinot.core.segment.index.readers.StringDictionary;
+import org.apache.pinot.core.segment.index.readers.text.LuceneTextIndexReader;
 import org.apache.pinot.core.segment.memory.PinotDataBuffer;
 import org.apache.pinot.core.segment.store.ColumnIndexType;
 import org.apache.pinot.core.segment.store.SegmentDirectory;
@@ -268,6 +272,9 @@ public class LoaderTest {
 
   @Test
   public void testTextIndexLoad() throws Exception {
+    // Tests for scenarios by creating on-disk segment in V3 and then loading
+    // the segment with and without specifying segmentVersion in IndexLoadingConfig
+
     // create on-disk segment in V3
     // this generates the segment in V1 but converts to V3 as part of post-creation processing
     constructSegmentWithTextIndex(SegmentVersion.v3);
@@ -288,7 +295,10 @@ public class LoaderTest {
     // CASE 1: don't set the segment version to load in IndexLoadingConfig
     // there should be no conversion done by ImmutableSegmentLoader and it should
     // be able to create text index reader with on-disk version V3
-    IndexSegment indexSegment = ImmutableSegmentLoader.load(_indexDir, ReadMode.mmap);
+    IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig();
+    indexLoadingConfig.setTextIndexColumns(new HashSet<>(Arrays.asList(TEXT_INDEX_COL_NAME)));
+    indexLoadingConfig.setReadMode(ReadMode.mmap);
+    IndexSegment indexSegment = ImmutableSegmentLoader.load(_indexDir, indexLoadingConfig);
     // check that loaded segment version is v3
     Assert.assertEquals(indexSegment.getSegmentMetadata().getVersion(), SegmentVersion.v3.toString());
     // no change/conversion should have happened in indexDir
@@ -297,16 +307,23 @@ public class LoaderTest {
     verifyIndexDirIsV3(_indexDir);
     // no change/conversion should have happened for textIndex dir
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
+    // segment load should have created the docID mapping file in V3 structure
+    File textIndexDocIdMappingFile = SegmentDirectoryPaths.findTextIndexDocIdMappingFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
+    Assert.assertNotNull(textIndexDocIdMappingFile);
     Assert.assertTrue(textIndexFile.isDirectory());
+    Assert.assertFalse(textIndexDocIdMappingFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
+    Assert.assertEquals(textIndexDocIdMappingFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+    Assert.assertEquals(textIndexDocIdMappingFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
     indexSegment.destroy();
 
     // CASE 2: set the segment version to load in IndexLoadingConfig as V3
     // there should be no conversion done by ImmutableSegmentLoader since the segmentVersionToLoad
     // is same as the version of segment on disk (V3)
-    indexSegment = ImmutableSegmentLoader.load(_indexDir, _v3IndexLoadingConfig);
+    indexLoadingConfig.setSegmentVersion(SegmentVersion.v3);
+    indexSegment = ImmutableSegmentLoader.load(_indexDir, indexLoadingConfig);
     // check that loaded segment version is v3
     Assert.assertEquals(indexSegment.getSegmentMetadata().getVersion(), SegmentVersion.v3.toString());
     // no change/conversion should have happened in indexDir
@@ -315,12 +332,21 @@ public class LoaderTest {
     verifyIndexDirIsV3(_indexDir);
     // no change/conversion should have happened for textIndex dir
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
+    // segment load should have created the docID mapping file in V3 structure
+    textIndexDocIdMappingFile = SegmentDirectoryPaths.findTextIndexDocIdMappingFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
+    Assert.assertNotNull(textIndexDocIdMappingFile);
     Assert.assertTrue(textIndexFile.isDirectory());
+    Assert.assertFalse(textIndexDocIdMappingFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
+    Assert.assertEquals(textIndexDocIdMappingFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+    Assert.assertEquals(textIndexDocIdMappingFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
     indexSegment.destroy();
 
+    // Test for scenarios by creating on-disk segment in V1 and then loading
+    // the segment with and without specifying segmentVersion in IndexLoadingConfig
+
     // create on-disk segment in V1
     // this generates the segment in V1 and does not convert to V3 as part of post-creation processing
     constructSegmentWithTextIndex(SegmentVersion.v1);
@@ -335,13 +361,17 @@ public class LoaderTest {
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
     Assert.assertTrue(textIndexFile.isDirectory());
+    Assert.assertFalse(textIndexDocIdMappingFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), new SegmentMetadataImpl(_indexDir).getName());
 
     // CASE 1: don't set the segment version to load in IndexLoadingConfig
     // there should be no conversion done by ImmutableSegmentLoader and it should
     // be able to create text index reader with on-disk version V1
-    indexSegment = ImmutableSegmentLoader.load(_indexDir, ReadMode.mmap);
+    indexLoadingConfig = new IndexLoadingConfig();
+    indexLoadingConfig.setTextIndexColumns(new HashSet<>(Arrays.asList(TEXT_INDEX_COL_NAME)));
+    indexLoadingConfig.setReadMode(ReadMode.mmap);
+    indexSegment = ImmutableSegmentLoader.load(_indexDir, indexLoadingConfig);
     // check that loaded segment version is v1
     Assert.assertEquals(indexSegment.getSegmentMetadata().getVersion(), SegmentVersion.v1.toString());
     // no change/conversion should have happened in indexDir
@@ -349,16 +379,22 @@ public class LoaderTest {
     Assert.assertFalse(SegmentDirectoryPaths.segmentDirectoryFor(_indexDir, SegmentVersion.v3).exists());
     // no change/conversion should have happened in text index Dir
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
+    // segment load should have created the docID mapping file in V1 structure
+    textIndexDocIdMappingFile = SegmentDirectoryPaths.findTextIndexDocIdMappingFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
+    Assert.assertNotNull(textIndexDocIdMappingFile);
     Assert.assertTrue(textIndexFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), new SegmentMetadataImpl(_indexDir).getName());
+    Assert.assertEquals(textIndexDocIdMappingFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+    Assert.assertEquals(textIndexDocIdMappingFile.getParentFile().getName(), new SegmentMetadataImpl(_indexDir).getName());
     indexSegment.destroy();
 
     // CASE 2: set the segment version to load in IndexLoadingConfig to V1
     // there should be no conversion done by ImmutableSegmentLoader since the segmentVersionToLoad
     // is same as the version of segment on fisk
-    indexSegment = ImmutableSegmentLoader.load(_indexDir, _v1IndexLoadingConfig);
+    indexLoadingConfig.setSegmentVersion(SegmentVersion.v1);
+    indexSegment = ImmutableSegmentLoader.load(_indexDir, indexLoadingConfig);
     // check that loaded segment version is v1
     Assert.assertEquals(indexSegment.getSegmentMetadata().getVersion(), SegmentVersion.v1.toString());
     // no change/conversion should have happened in indexDir
@@ -366,16 +402,22 @@ public class LoaderTest {
     Assert.assertFalse(SegmentDirectoryPaths.segmentDirectoryFor(_indexDir, SegmentVersion.v3).exists());
     // no change/conversion should have happened in text index Dir
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
+    // segment load should have created the docID mapping file in V1 structure
+    textIndexDocIdMappingFile = SegmentDirectoryPaths.findTextIndexDocIdMappingFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
+    Assert.assertNotNull(textIndexDocIdMappingFile);
     Assert.assertTrue(textIndexFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), new SegmentMetadataImpl(_indexDir).getName());
+    Assert.assertEquals(textIndexDocIdMappingFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+    Assert.assertEquals(textIndexDocIdMappingFile.getParentFile().getName(), new SegmentMetadataImpl(_indexDir).getName());
     indexSegment.destroy();
 
     // CASE 3: set the segment version to load in IndexLoadingConfig to V3
     // there should be conversion done by ImmutableSegmentLoader since the segmentVersionToLoad
     // is different than the version of segment on disk
-    indexSegment = ImmutableSegmentLoader.load(_indexDir, _v3IndexLoadingConfig);
+    indexLoadingConfig.setSegmentVersion(SegmentVersion.v3);
+    indexSegment = ImmutableSegmentLoader.load(_indexDir, indexLoadingConfig);
     // check that loaded segment version is v3
     Assert.assertEquals(indexSegment.getSegmentMetadata().getVersion(), SegmentVersion.v3.toString());
     // the index dir should exist in v3 format due to conversion
@@ -383,10 +425,15 @@ public class LoaderTest {
     verifyIndexDirIsV3(_indexDir);
     // check that text index exists under V3 subdir. It should exist and should be a subdir
     textIndexFile = SegmentDirectoryPaths.findTextIndexIndexFile(_indexDir, TEXT_INDEX_COL_NAME);
+    // segment load should have created the docID mapping file in V3 structure
+    textIndexDocIdMappingFile = SegmentDirectoryPaths.findTextIndexDocIdMappingFile(_indexDir, TEXT_INDEX_COL_NAME);
     Assert.assertNotNull(textIndexFile);
+    Assert.assertNotNull(textIndexDocIdMappingFile);
     Assert.assertTrue(textIndexFile.isDirectory());
     Assert.assertEquals(textIndexFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexCreator.LUCENE_TEXT_INDEX_FILE_EXTENSION);
     Assert.assertEquals(textIndexFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
+    Assert.assertEquals(textIndexDocIdMappingFile.getName(), TEXT_INDEX_COL_NAME + LuceneTextIndexReader.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+    Assert.assertEquals(textIndexDocIdMappingFile.getParentFile().getName(), SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
     indexSegment.destroy();
   }
 
diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TestTextSearchQueries.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
similarity index 99%
rename from pinot-core/src/test/java/org/apache/pinot/queries/TestTextSearchQueries.java
rename to pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 7cba996..7ed16b7 100644
--- a/pinot-core/src/test/java/org/apache/pinot/queries/TestTextSearchQueries.java
+++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -56,7 +56,6 @@ import org.apache.pinot.core.operator.query.AggregationOperator;
 import org.apache.pinot.core.operator.query.SelectionOnlyOperator;
 import org.apache.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
 import org.apache.pinot.core.segment.index.loader.IndexLoadingConfig;
-import org.apache.pinot.pql.parsers.Pql2Compiler;
 import org.apache.pinot.spi.data.FieldSpec;
 import org.apache.pinot.spi.data.Schema;
 import org.apache.pinot.spi.data.readers.GenericRow;
@@ -75,7 +74,7 @@ import org.testng.annotations.Test;
  * The test table has a SKILLS column and QUERY_LOG column. Text index is created
  * on each of these columns.
  */
-public class TestTextSearchQueries extends BaseQueriesTest {
+public class TextSearchQueriesTest extends BaseQueriesTest {
 
   private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), "TextSearchQueries");
   private static final String TABLE_NAME = "MyTable";


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org