You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2022/11/02 23:12:14 UTC

[pinot] branch master updated: Customize stopword for Lucene Index (#9708)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 229c55e65a Customize stopword for Lucene Index (#9708)
229c55e65a is described below

commit 229c55e65ad7d3e0185c57b5a6da53c8adc0a9d4
Author: Jia Guo <ji...@linkedin.com>
AuthorDate: Wed Nov 2 16:12:09 2022 -0700

    Customize stopword for Lucene Index (#9708)
    
    * Customize stopword for Lucene Index
    
    * Customize stopword for Lucene Index
    
    * Customize stopword for Lucene Index
    
    * Customize stopword for Lucene Index
    
    * Customize stopword for Lucene Index
---
 .../pinot/queries/TextSearchQueriesTest.java       | 46 +++++++++++++++++-
 .../resources/data/text_search_data/skills.txt     |  6 ++-
 .../TextIndicesRealtimeClusterIntegrationTest.java |  2 +-
 .../indexsegment/mutable/MutableSegmentImpl.java   |  7 ++-
 .../invertedindex/RealtimeLuceneTextIndex.java     |  8 +++-
 .../creator/impl/DefaultIndexCreatorProvider.java  |  2 +-
 .../creator/impl/SegmentColumnarIndexCreator.java  |  4 +-
 .../creator/impl/text/LuceneTextIndexCreator.java  | 27 ++++++++---
 .../loader/invertedindex/TextIndexHandler.java     |  8 +++-
 .../index/readers/text/LuceneTextIndexReader.java  |  6 ++-
 .../local/segment/store/TextIndexUtils.java        | 54 ++++++++++++++++++++++
 .../NativeAndLuceneMutableTextIndexTest.java       |  3 +-
 .../segment/store/FilePerIndexDirectoryTest.java   | 12 +++--
 .../store/SingleFileIndexDirectoryTest.java        | 12 +++--
 .../segment/spi/creator/IndexCreationContext.java  | 27 +++++++++--
 .../apache/pinot/spi/config/table/FieldConfig.java |  3 ++
 16 files changed, 197 insertions(+), 30 deletions(-)

diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index e730beb081..688d229023 100644
--- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -145,6 +145,13 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
     Map<String, String> props = new HashMap<>();
     props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
     columnProperties.put(SKILLS_TEXT_COL_MULTI_TERM_NAME, props);
+    props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_INCLUDE_KEY, "coordinator");
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "it, those");
+    columnProperties.put(SKILLS_TEXT_COL_NAME, props);
+    props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "");
+    columnProperties.put(SKILLS_TEXT_COL_DICT_NAME, props);
     indexLoadingConfig.setColumnProperties(columnProperties);
     ImmutableSegment immutableSegment =
         ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), indexLoadingConfig);
@@ -193,6 +200,13 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
     props.put(FieldConfig.TEXT_INDEX_NO_RAW_DATA, "true");
     props.put(FieldConfig.TEXT_INDEX_RAW_VALUE, "ILoveCoding");
     columnProperties.put(SKILLS_TEXT_NO_RAW_NAME, props);
+    props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_INCLUDE_KEY, "coordinator");
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "it, those");
+    columnProperties.put(SKILLS_TEXT_COL_NAME, props);
+    props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "");
+    columnProperties.put(SKILLS_TEXT_COL_DICT_NAME, props);
     config.setColumnProperties(columnProperties);
     SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
     try (RecordReader recordReader = new GenericRowRecordReader(rows)) {
@@ -206,7 +220,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
     List<GenericRow> rows = new ArrayList<>();
 
     // read the skills file
-    String[] skills = new String[24];
+    String[] skills = new String[28];
     List<String[]> multiValueStringList = new ArrayList<>();
     int skillCount = 0;
     try (BufferedReader reader = new BufferedReader(new InputStreamReader(
@@ -217,7 +231,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
         multiValueStringList.add(StringUtils.splitByWholeSeparator(line, ", "));
       }
     }
-    assertEquals(skillCount, 24);
+    assertEquals(skillCount, 28);
 
     // read the query log file (24k queries) and build dataset
     int counter = 0;
@@ -1864,6 +1878,34 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, 'a and or in the are')";
     testInterSegmentAggregationQueryHelper(query, 0);
 
+    // query with words excluded from default stop-words. they should be indexed
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"IT support\" or \"IT manager\"')";
+    testInterSegmentAggregationQueryHelper(query, 8);
+
+    // query with words excluded from default stop-words. they should be indexed
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"IT\"')";
+    testInterSegmentAggregationQueryHelper(query, 16);
+
+    // query without stop-words
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"support\" or \"manager\"')";
+    testInterSegmentAggregationQueryHelper(query, 12);
+
+    // query without stop-words
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"supporting\"')";
+    testInterSegmentAggregationQueryHelper(query, 4);
+
+    // query with included stop-words. they should not be indexed
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'coordinator')";
+    testInterSegmentAggregationQueryHelper(query, 0);
+
+    // query with default stop-words. they should not be indexed
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"IT support\" or \"IT manager\"')";
+    testInterSegmentAggregationQueryHelper(query, 12);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"IT\"')";
+    testInterSegmentAggregationQueryHelper(query, 0);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_DICT, '\"support\" or \"manager\"')";
+    testInterSegmentAggregationQueryHelper(query, 12);
+
     // analyzer should prune/ignore the stop words from search expression and consider everything else for a match
     query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"learned a lot\"')";
     testInterSegmentAggregationQueryHelper(query, 4);
diff --git a/pinot-core/src/test/resources/data/text_search_data/skills.txt b/pinot-core/src/test/resources/data/text_search_data/skills.txt
index 36292748ec..bfdab88e1a 100644
--- a/pinot-core/src/test/resources/data/text_search_data/skills.txt
+++ b/pinot-core/src/test/resources/data/text_search_data/skills.txt
@@ -21,4 +21,8 @@ C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernet
 Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster management, docker image building and distribution
 Database engine, OLAP systems, OLTP transaction processing at large scale, concurrency, multi-threading, GO, building large scale systems
 GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 - NullPointerException
-Foo worked in a lot of places and learned a lot of things
\ No newline at end of file
+Foo worked in a lot of places and learned a lot of things
+IT support, python, hardware debugging
+IT manager, workspace coordinator
+manager, coordinator, IT
+IT supporting
\ No newline at end of file
diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/TextIndicesRealtimeClusterIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/TextIndicesRealtimeClusterIntegrationTest.java
index 3ad8eda2ca..b10c994791 100644
--- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/TextIndicesRealtimeClusterIntegrationTest.java
+++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/TextIndicesRealtimeClusterIntegrationTest.java
@@ -54,7 +54,7 @@ public class TextIndicesRealtimeClusterIntegrationTest extends BaseClusterIntegr
   private static final String TEXT_COLUMN_NAME = "skills";
   private static final String TEXT_COLUMN_NAME_NATIVE = "skills_native";
   private static final String TIME_COLUMN_NAME = "millisSinceEpoch";
-  private static final int NUM_SKILLS = 24;
+  private static final int NUM_SKILLS = 28;
   private static final int NUM_MATCHING_SKILLS = 4;
   private static final int NUM_RECORDS = NUM_SKILLS * 1000;
   private static final int NUM_MATCHING_RECORDS = NUM_MATCHING_SKILLS * 1000;
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java
index 7b83864cd1..f405ebccb2 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java
@@ -319,12 +319,16 @@ public class MutableSegmentImpl implements MutableSegment {
 
       // Text index
       MutableTextIndex textIndex;
+      List<String> stopWordsInclude = null;
+      List<String> stopWordsExclude = null;
       if (textIndexColumns.contains(column)) {
         boolean useNativeTextIndex = false;
         if (_fieldConfigList != null) {
           for (FieldConfig fieldConfig : _fieldConfigList) {
             if (fieldConfig.getName().equals(column)) {
               Map<String, String> properties = fieldConfig.getProperties();
+              stopWordsInclude = TextIndexUtils.extractStopWordsInclude(properties);
+              stopWordsExclude = TextIndexUtils.extractStopWordsExclude(properties);
               if (TextIndexUtils.isFstTypeNative(properties)) {
                 useNativeTextIndex = true;
               }
@@ -340,7 +344,8 @@ public class MutableSegmentImpl implements MutableSegment {
           //  it is beyond the scope of realtime index pluggability to do this refactoring, so realtime
           //  text indexes remain statically defined. Revisit this after this refactoring has been done.
           RealtimeLuceneTextIndex luceneTextIndex =
-              new RealtimeLuceneTextIndex(column, new File(config.getConsumerDir()), _segmentName);
+              new RealtimeLuceneTextIndex(column, new File(config.getConsumerDir()), _segmentName,
+                  stopWordsInclude, stopWordsExclude);
           if (_realtimeLuceneReaders == null) {
             _realtimeLuceneReaders = new RealtimeLuceneIndexRefreshState.RealtimeLuceneReaders(_segmentName);
           }
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
index c2a3d9c9e8..e066748335 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
@@ -19,6 +19,7 @@
 package org.apache.pinot.segment.local.realtime.impl.invertedindex;
 
 import java.io.File;
+import java.util.List;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexWriter;
@@ -57,8 +58,11 @@ public class RealtimeLuceneTextIndex implements MutableTextIndex {
    * @param column column name
    * @param segmentIndexDir realtime segment consumer dir
    * @param segmentName realtime segment name
+   * @param stopWordsInclude the words to include in addition to the default stop word list
+   * @param stopWordsExclude stop words to exclude from default stop words
    */
-  public RealtimeLuceneTextIndex(String column, File segmentIndexDir, String segmentName) {
+  public RealtimeLuceneTextIndex(String column, File segmentIndexDir, String segmentName,
+      List<String> stopWordsInclude, List<String> stopWordsExclude) {
     _column = column;
     _segmentName = segmentName;
     try {
@@ -72,7 +76,7 @@ public class RealtimeLuceneTextIndex implements MutableTextIndex {
       // for realtime
       _indexCreator =
           new LuceneTextIndexCreator(column, new File(segmentIndexDir.getAbsolutePath() + "/" + segmentName),
-              false /* commitOnClose */);
+              false /* commitOnClose */, stopWordsInclude, stopWordsExclude);
       IndexWriter indexWriter = _indexCreator.getIndexWriter();
       _searcherManager = new SearcherManager(indexWriter, false, false, null);
     } catch (Exception e) {
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/DefaultIndexCreatorProvider.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/DefaultIndexCreatorProvider.java
index f42966fcc9..6c8859d1ca 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/DefaultIndexCreatorProvider.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/DefaultIndexCreatorProvider.java
@@ -161,7 +161,7 @@ public final class DefaultIndexCreatorProvider implements IndexCreatorProvider {
         return new NativeTextIndexCreator(context.getFieldSpec().getName(), context.getIndexDir());
       } else {
         return new LuceneTextIndexCreator(context.getFieldSpec().getName(), context.getIndexDir(),
-                context.isCommitOnClose());
+                context.isCommitOnClose(), context.getStopWordsInclude(), context.getStopWordsExclude());
       }
     }
   }
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
index ba6f668e36..9c1390d26c 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -282,7 +282,9 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
           }
         }
         _textIndexCreatorMap.put(columnName,
-            _indexCreatorProvider.newTextIndexCreator(context.forTextIndex(fstType, true)));
+            _indexCreatorProvider.newTextIndexCreator(context.forTextIndex(fstType, true,
+                TextIndexUtils.extractStopWordsInclude(columnName, _columnProperties),
+                TextIndexUtils.extractStopWordsExclude(columnName, _columnProperties))));
       }
 
       if (fstIndexColumns.contains(columnName)) {
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index b3b3cd0f24..f65e93220f 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -21,6 +21,9 @@ package org.apache.pinot.segment.local.segment.creator.impl.text;
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import javax.annotation.Nullable;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
@@ -33,6 +36,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndex;
 import org.apache.pinot.segment.local.segment.creator.impl.SegmentColumnarIndexCreator;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.spi.V1Constants;
 import org.apache.pinot.segment.spi.index.creator.DictionaryBasedInvertedIndexCreator;
 import org.apache.pinot.segment.spi.index.creator.TextIndexCreator;
@@ -55,10 +59,15 @@ public class LuceneTextIndexCreator implements TextIndexCreator {
 
   private int _nextDocId = 0;
 
-  public static final CharArraySet ENGLISH_STOP_WORDS_SET = new CharArraySet(Arrays
-      .asList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no",
-          "not", "of", "on", "or", "such", "that", "the", "their", "then", "than", "there", "these", "they", "this",
-          "to", "was", "will", "with", "those"), true);
+  public static HashSet<String> getDefaultEnglishStopWordsSet() {
+    return new HashSet<>(
+        Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
+            "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "than", "there", "these", "they",
+            "this", "to", "was", "will", "with", "those"));
+  }
+
+  public static final CharArraySet ENGLISH_STOP_WORDS_SET = new CharArraySet(getDefaultEnglishStopWordsSet(), true);
+
 
   /**
    * Called by {@link SegmentColumnarIndexCreator}
@@ -82,16 +91,20 @@ public class LuceneTextIndexCreator implements TextIndexCreator {
    *               no need to commit the index from the realtime side. So when the realtime segment
    *               is destroyed (which is after the realtime segment has been committed and converted
    *               to offline), we close this lucene index writer to release resources but don't commit.
-   *               This is the reason to have commit flag part of the constructor.
+   * @param stopWordsInclude the words to include in addition to the default stop word list
+   * @param stopWordsExclude the words to exclude from the default stop word list
    */
-  public LuceneTextIndexCreator(String column, File segmentIndexDir, boolean commit) {
+  public LuceneTextIndexCreator(String column, File segmentIndexDir, boolean commit,
+      @Nullable List<String> stopWordsInclude, @Nullable List<String> stopWordsExclude) {
     _textColumn = column;
     try {
       // segment generation is always in V1 and later we convert (as part of post creation processing)
       // to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
       File indexFile = getV1TextIndexFile(segmentIndexDir);
       _indexDirectory = FSDirectory.open(indexFile.toPath());
-      StandardAnalyzer standardAnalyzer = new StandardAnalyzer(ENGLISH_STOP_WORDS_SET);
+
+      StandardAnalyzer standardAnalyzer =
+          TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(stopWordsInclude, stopWordsExclude);
       IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
       indexWriterConfig.setRAMBufferSizeMB(LUCENE_INDEX_MAX_BUFFER_SIZE_MB);
       indexWriterConfig.setCommitOnClose(commit);
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
index 4f47c298bb..9af8fc4065 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
@@ -39,11 +39,13 @@ package org.apache.pinot.segment.local.segment.index.loader.invertedindex;
 import java.io.File;
 import java.io.IOException;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 import org.apache.pinot.segment.local.segment.index.loader.IndexHandler;
 import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
 import org.apache.pinot.segment.local.segment.index.loader.LoaderUtils;
 import org.apache.pinot.segment.local.segment.index.loader.SegmentPreProcessor;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.spi.ColumnMetadata;
 import org.apache.pinot.segment.spi.SegmentMetadata;
 import org.apache.pinot.segment.spi.creator.IndexCreationContext;
@@ -87,11 +89,13 @@ public class TextIndexHandler implements IndexHandler {
   private final SegmentMetadata _segmentMetadata;
   private final Set<String> _columnsToAddIdx;
   private final FSTType _fstType;
+  private final Map<String, Map<String, String>> _columnProperties;
 
   public TextIndexHandler(SegmentMetadata segmentMetadata, IndexLoadingConfig indexLoadingConfig) {
     _segmentMetadata = segmentMetadata;
     _fstType = indexLoadingConfig.getFSTIndexType();
     _columnsToAddIdx = indexLoadingConfig.getTextIndexColumns();
+    _columnProperties = indexLoadingConfig.getColumnProperties();
   }
 
   @Override
@@ -179,7 +183,9 @@ public class TextIndexHandler implements IndexHandler {
     try (ForwardIndexReader forwardIndexReader = LoaderUtils.getForwardIndexReader(segmentWriter, columnMetadata);
         ForwardIndexReaderContext readerContext = forwardIndexReader.createContext();
         TextIndexCreator textIndexCreator = indexCreatorProvider.newTextIndexCreator(IndexCreationContext.builder()
-            .withColumnMetadata(columnMetadata).withIndexDir(segmentDirectory).build().forTextIndex(_fstType, true))) {
+            .withColumnMetadata(columnMetadata).withIndexDir(segmentDirectory).build().forTextIndex(_fstType, true,
+                TextIndexUtils.extractStopWordsInclude(columnName, _columnProperties),
+                TextIndexUtils.extractStopWordsExclude(columnName, _columnProperties)))) {
       if (columnMetadata.isSingleValue()) {
         processSVField(segmentWriter, hasDictionary, forwardIndexReader, readerContext, textIndexCreator, numDocs,
             columnMetadata);
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
index ae49e68f6e..2f1f445f3d 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
@@ -37,6 +37,7 @@ import org.apache.lucene.store.FSDirectory;
 import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
 import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
 import org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.spi.V1Constants;
 import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
 import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
@@ -93,7 +94,10 @@ public class LuceneTextIndexReader implements TextIndexReader {
       // TODO: consider using a threshold of num docs per segment to decide between building
       // mapping file upfront on segment load v/s on-the-fly during query processing
       _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher);
-      _standardAnalyzer = new StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
+      _standardAnalyzer = TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
+          TextIndexUtils.extractStopWordsInclude(textIndexProperties),
+          TextIndexUtils.extractStopWordsExclude(textIndexProperties)
+      );
     } catch (Exception e) {
       LOGGER
           .error("Failed to instantiate Lucene text index reader for column {}, exception {}", column, e.getMessage());
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index bd3f51c484..aa474743c6 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -19,9 +19,18 @@
 package org.apache.pinot.segment.local.segment.store;
 
 import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
+import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
 import org.apache.pinot.segment.spi.V1Constants.Indexes;
 import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths;
 import org.apache.pinot.spi.config.table.FSTType;
@@ -64,4 +73,49 @@ public class TextIndexUtils {
   public static FSTType getFSTTypeOfIndex(File indexDir, String column) {
     return SegmentDirectoryPaths.findTextIndexIndexFile(indexDir, column) != null ? FSTType.LUCENE : FSTType.NATIVE;
   }
+
+  @Nonnull
+  public static List<String> extractStopWordsInclude(String colName,
+      Map<String, Map<String, String>> columnProperties) {
+    return extractStopWordsInclude(columnProperties.getOrDefault(colName, null));
+  }
+
+  @Nonnull
+  public static List<String> extractStopWordsExclude(String colName,
+      Map<String, Map<String, String>> columnProperties) {
+    return extractStopWordsExclude(columnProperties.getOrDefault(colName, null));
+  }
+
+  @Nonnull
+  public static List<String> extractStopWordsInclude(Map<String, String> columnProperty) {
+    return parseEntryAsString(columnProperty, FieldConfig.TEXT_INDEX_STOP_WORD_INCLUDE_KEY);
+  }
+
+  @Nonnull
+  public static List<String> extractStopWordsExclude(Map<String, String> columnProperty) {
+    return parseEntryAsString(columnProperty, FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY);
+  }
+
+  @Nonnull
+  private static List<String> parseEntryAsString(@Nullable Map<String, String> columnProperties,
+      String stopWordKey) {
+    if (columnProperties == null) {
+      return Collections.EMPTY_LIST;
+    }
+    String includeWords = columnProperties.getOrDefault(stopWordKey, "");
+    return Arrays.stream(includeWords.split(FieldConfig.TEXT_INDEX_STOP_WORD_SEPERATOR))
+        .map(String::trim).collect(Collectors.toList());
+  }
+
+  public static StandardAnalyzer getStandardAnalyzerWithCustomizedStopWords(@Nullable List<String> stopWordsInclude,
+     @Nullable List<String> stopWordsExclude) {
+    HashSet<String> stopWordSet = LuceneTextIndexCreator.getDefaultEnglishStopWordsSet();
+    if (stopWordsInclude != null) {
+      stopWordSet.addAll(stopWordsInclude);
+    }
+    if (stopWordsExclude != null) {
+      stopWordsExclude.forEach(stopWordSet::remove);
+    }
+    return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
+  }
 }
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
index 094ffff809..6345433d0c 100644
--- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
+++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
@@ -52,7 +52,8 @@ public class NativeAndLuceneMutableTextIndexTest {
   @BeforeClass
   public void setUp()
       throws Exception {
-    _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar");
+    _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar", null,
+        null);
     _nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME);
     List<String> documents = getTextData();
 
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
index 3076cf60a7..5f9275ae95 100644
--- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
+++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
@@ -172,8 +172,10 @@ public class FilePerIndexDirectoryTest {
   public void testRemoveTextIndices()
       throws IOException {
     try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap);
-        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true);
-        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true)) {
+        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true,
+            null, null);
+        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true,
+            null, null)) {
       PinotDataBuffer buf = fpi.newBuffer("col1", ColumnIndexType.FORWARD_INDEX, 1024);
       buf.putInt(0, 1);
 
@@ -233,8 +235,10 @@ public class FilePerIndexDirectoryTest {
       throws IOException {
     // Write sth to buffers and flush them to index files on disk
     try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap);
-        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true);
-        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true)) {
+        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true,
+            null, null);
+        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true,
+            null, null)) {
       PinotDataBuffer buf = fpi.newBuffer("col1", ColumnIndexType.FORWARD_INDEX, 1024);
       buf.putInt(0, 111);
       buf = fpi.newBuffer("col2", ColumnIndexType.DICTIONARY, 1024);
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
index 3664a66ef0..f065a191e5 100644
--- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
+++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
@@ -233,8 +233,10 @@ public class SingleFileIndexDirectoryTest {
   public void testRemoveTextIndices()
       throws IOException, ConfigurationException {
     try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap);
-        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true);
-        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true)) {
+        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true,
+            null, null);
+        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true,
+            null, null)) {
       PinotDataBuffer buf = sfd.newBuffer("col1", ColumnIndexType.FORWARD_INDEX, 1024);
       buf.putInt(0, 1);
 
@@ -336,8 +338,10 @@ public class SingleFileIndexDirectoryTest {
   public void testGetColumnIndices()
       throws Exception {
     try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap);
-        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true);
-        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true)) {
+        LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true,
+            null, null);
+        LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true,
+            null, null)) {
       PinotDataBuffer buf = sfd.newBuffer("col1", ColumnIndexType.FORWARD_INDEX, 1024);
       buf.putInt(0, 111);
       buf = sfd.newBuffer("col2", ColumnIndexType.DICTIONARY, 1024);
diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java
index 9ec3215c6d..9e15345ef2 100644
--- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java
+++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java
@@ -19,6 +19,8 @@
 package org.apache.pinot.segment.spi.creator;
 
 import java.io.File;
+import java.util.Collections;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import javax.annotation.Nullable;
@@ -307,8 +309,9 @@ public interface IndexCreationContext {
       return new Range(this, rangeIndexVersion);
     }
 
-    public Text forTextIndex(FSTType fstType, boolean commitOnClose) {
-      return new Text(this, fstType, commitOnClose);
+    public Text forTextIndex(FSTType fstType, boolean commitOnClose, List<String> stopWordsInclude,
+        List<String> stopWordExclude) {
+      return new Text(this, fstType, commitOnClose, stopWordsInclude, stopWordExclude);
     }
   }
 
@@ -481,15 +484,31 @@ public interface IndexCreationContext {
     private final FSTType _fstType;
     private final String[] _sortedUniqueElementsArray;
 
+    @Nullable
+    public List<String> getStopWordsInclude() {
+      return _stopWordsInclude;
+    }
+
+    @Nullable
+    public List<String> getStopWordsExclude() {
+      return _stopWordsExclude;
+    }
+
+    private final List<String> _stopWordsInclude;
+    private final List<String> _stopWordsExclude;
+
     /**
      * For text indexes
      */
-    public Text(IndexCreationContext wrapped, FSTType fstType, boolean commitOnClose) {
+    public Text(IndexCreationContext wrapped, FSTType fstType, boolean commitOnClose, List<String> stopWordsInclude,
+        List<String> stopWordExclude) {
       super(wrapped);
       _commitOnClose = commitOnClose;
       _fstType = fstType;
       _sortedUniqueElementsArray = null;
       _isFst = false;
+      _stopWordsInclude = stopWordsInclude;
+      _stopWordsExclude = stopWordExclude;
     }
 
     /**
@@ -501,6 +520,8 @@ public interface IndexCreationContext {
       _fstType = fstType;
       _sortedUniqueElementsArray = sortedUniqueElementsArray;
       _isFst = true;
+      _stopWordsInclude = Collections.EMPTY_LIST;
+      _stopWordsExclude = Collections.EMPTY_LIST;
     }
 
     public boolean isCommitOnClose() {
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index 08215a84b9..e04fac4d13 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -44,6 +44,9 @@ public class FieldConfig extends BaseJsonConfig {
   public static final String TEXT_INDEX_NO_RAW_DATA = "noRawDataForTextIndex";
   public static final String TEXT_INDEX_RAW_VALUE = "rawValueForTextIndex";
   public static final String TEXT_INDEX_DEFAULT_RAW_VALUE = "n";
+  public static final String TEXT_INDEX_STOP_WORD_INCLUDE_KEY = "stopWordInclude";
+  public static final String TEXT_INDEX_STOP_WORD_EXCLUDE_KEY = "stopWordExclude";
+  public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ",";
   // "native" for native, default is Lucene
   public static final String TEXT_FST_TYPE = "fstType";
   public static final String TEXT_NATIVE_FST_LITERAL = "native";


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org