You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ja...@apache.org on 2021/10/30 01:02:03 UTC
[pinot] branch master updated: Allow MV Field Support For Raw
Columns in Text Indices (#7638)
This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 5becf5b Allow MV Field Support For Raw Columns in Text Indices (#7638)
5becf5b is described below
commit 5becf5b81c996a6be39a22fd99aacfa7f1b3e1ac
Author: Atri Sharma <at...@gmail.com>
AuthorDate: Sat Oct 30 06:31:49 2021 +0530
Allow MV Field Support For Raw Columns in Text Indices (#7638)
Using the new MV raw byte forward index, allow multi value fields to be supported in text indices.
---
.../pinot/queries/TextSearchQueriesTest.java | 204 ++++++---------------
.../creator/impl/SegmentColumnarIndexCreator.java | 154 ++++++++--------
.../impl/inv/text/LuceneFSTIndexCreator.java | 5 +
.../creator/impl/text/LuceneTextIndexCreator.java | 20 ++
.../local/segment/index/loader/LoaderUtils.java | 13 +-
.../loader/invertedindex/TextIndexHandler.java | 85 ++++++---
.../utils/nativefst/NativeFSTIndexCreator.java | 5 +
.../index/loader/SegmentPreProcessorTest.java | 41 +++--
.../resources/data/newColumnsSchemaWithText.json | 10 +
.../spi/index/creator/TextIndexCreator.java | 5 +
10 files changed, 280 insertions(+), 262 deletions(-)
diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index ef2f054..98f3bc5 100644
--- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -31,9 +31,11 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
import java.util.Random;
import java.util.Set;
import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -96,14 +98,16 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
private static final String SKILLS_TEXT_COL_DICT_NAME = "SKILLS_TEXT_COL_DICT";
private static final String SKILLS_TEXT_COL_MULTI_TERM_NAME = "SKILLS_TEXT_COL_1";
private static final String SKILLS_TEXT_NO_RAW_NAME = "SKILLS_TEXT_COL_2";
+ private static final String SKILLS_TEXT_MV_COL_NAME = "SKILLS_TEXT_MV_COL";
+ private static final String SKILLS_TEXT_MV_COL_DICT_NAME = "SKILLS_TEXT_MV_COL_DICT";
private static final String INT_COL_NAME = "INT_COL";
- private static final List<String> RAW_TEXT_INDEX_COLUMNS = Arrays
- .asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_MULTI_TERM_NAME, SKILLS_TEXT_NO_RAW_NAME);
- private static final List<String> DICT_TEXT_INDEX_COLUMNS = Arrays.asList(SKILLS_TEXT_COL_DICT_NAME);
+ private static final List<String> RAW_TEXT_INDEX_COLUMNS =
+ Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_MULTI_TERM_NAME,
+ SKILLS_TEXT_NO_RAW_NAME, SKILLS_TEXT_MV_COL_NAME);
+ private static final List<String> DICT_TEXT_INDEX_COLUMNS =
+ Arrays.asList(SKILLS_TEXT_COL_DICT_NAME, SKILLS_TEXT_MV_COL_DICT_NAME);
private static final int INT_BASE_VALUE = 1000;
- private final List<GenericRow> _rows = new ArrayList<>();
-
private IndexSegment _indexSegment;
private List<IndexSegment> _indexSegments;
@@ -157,8 +161,8 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
List<FieldConfig> fieldConfigs = new ArrayList<>(RAW_TEXT_INDEX_COLUMNS.size() + DICT_TEXT_INDEX_COLUMNS.size());
for (String textIndexColumn : RAW_TEXT_INDEX_COLUMNS) {
- fieldConfigs
- .add(new FieldConfig(textIndexColumn, FieldConfig.EncodingType.RAW, FieldConfig.IndexType.TEXT, null, null));
+ fieldConfigs.add(
+ new FieldConfig(textIndexColumn, FieldConfig.EncodingType.RAW, FieldConfig.IndexType.TEXT, null, null));
}
for (String textIndexColumn : DICT_TEXT_INDEX_COLUMNS) {
fieldConfigs.add(
@@ -174,6 +178,8 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
.addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME, FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_COL_MULTI_TERM_NAME, FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_NO_RAW_NAME, FieldSpec.DataType.STRING)
+ .addMultiValueDimension(SKILLS_TEXT_MV_COL_NAME, FieldSpec.DataType.STRING)
+ .addMultiValueDimension(SKILLS_TEXT_MV_COL_DICT_NAME, FieldSpec.DataType.STRING)
.addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema);
config.setOutDir(INDEX_DIR.getPath());
@@ -197,24 +203,22 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
List<GenericRow> rows = new ArrayList<>();
// read the skills file
- URL resourceUrl = getClass().getClassLoader().getResource("data/text_search_data/skills.txt");
- File skillFile = new File(resourceUrl.getFile());
String[] skills = new String[100];
+ List<String[]> multiValueStringList = new ArrayList<>();
int skillCount = 0;
- try (InputStream inputStream = new FileInputStream(skillFile);
- BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) {
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(
+ Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("data/text_search_data/skills.txt"))))) {
String line;
while ((line = reader.readLine()) != null) {
skills[skillCount++] = line;
+ multiValueStringList.add(StringUtils.splitByWholeSeparator(line, ", "));
}
}
// read the pql query log file (24k queries) and build dataset
- resourceUrl = getClass().getClassLoader().getResource("data/text_search_data/pql_query1.txt");
- File logFile = new File(resourceUrl.getFile());
int counter = 0;
- try (InputStream inputStream = new FileInputStream(logFile);
- BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) {
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(
+ getClass().getClassLoader().getResourceAsStream("data/text_search_data/pql_query1.txt"))))) {
String line;
while ((line = reader.readLine()) != null) {
GenericRow row = new GenericRow();
@@ -224,12 +228,16 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
row.putValue(SKILLS_TEXT_COL_NAME, "software engineering");
row.putValue(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
- row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_NO_RAW_NAME, "software engineering");
+ row.putValue(SKILLS_TEXT_MV_COL_NAME, new String[]{"software", "engineering"});
+ row.putValue(SKILLS_TEXT_MV_COL_DICT_NAME, new String[]{"software", "engineering"});
} else {
row.putValue(SKILLS_TEXT_COL_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_NO_RAW_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_MV_COL_NAME, multiValueStringList.get(counter));
+ row.putValue(SKILLS_TEXT_MV_COL_DICT_NAME, multiValueStringList.get(counter));
}
rows.add(row);
counter++;
@@ -316,13 +324,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "management, docker image building and distribution"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Distributed systems\"') "
- + "LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Distributed systems\"", expected);
// TEST 5: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
@@ -339,13 +341,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "management, docker image building and distribution"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"query processing\"') LIMIT"
- + " 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"query processing\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"query processing\"", expected);
// TEST 6: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain the phrase "machine
@@ -395,13 +391,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "management, docker image building and distribution"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\"') LIMIT"
- + " 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Machine learning\"", expected);
// TEST 7: composite phrase query using boolean operator AND
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent phrases
@@ -420,15 +410,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "performance scalable systems"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND "
- + "\"Tensor flow\"') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND \"Tensor flow\"') "
- + "LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Machine learning\" AND \"Tensor flow\"", expected);
// TEST 8: term query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain term 'Java'.
@@ -481,11 +463,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "distributed storage, concurrency, multi-threading, apache airflow"
});
- query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'Java') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'Java') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("Java", expected);
// TEST 9: composite term query using BOOLEAN operator AND
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent
@@ -529,12 +507,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "distributed storage, concurrency, multi-threading, apache airflow"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'Java AND C++') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'Java AND C++') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("Java AND C++", expected);
// TEST 10: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "Java C++" as is.
@@ -563,12 +536,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "multi-threading, apache airflow"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Java C++\"') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Java C++\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Java C++\"", expected);
// TEST 11: composite phrase query using boolean operator AND.
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain two independent phrases
@@ -581,15 +549,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "performance scalable systems"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND "
- + "\"gpu processing\"') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND \"gpu "
- + "processing\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Machine learning\" AND \"gpu processing\"", expected);
// TEST 12: composite phrase and term query using boolean operator AND.
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "machine learning"
@@ -608,14 +568,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "performance scalable systems"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND "
- + "gpu') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND gpu') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Machine learning\" AND gpu", expected);
// TEST 13: composite phrase and term query using boolean operator AND
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "machine learning"
@@ -635,15 +588,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "performance scalable systems"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND gpu"
- + " AND python') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"Machine learning\" AND gpu AND python') "
- + "LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"Machine learning\" AND gpu AND python", expected);
// TEST 14: term query
// Search in SKILLS_TEXT_COL column to look for documents that MUST contain term 'apache'. The expected result
@@ -678,11 +623,8 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
"Databases, columnar query processing, Apache Arrow, distributed systems, Machine learning, cluster "
+ "management, docker image building and distribution"
});
- query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'apache') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'apache') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("apache", expected);
// TEST 15: composite phrase and term query using boolean operator AND.
// search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "distributed
@@ -701,15 +643,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "management, docker image building and distribution"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND "
- + "apache') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND apache') LIMIT "
- + "50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"distributed systems\" AND apache", expected);
// TEST 16: term query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain term 'database'.
@@ -744,11 +678,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ " building large scale systems"
});
- query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'database') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'database') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("database", expected);
// TEST 17: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase "database engine"
@@ -763,13 +693,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ " building large scale systems"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"database engine\"') LIMIT "
- + "50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"database engine\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"database engine\"", expected);
// TEST 18: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
@@ -790,13 +714,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ "multi-threading, C++,"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"publish subscribe\"') "
- + "LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"publish subscribe\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"publish subscribe\"", expected);
// TEST 19: phrase query
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain phrase
@@ -805,14 +723,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
expected = new ArrayList<>();
expected.add(new Serializable[]{1000, "Accounts, Banking, Insurance, worked in NGO, Java"});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"accounts banking "
- + "insurance\"') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"accounts banking insurance\"') LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"accounts banking insurance\"", expected);
// TEST 20: composite term query with boolean operator AND
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain terms 'accounts'
@@ -826,15 +737,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
expected.add(new Serializable[]{1001, "Accounts, Banking, Finance, Insurance"});
expected.add(new Serializable[]{1002, "Accounts, Finance, Banking, Insurance"});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'accounts AND banking AND "
- + "insurance') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'accounts AND banking AND insurance') LIMIT "
- + "50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("accounts AND banking AND insurance", expected);
// TEST 21: composite phrase and term query using boolean operator AND.
// Search in SKILLS_TEXT_COL column to look for documents where each document MUST contain ALL the following skills:
@@ -853,14 +756,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
+ " concurrency, multi-threading, C++, CPU processing, Java"
});
- query =
- "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND "
- + "Java AND C++') LIMIT 50000";
- testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
- query =
- "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND Java AND C++') "
- + "LIMIT 50000";
- testTextSearchAggregationQueryHelper(query, expected.size());
+ testSkillsColumn("\"distributed systems\" AND Java AND C++", expected);
// test for the index configured to use AND as the default
// conjunction operator
@@ -1802,6 +1698,22 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
Assert.assertEquals(expectedCount, count);
}
+ private void testSkillsColumn(String searchQuery, List<Serializable[]> expected)
+ throws Exception {
+ for (String skillColumn : Arrays.asList(SKILLS_TEXT_COL_NAME, SKILLS_TEXT_COL_DICT_NAME,
+ SKILLS_TEXT_COL_MULTI_TERM_NAME, SKILLS_TEXT_NO_RAW_NAME, SKILLS_TEXT_MV_COL_NAME,
+ SKILLS_TEXT_MV_COL_DICT_NAME)) {
+ String query =
+ String.format("SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(%s, '%s') LIMIT 50000",
+ skillColumn, searchQuery);
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+
+ query = String.format("SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(%s, '%s') LIMIT 50000", skillColumn,
+ searchQuery);
+ testTextSearchAggregationQueryHelper(query, expected.size());
+ }
+ }
+
@Test
public void testInterSegment() {
String query =
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
index 0ea31d5..d2ac484 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -166,9 +166,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
Map<String, H3IndexConfig> h3IndexConfigs = _config.getH3IndexConfigs();
for (String columnName : h3IndexConfigs.keySet()) {
- Preconditions
- .checkState(schema.hasColumn(columnName), "Cannot create H3 index for column: %s because it is not in schema",
- columnName);
+ Preconditions.checkState(schema.hasColumn(columnName),
+ "Cannot create H3 index for column: %s because it is not in schema", columnName);
}
// Initialize creators for dictionary, forward index and inverted index
@@ -206,8 +205,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
int cardinality = indexCreationInfo.getDistinctValueCount();
if (fieldSpec.isSingleValueField()) {
if (indexCreationInfo.isSorted()) {
- _forwardIndexCreatorMap
- .put(columnName, new SingleValueSortedForwardIndexCreator(_indexDir, columnName, cardinality));
+ _forwardIndexCreatorMap.put(columnName,
+ new SingleValueSortedForwardIndexCreator(_indexDir, columnName, cardinality));
} else {
_forwardIndexCreatorMap.put(columnName,
new SingleValueUnsortedForwardIndexCreator(_indexDir, columnName, cardinality, _totalDocs));
@@ -221,8 +220,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
// Initialize inverted index creator; skip creating inverted index if sorted
if (invertedIndexColumns.contains(columnName) && !indexCreationInfo.isSorted()) {
if (segmentCreationSpec.isOnHeap()) {
- _invertedIndexCreatorMap
- .put(columnName, new OnHeapBitmapInvertedIndexCreator(_indexDir, columnName, cardinality));
+ _invertedIndexCreatorMap.put(columnName,
+ new OnHeapBitmapInvertedIndexCreator(_indexDir, columnName, cardinality));
} else {
_invertedIndexCreatorMap.put(columnName,
new OffHeapBitmapInvertedIndexCreator(_indexDir, fieldSpec, cardinality, _totalDocs,
@@ -254,21 +253,19 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
if (textIndexColumns.contains(columnName)) {
// Initialize text index creator
- Preconditions.checkState(fieldSpec.isSingleValueField(),
- "Text index is currently only supported on single-value columns");
- Preconditions
- .checkState(storedType == DataType.STRING, "Text index is currently only supported on STRING type columns");
- _textIndexCreatorMap
- .put(columnName, new LuceneTextIndexCreator(columnName, _indexDir, true /* commitOnClose */));
+ Preconditions.checkState(storedType == DataType.STRING,
+ "Text index is currently only supported on STRING type columns");
+ _textIndexCreatorMap.put(columnName,
+ new LuceneTextIndexCreator(columnName, _indexDir, true /* commitOnClose */));
}
if (fstIndexColumns.contains(columnName)) {
Preconditions.checkState(fieldSpec.isSingleValueField(),
"FST index is currently only supported on single-value columns");
- Preconditions
- .checkState(storedType == DataType.STRING, "FST index is currently only supported on STRING type columns");
- Preconditions
- .checkState(dictEnabledColumn, "FST index is currently only supported on dictionary-encoded columns");
+ Preconditions.checkState(storedType == DataType.STRING,
+ "FST index is currently only supported on STRING type columns");
+ Preconditions.checkState(dictEnabledColumn,
+ "FST index is currently only supported on dictionary-encoded columns");
_fstIndexCreatorMap.put(columnName, new LuceneFSTIndexCreator(_indexDir, columnName,
(String[]) indexCreationInfo.getSortedUniqueElementsArray()));
}
@@ -276,8 +273,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
if (jsonIndexColumns.contains(columnName)) {
Preconditions.checkState(fieldSpec.isSingleValueField(),
"Json index is currently only supported on single-value columns");
- Preconditions
- .checkState(storedType == DataType.STRING, "Json index is currently only supported on STRING columns");
+ Preconditions.checkState(storedType == DataType.STRING,
+ "Json index is currently only supported on STRING columns");
JsonIndexCreator jsonIndexCreator =
segmentCreationSpec.isOnHeap() ? new OnHeapJsonIndexCreator(_indexDir, columnName)
: new OffHeapJsonIndexCreator(_indexDir, columnName);
@@ -286,8 +283,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
H3IndexConfig h3IndexConfig = h3IndexConfigs.get(columnName);
if (h3IndexConfig != null) {
- Preconditions
- .checkState(fieldSpec.isSingleValueField(), "H3 index is currently only supported on single-value columns");
+ Preconditions.checkState(fieldSpec.isSingleValueField(),
+ "H3 index is currently only supported on single-value columns");
Preconditions.checkState(storedType == DataType.BYTES, "H3 index is currently only supported on BYTES columns");
H3IndexResolution resolution = h3IndexConfig.getResolution();
GeoSpatialIndexCreator h3IndexCreator =
@@ -308,8 +305,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
Map<String, Map<String, String>> columnProperties) {
if (columnProperties != null) {
Map<String, String> properties = columnProperties.get(columnName);
- return properties != null && Boolean
- .parseBoolean(properties.get(FieldConfig.DERIVE_NUM_DOCS_PER_CHUNK_RAW_INDEX_KEY));
+ return properties != null && Boolean.parseBoolean(
+ properties.get(FieldConfig.DERIVE_NUM_DOCS_PER_CHUNK_RAW_INDEX_KEY));
}
return false;
}
@@ -397,7 +394,22 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
// text-index
TextIndexCreator textIndexCreator = _textIndexCreatorMap.get(columnName);
if (textIndexCreator != null) {
- textIndexCreator.add((String) columnValueToIndex);
+ if (fieldSpec.isSingleValueField()) {
+ textIndexCreator.add((String) columnValueToIndex);
+ } else {
+ Object[] values = (Object[]) columnValueToIndex;
+ int length = values.length;
+ if (values instanceof String[]) {
+ textIndexCreator.add((String[]) values, length);
+ } else {
+ String[] strings = new String[length];
+ for (int i = 0; i < length; i++) {
+ strings[i] = (String) values[i];
+ }
+ textIndexCreator.add(strings, length);
+ columnValueToIndex = strings;
+ }
+ }
}
if (fieldSpec.isSingleValueField()) {
@@ -461,8 +473,7 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
//dictionary encoded
int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex);
forwardIndexCreator.putDictIdMV(dictIds);
- DictionaryBasedInvertedIndexCreator invertedIndexCreator = _invertedIndexCreatorMap
- .get(columnName);
+ DictionaryBasedInvertedIndexCreator invertedIndexCreator = _invertedIndexCreatorMap.get(columnName);
if (invertedIndexCreator != null) {
invertedIndexCreator.add(dictIds, dictIds.length);
}
@@ -470,76 +481,69 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
// for text index on raw columns, check the config to determine if actual raw value should
// be stored or not
if (textIndexCreator != null && !shouldStoreRawValueForTextIndex(columnName)) {
- Object value = _columnProperties.get(columnName)
- .get(FieldConfig.TEXT_INDEX_RAW_VALUE);
+ Object value = _columnProperties.get(columnName).get(FieldConfig.TEXT_INDEX_RAW_VALUE);
if (value == null) {
value = FieldConfig.TEXT_INDEX_DEFAULT_RAW_VALUE;
}
if (forwardIndexCreator.getValueType().getStoredType() == DataType.STRING) {
- columnValueToIndex = new String[] {String.valueOf(value)};
+ columnValueToIndex = new String[]{String.valueOf(value)};
} else if (forwardIndexCreator.getValueType().getStoredType() == DataType.BYTES) {
- columnValueToIndex = new byte[][] {String.valueOf(value).getBytes(UTF_8)};
+ columnValueToIndex = new byte[][]{String.valueOf(value).getBytes(UTF_8)};
} else {
throw new RuntimeException("Text Index is only supported for STRING and BYTES stored type");
}
}
+ Object[] values = (Object[]) columnValueToIndex;
+ int length = values.length;
switch (forwardIndexCreator.getValueType()) {
case INT:
- if (columnValueToIndex instanceof Object[]) {
- int[] array = new int[((Object[]) columnValueToIndex).length];
- for (int i = 0; i < array.length; i++) {
- array[i] = (Integer) ((Object[]) columnValueToIndex)[i];
- }
- forwardIndexCreator.putIntMV(array);
+ int[] ints = new int[length];
+ for (int i = 0; i < length; i++) {
+ ints[i] = (Integer) values[i];
}
+ forwardIndexCreator.putIntMV(ints);
break;
case LONG:
- if (columnValueToIndex instanceof Object[]) {
- long[] array = new long[((Object[]) columnValueToIndex).length];
- for (int i = 0; i < array.length; i++) {
- array[i] = (Long) ((Object[]) columnValueToIndex)[i];
- }
- forwardIndexCreator.putLongMV(array);
+ long[] longs = new long[length];
+ for (int i = 0; i < length; i++) {
+ longs[i] = (Long) values[i];
}
+ forwardIndexCreator.putLongMV(longs);
break;
case FLOAT:
- if (columnValueToIndex instanceof Object[]) {
- float[] array = new float[((Object[]) columnValueToIndex).length];
- for (int i = 0; i < array.length; i++) {
- array[i] = (Float) ((Object[]) columnValueToIndex)[i];
- }
- forwardIndexCreator.putFloatMV(array);
+ float[] floats = new float[length];
+ for (int i = 0; i < length; i++) {
+ floats[i] = (Float) values[i];
}
+ forwardIndexCreator.putFloatMV(floats);
break;
case DOUBLE:
- if (columnValueToIndex instanceof Object[]) {
- double[] array = new double[((Object[]) columnValueToIndex).length];
- for (int i = 0; i < array.length; i++) {
- array[i] = (Double) ((Object[]) columnValueToIndex)[i];
- }
- forwardIndexCreator.putDoubleMV(array);
+ double[] doubles = new double[length];
+ for (int i = 0; i < length; i++) {
+ doubles[i] = (Double) values[i];
}
+ forwardIndexCreator.putDoubleMV(doubles);
break;
case STRING:
- if (columnValueToIndex instanceof String[]) {
- forwardIndexCreator.putStringMV((String[]) columnValueToIndex);
- } else if (columnValueToIndex instanceof Object[]) {
- String[] array = new String[((Object[]) columnValueToIndex).length];
- for (int i = 0; i < array.length; i++) {
- array[i] = (String) ((Object[]) columnValueToIndex)[i];
+ if (values instanceof String[]) {
+ forwardIndexCreator.putStringMV((String[]) values);
+ } else {
+ String[] strings = new String[length];
+ for (int i = 0; i < length; i++) {
+ strings[i] = (String) values[i];
}
- forwardIndexCreator.putStringMV(array);
+ forwardIndexCreator.putStringMV(strings);
}
break;
case BYTES:
- if (columnValueToIndex instanceof byte[][]) {
- forwardIndexCreator.putBytesMV((byte[][]) columnValueToIndex);
- } else if (columnValueToIndex instanceof Object[]) {
- byte[][] array = new byte[((Object[]) columnValueToIndex).length][];
- for (int i = 0; i < array.length; i++) {
- array[i] = (byte[]) ((Object[]) columnValueToIndex)[i];
+ if (values instanceof byte[][]) {
+ forwardIndexCreator.putBytesMV((byte[][]) values);
+ } else {
+ byte[][] bytesArray = new byte[length][];
+ for (int i = 0; i < length; i++) {
+ bytesArray[i] = (byte[]) values[i];
}
- forwardIndexCreator.putBytesMV(array);
+ forwardIndexCreator.putBytesMV(bytesArray);
}
break;
default:
@@ -740,8 +744,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
String.valueOf(columnIndexCreationInfo.getMaxNumberOfMultiValueElements()));
properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
String.valueOf(columnIndexCreationInfo.getTotalNumberOfEntries()));
- properties
- .setProperty(getKeyFor(column, IS_AUTO_GENERATED), String.valueOf(columnIndexCreationInfo.isAutoGenerated()));
+ properties.setProperty(getKeyFor(column, IS_AUTO_GENERATED),
+ String.valueOf(columnIndexCreationInfo.isAutoGenerated()));
PartitionFunction partitionFunction = columnIndexCreationInfo.getPartitionFunction();
if (partitionFunction != null) {
@@ -871,17 +875,15 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
return new MultiValueVarByteRawIndexCreator(file, compressionType, column, totalDocs, dataType, writerVersion,
maxRowLengthInBytes, maxNumberOfMultiValueElements);
default:
- throw new UnsupportedOperationException(
- "Data type not supported for raw indexing: " + dataType);
+ throw new UnsupportedOperationException("Data type not supported for raw indexing: " + dataType);
}
}
@Override
public void close()
throws IOException {
- FileUtils.close(Iterables
- .concat(_dictionaryCreatorMap.values(), _forwardIndexCreatorMap.values(), _invertedIndexCreatorMap.values(),
- _textIndexCreatorMap.values(), _fstIndexCreatorMap.values(), _jsonIndexCreatorMap.values(),
- _h3IndexCreatorMap.values(), _nullValueVectorCreatorMap.values()));
+ FileUtils.close(Iterables.concat(_dictionaryCreatorMap.values(), _forwardIndexCreatorMap.values(),
+ _invertedIndexCreatorMap.values(), _textIndexCreatorMap.values(), _fstIndexCreatorMap.values(),
+ _jsonIndexCreatorMap.values(), _h3IndexCreatorMap.values(), _nullValueVectorCreatorMap.values()));
}
}
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/inv/text/LuceneFSTIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/inv/text/LuceneFSTIndexCreator.java
index 5e78289..bb772ac 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/inv/text/LuceneFSTIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/inv/text/LuceneFSTIndexCreator.java
@@ -78,6 +78,11 @@ public class LuceneFSTIndexCreator implements TextIndexCreator {
}
@Override
+ public void add(String[] documents, int length) {
+ throw new UnsupportedOperationException("Multiple values not supported");
+ }
+
+ @Override
public void seal()
throws IOException {
LOGGER.info("Sealing FST index: " + _fstIndexFile.getAbsolutePath());
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index aa4e9b5..a46c481 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -121,6 +121,26 @@ public class LuceneTextIndexCreator implements TextIndexCreator {
}
@Override
+ public void add(String[] documents, int length) {
+ Document docToIndex = new Document();
+
+ // Whenever multiple fields with the same name appear in one document, both the
+ // inverted index and term vectors will logically append the tokens of the
+ // field to one another, in the order the fields were added.
+ for (int i = 0; i < length; i++) {
+ docToIndex.add(new TextField(_textColumn, documents[i], Field.Store.NO));
+ }
+ docToIndex.add(new StoredField(LUCENE_INDEX_DOC_ID_COLUMN_NAME, _nextDocId++));
+
+ try {
+ _indexWriter.addDocument(docToIndex);
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Caught exception while adding a new document to the Lucene index for column: " + _textColumn, e);
+ }
+ }
+
+ @Override
public void seal() {
try {
// Do this one time operation of combining the multiple lucene index files (if any)
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java
index a5b0843..b7083d0 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java
@@ -29,7 +29,9 @@ import org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexCo
import org.apache.pinot.segment.local.segment.index.readers.BaseImmutableDictionary;
import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBitMVForwardIndexReader;
import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBitSVForwardIndexReaderV2;
+import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader;
import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkSVForwardIndexReader;
+import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkMVForwardIndexReader;
import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkSVForwardIndexReader;
import org.apache.pinot.segment.local.segment.index.readers.sorted.SortedIndexReaderImpl;
import org.apache.pinot.segment.spi.ColumnMetadata;
@@ -70,9 +72,14 @@ public class LoaderUtils {
columnMetadata.getTotalNumberOfEntries(), columnMetadata.getBitsPerElement());
}
} else {
- DataType dataType = columnMetadata.getDataType();
- return dataType.isFixedWidth() ? new FixedByteChunkSVForwardIndexReader(dataBuffer, dataType)
- : new VarByteChunkSVForwardIndexReader(dataBuffer, dataType);
+ DataType storedType = columnMetadata.getDataType().getStoredType();
+ if (columnMetadata.isSingleValue()) {
+ return storedType.isFixedWidth() ? new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType)
+ : new VarByteChunkSVForwardIndexReader(dataBuffer, storedType);
+ } else {
+ return storedType.isFixedWidth() ? new FixedByteChunkMVForwardIndexReader(dataBuffer, storedType)
+ : new VarByteChunkMVForwardIndexReader(dataBuffer, storedType);
+ }
}
}
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
index 53134cb..c12308e 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
@@ -37,6 +37,7 @@
package org.apache.pinot.segment.local.segment.index.loader.invertedindex;
import java.io.File;
+import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.configuration.PropertiesConfiguration;
@@ -45,10 +46,9 @@ import org.apache.pinot.segment.local.segment.index.loader.IndexHandler;
import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
import org.apache.pinot.segment.local.segment.index.loader.LoaderUtils;
import org.apache.pinot.segment.local.segment.index.loader.SegmentPreProcessor;
-import org.apache.pinot.segment.local.segment.index.readers.forward.BaseChunkSVForwardIndexReader;
-import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkSVForwardIndexReader;
import org.apache.pinot.segment.spi.ColumnMetadata;
import org.apache.pinot.segment.spi.SegmentMetadata;
+import org.apache.pinot.segment.spi.index.creator.TextIndexCreator;
import org.apache.pinot.segment.spi.index.creator.TextIndexType;
import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
@@ -123,9 +123,8 @@ public class TextIndexHandler implements IndexHandler {
}
/**
- * Right now the text index is supported on RAW and dictionary encoded
- * single-value STRING columns. Later we can add support for text index
- * on multi-value columns and BYTE type columns
+ * Right now the text index is supported on STRING columns.
+ * Later we can add support for text index on BYTES columns
* @param columnMetadata metadata for column
*/
private void checkUnsupportedOperationsForTextIndex(ColumnMetadata columnMetadata) {
@@ -133,10 +132,6 @@ public class TextIndexHandler implements IndexHandler {
if (columnMetadata.getDataType() != DataType.STRING) {
throw new UnsupportedOperationException("Text index is currently only supported on STRING columns: " + column);
}
- if (!columnMetadata.isSingleValue()) {
- throw new UnsupportedOperationException(
- "Text index is currently not supported on multi-value columns: " + column);
- }
}
private void createTextIndexForColumn(ColumnMetadata columnMetadata)
@@ -156,24 +151,10 @@ public class TextIndexHandler implements IndexHandler {
try (ForwardIndexReader forwardIndexReader = LoaderUtils.getForwardIndexReader(_segmentWriter, columnMetadata);
ForwardIndexReaderContext readerContext = forwardIndexReader.createContext();
LuceneTextIndexCreator textIndexCreator = new LuceneTextIndexCreator(column, segmentDirectory, true)) {
- if (!hasDictionary) {
- // text index on raw column, just read the raw forward index
- VarByteChunkSVForwardIndexReader rawIndexReader = (VarByteChunkSVForwardIndexReader) forwardIndexReader;
- BaseChunkSVForwardIndexReader.ChunkReaderContext chunkReaderContext =
- (BaseChunkSVForwardIndexReader.ChunkReaderContext) readerContext;
- for (int docId = 0; docId < numDocs; docId++) {
- textIndexCreator.add(rawIndexReader.getString(docId, chunkReaderContext));
- }
+ if (columnMetadata.isSingleValue()) {
+ processSVField(hasDictionary, forwardIndexReader, readerContext, textIndexCreator, numDocs, columnMetadata);
} else {
- // text index on dictionary encoded SV column
- // read forward index to get dictId
- // read the raw value from dictionary using dictId
- try (Dictionary dictionary = LoaderUtils.getDictionary(_segmentWriter, columnMetadata)) {
- for (int docId = 0; docId < numDocs; docId++) {
- int dictId = forwardIndexReader.getDictId(docId, readerContext);
- textIndexCreator.add(dictionary.getStringValue(dictId));
- }
- }
+ processMVField(hasDictionary, forwardIndexReader, readerContext, textIndexCreator, numDocs, columnMetadata);
}
textIndexCreator.seal();
}
@@ -183,4 +164,56 @@ public class TextIndexHandler implements IndexHandler {
properties.setProperty(getKeyFor(column, TEXT_INDEX_TYPE), TextIndexType.LUCENE.name());
properties.save();
}
+
+ private void processSVField(boolean hasDictionary, ForwardIndexReader forwardIndexReader,
+ ForwardIndexReaderContext readerContext, TextIndexCreator textIndexCreator, int numDocs,
+ ColumnMetadata columnMetadata)
+ throws IOException {
+ if (!hasDictionary) {
+ // text index on raw column, just read the raw forward index
+ for (int docId = 0; docId < numDocs; docId++) {
+ textIndexCreator.add(forwardIndexReader.getString(docId, readerContext));
+ }
+ } else {
+ // text index on dictionary encoded SV column
+ // read forward index to get dictId
+ // read the raw value from dictionary using dictId
+ try (Dictionary dictionary = LoaderUtils.getDictionary(_segmentWriter, columnMetadata)) {
+ for (int docId = 0; docId < numDocs; docId++) {
+ int dictId = forwardIndexReader.getDictId(docId, readerContext);
+ textIndexCreator.add(dictionary.getStringValue(dictId));
+ }
+ }
+ }
+ }
+
+ private void processMVField(boolean hasDictionary, ForwardIndexReader forwardIndexReader,
+ ForwardIndexReaderContext readerContext, TextIndexCreator textIndexCreator, int numDocs,
+ ColumnMetadata columnMetadata)
+ throws IOException {
+ if (!hasDictionary) {
+ // text index on raw column, just read the raw forward index
+ String[] valueBuffer = new String[columnMetadata.getMaxNumberOfMultiValues()];
+ for (int docId = 0; docId < numDocs; docId++) {
+ int length = forwardIndexReader.getStringMV(docId, valueBuffer, readerContext);
+ textIndexCreator.add(valueBuffer, length);
+ }
+ } else {
+ // text index on dictionary encoded MV column
+ // read forward index to get dictId
+ // read the raw value from dictionary using dictId
+ try (Dictionary dictionary = LoaderUtils.getDictionary(_segmentWriter, columnMetadata)) {
+ int maxNumEntries = columnMetadata.getMaxNumberOfMultiValues();
+ int[] dictIdBuffer = new int[maxNumEntries];
+ String[] valueBuffer = new String[maxNumEntries];
+ for (int docId = 0; docId < numDocs; docId++) {
+ int length = forwardIndexReader.getDictIdMV(docId, dictIdBuffer, readerContext);
+ for (int i = 0; i < length; i++) {
+ valueBuffer[i] = dictionary.getStringValue(dictIdBuffer[i]);
+ }
+ textIndexCreator.add(valueBuffer, length);
+ }
+ }
+ }
+ }
}
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/NativeFSTIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/NativeFSTIndexCreator.java
index a97bb80..678105a 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/NativeFSTIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/NativeFSTIndexCreator.java
@@ -66,6 +66,11 @@ public class NativeFSTIndexCreator implements TextIndexCreator {
}
@Override
+ public void add(String[] document, int length) {
+ throw new UnsupportedOperationException("Multiple values not supported");
+ }
+
+ @Override
public void seal()
throws IOException {
LOGGER.info("Sealing FST index: " + _fstIndexFile.getAbsolutePath());
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
index 2a16478..8a2fb82 100644
--- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
+++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/SegmentPreProcessorTest.java
@@ -85,6 +85,8 @@ public class SegmentPreProcessorTest {
private static final String EXISTING_STRING_COL_DICT = "column5";
private static final String NEWLY_ADDED_STRING_COL_RAW = "newTextColRaw";
private static final String NEWLY_ADDED_STRING_COL_DICT = "newTextColDict";
+ private static final String NEWLY_ADDED_STRING_MV_COL_RAW = "newTextMVColRaw";
+ private static final String NEWLY_ADDED_STRING_MV_COL_DICT = "newTextMVColDict";
// For create fst index tests
private static final String NEWLY_ADDED_FST_COL_DICT = "newFSTColDict";
@@ -214,8 +216,10 @@ public class SegmentPreProcessorTest {
throws Exception {
Set<String> textIndexColumns = new HashSet<>();
textIndexColumns.add(NEWLY_ADDED_STRING_COL_RAW);
+ textIndexColumns.add(NEWLY_ADDED_STRING_MV_COL_RAW);
_indexLoadingConfig.setTextIndexColumns(textIndexColumns);
_indexLoadingConfig.getNoDictionaryColumns().add(NEWLY_ADDED_STRING_COL_RAW);
+ _indexLoadingConfig.getNoDictionaryColumns().add(NEWLY_ADDED_STRING_MV_COL_RAW);
// Create a segment in V3, add a new raw column with text index enabled
constructV3Segment();
@@ -224,6 +228,8 @@ public class SegmentPreProcessorTest {
// should be null since column does not exist in the schema
assertNull(columnMetadata);
checkTextIndexCreation(NEWLY_ADDED_STRING_COL_RAW, 1, 1, _newColumnsSchemaWithText, true, true, true, 4);
+ checkTextIndexCreation(NEWLY_ADDED_STRING_MV_COL_RAW, 1, 1, _newColumnsSchemaWithText, true, true, false, 4, false,
+ 1);
// Create a segment in V1, add a new raw column with text index enabled
constructV1Segment();
@@ -303,6 +309,7 @@ public class SegmentPreProcessorTest {
throws Exception {
Set<String> textIndexColumns = new HashSet<>();
textIndexColumns.add(NEWLY_ADDED_STRING_COL_DICT);
+ textIndexColumns.add(NEWLY_ADDED_STRING_MV_COL_DICT);
_indexLoadingConfig.setTextIndexColumns(textIndexColumns);
// Create a segment in V3, add a new dict encoded column with text index enabled
@@ -313,6 +320,9 @@ public class SegmentPreProcessorTest {
assertNull(columnMetadata);
checkTextIndexCreation(NEWLY_ADDED_STRING_COL_DICT, 1, 1, _newColumnsSchemaWithText, true, true, true, 4);
+ checkTextIndexCreation(NEWLY_ADDED_STRING_MV_COL_DICT, 1, 1, _newColumnsSchemaWithText, true, true, false, 4, false,
+ 1);
+
// Create a segment in V1, add a new dict encoded column with text index enabled
constructV1Segment();
segmentMetadata = new SegmentMetadataImpl(_indexDir);
@@ -385,18 +395,27 @@ public class SegmentPreProcessorTest {
boolean isSorted, int dictionaryElementSize)
throws Exception {
checkIndexCreation(ColumnIndexType.FST_INDEX, column, cardinality, bits, schema, isAutoGenerated, true, isSorted,
- dictionaryElementSize);
+ dictionaryElementSize, true, 0);
}
private void checkTextIndexCreation(String column, int cardinality, int bits, Schema schema, boolean isAutoGenerated,
boolean hasDictionary, boolean isSorted, int dictionaryElementSize)
throws Exception {
checkIndexCreation(ColumnIndexType.TEXT_INDEX, column, cardinality, bits, schema, isAutoGenerated, hasDictionary,
- isSorted, dictionaryElementSize);
+ isSorted, dictionaryElementSize, true, 0);
+ }
+
+ private void checkTextIndexCreation(String column, int cardinality, int bits, Schema schema, boolean isAutoGenerated,
+ boolean hasDictionary, boolean isSorted, int dictionaryElementSize, boolean isSingleValue,
+ int maxNumberOfMultiValues)
+ throws Exception {
+ checkIndexCreation(ColumnIndexType.TEXT_INDEX, column, cardinality, bits, schema, isAutoGenerated, hasDictionary,
+ isSorted, dictionaryElementSize, isSingleValue, maxNumberOfMultiValues);
}
private void checkIndexCreation(ColumnIndexType indexType, String column, int cardinality, int bits, Schema schema,
- boolean isAutoGenerated, boolean hasDictionary, boolean isSorted, int dictionaryElementSize)
+ boolean isAutoGenerated, boolean hasDictionary, boolean isSorted, int dictionaryElementSize,
+ boolean isSingleValued, int maxNumberOfMultiValues)
throws Exception {
try (SegmentDirectory segmentDirectory = SegmentDirectoryLoaderRegistry.getLocalSegmentDirectoryLoader()
@@ -405,14 +424,14 @@ public class SegmentPreProcessorTest {
processor.process();
SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(_indexDir);
ColumnMetadata columnMetadata = segmentMetadata.getColumnMetadataFor(column);
- assertEquals(columnMetadata.getFieldSpec(), new DimensionFieldSpec(column, DataType.STRING, true));
+ assertEquals(columnMetadata.getFieldSpec(), new DimensionFieldSpec(column, DataType.STRING, isSingleValued));
assertEquals(columnMetadata.getCardinality(), cardinality);
assertEquals(columnMetadata.getTotalDocs(), 100000);
assertEquals(columnMetadata.getBitsPerElement(), bits);
assertEquals(columnMetadata.getColumnMaxLength(), dictionaryElementSize);
assertEquals(columnMetadata.isSorted(), isSorted);
assertEquals(columnMetadata.hasDictionary(), hasDictionary);
- assertEquals(columnMetadata.getMaxNumberOfMultiValues(), 0);
+ assertEquals(columnMetadata.getMaxNumberOfMultiValues(), maxNumberOfMultiValues);
assertEquals(columnMetadata.getTotalNumberOfEntries(), 100000);
assertEquals(columnMetadata.isAutoGenerated(), isAutoGenerated);
@@ -761,8 +780,8 @@ public class SegmentPreProcessorTest {
Iterator<String> keys = configuration.getKeys();
while (keys.hasNext()) {
String key = keys.next();
- if (key.endsWith(V1Constants.MetadataKeys.Column.MIN_VALUE) || key
- .endsWith(V1Constants.MetadataKeys.Column.MAX_VALUE)) {
+ if (key.endsWith(V1Constants.MetadataKeys.Column.MIN_VALUE) || key.endsWith(
+ V1Constants.MetadataKeys.Column.MAX_VALUE)) {
configuration.clearProperty(key);
}
}
@@ -970,8 +989,8 @@ public class SegmentPreProcessorTest {
assertNotNull(segmentMetadata.getColumnMetadataFor("newJsonCol"));
_indexLoadingConfig = new IndexLoadingConfig();
- _indexLoadingConfig
- .setH3IndexConfigs(ImmutableMap.of("newH3Col", new H3IndexConfig(ImmutableMap.of("resolutions", "5"))));
+ _indexLoadingConfig.setH3IndexConfigs(
+ ImmutableMap.of("newH3Col", new H3IndexConfig(ImmutableMap.of("resolutions", "5"))));
_indexLoadingConfig.setJsonIndexColumns(new HashSet<>(Collections.singletonList("newJsonCol")));
// V1 use separate file for each column index.
@@ -1025,8 +1044,8 @@ public class SegmentPreProcessorTest {
long initFileSize = singleFileIndex.length();
_indexLoadingConfig = new IndexLoadingConfig();
- _indexLoadingConfig
- .setH3IndexConfigs(ImmutableMap.of("newH3Col", new H3IndexConfig(ImmutableMap.of("resolutions", "5"))));
+ _indexLoadingConfig.setH3IndexConfigs(
+ ImmutableMap.of("newH3Col", new H3IndexConfig(ImmutableMap.of("resolutions", "5"))));
_indexLoadingConfig.setJsonIndexColumns(new HashSet<>(Collections.singletonList("newJsonCol")));
// Create H3 and Json indices.
diff --git a/pinot-segment-local/src/test/resources/data/newColumnsSchemaWithText.json b/pinot-segment-local/src/test/resources/data/newColumnsSchemaWithText.json
index b2975d4..dda33ba 100644
--- a/pinot-segment-local/src/test/resources/data/newColumnsSchemaWithText.json
+++ b/pinot-segment-local/src/test/resources/data/newColumnsSchemaWithText.json
@@ -30,6 +30,16 @@
"dataType": "STRING"
},
{
+ "name": "newTextMVColRaw",
+ "dataType": "STRING",
+ "singleValueField": false
+ },
+ {
+ "name": "newTextMVColDict",
+ "dataType": "STRING",
+ "singleValueField": false
+ },
+ {
"name": "column6",
"dataType": "INT",
"singleValueField": false
diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/TextIndexCreator.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/TextIndexCreator.java
index 4a9d6cf..bad466e 100644
--- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/TextIndexCreator.java
+++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/TextIndexCreator.java
@@ -33,6 +33,11 @@ public interface TextIndexCreator extends Closeable {
void add(String document);
/**
+ * Adds a set of documents to the index
+ */
+ void add(String[] document, int length);
+
+ /**
* Seals the index and flushes it to disk.
*/
void seal()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org