You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2020/09/10 11:17:31 UTC
[lucene-solr] branch master updated: LUCENE-9464: Add
high(er)-level hit highlighter example that demonstrates and uses low-level
components (#1820)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new e2f3f62 LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820)
e2f3f62 is described below
commit e2f3f626ee4c7f2d2df1e09a31b971c81e95be44
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Thu Sep 10 13:17:13 2020 +0200
LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820)
---
.../matchhighlight/FieldValueHighlighters.java | 139 ++++++
.../search/matchhighlight/MatchHighlighter.java | 308 ++++++++++++++
.../matchhighlight/MatchRegionRetriever.java | 11 +-
.../search/matchhighlight/AnalyzerWithGaps.java | 51 +++
.../lucene/search/matchhighlight/IndexBuilder.java | 105 +++++
.../matchhighlight/TestMatchHighlighter.java | 466 +++++++++++++++++++++
.../matchhighlight/TestMatchRegionRetriever.java | 311 ++++++--------
7 files changed, 1199 insertions(+), 192 deletions(-)
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
new file mode 100644
index 0000000..ece6693
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.BiPredicate;
+import java.util.function.Predicate;
+
+/**
+ * A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes
+ * that cover typical use cases (verbatim values, highlights, abbreviations).
+ *
+ * @see MatchHighlighter#appendFieldHighlighter
+ */
+public final class FieldValueHighlighters {
+ private FieldValueHighlighters() {
+ }
+
+ private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter {
+ private final BiPredicate<String, Boolean> testPredicate;
+
+ protected AbstractFieldValueHighlighter(BiPredicate<String, Boolean> testPredicate) {
+ this.testPredicate = testPredicate;
+ }
+
+ @Override
+ public final boolean isApplicable(String field, boolean hasMatches) {
+ return testPredicate.test(field, hasMatches);
+ }
+ }
+
+ /**
+ * Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained
+ * highlights or not.
+ */
+ public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set<String> fields) {
+ PassageSelector passageSelector = defaultPassageSelector();
+ PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", "");
+ return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) {
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ List<Passage> bestPassages =
+ passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
+
+ return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+ }
+
+ @Override
+ public Collection<String> alwaysFetchedFields() {
+ return fields;
+ }
+ };
+ }
+
+ /**
+ * Default preconfigured {@link PassageSelector}.
+ */
+ public static PassageSelector defaultPassageSelector() {
+ return new PassageSelector(
+ PassageSelector.DEFAULT_SCORER,
+ new BreakIteratorShrinkingAdjuster());
+ }
+
+ /**
+ * Highlights fields matching predicate {@code matchFields} only if they contained query matches.
+ */
+ public static MatchHighlighter.FieldValueHighlighter highlighted(
+ int maxPassageWindow,
+ int maxPassages,
+ PassageFormatter passageFormatter,
+ Predicate<String> matchFields) {
+ PassageSelector passageSelector = defaultPassageSelector();
+ return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) {
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ assert matchOffsets != null;
+
+ List<Passage> bestPassages =
+ passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
+
+ return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+ }
+ };
+ }
+
+ /**
+ * Always returns raw field values, no highlighting or value truncation is applied.
+ */
+ public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) {
+ HashSet<String> matchFields = new HashSet<>(Arrays.asList(moreFields));
+ matchFields.add(field);
+ return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) {
+ @Override
+ public Collection<String> alwaysFetchedFields() {
+ return matchFields;
+ }
+
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ return Arrays.asList(values);
+ }
+ };
+ }
+
+ /**
+ * Matches all fields and omits their value in the output (so that no highlight or value is emitted).
+ */
+ public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
+ return new AbstractFieldValueHighlighter((field, hasMatches) -> true) {
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ return null;
+ }
+ };
+ }
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
new file mode 100644
index 0000000..20938b0
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
@@ -0,0 +1,308 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DocumentStoredFieldVisitor;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+
+/**
+ * An example highlighter that combines several lower-level highlighting
+ * utilities in this package into a fully featured, ready-to-use component.
+ * <p>
+ * Note that if you need to customize or tweak the details of highlighting,
+ * it is better to assemble your own highlighter using those low-level
+ * building blocks, rather than extend or modify this one.
+ */
+public class MatchHighlighter {
+ private final IndexSearcher searcher;
+ private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
+ private final Analyzer analyzer;
+
+ private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
+ private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
+
+ /**
+ * Actual per-field highlighter. Field highlighters are probed whether they
+ * are applicable to a particular combination of (field, hasMatches) pair. If a highlighter
+ * declares it is applicable, its {@link #format} method is invoked and the result
+ * is returned as the field's value.
+ *
+ * @see FieldValueHighlighters
+ */
+ public interface FieldValueHighlighter {
+ /**
+ * Check if this highlighter can be applied to a given field.
+ *
+ * @param field Field name
+ * @param hasMatches {@code true} if the field has a non-empty set of match regions.
+ */
+ boolean isApplicable(String field, boolean hasMatches);
+
+ /**
+ * Do format field values appropriately.
+ */
+ List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets);
+
+ /**
+ * @return Returns a set of fields that must be fetched for each document, regardless
+ * of whether they had matches or not. This is useful to load and return certain fields
+ * that should always be included (identifiers, document titles, etc.).
+ */
+ default Collection<String> alwaysFetchedFields() {
+ return Collections.emptyList();
+ }
+
+ /**
+ * Returns a new field value highlighter that is a combination of this one and another one.
+ */
+ default FieldValueHighlighter or(FieldValueHighlighter other) {
+ FieldValueHighlighter first = this;
+ FieldValueHighlighter second = other;
+
+ HashSet<String> fieldUnion = new HashSet<>();
+ fieldUnion.addAll(first.alwaysFetchedFields());
+ fieldUnion.addAll(second.alwaysFetchedFields());
+
+ return new FieldValueHighlighter() {
+ @Override
+ public boolean isApplicable(String field, boolean hasMatches) {
+ return first.isApplicable(field, hasMatches)
+ || second.isApplicable(field, hasMatches);
+ }
+
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets) {
+ FieldValueHighlighter delegate =
+ first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second;
+ return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
+ }
+
+ @Override
+ public Collection<String> alwaysFetchedFields() {
+ return fieldUnion;
+ }
+ };
+ }
+ }
+
+ /**
+ * Append a new highlighter to field highlighters chain. The order of field highlighters
+ * is important (first-matching wins).
+ */
+ public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
+ fieldHighlighters.add(highlighter);
+ fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
+ return this;
+ }
+
+ /**
+ * Always fetch the given set of fields for all input documents.
+ */
+ public void alwaysFetchFields(String field, String... otherFields) {
+ Stream.concat(Stream.of(field), Stream.of(otherFields))
+ .forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld)));
+ }
+
+ /**
+ * Single document's highlights.
+ */
+ public static class DocHighlights {
+ public final int docId;
+ public final Map<String, List<String>> fields = new LinkedHashMap<>();
+
+ public DocHighlights(int docId) {
+ this.docId = docId;
+ }
+ }
+
+ /**
+ * An {@link OffsetRange} of a match, together with the source query that caused it.
+ */
+ public static class QueryOffsetRange extends OffsetRange {
+ public final Query query;
+
+ QueryOffsetRange(Query query, int from, int to) {
+ super(from, to);
+ this.query = query;
+ }
+ }
+
+ private static class DocHit {
+ final int docId;
+ private final LeafReader leafReader;
+ private final int leafDocId;
+ private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges
+ = new LinkedHashMap<>();
+
+ DocHit(int docId, LeafReader leafReader, int leafDocId) {
+ this.docId = docId;
+ this.leafReader = leafReader;
+ this.leafDocId = leafDocId;
+ }
+
+ void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
+ hits.forEach((field, offsets) -> {
+ List<QueryOffsetRange> target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
+ offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
+ });
+ }
+
+ Document document(Predicate<String> needsField) throws IOException {
+ // Only load the fields that have a chance to be highlighted.
+ DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() {
+ @Override
+ public Status needsField(FieldInfo fieldInfo) {
+ return (matchRanges.containsKey(fieldInfo.name) ||
+ needsField.test(fieldInfo.name)) ? Status.YES : Status.NO;
+ }
+ };
+
+ leafReader.document(leafDocId, visitor);
+ return visitor.getDocument();
+ }
+ }
+
+ public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
+ this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+ }
+
+ public MatchHighlighter(IndexSearcher searcher,
+ Analyzer analyzer,
+ OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
+ this.searcher = searcher;
+ this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
+ this.analyzer = analyzer;
+ }
+
+ public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
+ // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
+ // for streaming, so we'll just prepopulate the map in proper order.
+ LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
+ for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+ docHits.put(scoreDoc.doc, null);
+ }
+
+ // Collect match ranges for each query and associate each range to the origin query.
+ for (Query q : queries) {
+ MatchRegionRetriever highlighter =
+ new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
+ highlighter.highlightDocuments(topDocs,
+ (int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) -> {
+ DocHit docHit = docHits.get(docId);
+ if (docHit == null) {
+ docHit = new DocHit(docId, leafReader, leafDocId);
+ docHits.put(docId, docHit);
+ }
+ docHit.addMatches(q, hits);
+ });
+ }
+
+ return docHits.values().stream()
+ .filter(Objects::nonNull) // This should always the case?
+ .map(this::computeDocFieldValues);
+ }
+
+ private DocHighlights computeDocFieldValues(DocHit docHit) {
+ Document doc;
+ try {
+ doc = docHit.document(fieldsAlwaysReturned::contains);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+
+ DocHighlights docHighlights = new DocHighlights(docHit.docId);
+
+ HashSet<String> unique = new HashSet<>();
+ for (IndexableField indexableField : doc) {
+ String field = indexableField.name();
+ if (!unique.add(field)) {
+ continue;
+ }
+
+ String[] values = doc.getValues(field);
+ String contiguousValue = contiguousFieldValue(field, values);
+ List<OffsetRange> valueRanges = computeValueRanges(field, values);
+ List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
+
+ List<String> formattedValues = fieldValueHighlighter(field, offsets != null)
+ .format(field, values, contiguousValue, valueRanges, offsets);
+
+ if (formattedValues != null) {
+ docHighlights.fields.put(field, formattedValues);
+ }
+ }
+
+ return docHighlights;
+ }
+
+ private List<OffsetRange> computeValueRanges(String field, String[] values) {
+ ArrayList<OffsetRange> valueRanges = new ArrayList<>();
+ int offset = 0;
+ for (CharSequence v : values) {
+ valueRanges.add(new OffsetRange(offset, offset + v.length()));
+ offset += v.length();
+ offset += analyzer.getOffsetGap(field);
+ }
+ return valueRanges;
+ }
+
+ private String contiguousFieldValue(String field, String[] values) {
+ String value;
+ if (values.length == 1) {
+ value = values[0];
+ } else {
+ // TODO: This can be inefficient if offset gap is large but the logic
+ // of applying offsets would get much more complicated so leaving for now
+ // (would have to recalculate all offsets to omit gaps).
+ String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
+ value = String.join(fieldGapPadding, values);
+ }
+ return value;
+ }
+
+ private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
+ for (FieldValueHighlighter highlighter : fieldHighlighters) {
+ if (highlighter.isApplicable(field, hasMatches)) {
+ return highlighter;
+ }
+ }
+ throw new RuntimeException("No field highlighter could be matched to field: " + field);
+ }
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
index 16c9a11..2861ac6 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
@@ -80,22 +80,23 @@ public class MatchRegionRetriever {
/**
* A constructor with the default offset strategy supplier.
+ *
+ * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
+ * in the absence of position offsets in the index. Note that the analyzer must return
+ * tokens (positions and offsets) identical to the ones stored in the index.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
- this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+ this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
/**
* @param searcher Index searcher to be used for retrieving matches.
* @param query The query for which matches should be retrieved. The query should be rewritten
* against the provided searcher.
- * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
- * in the absence of position offsets in the index. Note that the analyzer must return
- * tokens (positions and offsets) identical to the ones stored in the index.
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
* instances.
*/
- public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
+ public MatchRegionRetriever(IndexSearcher searcher, Query query,
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
throws IOException {
leaves = searcher.getIndexReader().leaves();
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java
new file mode 100644
index 0000000..3009c91
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
+
+/**
+ * An analyzer for tests that has a predefined offset and position gap.
+ */
+class AnalyzerWithGaps extends DelegatingAnalyzerWrapper {
+ private final Analyzer delegate;
+ private final int offsetGap;
+ private final int positionGap;
+
+ AnalyzerWithGaps(int offsetGap, int positionGap, Analyzer delegate) {
+ super(delegate.getReuseStrategy());
+ this.delegate = delegate;
+ this.offsetGap = offsetGap;
+ this.positionGap = positionGap;
+ }
+
+ @Override
+ protected Analyzer getWrappedAnalyzer(String fieldName) {
+ return delegate;
+ }
+
+ @Override
+ public int getOffsetGap(String fieldName) {
+ return offsetGap;
+ }
+
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return positionGap;
+ }
+}
\ No newline at end of file
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java
new file mode 100644
index 0000000..2a6e783
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.function.BiFunction;
+import java.util.function.Consumer;
+
+/**
+ * Utility class for building an ephemeral document index
+ * and running a block of code on its reader.
+ */
+class IndexBuilder {
+ public static final String FLD_ID = "id";
+ public static final String FLD_SORT_ORDER = "id_order";
+
+ private final BiFunction<String, String, IndexableField> toField;
+ private final ArrayList<Document> documents = new ArrayList<>();
+ private int seq;
+
+ class DocFields {
+ final Document document;
+
+ public DocFields(Document doc) {
+ this.document = doc;
+ }
+
+ public void add(String field, String... values) {
+ assert values.length > 0 : "At least one field value is required.";
+ for (String value : values) {
+ document.add(toField.apply(field, value));
+ }
+ }
+ }
+
+ IndexBuilder(BiFunction<String, String, IndexableField> valueToField) {
+ this.toField = valueToField;
+ }
+
+ public IndexBuilder doc(String field, String... values) {
+ return doc(fields -> {
+ fields.add(field, values);
+ });
+ }
+
+ public IndexBuilder doc(Consumer<DocFields> fields) {
+ Document doc = new Document();
+ doc.add(new NumericDocValuesField(FLD_SORT_ORDER, seq));
+ doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
+ fields.accept(new DocFields(doc));
+ documents.add(doc);
+ return this;
+ }
+
+ public IndexBuilder build(Analyzer analyzer, IOUtils.IOConsumer<DirectoryReader> block) throws IOException {
+ IndexWriterConfig config = new IndexWriterConfig(analyzer);
+ config.setIndexSort(new Sort(new SortField(FLD_SORT_ORDER, SortField.Type.LONG)));
+ try (Directory directory = new ByteBuffersDirectory()) {
+ IndexWriter iw = new IndexWriter(directory, config);
+ for (Document doc : documents) {
+ iw.addDocument(doc);
+ }
+ if (RandomizedTest.randomBoolean()) {
+ iw.commit();
+ }
+ iw.flush();
+
+ try (DirectoryReader reader = DirectoryReader.open(iw)) {
+ block.accept(reader);
+ }
+ }
+ return this;
+ }
+}
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
new file mode 100644
index 0000000..d1acf98
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@@ -0,0 +1,466 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.intervals.IntervalQuery;
+import org.apache.lucene.queries.intervals.Intervals;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class TestMatchHighlighter extends LuceneTestCase {
+ private static final String FLD_ID = "id";
+ private static final String FLD_TEXT1 = "text1";
+ private static final String FLD_TEXT2 = "text2";
+
+ private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
+ private FieldType TYPE_TEXT_POSITIONS;
+
+ private PerFieldAnalyzerWrapper analyzer;
+
+ @Before
+ public void setup() throws IOException {
+ TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;
+
+ TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
+ TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ TYPE_TEXT_POSITIONS_OFFSETS.freeze();
+
+ Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
+
+ // Create an analyzer with some synonyms, just to showcase them.
+ SynonymMap synonymMap = buildSynonymMap(new String[][]{
+ {"moon\u0000shine", "firewater"},
+ {"firewater", "moon\u0000shine"},
+ });
+
+ // Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
+ // glued together.
+ final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
+ final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
+ Analyzer synonymsAnalyzer =
+ new AnalyzerWithGaps(offsetGap, positionGap, new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new WhitespaceTokenizer();
+ TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
+ return new TokenStreamComponents(tokenizer, tokenStream);
+ }
+ });
+
+ fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
+ fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);
+
+ analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
+ }
+
+ static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
+ SynonymMap.Builder builder = new SynonymMap.Builder();
+ for (String[] pair : synonyms) {
+ assertThat(pair.length, Matchers.equalTo(2));
+ builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
+ }
+ return builder.build();
+ }
+
+ @Test
+ public void testBasicUsage() throws IOException {
+ new IndexBuilder(this::toField)
+ .doc(FLD_TEXT1, "foo bar baz")
+ .doc(FLD_TEXT1, "bar foo baz")
+ .doc(fields -> {
+ fields.add(FLD_TEXT1, "Very long content but not matching anything.");
+ fields.add(FLD_TEXT2, "no foo but bar");
+ })
+ .build(analyzer, reader -> {
+ Query query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
+ .add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
+ .build();
+
+ // In the most basic scenario, we run a search against a query, retrieve
+ // top docs...
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+ TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+ // ...and would want a fixed set of fields from those documents, some of them
+ // possibly highlighted if they matched the query.
+ //
+ // This configures the highlighter so that the FLD_ID field is always returned verbatim,
+ // and FLD_TEXT1 is returned *only if it contained a query match*.
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+ .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ // Note document field highlights are a stream over documents in topDocs. In the remaining code we will just
+ // collect them on the fly into a preformatted string.
+ Stream<MatchHighlighter.DocHighlights> highlights = highlighter.highlight(topDocs, query);
+ assertHighlights(toDocList(highlights),
+ " 0. id: 0",
+ " text1: >foo< bar baz",
+ " 1. id: 1",
+ " text1: bar >foo< baz",
+ " 2. id: 2");
+
+ // In a more realistic use case, you'd want to show the value of a given field *regardless* of whether it
+ // contained a highlight or not -- it is odd that document "id: 2" above doesn't have the 'text1' field
+ // shown because that field wasn't part of the query match.
+ //
+ // Let's say the field is also potentially long; if it contains a match,
+ // we would want to display the contextual snippet surrounding that match. If it does not contain any
+ // matches, we would want to display its content up to a given number of characters (lead lines).
+ //
+ // Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
+ highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+ .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+ .appendFieldHighlighter(FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+ " 0. id: 0",
+ " text1: >foo< bar baz",
+ " 1. id: 1",
+ " text1: bar >foo< baz",
+ " 2. id: 2",
+ " text1: Very long...");
+
+ // Field highlighters can apply to multiple fields and be chained for convenience.
+ // For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
+ Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
+ MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
+ FieldValueHighlighters.highlighted(80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
+ .or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));
+
+ highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+ .appendFieldHighlighter(highlightedOrAbbreviated)
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+ " 0. id: 0",
+ " text1: >foo< bar baz",
+ " 1. id: 1",
+ " text1: bar >foo< baz",
+ " 2. id: 2",
+ " text1: Very long...",
+ " text2: no foo but >bar<");
+ });
+ }
+
+ @Test
+ public void testSynonymHighlight() throws IOException {
+ // There is nothing special needed to highlight or process complex queries, synonyms, etc.
+ // Synonyms defined in the constructor of this class.
+ new IndexBuilder(this::toField)
+ .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
+ .build(analyzer, reader -> {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
+ assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+ "0. text1: Where the >moon shine< falls, >firewater< flows.");
+
+ query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
+ assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+ "0. text1: Where the >moon shine< falls, >firewater< flows.");
+ });
+ }
+
+ @Test
+ public void testCustomFieldHighlightHandling() throws IOException {
+ // Match highlighter is a showcase of individual components in this package, suitable
+ // to create any kind of field-display designs.
+ //
+ // In this example we will build a custom field highlighting handler that
+ // highlights matches over a multivalued field, shows that field's values if it received
+ // no matches and limits the number of values displayed to at most 2 (with an appropriate message).
+ new IndexBuilder(this::toField)
+ // Just one document, one field, four values.
+ .doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
+ .build(analyzer, reader -> {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER;
+
+ // Let's start with the simple predefined highlighter so that the field's value shows
+ // and is highlighted when it was part of the hit.
+ MatchHighlighter.FieldValueHighlighter highlighted = FieldValueHighlighters.highlighted(
+ 80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(highlighted)
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
+ TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+ // Note the highlighter is configured with at most 2 snippets so the match on the
+ // third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
+ // values are displayed in full.
+ assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+ "0. text1: >foo< bar, bar >foo< baz");
+
+ // So the above works fine if the field received a match but omits it otherwise. We can
+ // force the display of this field by chaining with verbatim value highlighter:
+ highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
+ "0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");
+
+ // But this is not exactly what we'd like because we want to limit the display of values to the first two.
+ // Let's just write a custom field highlighter handler that does it.
+ class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
+ private final String field;
+ private final int limit;
+
+ AtMostNValuesHighlighter(String field, int limit) {
+ this.field = field;
+ this.limit = limit;
+ }
+
+ @Override
+ public boolean isApplicable(String field, boolean hasMatches) {
+ return Objects.equals(field, this.field);
+ }
+
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ if (values.length <= limit) {
+ return Arrays.asList(values);
+ } else {
+ List<String> collected = Stream.of(values).limit(limit).collect(Collectors.toList());
+ int remaining = values.length - collected.size();
+ collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
+ return collected;
+ }
+ }
+
+ @Override
+ public Collection<String> alwaysFetchedFields() {
+ return Collections.singleton(field);
+ }
+ }
+
+ // We can now chain it as usual and contemplate the result.
+ highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+ "0. text1: >foo< bar, bar >foo< baz");
+ assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
+ "0. text1: foo bar, bar foo baz, [2 omitted]");
+ });
+ }
+
+ @Test
+ public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
+ // Match highlighter underlying components are powerful enough to build interesting,
+ // if not always super-practical, things. In this case, we would like to highlight
+ // a set of matches of *more than one* query over the same set of input documents. This includes
+ // highest-scoring passage resolution (from multiple hits) and different highlight markers
+ // for each query.
+ new IndexBuilder(this::toField)
+ .doc(FLD_TEXT1, "foo bar baz")
+ .doc(FLD_TEXT1, "foo baz bar")
+ .build(analyzer, reader -> {
+ // Let's start with the two queries. The first one will be an unordered
+ // query for (foo, baz) with a max gap of 1; let's use intervals for this.
+ Query q1 = new IntervalQuery(FLD_TEXT1,
+ Intervals.maxgaps(1,
+ Intervals.unordered(
+ Intervals.term("foo"),
+ Intervals.term("baz"))));
+
+ // The second one will be a simpler term query for "bar".
+ Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));
+
+ // Let's fetch matching documents by combining the two into a Boolean query.
+ Query query = new BooleanQuery.Builder()
+ .add(q1, BooleanClause.Occur.SHOULD)
+ .add(q2, BooleanClause.Occur.SHOULD)
+ .build();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+ TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+ // If we use the "regular" highlighter, the result will be slightly odd: a nested
+ // highlight over "bar" within the first match. Also, you can't distinguish which of the sub-queries
+ // caused which highlight marker... but if it were HTML then you could give the span
+ // some semi-translucent background and layered matches would be visible.
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", "<span>", "</span>"), FLD_TEXT1::equals))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+ "0. text1: <span>foo <span>bar</span> baz</span>",
+ "1. text1: <span>foo baz</span> <span>bar</span>");
+
+ // To separate highlights for multiple queries we'll pass them separately to the
+ // highlighter and differentiate highlight markers upon their application. Let's start with the customized
+ // field highlighter first. This utilizes the fact that match ranges passed from MatchHighlighter
+ // contain a reference to the original query which brought up the match.
+ class SeparateMarkerFieldHighlighter implements MatchHighlighter.FieldValueHighlighter {
+ private final String field;
+ private final Map<Query, String> queryClassMap;
+
+ SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
+ this.field = field;
+ this.queryClassMap = queryClassMap;
+ }
+
+ @Override
+ public boolean isApplicable(String field, boolean hasMatches) {
+ return Objects.equals(field, this.field) && hasMatches;
+ }
+
+ @Override
+ public List<String> format(String field, String[] values, String contiguousValue,
+ List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ PassageSelector passageSelector = new PassageSelector();
+ int maxPassageWindow = 80;
+ int maxPassages = 3;
+ List<Passage> bestPassages =
+ passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
+
+ // We know the offset ranges passed to us by MatchHighlighter are instances of QueryOffsetRange
+ // so we compute the class based on that.
+ Function<OffsetRange, String> queryToClass =
+ (range) -> queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);
+
+ PassageFormatter passageFormatter = new PassageFormatter("...",
+ (range) -> "<span class='" + queryToClass.apply(range) + "'>",
+ (range) -> "</span>");
+
+ return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+ }
+ }
+
+ // And this is pretty much it. We now set up query classes to display, set up the highlighter...
+ Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
+ highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ // ...and run highlighting. Note the query passed to the highlighter are individual sub-clauses
+ // of the Boolean query used to fetch documents.
+ assertHighlights(toDocList(highlighter.highlight(topDocs, q1, q2)),
+ "0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
+ "1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
+ });
+ }
+
+ private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
+ ArrayList<String> actualLines = new ArrayList<>();
+ for (int doc = 0; doc < docList.size(); doc++) {
+ List<String> fields = docList.get(doc);
+ for (int i = 0; i < fields.size(); i++) {
+ actualLines.add((i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i));
+ }
+ }
+
+ if (!Arrays.equals(
+ Stream.of(expectedFormattedLines).map(String::trim).toArray(),
+ actualLines.stream().map(String::trim).toArray())) {
+ throw new AssertionError("Actual hits were:\n" +
+ String.join("\n", actualLines) + "\n\n but expected them to be:\n" +
+ String.join("\n", expectedFormattedLines));
+ }
+ }
+
+ private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
+ return highlights.map(docHighlights ->
+ docHighlights.fields.entrySet().stream()
+ .map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
+ .collect(Collectors.toList())
+ ).collect(Collectors.toList());
+ }
+
+ private IndexableField toField(String name, String value) {
+ switch (name) {
+ case FLD_TEXT1:
+ return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
+ case FLD_TEXT2:
+ return new Field(name, value, TYPE_TEXT_POSITIONS);
+ default:
+ throw new AssertionError("Don't know how to handle this field: " + name);
+ }
+ }
+}
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
index 0fd9ca0..691877c 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
@@ -20,21 +20,17 @@ import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.IntervalQuery;
@@ -52,19 +48,13 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
-import org.apache.lucene.store.ByteBuffersDirectory;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
-import java.io.UncheckedIOException;
import java.util.ArrayList;
-import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -75,11 +65,9 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.emptyArray;
-import static org.hamcrest.Matchers.not;
public class TestMatchRegionRetriever extends LuceneTestCase {
- private static final String FLD_ID = "field_id";
+ private static final String FLD_ID = IndexBuilder.FLD_ID;
private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
@@ -100,7 +88,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
private Analyzer analyzer;
@Before
- public void setup() {
+ public void setup() throws IOException {
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_STORED_WITH_OFFSETS.freeze();
@@ -109,26 +97,24 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
TYPE_STORED_NO_POSITIONS.freeze();
+ final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
+ final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
Analyzer whitespaceAnalyzer =
- new Analyzer() {
- final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
- final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
+ new AnalyzerWithGaps(offsetGap, positionGap,
+ new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN));
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- WhitespaceTokenizer tokenizer =
- new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
- return new TokenStreamComponents(tokenizer);
- }
-
- @Override
- public int getOffsetGap(String fieldName) {
- return offsetGap;
- }
+ SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
+ {"foo\u0000bar", "syn1"},
+ {"baz", "syn2\u0000syn3"},
+ });
+ Analyzer synonymsAnalyzer =
+ new Analyzer() {
@Override
- public int getPositionIncrementGap(String fieldName) {
- return positionGap;
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new WhitespaceTokenizer();
+ TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
+ return new TokenStreamComponents(tokenizer, tokenStream);
}
};
@@ -138,26 +124,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
-
- try {
- SynonymMap.Builder b = new SynonymMap.Builder();
- b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
- b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
- SynonymMap synonymMap = b.build();
- Analyzer synonymsAnalyzer =
- new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new WhitespaceTokenizer();
- TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
- return new TokenStreamComponents(tokenizer, tokenStream);
- }
- };
- fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
- fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
- } catch (IOException e) {
- throw new UncheckedIOException(e);
- }
+ fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
+ fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
}
@@ -184,13 +152,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkTermQuery(String field) throws IOException {
- withReader(
- List.of(
- Map.of(field, values("foo bar baz")),
- Map.of(field, values("bar foo baz")),
- Map.of(field, values("bar baz foo")),
- Map.of(field, values("bar bar bar irrelevant"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz")
+ .doc(field, "bar foo baz")
+ .doc(field, "bar baz foo")
+ .doc(field, "bar bar bar irrelevant")
+ .build(analyzer, reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar baz')", field),
@@ -217,17 +184,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
.add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
.build();
- withReader(
- List.of(
- Map.of(field, values("foo bar baz abc")),
- Map.of(field, values("bar foo baz def")),
- Map.of(field, values("bar baz foo xyz"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz abc")
+ .doc(field, "bar foo baz def")
+ .doc(field, "bar baz foo xyz")
+ .build(analyzer, reader -> {
assertThat(highlights(reader, query),
containsInAnyOrder(
fmt("0: (%s: '>foo bar baz< abc')", field),
fmt("1: (%s: 'bar >foo baz< def')", field)));
- });
+ }
+ );
}
@Test
@@ -241,12 +208,11 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkVariousQueryTypes(String field) throws IOException {
- withReader(
- List.of(
- Map.of(field, values("foo bar baz abc")),
- Map.of(field, values("bar foo baz def")),
- Map.of(field, values("bar baz foo xyz"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz abc")
+ .doc(field, "bar foo baz def")
+ .doc(field, "bar baz foo xyz")
+ .build(analyzer, reader -> {
assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar >baz< abc')", field),
@@ -297,31 +263,31 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
assertThat(highlights(reader, new MatchAllDocsQuery()),
Matchers.hasSize(0));
- });
+ }
+ );
- withReader(
- List.of(
- Map.of(field, values("foo baz foo")),
- Map.of(field, values("bas baz foo")),
- Map.of(field, values("bar baz foo xyz"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo baz foo")
+ .doc(field, "bas baz foo")
+ .doc(field, "bar baz foo xyz")
+ .build(analyzer, reader -> {
assertThat(
highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
- });
+ }
+ );
}
@Test
public void testIntervalQueries() throws IOException {
String field = FLD_TEXT_POS_OFFS;
- withReader(
- List.of(
- Map.of(field, values("foo baz foo")),
- Map.of(field, values("bas baz foo")),
- Map.of(field, values("bar baz foo xyz"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo baz foo")
+ .doc(field, "bas baz foo")
+ .doc(field, "bar baz foo xyz")
+ .build(analyzer, reader -> {
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.unordered(
@@ -374,7 +340,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
- });
+ }
+ );
}
@Test
@@ -388,36 +355,37 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
public void checkMultivaluedFields(String field) throws IOException {
- withReader(
- List.of(
- Map.of(field, values("foo bar", "baz abc", "bad baz")),
- Map.of(field, values("bar foo", "baz def")),
- Map.of(field, values("bar baz", "foo xyz"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar", "baz abc", "bad baz")
+ .doc(field, "bar foo", "baz def")
+ .doc(field, "bar baz", "foo xyz")
+ .build(analyzer, reader -> {
assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>baz< abc | bad >baz<')", field),
fmt("1: (%s: '>baz< def')", field),
fmt("2: (%s: 'bar >baz<')", field)));
- });
+ }
+ );
}
@Test
public void testMultiFieldHighlights() throws IOException {
- for (String[] fields :
+ for (String[] fieldPairs :
new String[][]{
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
}) {
- String field1 = fields[0];
- String field2 = fields[1];
- withReader(
- List.of(
- Map.of(
- field1, values("foo bar", "baz abc"),
- field2, values("foo baz", "loo bar"))),
- reader -> {
+ String field1 = fieldPairs[0];
+ String field2 = fieldPairs[1];
+
+ new IndexBuilder(this::toField)
+ .doc(fields -> {
+ fields.add(field1, "foo bar", "baz abc");
+ fields.add(field2, "foo baz", "loo bar");
+ })
+ .build(analyzer, reader -> {
String ordered =
Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
.sorted()
@@ -428,7 +396,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
reader,
stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
containsInAnyOrder(fmt("0: %s", ordered)));
- });
+ }
+ );
}
}
@@ -440,15 +409,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testNoRewrite() throws IOException {
String field1 = FLD_TEXT_POS_OFFS1;
String field2 = FLD_TEXT_POS_OFFS2;
- withReader(
- List.of(
- Map.of(
- field1, values("0100"),
- field2, values("loo bar")),
- Map.of(
- field1, values("0200"),
- field2, values("foo bar"))),
- reader -> {
+
+ new IndexBuilder(this::toField)
+ .doc(fields -> {
+ fields.add(field1, "0100");
+ fields.add(field2, "loo bar");
+ })
+ .doc(fields -> {
+ fields.add(field1, "0200");
+ fields.add(field2, "foo bar");
+ })
+ .build(analyzer, reader -> {
String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
assertThat(
highlights(
@@ -461,7 +432,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
reader,
stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
containsInAnyOrder(expected));
- });
+ }
+ );
}
@Test
@@ -475,9 +447,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
public void checkNestedQueryHits(String field) throws IOException {
- withReader(
- List.of(Map.of(field, values("foo bar baz abc"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz abc")
+ .build(analyzer, reader -> {
assertThat(
highlights(
reader,
@@ -496,7 +468,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
.add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
.build()),
containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
- });
+ }
+ );
}
@Test
@@ -510,13 +483,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkGraphQuery(String field) throws IOException {
- withReader(
- List.of(
- Map.of(field, values("foo bar baz")),
- Map.of(field, values("bar foo baz")),
- Map.of(field, values("bar baz foo")),
- Map.of(field, values("bar bar bar irrelevant"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz")
+ .doc(field, "bar foo baz")
+ .doc(field, "bar baz foo")
+ .doc(field, "bar bar bar irrelevant")
+ .build(analyzer, reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
@@ -536,7 +508,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
assertThat(
highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
- });
+ }
+ );
}
@Test
@@ -550,13 +523,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkSpanQueries(String field) throws IOException {
- withReader(
- List.of(
- Map.of(field, values("foo bar baz")),
- Map.of(field, values("bar foo baz")),
- Map.of(field, values("bar baz foo")),
- Map.of(field, values("bar bar bar irrelevant"))),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(field, "foo bar baz")
+ .doc(field, "bar foo baz")
+ .doc(field, "bar baz foo")
+ .doc(field, "bar bar bar irrelevant")
+ .build(analyzer, reader -> {
assertThat(
highlights(
reader,
@@ -598,7 +570,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
fmt("0: (%s: '>foo bar< baz')", field),
fmt("1: (%s: '>bar foo< baz')", field),
fmt("2: (%s: '>bar baz foo<')", field)));
- });
+ }
+ );
}
/**
@@ -610,12 +583,10 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
String field = FLD_TEXT_NOPOS;
- withReader(
- List.of(
- Map.of(FLD_TEXT_NOPOS, values("foo bar")),
- Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
- ),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(FLD_TEXT_NOPOS, "foo bar")
+ .doc(FLD_TEXT_NOPOS, "foo bar", "baz baz")
+ .build(analyzer, reader -> {
OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
.computeOffsetRetrievalStrategies(reader, analyzer);
OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
@@ -634,7 +605,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("0: (%s: '>foo bar<')", field),
fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
- });
+ }
+ );
}
/**
@@ -648,13 +620,13 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
String field = FLD_TEXT_NOPOS;
- withReader(
- List.of(
- Map.of(FLD_TEXT_NOPOS, values("foo bar"),
- FLD_TEXT_POS, values("bar bar")),
- Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
- ),
- reader -> {
+ new IndexBuilder(this::toField)
+ .doc(fields -> {
+ fields.add(FLD_TEXT_NOPOS, "foo bar");
+ fields.add(FLD_TEXT_POS, "bar bar");
+ })
+ .doc(FLD_TEXT_NOPOS, "foo bar", "baz bar")
+ .build(analyzer, reader -> {
assertThat(
highlights(
reader,
@@ -662,7 +634,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("0: (%s: 'foo >bar<')", field),
fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
- });
+ }
+ );
}
private List<String> highlights(IndexReader reader, Query query) throws IOException {
@@ -702,46 +675,14 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
};
- MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
- offsetsStrategySupplier);
+ MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, offsetsStrategySupplier);
highlighter.highlightDocuments(topDocs, highlightCollector);
return highlights;
}
- private String[] values(String... values) {
- assertThat(values, not(emptyArray()));
- return values;
- }
-
- private void withReader(
- Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
- throws IOException {
- IndexWriterConfig config = new IndexWriterConfig(analyzer);
-
- try (Directory directory = new ByteBuffersDirectory()) {
- IndexWriter iw = new IndexWriter(directory, config);
-
- int seq = 0;
- for (Map<String, String[]> fields : docs) {
- Document doc = new Document();
- doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
- for (Map.Entry<String, String[]> field : fields.entrySet()) {
- for (String value : field.getValue()) {
- doc.add(toField(field.getKey(), value));
- }
- }
- iw.addDocument(doc);
- if (RandomizedTest.randomBoolean()) {
- iw.commit();
- }
- }
- iw.flush();
-
- try (DirectoryReader reader = DirectoryReader.open(iw)) {
- block.accept(reader);
- }
- }
+ private static String fmt(String string, Object... args) {
+ return String.format(Locale.ROOT, string, args);
}
private IndexableField toField(String name, String value) {
@@ -760,8 +701,4 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
throw new AssertionError("Don't know how to handle this field: " + name);
}
}
-
- private static String fmt(String string, Object... args) {
- return String.format(Locale.ROOT, string, args);
- }
}