You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2022/09/24 10:37:33 UTC
[lucene] branch branch_9x updated: Don't try to highlight very long terms (#11808)

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 8f41b3fe853 Don't try to highlight very long terms (#11808)
8f41b3fe853 is described below

commit 8f41b3fe8538cb1ca825425bacda770e5881ce38
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Sat Sep 24 11:26:16 2022 +0100

    Don't try to highlight very long terms (#11808)
    
    The UnifiedHighlighter can throw exceptions when highlighting terms that are longer
    than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing
    a confusing exception, we can instead filter out the long terms when building the
    MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case.
---
 .../automaton/DaciukMihovAutomatonBuilder.java     |  2 +-
 .../uhighlight/MemoryIndexOffsetStrategy.java      | 11 +++++++-
 .../search/uhighlight/TestUnifiedHighlighter.java  | 29 ++++++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
index 26952b678e3..d96a88904d0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
@@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder {
    * This builder rejects terms that are more than 1k chars long since it then uses recursion based
    * on the length of the string, which might cause stack overflows.
    */
-  static final int MAX_TERM_LENGTH = 1_000;
+  public static final int MAX_TERM_LENGTH = 1_000;
 
   /** The default constructor is private. Use static methods directly. */
   private DaciukMihovAutomatonBuilder() {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
index 79dbcbea44b..aad68190c64 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -28,6 +29,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 
 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -60,7 +63,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
 
     List<CharArrayMatcher> allAutomata = new ArrayList<>();
     if (components.getTerms().length > 0) {
-      allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
+      // Filter out any long terms that would otherwise cause exceptions if we tried
+      // to build an automaton on them
+      List<BytesRef> filteredTerms =
+          Arrays.stream(components.getTerms())
+              .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+              .collect(Collectors.toList());
+      allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
     }
     Collections.addAll(allAutomata, components.getAutomata());
     for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
index 482644d4243..6bad8ddbe5e 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.junit.After;
 import org.junit.Before;
 
@@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
 
     ir.close();
   }
+
+  public void testQueryWithLongTerm() throws IOException {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighter =
+        randomUnifiedHighlighter(
+            searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true);
+
+    Query query =
+        new BooleanQuery.Builder()
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+                BooleanClause.Occur.SHOULD)
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+                BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
+            .build();
+
+    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+
+    String[] snippets = highlighter.highlight("title", query, topDocs);
+    assertArrayEquals(new String[] {"This is the <b>title</b> field."}, snippets);
+
+    ir.close();
+  }
 }