You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2021/10/23 00:40:27 UTC

[lucene] branch main updated: LUCENE-9431: UnifiedHighlighter WEIGHT_MATCHES is now true by default (#362)

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 2719cf6  LUCENE-9431: UnifiedHighlighter WEIGHT_MATCHES is now true by default (#362)
2719cf6 is described below

commit 2719cf6630eb2bd7cb37d0e8462dc912d8fafd83
Author: David Smiley <ds...@apache.org>
AuthorDate: Fri Oct 22 20:40:22 2021 -0400

    LUCENE-9431: UnifiedHighlighter WEIGHT_MATCHES is now true by default (#362)
    
    Co-authored-by: Animesh Pandey <ap...@gmail.com>
---
 lucene/CHANGES.txt                                 |  3 ++
 .../search/uhighlight/UnifiedHighlighter.java      | 16 ++++--
 .../search/uhighlight/TestUnifiedHighlighter.java  | 19 +++++++
 .../uhighlight/TestUnifiedHighlighterMTQ.java      |  9 +++-
 .../TestUnifiedHighlighterTermIntervals.java       | 58 ++++------------------
 5 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b2ac0cb..3cf94ac 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -160,6 +160,9 @@ API Changes
 
 * LUCENE-9325: Sort is now final, and the `setSort()` method has been removed (Alan Woodward)
 
+* LUCENE-9431: The UnifiedHighlighter's WEIGHT_MATCHES flag is now set by default, provided its
+  requirements are met.  It can be disabled via over-riding getFlags (Animesh Pandey, David Smiley)
+
 * LUCENE-10158: Add a new interface Unwrappable to the utils package to allow code to
   unwrap wrappers/delegators that are added by Lucene's testing framework. This will allow
   testing new MMapDirectory implementation based on JDK Project Panama. (Uwe Schindler)
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
index fb9b9ba..da3b809 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -86,7 +86,7 @@ import org.apache.lucene.util.InPlaceMergeSorter;
  *   <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
  * </ul>
  *
- * <p>This is thread-safe.
+ * <p>This is thread-safe, notwithstanding the setters.
  *
  * @lucene.experimental
  */
@@ -823,6 +823,7 @@ public class UnifiedHighlighter {
     return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
   }
 
+  /** Customize the highlighting flags to use by field. */
   protected Set<HighlightFlag> getFlags(String field) {
     Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
     if (shouldHandleMultiTermQuery(field)) {
@@ -834,6 +835,11 @@ public class UnifiedHighlighter {
     if (shouldPreferPassageRelevancyOverSpeed(field)) {
       highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
     }
+    if (highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
+        && highlightFlags.contains(HighlightFlag.PHRASES)
+        && highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
+      highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);
+    }
     return highlightFlags;
   }
 
@@ -1168,9 +1174,11 @@ public class UnifiedHighlighter {
 
     /**
      * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's
-     * more accurate to the query, though might not calculate passage relevancy as well. Use of this
-     * flag requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES}. {@link
-     * #PASSAGE_RELEVANCY_OVER_SPEED} will be ignored. False by default.
+     * more accurate to the query, and the snippets can be a little different for phrases because
+     * the whole phrase is marked up instead of each word. The passage relevancy calculation can be
+     * different (maybe worse?) and it's slower when highlighting many fields. Use of this flag
+     * requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES} and {@link
+     * #PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true by default.
      */
     WEIGHT_MATCHES
 
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
index 1f23c3f..12c2259 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -460,6 +460,25 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
     ir.close();
   }
 
+  public void testHighlighterDefaultFlags() throws Exception {
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
+    Document document = new Document();
+    document.add(new Field("body", "test body", fieldType));
+    iw.addDocument(document);
+    IndexReader ir = iw.getReader();
+    iw.close();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
+    Set<HighlightFlag> flags = highlighter.getFlags("body");
+    assertTrue(flags.contains(HighlightFlag.PHRASES));
+    assertTrue(flags.contains(HighlightFlag.MULTI_TERM_QUERY));
+    assertTrue(flags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED));
+    assertTrue(flags.contains(HighlightFlag.WEIGHT_MATCHES));
+    // if more flags are added, bump the number below and add an assertTrue or assertFalse above
+    assertEquals(4, HighlightFlag.values().length);
+    ir.close();
+  }
+
   public void testCuriousGeorge() throws Exception {
     String text =
         "It’s the formula for success for preschoolers—Curious George and fire trucks! "
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
index 4ddf238..b48d304 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
@@ -778,10 +778,15 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
     snippets = highlighter.highlight("body", query, topDocs);
     assertEquals(1, snippets.length);
 
+    // All flags are enabled.
     assertEquals(
-        "<b>Test(body:te*)</b> a <b>one(body:*one*)</b> <b>sentence(body:zentence~~2)</b> document.",
+        "" + highlighter.getFlags("body"),
+        HighlightFlag.values().length,
+        highlighter.getFlags("body").size());
+    assertEquals(
+        "" + highlighter.getFlags("title"),
+        "<b>Test(body:te*)</b> a <b>one(body:*one*)</b> <b>sentence(sentence)</b> document.",
         snippets[0]);
-
     ir.close();
   }
 
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java
index 8e602b1..13588c3 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermIntervals.java
@@ -27,7 +27,6 @@ import java.util.Collections;
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.lucene.analysis.Analyzer;
@@ -87,47 +86,10 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase {
 
   static UnifiedHighlighter randomUnifiedHighlighter(
       IndexSearcher searcher, Analyzer indexAnalyzer) {
-    return randomUnifiedHighlighter(
+    return TestUnifiedHighlighter.randomUnifiedHighlighter(
         searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class), null);
   }
 
-  static UnifiedHighlighter randomUnifiedHighlighter(
-      IndexSearcher searcher,
-      Analyzer indexAnalyzer,
-      EnumSet<HighlightFlag> mandatoryFlags,
-      Boolean requireFieldMatch) {
-    final UnifiedHighlighter uh =
-        new UnifiedHighlighter(searcher, indexAnalyzer) {
-          Set<HighlightFlag> flags; // consistently random set of flags for this test run
-
-          @Override
-          protected Set<HighlightFlag> getFlags(String field) {
-            if (flags != null) {
-              return flags;
-            }
-            final EnumSet<HighlightFlag> result = EnumSet.copyOf(mandatoryFlags);
-            int r = random().nextInt();
-            for (HighlightFlag highlightFlag : HighlightFlag.values()) {
-              if (((1 << highlightFlag.ordinal()) & r) == 0) {
-                result.add(highlightFlag);
-              }
-            }
-            if (result.contains(HighlightFlag.WEIGHT_MATCHES)) {
-              // these two are required for WEIGHT_MATCHES
-              result.add(HighlightFlag.MULTI_TERM_QUERY);
-              result.add(HighlightFlag.PHRASES);
-            }
-            return flags = result;
-          }
-        };
-    uh.setCacheFieldValCharsThreshold(random().nextInt(100));
-    if (requireFieldMatch == Boolean.FALSE
-        || (requireFieldMatch == null && random().nextBoolean())) {
-      uh.setFieldMatcher(f -> true); // requireFieldMatch==false
-    }
-    return uh;
-  }
-
   //
   //  Tests below were ported from the PostingsHighlighter. Possibly augmented.  Far below are newer
   // tests.
@@ -1033,15 +995,15 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase {
     assertEquals(1, topDocs.totalHits.value);
     String[] snippets = highlighter.highlight("title", query, topDocs, 10);
     assertEquals(1, snippets.length);
-    if (highlighter.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) {
-      assertEquals(
-          "" + highlighter.getFlags("title"), "<b>This is the title field</b>.", snippets[0]);
-    } else {
-      assertEquals(
-          "" + highlighter.getFlags("title"),
-          "<b>This</b> <b>is</b> <b>the</b> title <b>field</b>.",
-          snippets[0]);
-    }
+    // All flags are enabled.
+    assertEquals(
+        "" + highlighter.getFlags("title"),
+        HighlightFlag.values().length,
+        highlighter.getFlags("title").size());
+    assertEquals(
+        "" + highlighter.getFlags("title"),
+        "<b>This</b> <b>is</b> <b>the</b> title <b>field</b>.",
+        snippets[0]);
     ir.close();
   }