You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2018/08/13 19:18:09 UTC

lucene-solr:master: LUCENE-8446: DefaultPassageFormatter: merge overlapping matches

Repository: lucene-solr
Updated Branches:
  refs/heads/master bb5816357 -> 8d3f59a47


LUCENE-8446: DefaultPassageFormatter: merge overlapping matches


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8d3f59a4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8d3f59a4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8d3f59a4

Branch: refs/heads/master
Commit: 8d3f59a47f2a4d6e53ef352e9ce436553f617070
Parents: bb58163
Author: David Smiley <ds...@apache.org>
Authored: Mon Aug 13 15:18:03 2018 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Mon Aug 13 15:18:03 2018 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +++
 .../uhighlight/DefaultPassageFormatter.java     | 25 +++++++++++++-------
 .../lucene/search/uhighlight/OffsetsEnum.java   |  4 ++++
 3 files changed, 23 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1b09260..5ae0111 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -243,6 +243,9 @@ Improvements
 
 * LUCENE-8414: Make segmentInfos private in IndexWriter (Simon Willnauer, Nhat Nguyen)
 
+* LUCENE-8446: The UnifiedHighlighter's DefaultPassageFormatter now treats overlapping matches in
+  the passage as merged (as if one larger match).  (David Smiley)
+
 Other:
 
 * LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
index 62d58df..8d2a424 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
@@ -70,17 +70,24 @@ public class DefaultPassageFormatter extends PassageFormatter {
       pos = passage.getStartOffset();
       for (int i = 0; i < passage.getNumMatches(); i++) {
         int start = passage.getMatchStarts()[i];
+        assert start >= pos && start < passage.getEndOffset();
+        //append content before this start
+        append(sb, content, pos, start);
+
         int end = passage.getMatchEnds()[i];
-        // its possible to have overlapping terms
-        if (start > pos) {
-          append(sb, content, pos, start);
-        }
-        if (end > pos) {
-          sb.append(preTag);
-          append(sb, content, Math.max(pos, start), end);
-          sb.append(postTag);
-          pos = end;
+        assert end > start;
+        // its possible to have overlapping terms.
+        //   Look ahead to expand 'end' past all overlapping:
+        while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
+          end = passage.getMatchEnds()[++i];
         }
+        end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
+
+        sb.append(preTag);
+        append(sb, content, start, end);
+        sb.append(postTag);
+
+        pos = end;
       }
       // its possible a "term" from the analyzer could span a sentence boundary.
       append(sb, content, pos, Math.max(pos, passage.getEndOffset()));

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index 55f3c37..bdabcc1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -46,6 +46,10 @@ public abstract class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable
       if (cmp != 0) {
         return cmp; // vast majority of the time we return here.
       }
+      cmp = Integer.compare(endOffset(), other.endOffset());
+      if (cmp != 0) {
+        return cmp;
+      }
       final BytesRef thisTerm = this.getTerm();
       final BytesRef otherTerm = other.getTerm();
       if (thisTerm == null || otherTerm == null) {