You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2018/08/13 19:18:09 UTC
lucene-solr:master: LUCENE-8446: DefaultPassageFormatter: merge
overlapping matches
Repository: lucene-solr
Updated Branches:
refs/heads/master bb5816357 -> 8d3f59a47
LUCENE-8446: DefaultPassageFormatter: merge overlapping matches
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8d3f59a4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8d3f59a4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8d3f59a4
Branch: refs/heads/master
Commit: 8d3f59a47f2a4d6e53ef352e9ce436553f617070
Parents: bb58163
Author: David Smiley <ds...@apache.org>
Authored: Mon Aug 13 15:18:03 2018 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Mon Aug 13 15:18:03 2018 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../uhighlight/DefaultPassageFormatter.java | 25 +++++++++++++-------
.../lucene/search/uhighlight/OffsetsEnum.java | 4 ++++
3 files changed, 23 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1b09260..5ae0111 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -243,6 +243,9 @@ Improvements
* LUCENE-8414: Make segmentInfos private in IndexWriter (Simon Willnauer, Nhat Nguyen)
+* LUCENE-8446: The UnifiedHighlighter's DefaultPassageFormatter now treats overlapping matches in
+ the passage as merged (as if one larger match). (David Smiley)
+
Other:
* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
index 62d58df..8d2a424 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
@@ -70,17 +70,24 @@ public class DefaultPassageFormatter extends PassageFormatter {
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
+ assert start >= pos && start < passage.getEndOffset();
+ //append content before this start
+ append(sb, content, pos, start);
+
int end = passage.getMatchEnds()[i];
- // its possible to have overlapping terms
- if (start > pos) {
- append(sb, content, pos, start);
- }
- if (end > pos) {
- sb.append(preTag);
- append(sb, content, Math.max(pos, start), end);
- sb.append(postTag);
- pos = end;
+ assert end > start;
+ // its possible to have overlapping terms.
+ // Look ahead to expand 'end' past all overlapping:
+ while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
+ end = passage.getMatchEnds()[++i];
}
+ end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
+
+ sb.append(preTag);
+ append(sb, content, start, end);
+ sb.append(postTag);
+
+ pos = end;
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8d3f59a4/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index 55f3c37..bdabcc1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -46,6 +46,10 @@ public abstract class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable
if (cmp != 0) {
return cmp; // vast majority of the time we return here.
}
+ cmp = Integer.compare(endOffset(), other.endOffset());
+ if (cmp != 0) {
+ return cmp;
+ }
final BytesRef thisTerm = this.getTerm();
final BytesRef otherTerm = other.getTerm();
if (thisTerm == null || otherTerm == null) {