You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2016/11/18 16:42:35 UTC
[10/20] lucene-solr:jira/solr-8593: LUCENE-7559: UnifiedHighlighter:
Increase Passage visibility
LUCENE-7559: UnifiedHighlighter: Increase Passage visibility
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c51e8901
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c51e8901
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c51e8901
Branch: refs/heads/jira/solr-8593
Commit: c51e89014ac74f164e3b6962abc5f0db56ee99bb
Parents: 738198e
Author: David Smiley <ds...@apache.org>
Authored: Thu Nov 17 11:19:54 2016 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Thu Nov 17 11:19:54 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../uhighlight/AnalysisOffsetStrategy.java | 3 ++
.../uhighlight/DefaultPassageFormatter.java | 14 ++++----
.../search/uhighlight/FieldHighlighter.java | 30 ++++++++--------
.../lucene/search/uhighlight/OffsetsEnum.java | 8 ++---
.../lucene/search/uhighlight/Passage.java | 38 ++++++++++++++------
.../uhighlight/TokenStreamOffsetStrategy.java | 7 ++--
.../uhighlight/TestUnifiedHighlighterMTQ.java | 14 ++++----
8 files changed, 70 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 27d6cc9..f5dfd44 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -111,6 +111,9 @@ Other
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
+* LUCENE-7559: UnifiedHighlighter: Make Passage more exposed to allow passage creation to
+ be customized. (David Smiley)
+
Build
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
index e9db77c..162d270 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
@@ -75,6 +75,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
*
* @lucene.internal
*/
+ // TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
+ // MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
+ // albeit with less code, less hack.
private static final class MultiValueTokenStream extends TokenFilter {
private final String fieldName;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
index 77612a7..bc27a43 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
@@ -63,13 +63,13 @@ public class DefaultPassageFormatter extends PassageFormatter {
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
- if (passage.startOffset > pos && pos > 0) {
+ if (passage.getStartOffset() > pos && pos > 0) {
sb.append(ellipsis);
}
- pos = passage.startOffset;
- for (int i = 0; i < passage.numMatches; i++) {
- int start = passage.matchStarts[i];
- int end = passage.matchEnds[i];
+ pos = passage.getStartOffset();
+ for (int i = 0; i < passage.getNumMatches(); i++) {
+ int start = passage.getMatchStarts()[i];
+ int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
@@ -82,8 +82,8 @@ public class DefaultPassageFormatter extends PassageFormatter {
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
- append(sb, content, pos, Math.max(pos, passage.endOffset));
- pos = passage.endOffset;
+ append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
+ pos = passage.getEndOffset();
}
return sb.toString();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
index 7c8f048..1caa739 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
@@ -117,9 +117,9 @@ public class FieldHighlighter {
break;
}
Passage passage = new Passage();
- passage.score = Float.NaN;
- passage.startOffset = pos;
- passage.endOffset = next;
+ passage.setScore(Float.NaN);
+ passage.setStartOffset(pos);
+ passage.setEndOffset(next);
passages.add(passage);
pos = next;
}
@@ -145,12 +145,12 @@ public class FieldHighlighter {
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
- if (left.score < right.score) {
+ if (left.getScore() < right.getScore()) {
return -1;
- } else if (left.score > right.score) {
+ } else if (left.getScore() > right.getScore()) {
return 1;
} else {
- return left.startOffset - right.startOffset;
+ return left.getStartOffset() - right.getStartOffset();
}
});
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
@@ -170,12 +170,12 @@ public class FieldHighlighter {
continue;
}
// See if this term should be part of a new passage.
- if (start >= passage.endOffset) {
- if (passage.startOffset >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
+ if (start >= passage.getEndOffset()) {
+ if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
// finalize passage
- passage.score *= scorer.norm(passage.startOffset);
+ passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
// new sentence: first add 'passage' to queue
- if (passageQueue.size() == maxPassages && passage.score < passageQueue.peek().score) {
+ if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
passage.reset(); // can't compete, just reset it
} else {
passageQueue.offer(passage);
@@ -192,8 +192,8 @@ public class FieldHighlighter {
break;
}
// advance breakIterator
- passage.startOffset = Math.max(breakIterator.preceding(start + 1), 0);
- passage.endOffset = Math.min(breakIterator.following(start), contentLength);
+ passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
+ passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
}
// Add this term to the passage.
int tf = 0;
@@ -209,12 +209,12 @@ public class FieldHighlighter {
off.nextPosition();
start = off.startOffset();
end = off.endOffset();
- if (start >= passage.endOffset || end > contentLength) { // it's beyond this passage
+ if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
offsetsEnumQueue.offer(off);
break;
}
}
- passage.score += off.weight * scorer.tf(tf, passage.endOffset - passage.startOffset);
+ passage.setScore(passage.getScore() + off.weight * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
}
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
@@ -222,7 +222,7 @@ public class FieldHighlighter {
p.sort();
}
// sort in ascending order
- Arrays.sort(passages, (left, right) -> left.startOffset - right.startOffset);
+ Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
return passages;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index cbaeb90..db1ea1f 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -66,9 +66,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
}
BytesRef getTerm() throws IOException {
- // the dp.getPayload thing is a hack -- see MultiTermHighlighting
- return term != null ? term : postingsEnum.getPayload();
- // We don't deepcopy() because in this hack we know we don't have to.
+ // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
+ return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
}
boolean hasMorePositions() throws IOException {
@@ -91,7 +90,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
@Override
public void close() throws IOException {
- if (postingsEnum instanceof Closeable) { // the one in MultiTermHighlighting is.
+ // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
+ if (postingsEnum instanceof Closeable) {
((Closeable) postingsEnum).close();
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
index de37d5d..a131d86 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@@ -30,16 +30,17 @@ import org.apache.lucene.util.RamUsageEstimator;
*
* @lucene.experimental
*/
-public final class Passage {
- int startOffset = -1;
- int endOffset = -1;
- float score = 0.0f;
+public class Passage {
+ private int startOffset = -1;
+ private int endOffset = -1;
+ private float score = 0.0f;
- int matchStarts[] = new int[8];
- int matchEnds[] = new int[8];
- BytesRef matchTerms[] = new BytesRef[8];
- int numMatches = 0;
+ private int[] matchStarts = new int[8];
+ private int[] matchEnds = new int[8];
+ private BytesRef[] matchTerms = new BytesRef[8];
+ private int numMatches = 0;
+ /** @lucene.internal */
public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
@@ -61,7 +62,8 @@ public final class Passage {
numMatches++;
}
- void sort() {
+ /** @lucene.internal */
+ public void sort() {
final int starts[] = matchStarts;
final int ends[] = matchEnds;
final BytesRef terms[] = matchTerms;
@@ -89,7 +91,8 @@ public final class Passage {
}.sort(0, numMatches);
}
- void reset() {
+ /** @lucene.internal */
+ public void reset() {
startOffset = endOffset = -1;
score = 0.0f;
numMatches = 0;
@@ -158,4 +161,19 @@ public final class Passage {
public BytesRef[] getMatchTerms() {
return matchTerms;
}
+
+ /** @lucene.internal */
+ public void setStartOffset(int startOffset) {
+ this.startOffset = startOffset;
+ }
+
+ /** @lucene.internal */
+ public void setEndOffset(int endOffset) {
+ this.endOffset = endOffset;
+ }
+
+ /** @lucene.internal */
+ public void setScore(float score) {
+ this.score = score;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
index 966eeef..28eb6b1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@@ -69,10 +69,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
}
- // but this would have a performance cost for likely little gain in the user experience, it
- // would only serve to make this method less bogus.
- // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
- // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
+ // See class javadocs.
+ // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
@@ -134,6 +132,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return currentEndOffset;
}
+ // TOTAL HACK; used in OffsetsEnum.getTerm()
@Override
public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c51e8901/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
index be0ff1b..10f36a7 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
@@ -697,13 +697,13 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
- if (passage.startOffset > pos && pos > 0) {
+ if (passage.getStartOffset() > pos && pos > 0) {
sb.append("... ");
}
- pos = passage.startOffset;
- for (int i = 0; i < passage.numMatches; i++) {
- int start = passage.matchStarts[i];
- int end = passage.matchEnds[i];
+ pos = passage.getStartOffset();
+ for (int i = 0; i < passage.getNumMatches(); i++) {
+ int start = passage.getMatchStarts()[i];
+ int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
sb.append(content, pos, start);
@@ -719,8 +719,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
- sb.append(content, pos, Math.max(pos, passage.endOffset));
- pos = passage.endOffset;
+ sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
+ pos = passage.getEndOffset();
}
return sb.toString();
}