You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/01/04 00:23:12 UTC
svn commit: r1649263 - in /lucene/dev/trunk/lucene: CHANGES.txt
highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
Author: dsmiley
Date: Sat Jan 3 23:23:12 2015
New Revision: 1649263
URL: http://svn.apache.org/r1649263
Log:
LUCENE-6139: TokenGroup start/end offset getters should have been returning offsets of matching tokens when there are some.
Also made the Highlighter use the getters instead of direct field access.
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1649263&r1=1649262&r2=1649263&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Jan 3 23:23:12 2015
@@ -408,6 +408,10 @@ Bug Fixes
* LUCENE-6152: Fix double close problems in OutputStreamIndexOutput.
(Uwe Schindler)
+
+* LUCENE-6139: Highlighter: TokenGroup start & end offset getters should have
+ been returning the offsets of just the matching tokens in the group when
+ there's a distinction. (David Smiley)
Documentation
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=1649263&r1=1649262&r2=1649263&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Sat Jan 3 23:23:12 2015
@@ -225,12 +225,12 @@ public class Highlighter
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
+" exceeds length of provided text sized "+text.length());
}
- if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
+ if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
+ startOffset = tokenGroup.getStartOffset();
+ endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
@@ -261,11 +261,11 @@ public class Highlighter
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
- if(tokenGroup.numTokens>0)
+ if(tokenGroup.getNumTokens() >0)
{
//flush the accumulated text (same code as in above loop)
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
+ startOffset = tokenGroup.getStartOffset();
+ endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java?rev=1649263&r1=1649262&r2=1649263&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java Sat Jan 3 23:23:12 2015
@@ -24,18 +24,20 @@ import org.apache.lucene.analysis.tokena
/**
* One, or several overlapping tokens, along with the score(s) and the scope of
- * the original text
+ * the original text.
*/
public class TokenGroup {
private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
- Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
- float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
- int numTokens = 0;
- int startOffset = 0;
- int endOffset = 0;
- float tot;
- int matchStartOffset, matchEndOffset;
+
+ private Token[] tokens = new Token[MAX_NUM_TOKENS_PER_GROUP];
+ private float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
+ private int numTokens = 0;
+ private int startOffset = 0;
+ private int endOffset = 0;
+ private float tot;
+ private int matchStartOffset;
+ private int matchEndOffset;
private OffsetAttribute offsetAtt;
private CharTermAttribute termAtt;
@@ -47,8 +49,8 @@ public class TokenGroup {
void addToken(float score) {
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
- int termStartOffset = offsetAtt.startOffset();
- int termEndOffset = offsetAtt.endOffset();
+ final int termStartOffset = offsetAtt.startOffset();
+ final int termEndOffset = offsetAtt.endOffset();
if (numTokens == 0) {
startOffset = matchStartOffset = termStartOffset;
endOffset = matchEndOffset = termEndOffset;
@@ -58,8 +60,8 @@ public class TokenGroup {
endOffset = Math.max(endOffset, termEndOffset);
if (score > 0) {
if (tot == 0) {
- matchStartOffset = offsetAtt.startOffset();
- matchEndOffset = offsetAtt.endOffset();
+ matchStartOffset = termStartOffset;
+ matchEndOffset = termEndOffset;
} else {
matchStartOffset = Math.min(matchStartOffset, termStartOffset);
matchEndOffset = Math.max(matchEndOffset, termEndOffset);
@@ -84,15 +86,14 @@ public class TokenGroup {
numTokens = 0;
tot = 0;
}
-
- /*
- * @param index a value between 0 and numTokens -1
- * @return the "n"th token
- */
- public Token getToken(int index)
- {
- return tokens[index];
- }
+
+ /**
+ * @param index a value between 0 and numTokens -1
+ * @return the "n"th token
+ */
+ public Token getToken(int index) {
+ return tokens[index];
+ }
/**
*
@@ -104,24 +105,26 @@ public class TokenGroup {
}
/**
- * @return the end position in the original text
+ * @return the earliest start offset in the original text of a matching token in this group (score > 0), or
+ * if there are none then the earliest offset of any token in the group.
*/
- public int getEndOffset() {
- return endOffset;
+ public int getStartOffset() {
+ return matchStartOffset;
}
/**
- * @return the number of tokens in this group
+ * @return the latest end offset in the original text of a matching token in this group (score > 0), or
+ * if there are none then {@link #getEndOffset()}.
*/
- public int getNumTokens() {
- return numTokens;
+ public int getEndOffset() {
+ return matchEndOffset;
}
/**
- * @return the start position in the original text
+ * @return the number of tokens in this group
*/
- public int getStartOffset() {
- return startOffset;
+ public int getNumTokens() {
+ return numTokens;
}
/**
@@ -130,4 +133,5 @@ public class TokenGroup {
public float getTotalScore() {
return tot;
}
+
}