You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/01/11 20:23:13 UTC
svn commit: r1557439 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/highlight/
highlighter/src/test/org/apache/lucene/search/highlight/
Author: mikemccand
Date: Sat Jan 11 19:23:13 2014
New Revision: 1557439
URL: http://svn.apache.org/r1557439
Log:
LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1557439&r1=1557438&r2=1557439&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Jan 11 19:23:13 2014
@@ -153,6 +153,9 @@ Bug fixes
domain-only URLs that are followed by an alphanumeric character.
(Chris Geeringh, Steve Rowe)
+* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
+ they were indexed with the term vectors. (Mike McCandless)
+
API Changes
* LUCENE-5339: The facet module was simplified/reworked to make the
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1557439&r1=1557438&r2=1557439&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Sat Jan 11 19:23:13 2014
@@ -20,11 +20,16 @@ package org.apache.lucene.search.highlig
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
@@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-
/**
* Hides implementation issues associated with obtaining a TokenStream for use
* with the higlighter - can obtain from TermFreqVectors with offsets and
@@ -169,11 +170,14 @@ public class TokenSources {
PositionIncrementAttribute posincAtt;
+ PayloadAttribute payloadAtt;
+
StoredTokenStream(Token tokens[]) {
this.tokens = tokens;
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posincAtt = addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = addAttribute(PayloadAttribute.class);
}
@Override
@@ -185,6 +189,10 @@ public class TokenSources {
clearAttributes();
termAtt.setEmpty().append(token);
offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ BytesRef payload = token.getPayload();
+ if (payload != null) {
+ payloadAtt.setPayload(payload);
+ }
posincAtt
.setPositionIncrement(currentToken <= 1
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
@@ -192,6 +200,9 @@ public class TokenSources {
return true;
}
}
+
+ boolean hasPayloads = tpv.hasPayloads();
+
// code to reconstruct the original sequence of Tokens
TermsEnum termsEnum = tpv.iterator(null);
int totalTokens = 0;
@@ -223,6 +234,13 @@ public class TokenSources {
final Token token = new Token(term,
dpEnum.startOffset(),
dpEnum.endOffset());
+ if (hasPayloads) {
+ // Must make a deep copy of the returned payload,
+ // since D&PEnum API is allowed to re-use on every
+ // call:
+ token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+ }
+
if (tokenPositionsGuaranteedContiguous && pos != -1) {
// We have positions stored and a guarantee that the token position
// information is contiguous
@@ -253,9 +271,11 @@ public class TokenSources {
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
@Override
public int compare(Token t1, Token t2) {
- if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
- - t2.endOffset();
- else return t1.startOffset() - t2.startOffset();
+ if (t1.startOffset() == t2.startOffset()) {
+ return t1.endOffset() - t2.endOffset();
+ } else {
+ return t1.startOffset() - t2.startOffset();
+ }
}
});
}
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java?rev=1557439&r1=1557438&r2=1557439&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java Sat Jan 11 19:23:13 2014
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Terms;
@@ -48,6 +49,8 @@ public final class TokenStreamFromTermPo
private OffsetAttribute offsetAttribute;
+ private PayloadAttribute payloadAttribute;
+
/**
* Constructor.
*
@@ -59,7 +62,9 @@ public final class TokenStreamFromTermPo
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
+ payloadAttribute = addAttribute(PayloadAttribute.class);
final boolean hasOffsets = vector.hasOffsets();
+ final boolean hasPayloads = vector.hasPayloads();
final TermsEnum termsEnum = vector.iterator(null);
BytesRef text;
DocsAndPositionsEnum dpEnum = null;
@@ -79,6 +84,13 @@ public final class TokenStreamFromTermPo
token = new Token();
token.setEmpty().append(text.utf8ToString());
}
+ if (hasPayloads) {
+ // Must make a deep copy of the returned payload,
+ // since D&PEnum API is allowed to re-use on every
+ // call:
+ token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+ }
+
// Yes - this is the position, not the increment! This is for
// sorting. This value
// will be corrected before use.
@@ -112,6 +124,7 @@ public final class TokenStreamFromTermPo
positionIncrementAttribute.setPositionIncrement(next
.getPositionIncrement());
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
+ payloadAttribute.setPayload(next.getPayload());
return true;
}
return false;
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java?rev=1557439&r1=1557438&r2=1557439&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java Sat Jan 11 19:23:13 2014
@@ -17,10 +17,14 @@ package org.apache.lucene.search.highlig
* limitations under the License.
*/
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -29,6 +33,7 @@ import org.apache.lucene.document.TextFi
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
@@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.Sp
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
-import java.io.IOException;
-
// LUCENE-2874
public class TokenSourcesTest extends LuceneTestCase {
private static final String FIELD = "text";
@@ -262,7 +266,6 @@ public class TokenSourcesTest extends Lu
public void testTermVectorWithoutOffsetsThrowsException()
throws IOException, InvalidTokenOffsetsException {
- final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
@@ -280,8 +283,7 @@ public class TokenSourcesTest extends Lu
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
- final TokenStream tokenStream = TokenSources
- .getTokenStream(
+ TokenSources.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
@@ -295,5 +297,68 @@ public class TokenSourcesTest extends Lu
}
}
+ int curOffset;
+ /** Just make a token with the text, and set the payload
+ * to the text as well. Offets increment "naturally". */
+ private Token getToken(String text) {
+ Token t = new Token(text, curOffset, curOffset+text.length());
+ t.setPayload(new BytesRef(text));
+ curOffset++;
+ return t;
+ }
+
+ // LUCENE-5294
+ public void testPayloads() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
+ myFieldType.setStoreTermVectors(true);
+ myFieldType.setStoreTermVectorOffsets(true);
+ myFieldType.setStoreTermVectorPositions(true);
+ myFieldType.setStoreTermVectorPayloads(true);
+
+ curOffset = 0;
+
+ Token[] tokens = new Token[] {
+ getToken("foxes"),
+ getToken("can"),
+ getToken("jump"),
+ getToken("high")
+ };
+
+ Document doc = new Document();
+ doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
+ writer.addDocument(doc);
+
+ IndexReader reader = writer.getReader();
+ writer.close();
+ assertEquals(1, reader.numDocs());
+
+ for(int i=0;i<2;i++) {
+ // Do this twice, once passing true and then passing
+ // false: they are entirely different code paths
+ // under-the-hood:
+ TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
+
+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+ PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
+
+ for(Token token : tokens) {
+ assertTrue(ts.incrementToken());
+ assertEquals(token.toString(), termAtt.toString());
+ assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
+ assertEquals(token.getPayload(), payloadAtt.getPayload());
+ assertEquals(token.startOffset(), offsetAtt.startOffset());
+ assertEquals(token.endOffset(), offsetAtt.endOffset());
+ }
+
+ assertFalse(ts.incrementToken());
+ }
+
+ reader.close();
+ dir.close();
+ }
}