You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by bu...@apache.org on 2009/07/25 06:11:34 UTC
svn commit: r797715 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/analysis/
src/java/org/apache/lucene/analysis/standard/
src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Author: buschmi
Date: Sat Jul 25 04:11:33 2009
New Revision: 797715
URL: http://svn.apache.org/viewvc?rev=797715&view=rev
Log:
LUCENE-1448: Add TokenStream.end() to perform end-of-stream operations. Fixes offset problems when multiple fields with the same name are added to a document.
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat Jul 25 04:11:33 2009
@@ -609,6 +609,11 @@
CONSTANT_SCORE_AUTO_REWRITE tries to pick the most performant
constant-score rewrite method. (Mike McCandless)
+34. LUCENE-1448: Added TokenStream.end(), to perform end-of-stream
+ operations. This is currently used to fix offset problems when
+ multiple fields with the same name are added to a document.
+ (Mike McCandless, Mark Miller, Michael Busch)
+
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java Sat Jul 25 04:11:33 2009
@@ -24,6 +24,8 @@
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.document.Fieldable;
+
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
@@ -123,6 +125,24 @@
return 0;
}
+ /**
+ * Just like {@link #getPositionIncrementGap}, except for
+ * Token offsets instead. By default this returns 1 for
+ * tokenized fields and, as if the fields were joined
+ * with an extra space character, and 0 for un-tokenized
+ * fields. This method is only called if the field
+ * produced at least one token for indexing.
+ *
+ * @param Fieldable the field just indexed
+ * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
+ */
+ public int getOffsetGap(Fieldable field) {
+ if (field.isTokenized())
+ return 1;
+ else
+ return 0;
+ }
+
/** Frees persistent resources used by this Analyzer */
public void close() {
tokenStreams.close();
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java Sat Jul 25 04:11:33 2009
@@ -36,6 +36,7 @@
public class CachingTokenFilter extends TokenFilter {
private List cache = null;
private Iterator iterator = null;
+ private AttributeSource.State finalState;
public CachingTokenFilter(TokenStream input) {
super(input);
@@ -69,6 +70,12 @@
restoreState((AttributeSource.State) iterator.next());
return true;
}
+
+ public final void end() throws IOException {
+ if (finalState != null) {
+ restoreState(finalState);
+ }
+ }
public void reset() throws IOException {
if(cache != null) {
@@ -80,6 +87,9 @@
while(input.incrementToken()) {
cache.add(captureState());
}
+ // capture final state
+ input.end();
+ finalState = captureState();
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Sat Jul 25 04:11:33 2009
@@ -63,6 +63,7 @@
offset += dataLen;
dataLen = input.read(ioBuffer);
if (dataLen == -1) {
+ dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
break;
else
@@ -93,6 +94,12 @@
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
return true;
}
+
+ public final void end() {
+ // set final offset
+ int finalOffset = input.correctOffset(offset);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Sat Jul 25 04:11:33 2009
@@ -31,6 +31,7 @@
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
+ private int finalOffset;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
@@ -59,11 +60,17 @@
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
- offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
+ finalOffset = input.correctOffset(upto);
+ offsetAtt.setOffset(input.correctOffset(0), finalOffset);
return true;
}
return false;
}
+
+ public final void end() {
+ // set final offset
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java Sat Jul 25 04:11:33 2009
@@ -149,6 +149,17 @@
return false;
}
+ public final void end() throws IOException {
+ super.end();
+ AttributeSource.State finalState = captureState();
+ for (Iterator it = sinks.iterator(); it.hasNext(); ) {
+ final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
+ if (sink != null) {
+ sink.setFinalState(finalState);
+ }
+ }
+ }
+
/**
* A filter that decides which {@link AttributeSource} states to store in the sink.
*/
@@ -162,6 +173,7 @@
public static final class SinkTokenStream extends TokenStream {
private final List cachedStates = new LinkedList();
+ private AttributeSource.State finalState;
private Iterator it = null;
private SinkFilter filter;
@@ -181,6 +193,10 @@
cachedStates.add(state);
}
+ private void setFinalState(AttributeSource.State finalState) {
+ this.finalState = finalState;
+ }
+
public final boolean incrementToken() throws IOException {
// lazy init the iterator
if (it == null) {
@@ -195,12 +211,18 @@
restoreState(state);
return true;
}
+
+ public final void end() throws IOException {
+ if (finalState != null) {
+ restoreState(finalState);
+ }
+ }
public final void reset() {
it = cachedStates.iterator();
}
}
-
+
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
public boolean accept(AttributeSource source) {
return true;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java Sat Jul 25 04:11:33 2009
@@ -39,6 +39,13 @@
this.input = input;
}
+ /** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
+ * input TokenStream.<p/>
+ * <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
+ public void end() throws IOException {
+ input.end();
+ }
+
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java Sat Jul 25 04:11:33 2009
@@ -268,6 +268,22 @@
tokenWrapper.delegate = token;
return true;
}
+
+ /**
+ * This method is called by the consumer after the last token has been consumed,
+ * i.e. after {@link #incrementToken()} returned <code>false</code> (using the new TokenStream API)
+ * or after {@link #next(Token)} or {@link #next()} returned <code>null</code> (old TokenStream API).
+ * <p/>
+ * This method can be used to perform any end-of-stream operations, such as setting the final
+ * offset of a stream. The final offset of a stream might differ from the offset of the last token
+ * e.g. in case one or more whitespaces followed after the last token, but a {@link WhitespaceTokenizer}
+ * was used.
+ *
+ * @throws IOException
+ */
+ public void end() throws IOException {
+ // do nothing by default
+ }
/** Returns the next token in the stream, or null at EOS.
* When possible, the input Token should be used as the
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Sat Jul 25 04:11:33 2009
@@ -183,6 +183,12 @@
posIncr++;
}
}
+
+ public final void end() {
+ // set final offset
+ int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java Sat Jul 25 04:11:33 2009
@@ -74,6 +74,8 @@
// tokenized.
if (field.isIndexed() && doInvert) {
+ final boolean anyToken;
+
if (fieldState.length > 0)
fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
@@ -95,6 +97,7 @@
fieldState.offset += valueLength;
fieldState.length++;
fieldState.position++;
+ anyToken = valueLength > 0;
} else { // tokenized field
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
@@ -124,6 +127,8 @@
// reset the TokenStream to the first token
stream.reset();
+ final int startLength = fieldState.length;
+
// deprecated
final boolean allowMinus1Position = docState.allowMinus1Position;
@@ -183,12 +188,18 @@
hasMoreTokens = stream.incrementToken();
}
- fieldState.offset = offsetEnd+1;
+ // trigger streams to perform end-of-stream operations
+ stream.end();
+
+ fieldState.offset += offsetAttribute.endOffset();
+ anyToken = fieldState.length > startLength;
} finally {
stream.close();
}
}
+ if (anyToken)
+ fieldState.offset += docState.analyzer.getOffsetGap(field);
fieldState.boost *= field.getBoost();
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=797715&r1=797714&r2=797715&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Sat Jul 25 04:11:33 2009
@@ -17,17 +17,26 @@
* limitations under the License.
*/
-import java.io.*;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.SinkTokenizer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TeeSinkTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -35,8 +44,6 @@
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
@@ -55,7 +62,6 @@
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
-import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
@@ -4150,20 +4156,238 @@
Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
+ Field f2 = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f2);
+ doc.add(f);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
+
+ // Token "" occurred once
+ assertEquals(1, termOffsets.length);
+ assertEquals(8, termOffsets[0].getStartOffset());
+ assertEquals(8, termOffsets[0].getEndOffset());
+
+ // Token "abcd" occurred three times
+ termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
+ assertEquals(3, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(4, termOffsets[1].getStartOffset());
assertEquals(8, termOffsets[1].getEndOffset());
+ assertEquals(8, termOffsets[2].getStartOffset());
+ assertEquals(12, termOffsets[2].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1442
+ public void testDoubleOffsetCounting2() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+ assertEquals(2, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ assertEquals(5, termOffsets[1].getStartOffset());
+ assertEquals(9, termOffsets[1].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionCharAnalyzer() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+ assertEquals(2, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ assertEquals(8, termOffsets[1].getStartOffset());
+ assertEquals(12, termOffsets[1].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
+ Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+ assertEquals(2, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ assertEquals(8, termOffsets[1].getStartOffset());
+ assertEquals(12, termOffsets[1].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
+ TokenStream sink = tee.newSinkTokenStream();
+ Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f1);
+ doc.add(f2);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+ assertEquals(2, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ assertEquals(8, termOffsets[1].getStartOffset());
+ assertEquals(12, termOffsets[1].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionStopFilter() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
+ assertEquals(2, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ assertEquals(9, termOffsets[1].getStartOffset());
+ assertEquals(13, termOffsets[1].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionStandard() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field f = new Field("field", "abcd the ", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ Field f2 = new Field("field", "crunch man", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f2);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+ TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+ assertEquals(1, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ termOffsets = tpv.getOffsets(1);
+ assertEquals(11, termOffsets[0].getStartOffset());
+ assertEquals(17, termOffsets[0].getEndOffset());
+ termOffsets = tpv.getOffsets(2);
+ assertEquals(18, termOffsets[0].getStartOffset());
+ assertEquals(21, termOffsets[0].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionStandardEmptyField() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ Field f = new Field("field", "", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ Field f2 = new Field("field", "crunch man", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(f2);
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+ TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+ assertEquals(1, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(6, termOffsets[0].getEndOffset());
+ termOffsets = tpv.getOffsets(1);
+ assertEquals(7, termOffsets[0].getStartOffset());
+ assertEquals(10, termOffsets[0].getEndOffset());
+ r.close();
+ dir.close();
+ }
+
+ // LUCENE-1448
+ public void testEndOffsetPositionStandardEmptyField2() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+
+ Field f = new Field("field", "abcd", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f);
+ doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+
+ Field f2 = new Field("field", "crunch", Field.Store.NO,
+ Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(f2);
+
+ w.addDocument(doc);
+ w.close();
+
+ IndexReader r = IndexReader.open(dir);
+ TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
+ TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
+ assertEquals(1, termOffsets.length);
+ assertEquals(0, termOffsets[0].getStartOffset());
+ assertEquals(4, termOffsets[0].getEndOffset());
+ termOffsets = tpv.getOffsets(1);
+ assertEquals(5, termOffsets[0].getStartOffset());
+ assertEquals(11, termOffsets[0].getEndOffset());
r.close();
dir.close();
}
+
// LUCENE-1468 -- make sure opening an IndexWriter with
// create=true does not remove non-index files