You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/23 23:15:17 UTC
svn commit: r1235028 [1/3] - in /lucene/dev/branches/lucene2858: ./
dev-tools/maven/solr/core/ lucene/
lucene/src/java/org/apache/lucene/codecs/lucene3x/
lucene/src/java/org/apache/lucene/codecs/lucene40/
lucene/src/test-framework/java/org/apache/lucen...
Author: uschindler
Date: Mon Jan 23 22:15:15 2012
New Revision: 1235028
URL: http://svn.apache.org/viewvc?rev=1235028&view=rev
Log:
LUCENE-2858: Reverse merged revision(s) 1-0 from lucene/dev/trunk
Added:
lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
- copied unchanged from r1235026, lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
- copied unchanged from r1235026, lucene/dev/trunk/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java
- copied unchanged from r1235026, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java
lucene/dev/branches/lucene2858/solr/core/src/test-files/solr/conf/stoptypes-1.txt
- copied unchanged from r1235026, lucene/dev/trunk/solr/core/src/test-files/solr/conf/stoptypes-1.txt
lucene/dev/branches/lucene2858/solr/core/src/test-files/solr/conf/stoptypes-2.txt
- copied unchanged from r1235026, lucene/dev/trunk/solr/core/src/test-files/solr/conf/stoptypes-2.txt
lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/TestTypeTokenFilterFactory.java
- copied unchanged from r1235026, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestTypeTokenFilterFactory.java
Modified:
lucene/dev/branches/lucene2858/ (props changed)
lucene/dev/branches/lucene2858/dev-tools/maven/solr/core/pom.xml.template
lucene/dev/branches/lucene2858/lucene/ (props changed)
lucene/dev/branches/lucene2858/lucene/CHANGES.txt
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java
lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java
lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java
lucene/dev/branches/lucene2858/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
lucene/dev/branches/lucene2858/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
lucene/dev/branches/lucene2858/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
lucene/dev/branches/lucene2858/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
lucene/dev/branches/lucene2858/solr/ (props changed)
lucene/dev/branches/lucene2858/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/lucene2858/solr/core/ (props changed)
lucene/dev/branches/lucene2858/solr/core/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/core/src/test/ (props changed)
Modified: lucene/dev/branches/lucene2858/dev-tools/maven/solr/core/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/dev-tools/maven/solr/core/pom.xml.template?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/dev-tools/maven/solr/core/pom.xml.template (original)
+++ lucene/dev/branches/lucene2858/dev-tools/maven/solr/core/pom.xml.template Mon Jan 23 22:15:15 2012
@@ -203,6 +203,12 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
+ <directory>${project.build.testSourceDirectory}</directory>
+ <excludes>
+ <exclude>**/*.java</exclude>
+ </excludes>
+ </testResource>
+ <testResource>
<directory>../solrj/src/test-files</directory>
</testResource>
</testResources>
Modified: lucene/dev/branches/lucene2858/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/CHANGES.txt?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene2858/lucene/CHANGES.txt Mon Jan 23 22:15:15 2012
@@ -790,7 +790,7 @@ New Features
input mapping to it) for FSTs that have strictly monotonic long
outputs (such as an ord). (Mike McCandless)
-* LUCENE-3121: Add TypeTokenFilter that filters tokens based on
+* LUCENE-3671: Add TypeTokenFilter that filters tokens based on
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
@@ -814,9 +814,11 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
-* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
- where they would create invalid offsets in some situations, leading to problems
- in highlighting. (Max Beutel via Robert Muir)
+* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters,
+ compound token filters, thai word filter, icutokenizer, pattern analyzer,
+ wikipediatokenizer, and smart chinese where they would create invalid offsets in
+ some situations, leading to problems in highlighting.
+ (Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java Mon Jan 23 22:15:15 2012
@@ -206,7 +206,7 @@ public class SegmentTermDocs {
skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
if (!haveSkipped) { // lazily initialize skip stream
- skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
+ skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
haveSkipped = true;
}
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java Mon Jan 23 22:15:15 2012
@@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader ex
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
- if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
storePayloads = false;
}
hasVectors |= storeTermVector;
- hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+ hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
// DV Types are packed in one byte
byte val = input.readByte();
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java Mon Jan 23 22:15:15 2012
@@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter ex
output.writeVInt(FORMAT_CURRENT);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
- assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
+ assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
byte bits = 0x0;
if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java Mon Jan 23 22:15:15 2012
@@ -197,7 +197,7 @@ public class Lucene40PostingsReader exte
// undefined
}
- if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
if (isFirstTerm) {
termState.proxOffset = termState.bytesReader.readVLong();
} else {
@@ -245,23 +245,23 @@ public class Lucene40PostingsReader exte
DocsAndPositionsEnum reuse, boolean needsOffsets)
throws IOException {
- if (needsOffsets) {
- // TODO: once we index offsets into postings fix this!
- return null;
+ boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ if (needsOffsets && !hasOffsets) {
+ return null; // not available
}
// TODO: refactor
- if (fieldInfo.storePayloads) {
- SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
- if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
- docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
+ if (fieldInfo.storePayloads || hasOffsets) {
+ SegmentFullPositionsEnum docsEnum;
+ if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
+ docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
} else {
- docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
+ docsEnum = (SegmentFullPositionsEnum) reuse;
if (docsEnum.startFreqIn != freqIn) {
// If you are using ParellelReader, and pass in a
// reused DocsEnum, it could have come from another
// reader also using standard codec
- docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
+ docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
}
}
return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
@@ -295,6 +295,7 @@ public class Lucene40PostingsReader exte
protected boolean indexOmitsTF; // does current field omit term freq?
protected boolean storePayloads; // does current field store payloads?
+ protected boolean storeOffsets; // does current field store offsets?
protected int limit; // number of docs in this posting
protected int ord; // how many docs we've read
@@ -324,6 +325,7 @@ public class Lucene40PostingsReader exte
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
storePayloads = fieldInfo.storePayloads;
+ storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
freqOffset = termState.freqOffset;
skipOffset = termState.skipOffset;
@@ -471,7 +473,7 @@ public class Lucene40PostingsReader exte
skipper.init(freqOffset + skipOffset,
freqOffset, 0,
- limit, storePayloads);
+ limit, storePayloads, storeOffsets);
skipped = true;
}
@@ -665,7 +667,7 @@ public class Lucene40PostingsReader exte
// TODO specialize DocsAndPosEnum too
- // Decodes docs & positions. payloads are not present.
+ // Decodes docs & positions. payloads nor offsets are present.
private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
@@ -792,7 +794,7 @@ public class Lucene40PostingsReader exte
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
- limit, false);
+ limit, false, false);
skipped = true;
}
@@ -868,8 +870,8 @@ public class Lucene40PostingsReader exte
}
}
- // Decodes docs & positions & payloads
- private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
+ // Decodes docs & positions & (payloads and/or offsets)
+ private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
private final IndexInput proxIn;
@@ -895,16 +897,24 @@ public class Lucene40PostingsReader exte
Lucene40SkipListReader skipper;
private BytesRef payload;
private long lazyProxPointer;
+
+ boolean storePayloads;
+ boolean storeOffsets;
+
+ int offsetLength;
+ int startOffset;
- public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
+ public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
startFreqIn = freqIn;
this.freqIn = (IndexInput) freqIn.clone();
this.proxIn = (IndexInput) proxIn.clone();
}
- public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
- assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
- assert fieldInfo.storePayloads;
+ public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
+ storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ storePayloads = fieldInfo.storePayloads;
+ assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ assert storePayloads || storeOffsets;
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[1];
@@ -923,6 +933,7 @@ public class Lucene40PostingsReader exte
doc = -1;
accum = 0;
position = 0;
+ startOffset = 0;
skipped = false;
posPendingCount = 0;
@@ -963,6 +974,7 @@ public class Lucene40PostingsReader exte
}
position = 0;
+ startOffset = 0;
//System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
return (doc = accum);
@@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader exte
//System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
- limit, true);
+ limit, storePayloads, storeOffsets);
skipped = true;
}
@@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader exte
lazyProxPointer = skipper.getProxPointer();
posPendingCount = 0;
position = 0;
+ startOffset = 0;
payloadPending = false;
payloadLength = skipper.getPayloadLength();
+ offsetLength = skipper.getOffsetLength();
}
}
@@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader exte
}
if (payloadPending && payloadLength > 0) {
- // payload of last position as never retrieved -- skip it
+ // payload of last position was never retrieved -- skip it
proxIn.seek(proxIn.getFilePointer() + payloadLength);
payloadPending = false;
}
// scan over any docs that were iterated without their positions
while(posPendingCount > freq) {
-
final int code = proxIn.readVInt();
- if ((code & 1) != 0) {
- // new payload length
- payloadLength = proxIn.readVInt();
- assert payloadLength >= 0;
+ if (storePayloads) {
+ if ((code & 1) != 0) {
+ // new payload length
+ payloadLength = proxIn.readVInt();
+ assert payloadLength >= 0;
+ }
+ assert payloadLength != -1;
}
- assert payloadLength != -1;
- proxIn.seek(proxIn.getFilePointer() + payloadLength);
+ if (storeOffsets) {
+ if ((proxIn.readVInt() & 1) != 0) {
+ // new offset length
+ offsetLength = proxIn.readVInt();
+ }
+ }
+
+ if (storePayloads) {
+ proxIn.seek(proxIn.getFilePointer() + payloadLength);
+ }
posPendingCount--;
position = 0;
+ startOffset = 0;
payloadPending = false;
//System.out.println("StandardR.D&PE skipPos");
}
@@ -1069,16 +1094,28 @@ public class Lucene40PostingsReader exte
proxIn.seek(proxIn.getFilePointer()+payloadLength);
}
- final int code = proxIn.readVInt();
- if ((code & 1) != 0) {
- // new payload length
- payloadLength = proxIn.readVInt();
- assert payloadLength >= 0;
- }
- assert payloadLength != -1;
+ int code = proxIn.readVInt();
+ if (storePayloads) {
+ if ((code & 1) != 0) {
+ // new payload length
+ payloadLength = proxIn.readVInt();
+ assert payloadLength >= 0;
+ }
+ assert payloadLength != -1;
- payloadPending = true;
- position += code >>> 1;
+ payloadPending = true;
+ code >>>= 1;
+ }
+ position += code;
+
+ if (storeOffsets) {
+ int offsetCode = proxIn.readVInt();
+ if ((offsetCode & 1) != 0) {
+ // new offset length
+ offsetLength = proxIn.readVInt();
+ }
+ startOffset += offsetCode >>> 1;
+ }
posPendingCount--;
@@ -1090,32 +1127,36 @@ public class Lucene40PostingsReader exte
@Override
public int startOffset() throws IOException {
- return -1;
+ return storeOffsets ? startOffset : -1;
}
@Override
public int endOffset() throws IOException {
- return -1;
+ return storeOffsets ? startOffset + offsetLength : -1;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
@Override
public BytesRef getPayload() throws IOException {
- assert lazyProxPointer == -1;
- assert posPendingCount < freq;
- if (!payloadPending) {
- throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
- }
- if (payloadLength > payload.bytes.length) {
- payload.grow(payloadLength);
- }
+ if (storePayloads) {
+ assert lazyProxPointer == -1;
+ assert posPendingCount < freq;
+ if (!payloadPending) {
+ throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
+ }
+ if (payloadLength > payload.bytes.length) {
+ payload.grow(payloadLength);
+ }
- proxIn.readBytes(payload.bytes, 0, payloadLength);
- payload.length = payloadLength;
- payloadPending = false;
+ proxIn.readBytes(payload.bytes, 0, payloadLength);
+ payload.length = payloadLength;
+ payloadPending = false;
- return payload;
+ return payload;
+ } else {
+ throw new IOException("No payloads exist for this field!");
+ }
}
@Override
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java Mon Jan 23 22:15:15 2012
@@ -73,12 +73,15 @@ public final class Lucene40PostingsWrite
IndexOptions indexOptions;
boolean storePayloads;
+ boolean storeOffsets;
// Starts a new term
long freqStart;
long proxStart;
FieldInfo fieldInfo;
int lastPayloadLength;
+ int lastOffsetLength;
int lastPosition;
+ int lastOffset;
// private String segment;
@@ -137,6 +140,8 @@ public final class Lucene40PostingsWrite
proxStart = proxOut.getFilePointer();
// force first payload to write its length
lastPayloadLength = -1;
+ // force first offset to write its length
+ lastOffsetLength = -1;
}
skipListWriter.resetSkip();
}
@@ -155,10 +160,8 @@ public final class Lucene40PostingsWrite
*/
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.indexOptions;
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
- throw new UnsupportedOperationException("this codec cannot index offsets");
- }
-
+
+ storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.storePayloads;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
@@ -180,7 +183,7 @@ public final class Lucene40PostingsWrite
}
if ((++df % skipInterval) == 0) {
- skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
skipListWriter.bufferSkip(df);
}
@@ -197,31 +200,26 @@ public final class Lucene40PostingsWrite
}
lastPosition = 0;
+ lastOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
- assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
+ assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
assert proxOut != null;
- // TODO: when we add offsets... often
- // endOffset-startOffset will be constant or near
- // constant for all docs (eg if the term wasn't stemmed
- // then this will usually be the utf16 length of the
- // term); would be nice to write that length once up
- // front and then not encode endOffset for each
- // position..
-
final int delta = position - lastPosition;
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
+ int payloadLength = 0;
+
if (storePayloads) {
- final int payloadLength = payload == null ? 0 : payload.length;
+ payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
@@ -230,13 +228,28 @@ public final class Lucene40PostingsWrite
} else {
proxOut.writeVInt(delta << 1);
}
-
- if (payloadLength > 0) {
- proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
- }
} else {
proxOut.writeVInt(delta);
}
+
+ if (storeOffsets) {
+ // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
+ // and the numbers aren't that much smaller anyways.
+ int offsetDelta = startOffset - lastOffset;
+ int offsetLength = endOffset - startOffset;
+ if (offsetLength != lastOffsetLength) {
+ proxOut.writeVInt(offsetDelta << 1 | 1);
+ proxOut.writeVInt(offsetLength);
+ } else {
+ proxOut.writeVInt(offsetDelta << 1);
+ }
+ lastOffset = startOffset;
+ lastOffsetLength = offsetLength;
+ }
+
+ if (payloadLength > 0) {
+ proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
+ }
}
@Override
@@ -304,7 +317,7 @@ public final class Lucene40PostingsWrite
assert firstTerm.skipOffset > 0;
bytesWriter.writeVInt(firstTerm.skipOffset);
}
- if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(firstTerm.proxStart);
}
long lastFreqStart = firstTerm.freqStart;
@@ -319,7 +332,7 @@ public final class Lucene40PostingsWrite
assert term.skipOffset > 0;
bytesWriter.writeVInt(term.skipOffset);
}
- if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(term.proxStart - lastProxStart);
lastProxStart = term.proxStart;
}
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java Mon Jan 23 22:15:15 2012
@@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInpu
*/
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
private boolean currentFieldStoresPayloads;
+ private boolean currentFieldStoresOffsets;
private long freqPointer[];
private long proxPointer[];
private int payloadLength[];
+ private int offsetLength[];
private long lastFreqPointer;
private long lastProxPointer;
private int lastPayloadLength;
+ private int lastOffsetLength;
public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
@@ -44,17 +47,20 @@ public class Lucene40SkipListReader exte
freqPointer = new long[maxSkipLevels];
proxPointer = new long[maxSkipLevels];
payloadLength = new int[maxSkipLevels];
+ offsetLength = new int[maxSkipLevels];
}
- public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
+ public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
super.init(skipPointer, df);
this.currentFieldStoresPayloads = storesPayloads;
+ this.currentFieldStoresOffsets = storesOffsets;
lastFreqPointer = freqBasePointer;
lastProxPointer = proxBasePointer;
Arrays.fill(freqPointer, freqBasePointer);
Arrays.fill(proxPointer, proxBasePointer);
Arrays.fill(payloadLength, 0);
+ Arrays.fill(offsetLength, 0);
}
/** Returns the freq pointer of the doc to which the last call of
@@ -76,12 +82,20 @@ public class Lucene40SkipListReader exte
return lastPayloadLength;
}
+ /** Returns the offset length (endOffset-startOffset) of the position stored just before
+ * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
+ * has skipped. */
+ public int getOffsetLength() {
+ return lastOffsetLength;
+ }
+
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
freqPointer[level] = lastFreqPointer;
proxPointer[level] = lastProxPointer;
payloadLength[level] = lastPayloadLength;
+ offsetLength[level] = lastOffsetLength;
}
@Override
@@ -90,6 +104,7 @@ public class Lucene40SkipListReader exte
lastFreqPointer = freqPointer[level];
lastProxPointer = proxPointer[level];
lastPayloadLength = payloadLength[level];
+ lastOffsetLength = offsetLength[level];
}
@@ -110,6 +125,11 @@ public class Lucene40SkipListReader exte
} else {
delta = skipStream.readVInt();
}
+
+ if (currentFieldStoresOffsets) {
+ offsetLength[level] = skipStream.readVInt();
+ }
+
freqPointer[level] += skipStream.readVInt();
proxPointer[level] += skipStream.readVInt();
Modified: lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java Mon Jan 23 22:15:15 2012
@@ -40,7 +40,9 @@ public class Lucene40SkipListWriter exte
private int curDoc;
private boolean curStorePayloads;
+ private boolean curStoreOffsets;
private int curPayloadLength;
+ private int curOffsetLength;
private long curFreqPointer;
private long curProxPointer;
@@ -58,10 +60,12 @@ public class Lucene40SkipListWriter exte
/**
* Sets the values for the current skip data.
*/
- public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
+ public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
this.curDoc = doc;
this.curStorePayloads = storePayloads;
this.curPayloadLength = payloadLength;
+ this.curStoreOffsets = storeOffsets;
+ this.curOffsetLength = offsetLength;
this.curFreqPointer = freqOutput.getFilePointer();
if (proxOutput != null)
this.curProxPointer = proxOutput.getFilePointer();
@@ -116,6 +120,12 @@ public class Lucene40SkipListWriter exte
// current field does not store payloads
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
}
+
+ // TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
+ if (curStoreOffsets) {
+ skipBuffer.writeVInt(curOffsetLength);
+ }
+
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));
Modified: lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Mon Jan 23 22:15:15 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
@@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTes
}
}
};
-
+
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
+ checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
+ }
+
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
switch(_TestUtil.nextInt(random, 0, 4)) {
@@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTes
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
- TokenStream ts = a.tokenStream("dummy", new StringReader(text));
+ int remainder = random.nextInt(10);
+ Reader reader = new StringReader(text);
+ TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTes
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
}
+ reader = new StringReader(text);
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
- toIntArray(positions));
+ toIntArray(positions),
+ text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
- toIntArray(positions));
+ null,
+ toIntArray(positions),
+ text.length());
} else if (offsetAtt != null) {
// offset
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
- toIntArray(endOffsets));
+ toIntArray(endOffsets),
+ null,
+ null,
+ text.length());
} else {
// terms only
- assertAnalyzesToReuse(a, text,
+ assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}
Modified: lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java Mon Jan 23 22:15:15 2012
@@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends Fiel
}
if ((++df % termsOut.skipInterval) == 0) {
- skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
skipListWriter.bufferSkip(df);
}
Modified: lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java?rev=1235028&r1=1235027&r2=1235028&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java Mon Jan 23 22:15:15 2012
@@ -22,29 +22,46 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedAnalyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockPayloadAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericField;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Assume;
+import org.apache.lucene.util._TestUtil;
public class TestPostingsOffsets extends LuceneTestCase {
+ IndexWriterConfig iwc;
+
+ public void setUp() throws Exception {
+ super.setUp();
+ // Currently only SimpleText and Lucene40 can index offsets into postings:
+ assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
+ iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
+
+ if (Codec.getDefault().getName().equals("Lucene40")) {
+ // pulsing etc are not implemented
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ }
+ }
public void testBasic() throws Exception {
-
- // Currently only SimpleText can index offsets into postings:
- Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
-
Directory dir = newDirectory();
- RandomIndexWriter w = new RandomIndexWriter(random, dir);
+
+ RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
@@ -94,16 +111,117 @@ public class TestPostingsOffsets extends
r.close();
dir.close();
}
+
+ public void testSkipping() throws Exception {
+ doTestNumbers(false);
+ }
+
+ public void testPayloads() throws Exception {
+ doTestNumbers(true);
+ }
+
+ public void doTestNumbers(boolean withPayloads) throws Exception {
+ Directory dir = newDirectory();
+ Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
+ iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ if (Codec.getDefault().getName().equals("Lucene40")) {
+ // pulsing etc are not implemented
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ }
+ iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
+ RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
+
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ if (random.nextBoolean()) {
+ ft.setStoreTermVectors(true);
+ ft.setStoreTermVectorOffsets(random.nextBoolean());
+ ft.setStoreTermVectorPositions(random.nextBoolean());
+ }
+
+ int numDocs = atLeast(500);
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ doc.add(new Field("numbers", English.intToEnglish(i), ft));
+ doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
+ doc.add(new StringField("id", "" + i));
+ w.addDocument(doc);
+ }
+
+ IndexReader reader = w.getReader();
+ w.close();
+
+ String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
+
+ for (String term : terms) {
+ DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
+ int doc;
+ while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
+ String storedNumbers = reader.document(doc).get("numbers");
+ int freq = dp.freq();
+ for (int i = 0; i < freq; i++) {
+ dp.nextPosition();
+ int start = dp.startOffset();
+ assert start >= 0;
+ int end = dp.endOffset();
+ assert end >= 0 && end >= start;
+ // check that the offsets correspond to the term in the src text
+ assertTrue(storedNumbers.substring(start, end).equals(term));
+ if (withPayloads) {
+ // check that we have a payload and it starts with "pos"
+ assertTrue(dp.hasPayload());
+ BytesRef payload = dp.getPayload();
+ assertTrue(payload.utf8ToString().startsWith("pos:"));
+ } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
+ }
+ }
+ }
+
+ // check we can skip correctly
+ int numSkippingTests = atLeast(50);
+
+ for (int j = 0; j < numSkippingTests; j++) {
+ int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
+ DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
+ int doc = dp.advance(num);
+ assertEquals(num, doc);
+ int freq = dp.freq();
+ for (int i = 0; i < freq; i++) {
+ String storedNumbers = reader.document(doc).get("numbers");
+ dp.nextPosition();
+ int start = dp.startOffset();
+ assert start >= 0;
+ int end = dp.endOffset();
+ assert end >= 0 && end >= start;
+ // check that the offsets correspond to the term in the src text
+ assertTrue(storedNumbers.substring(start, end).equals("hundred"));
+ if (withPayloads) {
+ // check that we have a payload and it starts with "pos"
+ assertTrue(dp.hasPayload());
+ BytesRef payload = dp.getPayload();
+ assertTrue(payload.utf8ToString().startsWith("pos:"));
+ } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
+ }
+ }
+
+ // check that other fields (without offsets) work correctly
+
+ for (int i = 0; i < numDocs; i++) {
+ DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
+ assertEquals(i, dp.nextDoc());
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
+ }
+
+ reader.close();
+ dir.close();
+ }
public void testRandom() throws Exception {
- // Currently only SimpleText can index offsets into postings:
- Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
-
// token -> docID -> tokens
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
Directory dir = newDirectory();
- RandomIndexWriter w = new RandomIndexWriter(random, dir);
+ RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);