You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2010/04/06 21:19:36 UTC
svn commit: r931278 [6/10] - in /lucene/dev/trunk: lucene/
lucene/backwards/src/ lucene/backwards/src/java/org/apache/lucene/index/
lucene/backwards/src/java/org/apache/lucene/index/codecs/
lucene/backwards/src/java/org/apache/lucene/search/ lucene/bac...
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java Tue Apr 6 19:19:27 2010
@@ -37,8 +37,16 @@ import org.apache.lucene.store.Directory
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitVector;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CloseableThreadLocal;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.preflex.PreFlexFields;
+import org.apache.lucene.index.codecs.preflex.SegmentTermDocs;
+import org.apache.lucene.index.codecs.preflex.SegmentTermPositions;
+import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
+import org.apache.lucene.util.BytesRef;
/**
* @lucene.experimental
@@ -83,10 +91,11 @@ public class SegmentReader extends Index
final String segment;
final FieldInfos fieldInfos;
- final IndexInput freqStream;
- final IndexInput proxStream;
- final TermInfosReader tisNoIndex;
+ final FieldsProducer fields;
+ final boolean isPreFlex;
+ final CodecProvider codecs;
+
final Directory dir;
final Directory cfsDir;
final int readBufferSize;
@@ -94,14 +103,22 @@ public class SegmentReader extends Index
private final SegmentReader origInstance;
- TermInfosReader tis;
FieldsReader fieldsReaderOrig;
TermVectorsReader termVectorsReaderOrig;
CompoundFileReader cfsReader;
CompoundFileReader storeCFSReader;
- CoreReaders(SegmentReader origInstance, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor) throws IOException {
+ CoreReaders(SegmentReader origInstance, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor, CodecProvider codecs) throws IOException {
+
+ if (termsIndexDivisor < 1 && termsIndexDivisor != -1) {
+ throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + termsIndexDivisor);
+ }
+
segment = si.name;
+ if (codecs == null) {
+ codecs = CodecProvider.getDefault();
+ }
+ this.codecs = codecs;
this.readBufferSize = readBufferSize;
this.dir = dir;
@@ -118,23 +135,12 @@ public class SegmentReader extends Index
fieldInfos = new FieldInfos(cfsDir, IndexFileNames.segmentFileName(segment, IndexFileNames.FIELD_INFOS_EXTENSION));
this.termsIndexDivisor = termsIndexDivisor;
- TermInfosReader reader = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize, termsIndexDivisor);
- if (termsIndexDivisor == -1) {
- tisNoIndex = reader;
- } else {
- tis = reader;
- tisNoIndex = null;
- }
- // make sure that all index files have been read or are kept open
- // so that if an index update removes them we'll still have them
- freqStream = cfsDir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FREQ_EXTENSION), readBufferSize);
+ // Ask codec for its Fields
+ fields = si.getCodec().fieldsProducer(new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor));
+ assert fields != null;
- if (fieldInfos.hasProx()) {
- proxStream = cfsDir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.PROX_EXTENSION), readBufferSize);
- } else {
- proxStream = null;
- }
+ isPreFlex = fields instanceof PreFlexFields;
success = true;
} finally {
if (!success) {
@@ -165,64 +171,12 @@ public class SegmentReader extends Index
return cfsReader;
}
- synchronized TermInfosReader getTermsReader() {
- if (tis != null) {
- return tis;
- } else {
- return tisNoIndex;
- }
- }
-
- synchronized boolean termsIndexIsLoaded() {
- return tis != null;
- }
-
- // NOTE: only called from IndexWriter when a near
- // real-time reader is opened, or applyDeletes is run,
- // sharing a segment that's still being merged. This
- // method is not fully thread safe, and relies on the
- // synchronization in IndexWriter
- synchronized void loadTermsIndex(SegmentInfo si, int termsIndexDivisor) throws IOException {
- if (tis == null) {
- Directory dir0;
- if (si.getUseCompoundFile()) {
- // In some cases, we were originally opened when CFS
- // was not used, but then we are asked to open the
- // terms reader with index, the segment has switched
- // to CFS
- if (cfsReader == null) {
- cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(segment, IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize);
- }
- dir0 = cfsReader;
- } else {
- dir0 = dir;
- }
-
- tis = new TermInfosReader(dir0, segment, fieldInfos, readBufferSize, termsIndexDivisor);
- }
- }
-
synchronized void decRef() throws IOException {
if (ref.decrementAndGet() == 0) {
- // close everything, nothing is shared anymore with other readers
- if (tis != null) {
- tis.close();
- // null so if an app hangs on to us we still free most ram
- tis = null;
- }
-
- if (tisNoIndex != null) {
- tisNoIndex.close();
- }
-
- if (freqStream != null) {
- freqStream.close();
- }
-
- if (proxStream != null) {
- proxStream.close();
+ if (fields != null) {
+ fields.close();
}
if (termVectorsReaderOrig != null) {
@@ -543,7 +497,7 @@ public class SegmentReader extends Index
* @throws IOException if there is a low-level IO error
*/
public static SegmentReader get(boolean readOnly, SegmentInfo si, int termInfosIndexDivisor) throws CorruptIndexException, IOException {
- return get(readOnly, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, termInfosIndexDivisor);
+ return get(readOnly, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, termInfosIndexDivisor, null);
}
/**
@@ -555,8 +509,13 @@ public class SegmentReader extends Index
SegmentInfo si,
int readBufferSize,
boolean doOpenStores,
- int termInfosIndexDivisor)
+ int termInfosIndexDivisor,
+ CodecProvider codecs)
throws CorruptIndexException, IOException {
+ if (codecs == null) {
+ codecs = CodecProvider.getDefault();
+ }
+
SegmentReader instance = readOnly ? new ReadOnlySegmentReader() : new SegmentReader();
instance.readOnly = readOnly;
instance.si = si;
@@ -565,7 +524,7 @@ public class SegmentReader extends Index
boolean success = false;
try {
- instance.core = new CoreReaders(instance, dir, si, readBufferSize, termInfosIndexDivisor);
+ instance.core = new CoreReaders(instance, dir, si, readBufferSize, termInfosIndexDivisor, codecs);
if (doOpenStores) {
instance.core.openDocStores(si);
}
@@ -590,6 +549,11 @@ public class SegmentReader extends Index
core.openDocStores(si);
}
+ @Override
+ public synchronized Bits getDeletedDocs() {
+ return deletedDocs;
+ }
+
private boolean checkDeletedCounts() throws IOException {
final int recomputedCount = deletedDocs.getRecomputedCount();
@@ -859,17 +823,36 @@ public class SegmentReader extends Index
List<String> files() throws IOException {
return new ArrayList<String>(si.files());
}
-
+
@Override
- public TermEnum terms() {
+ public TermEnum terms() throws IOException {
ensureOpen();
- return core.getTermsReader().terms();
+ if (core.isPreFlex) {
+ // For old API on an old segment, instead of
+ // converting old API -> new API -> old API, just give
+ // direct access to old:
+ return ((PreFlexFields) core.fields).tis.terms();
+ } else {
+ // Emulate pre-flex API on top of flex index
+ return new LegacyTermEnum(null);
+ }
}
+ /** @deprecated Please switch to the flex API ({@link
+ * #fields}) instead. */
+ @Deprecated
@Override
public TermEnum terms(Term t) throws IOException {
ensureOpen();
- return core.getTermsReader().terms(t);
+ if (core.isPreFlex) {
+ // For old API on an old segment, instead of
+ // converting old API -> new API -> old API, just give
+ // direct access to old:
+ return ((PreFlexFields) core.fields).tis.terms(t);
+ } else {
+ // Emulate pre-flex API on top of flex index
+ return new LegacyTermEnum(t);
+ }
}
FieldInfos fieldInfos() {
@@ -887,6 +870,9 @@ public class SegmentReader extends Index
return (deletedDocs != null && deletedDocs.get(n));
}
+ /** @deprecated Switch to the flex API ({@link
+ * IndexReader#termDocsEnum}) instead. */
+ @Deprecated
@Override
public TermDocs termDocs(Term term) throws IOException {
if (term == null) {
@@ -895,27 +881,73 @@ public class SegmentReader extends Index
return super.termDocs(term);
}
}
+
+ @Override
+ public Fields fields() throws IOException {
+ return core.fields;
+ }
+ /** @deprecated Switch to the flex API {@link
+ * IndexReader#termDocsEnum} instead. */
+ @Deprecated
@Override
public TermDocs termDocs() throws IOException {
ensureOpen();
- return new SegmentTermDocs(this);
+ if (core.isPreFlex) {
+ // For old API on an old segment, instead of
+ // converting old API -> new API -> old API, just give
+ // direct access to old:
+ final PreFlexFields pre = (PreFlexFields) core.fields;
+ SegmentTermDocs std = new SegmentTermDocs(pre.freqStream, pre.tis, core.fieldInfos);
+ std.setSkipDocs(deletedDocs);
+ return std;
+ } else {
+ // Emulate old API
+ return new LegacyTermDocs();
+ }
}
+ /** @deprecated Switch to the flex API {@link
+ * IndexReader#termDocsEnum} instead */
+ @Deprecated
@Override
public TermPositions termPositions() throws IOException {
ensureOpen();
- return new SegmentTermPositions(this);
+ if (core.isPreFlex) {
+ // For old API on an old segment, instead of
+ // converting old API -> new API -> old API, just give
+ // direct access to old:
+ final PreFlexFields pre = (PreFlexFields) core.fields;
+ SegmentTermPositions stp = new SegmentTermPositions(pre.freqStream, pre.proxStream, pre.tis, core.fieldInfos);
+ stp.setSkipDocs(deletedDocs);
+ return stp;
+ } else {
+ // Emulate old API
+ return new LegacyTermPositions();
+ }
}
@Override
public int docFreq(Term t) throws IOException {
ensureOpen();
- TermInfo ti = core.getTermsReader().get(t);
- if (ti != null)
- return ti.docFreq;
- else
+ Terms terms = core.fields.terms(t.field);
+ if (terms != null) {
+ return terms.docFreq(new BytesRef(t.text));
+ } else {
return 0;
+ }
+ }
+
+ @Override
+ public int docFreq(String field, BytesRef term) throws IOException {
+ ensureOpen();
+
+ Terms terms = core.fields.terms(field);
+ if (terms != null) {
+ return terms.docFreq(term);
+ } else {
+ return 0;
+ }
}
@Override
@@ -1078,17 +1110,13 @@ public class SegmentReader extends Index
}
}
- boolean termsIndexLoaded() {
- return core.termsIndexIsLoaded();
- }
-
// NOTE: only called from IndexWriter when a near
// real-time reader is opened, or applyDeletes is run,
// sharing a segment that's still being merged. This
// method is not thread safe, and relies on the
// synchronization in IndexWriter
- void loadTermsIndex(int termsIndexDivisor) throws IOException {
- core.loadTermsIndex(si, termsIndexDivisor);
+ void loadTermsIndex(int indexDivisor) throws IOException {
+ core.fields.loadTermsIndex(indexDivisor);
}
// for testing only
@@ -1266,14 +1294,9 @@ public class SegmentReader extends Index
// same entry in the FieldCache. See LUCENE-1579.
@Override
public final Object getFieldCacheKey() {
- return core.freqStream;
- }
-
- @Override
- public long getUniqueTermCount() {
- return core.getTermsReader().size();
+ return core;
}
-
+
/**
* Lotsa tests did hacks like:<br/>
* SegmentReader reader = (SegmentReader) IndexReader.open(dir);<br/>
@@ -1283,7 +1306,7 @@ public class SegmentReader extends Index
*/
@Deprecated
static SegmentReader getOnlySegmentReader(Directory dir) throws IOException {
- return getOnlySegmentReader(IndexReader.open(dir,false));
+ return getOnlySegmentReader(IndexReader.open(dir, false));
}
static SegmentReader getOnlySegmentReader(IndexReader reader) {
@@ -1305,4 +1328,372 @@ public class SegmentReader extends Index
public int getTermInfosIndexDivisor() {
return core.termsIndexDivisor;
}
+
+ // Back compat: pre-flex TermEnum API over flex API
+ @Deprecated
+ final private class LegacyTermEnum extends TermEnum {
+ FieldsEnum fields;
+ TermsEnum terms;
+ boolean done;
+ String currentField;
+ BytesRef currentTerm;
+
+ public LegacyTermEnum(Term t) throws IOException {
+ fields = core.fields.iterator();
+ currentField = fields.next();
+ if (currentField == null) {
+ // no fields
+ done = true;
+ } else if (t != null) {
+ // Pre-seek to this term
+
+ while(currentField.compareTo(t.field) < 0) {
+ currentField = fields.next();
+ if (currentField == null) {
+ // Hit end of fields
+ done = true;
+ break;
+ }
+ }
+
+ if (!done) {
+ // We found some field -- get its terms:
+ terms = fields.terms();
+
+ if (currentField == t.field) {
+ // We found exactly the requested field; now
+ // seek the term text:
+ String text = t.text();
+
+ // this is only for backwards compatibility.
+ // previously you could supply a term with unpaired surrogates,
+ // and it would return the next Term.
+ // if someone does this, tack on the lowest possible trail surrogate.
+ // this emulates the old behavior, and forms "valid UTF-8" unicode.
+ BytesRef tr = new BytesRef(UnicodeUtil.nextValidUTF16String(text));
+ TermsEnum.SeekStatus status = terms.seek(tr);
+
+ if (status == TermsEnum.SeekStatus.END) {
+ // Rollover to the next field
+ terms = null;
+ next();
+ } else if (status == TermsEnum.SeekStatus.FOUND) {
+ // Found exactly the term
+ currentTerm = tr;
+ } else {
+ // Found another term, in this same field
+ currentTerm = terms.term();
+ }
+ } else {
+ // We didn't find exact field (we found the
+ // following field); advance to first term in
+ // this field
+ next();
+ }
+ }
+ } else {
+ terms = fields.terms();
+ }
+ }
+
+ @Override
+ public boolean next() throws IOException {
+
+ if (done) {
+ return false;
+ }
+
+ while(true) {
+ if (terms == null) {
+ // Advance to the next field
+ currentField = fields.next();
+ if (currentField == null) {
+ done = true;
+ return false;
+ }
+ terms = fields.terms();
+ }
+ currentTerm = terms.next();
+ if (currentTerm != null) {
+ // This field still has terms
+ return true;
+ } else {
+ // Done producing terms from this field; advance
+ // to next field
+ terms = null;
+ }
+ }
+ }
+
+ @Override
+ public Term term() {
+ if (!done && terms != null && currentTerm != null) {
+ return new Term(currentField, currentTerm.utf8ToString());
+ }
+ return null;
+ }
+
+ @Override
+ public int docFreq() {
+ return terms == null ? 0 : terms.docFreq();
+ }
+
+ @Override
+ public void close() {}
+ }
+
+ // Back compat: emulates legacy TermDocs API on top of
+ // flex API
+ private class LegacyTermDocs implements TermDocs {
+
+ String currentField;
+ final Fields fields;
+ TermsEnum terms;
+ DocsEnum docsEnum;
+ boolean any;
+
+ LegacyTermDocs() throws IOException {
+ fields = core.fields;
+ }
+
+ public void close() {}
+
+ public void seek(TermEnum termEnum) throws IOException {
+ seek(termEnum.term());
+ }
+
+ public boolean skipTo(int target) throws IOException {
+ if (!any) {
+ return false;
+ } else {
+ return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS;
+ }
+ }
+
+ public void seek(Term term) throws IOException {
+
+ any = false;
+
+ if (terms != null && !term.field.equals(currentField)) {
+ // new field
+ terms = null;
+ }
+
+ if (terms == null) {
+ currentField = term.field;
+ Terms terms1 = fields.terms(currentField);
+ if (terms1 == null) {
+ // no such field
+ return;
+ } else {
+ terms = terms1.iterator();
+ }
+ }
+
+ if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) {
+ // Term exists
+ any = true;
+ pendingBulkResult = null;
+ docsEnum = terms.docs(deletedDocs, docsEnum);
+ }
+ }
+
+ public int doc() {
+ if (!any) {
+ return 0;
+ } else {
+ return docsEnum.docID();
+ }
+ }
+
+ private DocsEnum.BulkReadResult pendingBulkResult;
+ private int bulkCount;
+ private int pendingBulk;
+
+ public int read(int[] docs, int[] freqs) throws IOException {
+ if (any && pendingBulkResult == null) {
+ pendingBulkResult = docsEnum.getBulkResult();
+ }
+ if (!any) {
+ return 0;
+ } else if (pendingBulk > 0) {
+ final int left = bulkCount - pendingBulk;
+ if (docs.length >= left) {
+ // read all pending
+ System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, left);
+ System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, left);
+ pendingBulk = 0;
+ return left;
+ } else {
+ // read only part of pending
+ System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, docs.length);
+ System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, docs.length);
+ pendingBulk += docs.length;
+ return docs.length;
+ }
+ } else {
+ // nothing pending
+ bulkCount = docsEnum.read();
+ if (docs.length >= bulkCount) {
+ System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, bulkCount);
+ System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, bulkCount);
+ return bulkCount;
+ } else {
+ System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, docs.length);
+ System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, docs.length);
+ pendingBulk = docs.length;
+ return docs.length;
+ }
+ }
+ }
+
+ public int freq() {
+ if (!any) {
+ return 0;
+ } else {
+ return docsEnum.freq();
+ }
+ }
+
+ public boolean next() throws IOException {
+ if (!any) {
+ return false;
+ } else {
+ return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS;
+ }
+ }
+ }
+
+ // Back compat: implements legacy TermPositions API on top
+ // of flex API
+ final private class LegacyTermPositions implements TermPositions {
+
+ String currentField;
+ final Fields fields;
+ TermsEnum terms;
+ DocsAndPositionsEnum postingsEnum;
+ DocsEnum docsEnum;
+ boolean any;
+
+ LegacyTermPositions() throws IOException {
+ fields = core.fields;
+ }
+
+ public void close() {}
+
+ public void seek(TermEnum termEnum) throws IOException {
+ seek(termEnum.term());
+ }
+
+ public boolean skipTo(int target) throws IOException {
+ if (!any) {
+ return false;
+ } else {
+ return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS;
+ }
+ }
+
+ public void seek(Term term) throws IOException {
+
+ any = false;
+
+ if (terms != null && !term.field.equals(currentField)) {
+ // new field
+ terms = null;
+ }
+
+ if (terms == null) {
+ currentField = term.field;
+ Terms terms1 = fields.terms(currentField);
+ if (terms1 == null) {
+ // no such field
+ return;
+ } else {
+ terms = terms1.iterator();
+ }
+ }
+
+ if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) {
+ // Term exists
+ any = true;
+ postingsEnum = terms.docsAndPositions(deletedDocs, postingsEnum);
+ if (postingsEnum == null) {
+ docsEnum = terms.docs(deletedDocs, postingsEnum);
+ } else {
+ docsEnum = postingsEnum;
+ }
+ }
+ }
+
+ public int doc() {
+ if (!any) {
+ return 0;
+ } else {
+ return docsEnum.docID();
+ }
+ }
+
+ public int freq() {
+ if (!any) {
+ return 0;
+ } else {
+ return docsEnum.freq();
+ }
+ }
+
+ public boolean next() throws IOException {
+ if (!any) {
+ return false;
+ } else {
+ return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS;
+ }
+ }
+
+ public int read(int[] docs, int[] freqs) throws IOException {
+ throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
+ }
+
+ public int nextPosition() throws IOException {
+ if (!any || postingsEnum == null) {
+ return 0;
+ } else {
+ return postingsEnum.nextPosition();
+ }
+ }
+
+ public int getPayloadLength() {
+ if (!any || postingsEnum == null) {
+ return 0;
+ } else {
+ return postingsEnum.getPayloadLength();
+ }
+ }
+
+ public byte[] getPayload(byte[] bytes, int offset) throws IOException {
+ if (!any || postingsEnum == null) {
+ return null;
+ }
+ final BytesRef payload = postingsEnum.getPayload();
+ // old API would always used passed in bytes if it
+ // "fits", else allocate new:
+ if (bytes != null && payload.length <= bytes.length - offset) {
+ System.arraycopy(payload.bytes, payload.offset, bytes, offset, payload.length);
+ return bytes;
+ } else if (payload.offset == 0 && payload.length == payload.bytes.length) {
+ return payload.bytes;
+ } else {
+ final byte[] retBytes = new byte[payload.length];
+ System.arraycopy(payload.bytes, payload.offset, retBytes, 0, payload.length);
+ return retBytes;
+ }
+ }
+
+ public boolean isPayloadAvailable() {
+ if (!any || postingsEnum == null) {
+ return false;
+ } else {
+ return postingsEnum.hasPayload();
+ }
+ }
+ }
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Tue Apr 6 19:19:27 2010
@@ -19,32 +19,63 @@ package org.apache.lucene.index;
import java.util.HashSet;
import java.util.Collection;
+import java.io.PrintStream;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CodecProvider;
-class SegmentWriteState {
- DocumentsWriter docWriter;
- Directory directory;
- String segmentName;
- String docStoreSegmentName;
- int numDocs;
- int termIndexInterval;
- int numDocsInStore;
- Collection<String> flushedFiles;
-
- public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs,
- int numDocsInStore, int termIndexInterval) {
- this.docWriter = docWriter;
+/**
+ * This class is not meant for public usage; it's only
+ * public in order to expose access across packages. It's
+ * used internally when updating the index.
+ * @lucene.experimental
+ */
+public class SegmentWriteState {
+ public final PrintStream infoStream;
+ public final Directory directory;
+ public final String segmentName;
+ public final FieldInfos fieldInfos;
+ public final String docStoreSegmentName;
+ public final int numDocs;
+ public int numDocsInStore;
+ public final Collection<String> flushedFiles;
+
+ // Actual codec used
+ final Codec codec;
+
+ /** Expert: The fraction of terms in the "dictionary" which should be stored
+ * in RAM. Smaller values use more memory, but make searching slightly
+ * faster, while larger values use less memory and make searching slightly
+ * slower. Searching is typically not dominated by dictionary lookup, so
+ * tweaking this is rarely useful.*/
+ public final int termIndexInterval;
+
+ /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
+ * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
+ * smaller indexes, greater acceleration, but fewer accelerable cases, while
+ * smaller values result in bigger indexes, less acceleration and more
+ * accelerable cases. More detailed experiments would be useful here. */
+ public final int skipInterval = 16;
+
+ /** Expert: The maximum number of skip levels. Smaller values result in
+ * slightly smaller indexes, but slower skipping in big posting lists.
+ */
+ public final int maxSkipLevels = 10;
+
+ public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos,
+ String docStoreSegmentName, int numDocs,
+ int numDocsInStore, int termIndexInterval,
+ CodecProvider codecs) {
+ this.infoStream = infoStream;
this.directory = directory;
this.segmentName = segmentName;
+ this.fieldInfos = fieldInfos;
this.docStoreSegmentName = docStoreSegmentName;
this.numDocs = numDocs;
this.numDocsInStore = numDocsInStore;
this.termIndexInterval = termIndexInterval;
+ this.codec = codecs.getWriter(this);
flushedFiles = new HashSet<String>();
}
-
- public String segmentFileName(String ext) {
- return segmentName + "." + ext;
- }
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java Tue Apr 6 19:19:27 2010
@@ -90,8 +90,8 @@ final class StoredFieldsWriter {
state.flushedFiles.add(fieldsName);
state.flushedFiles.add(fieldsIdxName);
- state.docWriter.removeOpenFile(fieldsName);
- state.docWriter.removeOpenFile(fieldsIdxName);
+ docWriter.removeOpenFile(fieldsName);
+ docWriter.removeOpenFile(fieldsIdxName);
if (4+((long) state.numDocsInStore)*8 != state.directory.fileLength(fieldsIdxName))
throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fieldsIdxName) + " length in bytes of " + fieldsIdxName + " file exists?=" + state.directory.fileExists(fieldsIdxName));
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java Tue Apr 6 19:19:27 2010
@@ -1,7 +1,5 @@
package org.apache.lucene.index;
-import org.apache.lucene.util.StringHelper;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -19,6 +17,8 @@ import org.apache.lucene.util.StringHelp
* limitations under the License.
*/
+import org.apache.lucene.util.StringHelper;
+
/**
A Term represents a word from text. This is the unit of search. It is
composed of two elements, the text of the word, as a string, and the name of
@@ -35,7 +35,7 @@ public final class Term implements Compa
* <p>Note that a null field or null text value results in undefined
* behavior for most Lucene APIs that accept a Term parameter. */
public Term(String fld, String txt) {
- field = StringHelper.intern(fld);
+ field = fld == null ? null : StringHelper.intern(fld);
text = txt;
}
@@ -49,7 +49,8 @@ public final class Term implements Compa
this(fld, "", true);
}
- Term(String fld, String txt, boolean intern) {
+ /** @lucene.experimental */
+ public Term(String fld, String txt, boolean intern) {
field = intern ? StringHelper.intern(fld) : fld; // field names are interned
text = txt; // unless already known to be
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java Tue Apr 6 19:19:27 2010
@@ -27,8 +27,10 @@ import java.io.Closeable;
ordered by document number.
@see IndexReader#termDocs()
- */
+ @deprecated Use {@link DocsEnum} instead
+*/
+@Deprecated
public interface TermDocs extends Closeable {
/** Sets this to the data for a term.
* The enumeration is reset to the start of the data for this term.
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java Tue Apr 6 19:19:27 2010
@@ -23,8 +23,10 @@ import java.io.Closeable;
/** Abstract class for enumerating terms.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
- the enumeration is greater than all that precede it. */
+ the enumeration is greater than all that precede it.
+* @deprecated Use TermsEnum instead */
+@Deprecated
public abstract class TermEnum implements Closeable {
/** Increments the enumeration to the next element. True if one exists.*/
public abstract boolean next() throws IOException;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java Tue Apr 6 19:19:27 2010
@@ -26,8 +26,9 @@ import java.io.IOException;
* positions of each occurrence of a term in a document.
*
* @see IndexReader#termPositions()
+ * @deprecated Use {@link DocsAndPositionsEnum} instead
*/
-
+@Deprecated
public interface TermPositions
extends TermDocs
{
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Tue Apr 6 19:19:27 2010
@@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
@@ -106,6 +106,8 @@ final class TermVectorsTermsWriterPerFie
final int numPostings = termsHashPerField.numPostings;
+ final BytesRef flushTerm = perThread.flushTerm;
+
assert numPostings >= 0;
if (!doVectors || numPostings == 0)
@@ -126,7 +128,9 @@ final class TermVectorsTermsWriterPerFie
perThread.doc.addField(termsHashPerField.fieldInfo.number);
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
- final int[] termIDs = termsHashPerField.sortPostings();
+ // TODO: we may want to make this sort in same order
+ // as Codec's terms dict?
+ final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
tvf.writeVInt(numPostings);
byte bits = 0x0;
@@ -136,46 +140,40 @@ final class TermVectorsTermsWriterPerFie
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
tvf.writeByte(bits);
- int encoderUpto = 0;
- int lastTermBytesCount = 0;
-
+ int lastLen = 0;
+ byte[] lastBytes = null;
+ int lastStart = 0;
+
final ByteSliceReader reader = perThread.vectorSliceReader;
- final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+ final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool;
+
for(int j=0;j<numPostings;j++) {
final int termID = termIDs[j];
final int freq = postings.freqs[termID];
- final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
+ // Get BytesRef
+ termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]);
- // We swap between two encoders to save copying
- // last Term's byte array
- final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
-
- // TODO: we could do this incrementally
- UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
- final int termBytesCount = utf8Result.length;
-
- // TODO: UTF16toUTF8 could tell us this prefix
- // Compute common prefix between last term and
+ // Compute common byte prefix between last term and
// this term
int prefix = 0;
if (j > 0) {
- final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
- final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
- while(prefix < lastTermBytesCount && prefix < termBytesCount) {
- if (lastTermBytes[prefix] != termBytes[prefix])
+ while(prefix < lastLen && prefix < flushTerm.length) {
+ if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) {
break;
+ }
prefix++;
}
}
- encoderUpto = 1-encoderUpto;
- lastTermBytesCount = termBytesCount;
- final int suffix = termBytesCount - prefix;
+ lastLen = flushTerm.length;
+ lastBytes = flushTerm.bytes;
+ lastStart = flushTerm.offset;
+
+ final int suffix = flushTerm.length - prefix;
tvf.writeVInt(prefix);
tvf.writeVInt(suffix);
- tvf.writeBytes(utf8Result.result, prefix, suffix);
+ tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix);
tvf.writeVInt(freq);
if (doVectorPositions) {
@@ -209,9 +207,7 @@ final class TermVectorsTermsWriterPerFie
@Override
void newTerm(final int termID) {
-
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
-
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
postings.freqs[termID] = 1;
@@ -275,23 +271,25 @@ final class TermVectorsTermsWriterPerFie
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
+ ParallelPostingsArray newInstance(int size) {
+ return new TermVectorsPostingsArray(size);
+ }
+
@Override
- ParallelPostingsArray resize(int newSize) {
- TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize);
- copy(this, newArray);
- return newArray;
+ void copyTo(ParallelPostingsArray toArray, int numToCopy) {
+ assert toArray instanceof TermVectorsPostingsArray;
+ TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray;
+
+ super.copyTo(toArray, numToCopy);
+
+ System.arraycopy(freqs, 0, to.freqs, 0, size);
+ System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size);
+ System.arraycopy(lastPositions, 0, to.lastPositions, 0, size);
}
-
- void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) {
- super.copy(fromArray, toArray);
- System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length);
- System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length);
- System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
+
+ @Override
+ int bytesPerPosting() {
+ return super.bytesPerPosting() + 3 * DocumentsWriter.INT_NUM_BYTE;
}
}
-
- @Override
- int bytesPerPosting() {
- return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE;
- }
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java Tue Apr 6 19:19:27 2010
@@ -17,13 +17,14 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread {
final TermVectorsTermsWriter termsWriter;
final TermsHashPerThread termsHashPerThread;
final DocumentsWriter.DocState docState;
+ final BytesRef flushTerm = new BytesRef();
TermVectorsTermsWriter.PerDoc doc;
@@ -36,9 +37,6 @@ final class TermVectorsTermsWriterPerThr
// Used by perField when serializing the term vectors
final ByteSliceReader vectorSliceReader = new ByteSliceReader();
- final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
- new UnicodeUtil.UTF8Result()};
-
@Override
public void startDocument() {
assert clearLastVectorFieldName();
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java Tue Apr 6 19:19:27 2010
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
@@ -28,8 +29,7 @@ final class TermVectorsWriter {
private IndexOutput tvx = null, tvd = null, tvf = null;
private FieldInfos fieldInfos;
- final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
- new UnicodeUtil.UTF8Result()};
+ final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)};
public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos)
@@ -107,14 +107,14 @@ final class TermVectorsWriter {
UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
- int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
+ int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].bytes,
utf8Results[1-utf8Upto].length,
- utf8Results[utf8Upto].result,
+ utf8Results[utf8Upto].bytes,
utf8Results[utf8Upto].length);
int length = utf8Results[utf8Upto].length - start;
tvf.writeVInt(start); // write shared prefix length
tvf.writeVInt(length); // write delta length
- tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
+ tvf.writeBytes(utf8Results[utf8Upto].bytes, start, length); // write delta bytes
utf8Upto = 1-utf8Upto;
final int termFreq = freqs[j];
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java Tue Apr 6 19:19:27 2010
@@ -34,8 +34,6 @@ abstract class TermsHashConsumerPerField
abstract void newTerm(int termID) throws IOException;
abstract void addTerm(int termID) throws IOException;
abstract int getStreamCount();
-
- abstract ParallelPostingsArray createPostingsArray(int size);
- abstract int bytesPerPosting();
+ abstract ParallelPostingsArray createPostingsArray(int size);
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java Tue Apr 6 19:19:27 2010
@@ -19,10 +19,13 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
+import java.util.Comparator;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
final class TermsHashPerField extends InvertedDocConsumerPerField {
@@ -32,12 +35,12 @@ final class TermsHashPerField extends In
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
- TermAttribute termAtt;
-
+ TermToBytesRefAttribute termAtt;
+
// Copied from our perThread
- final CharBlockPool charPool;
final IntBlockPool intPool;
final ByteBlockPool bytePool;
+ final ByteBlockPool termBytePool;
final int streamCount;
final int numPostingInt;
@@ -52,43 +55,42 @@ final class TermsHashPerField extends In
private int[] postingsHash;
ParallelPostingsArray postingsArray;
-
- private final int bytesPerPosting;
-
+ private final BytesRef utf8;
+ private Comparator<BytesRef> termComp;
+
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread;
intPool = perThread.intPool;
- charPool = perThread.charPool;
bytePool = perThread.bytePool;
+ termBytePool = perThread.termBytePool;
docState = perThread.docState;
+
postingsHash = new int[postingsHashSize];
Arrays.fill(postingsHash, -1);
+ bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+
fieldState = docInverterPerField.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
+ postingsArray = consumer.createPostingsArray(postingsHashSize/2);
+ bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
+
streamCount = consumer.getStreamCount();
numPostingInt = 2*streamCount;
+ utf8 = perThread.utf8;
this.fieldInfo = fieldInfo;
if (nextPerThread != null)
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
else
nextPerField = null;
-
- // +3: Posting is referenced by hash, which
- // targets 25-50% fill factor; approximate this
- // as 3X # pointers
- bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE;
}
-
- void initPostingsArray() {
- assert postingsArray == null;
- postingsArray = consumer.createPostingsArray(postingsHashSize);
-
+ // sugar: just forwards to DW
+ private void bytesUsed(long size) {
if (perThread.termsHash.trackAllocations) {
- perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize);
+ perThread.termsHash.docWriter.bytesUsed(size);
}
}
-
+
void shrinkHash(int targetSize) {
assert postingsCompacted || numPostings == 0;
@@ -100,13 +102,20 @@ final class TermsHashPerField extends In
}
if (newSize != postingsHash.length) {
+ final long previousSize = postingsHash.length;
postingsHash = new int[newSize];
+ bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
Arrays.fill(postingsHash, -1);
- postingsArray = null;
postingsHashSize = newSize;
postingsHashHalfSize = newSize/2;
postingsHashMask = newSize-1;
}
+
+ if (postingsArray != null) {
+ final int startSize = postingsArray.size;
+ postingsArray = postingsArray.shrink(targetSize, false);
+ bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - startSize));
+ }
}
public void reset() {
@@ -129,14 +138,10 @@ final class TermsHashPerField extends In
nextPerField.abort();
}
- private void growParallelPostingsArray() {
- int oldSize = postingsArray.byteStarts.length;
- int newSize = (int) (oldSize * 1.5);
- this.postingsArray = this.postingsArray.resize(newSize);
-
- if (perThread.termsHash.trackAllocations) {
- perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize));
- }
+ private final void growParallelPostingsArray() {
+ int oldSize = postingsArray.size;
+ this.postingsArray = this.postingsArray.grow();
+ bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
}
public void initReader(ByteSliceReader reader, int termID, int stream) {
@@ -166,7 +171,8 @@ final class TermsHashPerField extends In
}
/** Collapse the hash table & sort in-place. */
- public int[] sortPostings() {
+ public int[] sortPostings(Comparator<BytesRef> termComp) {
+ this.termComp = termComp;
compactPostings();
quickSort(postingsHash, 0, numPostings-1);
return postingsHash;
@@ -237,50 +243,48 @@ final class TermsHashPerField extends In
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
int comparePostings(int term1, int term2) {
- if (term1 == term2)
+ if (term1 == term2) {
+ // Our quicksort does this, eg during partition
return 0;
-
- final int textStart1 = postingsArray.textStarts[term1];
- final int textStart2 = postingsArray.textStarts[term2];
-
- final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
- final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
-
- assert text1 != text2 || pos1 != pos2;
-
- while(true) {
- final char c1 = text1[pos1++];
- final char c2 = text2[pos2++];
- if (c1 != c2) {
- if (0xffff == c2)
- return 1;
- else if (0xffff == c1)
- return -1;
- else
- return c1-c2;
- } else
- // This method should never compare equal postings
- // unless p1==p2
- assert c1 != 0xffff;
}
+
+ termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
+ termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);
+
+ return termComp.compare(perThread.tr1, perThread.tr2);
}
/** Test whether the text for current RawPostingList p equals
- * current tokenText. */
- private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
+ * current tokenText in utf8. */
+ private boolean postingEquals(final int termID) {
final int textStart = postingsArray.textStarts[termID];
-
- final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
assert text != null;
- int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
- int tokenPos = 0;
- for(;tokenPos<tokenTextLen;pos++,tokenPos++)
- if (tokenText[tokenPos] != text[pos])
- return false;
- return 0xffff == text[pos];
+ int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+
+ final int len;
+ if ((text[pos] & 0x80) == 0) {
+ // length is 1 byte
+ len = text[pos];
+ pos += 1;
+ } else {
+ // length is 2 bytes
+ len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
+ pos += 2;
+ }
+
+ if (len == utf8.length) {
+ final byte[] utf8Bytes = utf8.bytes;
+ for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
+ if (utf8Bytes[tokenPos] != text[pos]) {
+ return false;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
}
private boolean doCall;
@@ -288,10 +292,14 @@ final class TermsHashPerField extends In
@Override
void start(Fieldable f) {
- if (postingsArray == null) {
- initPostingsArray();
+ if (fieldState.attributeSource.hasAttribute(TermToBytesRefAttribute.class)) {
+ termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
+ } else if (fieldState.attributeSource.hasAttribute(TermAttribute.class)) {
+ perThread.legacyTermAttributeWrapper.setTermAttribute(fieldState.attributeSource.getAttribute(TermAttribute.class));
+ termAtt = perThread.legacyTermAttributeWrapper;
+ } else {
+ throw new IllegalArgumentException("Could not find a term attribute (that implements TermToBytesRefAttribute) in the TokenStream");
}
- termAtt = fieldState.attributeSource.addAttribute(TermAttribute.class);
consumer.start(f);
if (nextPerField != null) {
nextPerField.start(f);
@@ -337,12 +345,9 @@ final class TermsHashPerField extends In
// New posting
termID = numPostings++;
- if (termID >= postingsArray.textStarts.length) {
+ if (termID >= postingsArray.size) {
growParallelPostingsArray();
}
- if (perThread.termsHash.trackAllocations) {
- perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
- }
assert termID >= 0;
@@ -392,48 +397,15 @@ final class TermsHashPerField extends In
// We are first in the chain so we must "intern" the
// term text into textStart address
- // Get the text of this term.
- final char[] tokenText = termAtt.termBuffer();
- final int tokenTextLen = termAtt.termLength();
-
- // Compute hashcode & replace any invalid UTF16 sequences
- int downto = tokenTextLen;
- int code = 0;
- while (downto > 0) {
- char ch = tokenText[--downto];
-
- if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
- if (0 == downto) {
- // Unpaired
- ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
- } else {
- final char ch2 = tokenText[downto-1];
- if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
- // OK: high followed by low. This is a valid
- // surrogate pair.
- code = ((code*31) + ch)*31+ch2;
- downto--;
- continue;
- } else {
- // Unpaired
- ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
- }
- }
- } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
- ch == 0xffff)) {
- // Unpaired or 0xffff
- ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
- }
-
- code = (code*31) + ch;
- }
+ // Get the text & hash of this term.
+ int code = termAtt.toBytesRef(utf8);
int hashPos = code & postingsHashMask;
// Locate RawPostingList in hash
int termID = postingsHash[hashPos];
- if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
+ if (termID != -1 && !postingEquals(termID)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code>>8)+code)|1;
@@ -441,61 +413,86 @@ final class TermsHashPerField extends In
code += inc;
hashPos = code & postingsHashMask;
termID = postingsHash[hashPos];
- } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
+ } while (termID != -1 && !postingEquals(termID));
}
if (termID == -1) {
// First time we are seeing this token since we last
// flushed the hash.
- final int textLen1 = 1+tokenTextLen;
- if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
- if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+ final int textLen2 = 2+utf8.length;
+ if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
+ // Not enough room in current block
+
+ if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
// Just skip this term, to remain as robust as
// possible during indexing. A TokenFilter
// can be inserted into the analyzer chain if
// other behavior is wanted (pruning the term
// to a prefix, throwing an exception, etc).
-
- if (docState.maxTermPrefix == null)
- docState.maxTermPrefix = new String(tokenText, 0, 30);
+ if (docState.maxTermPrefix == null) {
+ final int saved = utf8.length;
+ try {
+ utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
+ docState.maxTermPrefix = utf8.toString();
+ } finally {
+ utf8.length = saved;
+ }
+ }
consumer.skippingLongTerm();
return;
}
- charPool.nextBuffer();
+ bytePool.nextBuffer();
}
// New posting
termID = numPostings++;
- if (termID >= postingsArray.textStarts.length) {
+ if (termID >= postingsArray.size) {
growParallelPostingsArray();
}
- if (perThread.termsHash.trackAllocations) {
- perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
- }
assert termID != -1;
-
- final char[] text = charPool.buffer;
- final int textUpto = charPool.charUpto;
- postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
- charPool.charUpto += textLen1;
- System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
- text[textUpto+tokenTextLen] = 0xffff;
-
assert postingsHash[hashPos] == -1;
+
postingsHash[hashPos] = termID;
- if (numPostings == postingsHashHalfSize)
+ final byte[] text = bytePool.buffer;
+ final int textUpto = bytePool.byteUpto;
+ postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;
+
+ // We first encode the length, followed by the UTF8
+ // bytes. Length is encoded as vInt, but will consume
+ // 1 or 2 bytes at most (we reject too-long terms,
+ // above).
+
+ // encode length @ start of bytes
+ if (utf8.length < 128) {
+ // 1 byte to store length
+ text[textUpto] = (byte) utf8.length;
+ bytePool.byteUpto += utf8.length + 1;
+ System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
+ } else {
+ // 2 byte to store length
+ text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
+ text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
+ bytePool.byteUpto += utf8.length + 2;
+ System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
+ }
+
+ if (numPostings == postingsHashHalfSize) {
rehashPostings(2*postingsHashSize);
+ bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
+ }
// Init stream slices
- if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+ if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
intPool.nextBuffer();
+ }
- if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
bytePool.nextBuffer();
+ }
intUptos = intPool.buffer;
intUptoStart = intPool.intUpto;
@@ -577,16 +574,28 @@ final class TermsHashPerField extends In
int code;
if (perThread.primary) {
final int textStart = postingsArray.textStarts[termID];
- final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
- final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- int pos = start;
- while(text[pos] != 0xffff)
- pos++;
+ final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+ final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
code = 0;
- while (pos > start)
- code = (code*31) + text[--pos];
- } else
+
+ final int len;
+ int pos;
+ if ((text[start] & 0x80) == 0) {
+ // length is 1 byte
+ len = text[start];
+ pos = start+1;
+ } else {
+ len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
+ pos = start+2;
+ }
+
+ final int endPos = pos+len;
+ while(pos < endPos) {
+ code = (code*31) + text[pos++];
+ }
+ } else {
code = postingsArray.textStarts[termID];
+ }
int hashPos = code & newMask;
assert hashPos >= 0;
@@ -603,6 +612,7 @@ final class TermsHashPerField extends In
postingsHashMask = newMask;
postingsHash = newHash;
+
postingsHashSize = newSize;
postingsHashHalfSize = newSize >> 1;
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java Tue Apr 6 19:19:27 2010
@@ -17,6 +17,11 @@ package org.apache.lucene.index;
* limitations under the License.
*/
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+
import java.io.IOException;
final class TermsHashPerThread extends InvertedDocConsumerPerThread {
@@ -25,30 +30,54 @@ final class TermsHashPerThread extends I
final TermsHashConsumerPerThread consumer;
final TermsHashPerThread nextPerThread;
- final CharBlockPool charPool;
final IntBlockPool intPool;
final ByteBlockPool bytePool;
+ final ByteBlockPool termBytePool;
final boolean primary;
final DocumentsWriter.DocState docState;
+ // Used when comparing postings via termRefComp, in TermsHashPerField
+ final BytesRef tr1 = new BytesRef();
+ final BytesRef tr2 = new BytesRef();
+
+ // Used by perField:
+ final BytesRef utf8 = new BytesRef(10);
+
+ final LegacyTermAttributeWrapper legacyTermAttributeWrapper = new LegacyTermAttributeWrapper();
+
+ /** This class is used to wrap a legacy TermAttribute without support for {@link TermToBytesRefAttribute}. */
+ @Deprecated
+ static class LegacyTermAttributeWrapper implements TermToBytesRefAttribute {
+ private TermAttribute termAtt = null;
+
+ void setTermAttribute(TermAttribute termAtt) {
+ this.termAtt = termAtt;
+ }
+
+ public int toBytesRef(BytesRef target) {
+ assert target.bytes != null : "target byteref must be != null, because utf8 is used here";
+ return UnicodeUtil.UTF16toUTF8WithHash(termAtt.termBuffer(), 0, termAtt.termLength(), target);
+ }
+ }
+
public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
docState = docInverterPerThread.docState;
this.termsHash = termsHash;
this.consumer = termsHash.consumer.addThread(this);
+ intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
+ bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
+
if (nextTermsHash != null) {
// We are primary
- charPool = new CharBlockPool(termsHash.docWriter);
primary = true;
+ termBytePool = bytePool;
} else {
- charPool = primaryPerThread.charPool;
primary = false;
+ termBytePool = primaryPerThread.bytePool;
}
- intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
- bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
-
if (nextTermsHash != null)
nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
else
@@ -97,7 +126,8 @@ final class TermsHashPerThread extends I
intPool.reset();
bytePool.reset();
- if (primary)
- charPool.reset();
+ if (primary) {
+ bytePool.reset();
+ }
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java Tue Apr 6 19:19:27 2010
@@ -161,8 +161,8 @@ public class ConstantScoreQuery extends
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
- return "ConstantScore(" + filter.toString()
- + (getBoost()==1.0 ? ")" : "^" + getBoost());
+ return "ConstantScore(" + filter.toString() + ")"
+ + (getBoost()==1.0 ? "" : "^" + getBoost());
}
/** Returns true if <code>o</code> is equal to this. */
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java Tue Apr 6 19:19:27 2010
@@ -22,9 +22,9 @@ import org.apache.lucene.index.*;
final class ExactPhraseScorer extends PhraseScorer {
- ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets,
+ ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets,
Similarity similarity, byte[] norms) {
- super(weight, tps, offsets, similarity, norms);
+ super(weight, postings, offsets, similarity, norms);
}
@Override
@@ -42,11 +42,11 @@ final class ExactPhraseScorer extends Ph
int freq = 0;
do { // find position w/ all terms
while (first.position < last.position) { // scan forward in first
- do {
- if (!first.nextPosition())
- return freq;
- } while (first.position < last.position);
- firstToLast();
+ do {
+ if (!first.nextPosition())
+ return freq;
+ } while (first.position < last.position);
+ firstToLast();
}
freq++; // all equal: a match
} while (last.nextPosition());
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java Tue Apr 6 19:19:27 2010
@@ -20,6 +20,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
@@ -100,7 +101,7 @@ public interface FieldCache {
*/
public interface ByteParser extends Parser {
/** Return a single Byte representation of this field's value. */
- public byte parseByte(String string);
+ public byte parseByte(BytesRef term);
}
/** Interface to parse shorts from document fields.
@@ -108,7 +109,7 @@ public interface FieldCache {
*/
public interface ShortParser extends Parser {
/** Return a short representation of this field's value. */
- public short parseShort(String string);
+ public short parseShort(BytesRef term);
}
/** Interface to parse ints from document fields.
@@ -116,7 +117,7 @@ public interface FieldCache {
*/
public interface IntParser extends Parser {
/** Return an integer representation of this field's value. */
- public int parseInt(String string);
+ public int parseInt(BytesRef term);
}
/** Interface to parse floats from document fields.
@@ -124,7 +125,7 @@ public interface FieldCache {
*/
public interface FloatParser extends Parser {
/** Return an float representation of this field's value. */
- public float parseFloat(String string);
+ public float parseFloat(BytesRef term);
}
/** Interface to parse long from document fields.
@@ -132,7 +133,7 @@ public interface FieldCache {
*/
public interface LongParser extends Parser {
/** Return an long representation of this field's value. */
- public long parseLong(String string);
+ public long parseLong(BytesRef term);
}
/** Interface to parse doubles from document fields.
@@ -140,16 +141,20 @@ public interface FieldCache {
*/
public interface DoubleParser extends Parser {
/** Return an long representation of this field's value. */
- public double parseDouble(String string);
+ public double parseDouble(BytesRef term);
}
/** Expert: The cache used internally by sorting and range query classes. */
public static FieldCache DEFAULT = new FieldCacheImpl();
-
+
/** The default parser for byte values, which are encoded by {@link Byte#toString(byte)} */
public static final ByteParser DEFAULT_BYTE_PARSER = new ByteParser() {
- public byte parseByte(String value) {
- return Byte.parseByte(value);
+ public byte parseByte(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Byte.parseByte(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_BYTE_PARSER;
@@ -162,8 +167,12 @@ public interface FieldCache {
/** The default parser for short values, which are encoded by {@link Short#toString(short)} */
public static final ShortParser DEFAULT_SHORT_PARSER = new ShortParser() {
- public short parseShort(String value) {
- return Short.parseShort(value);
+ public short parseShort(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Short.parseShort(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_SHORT_PARSER;
@@ -176,8 +185,12 @@ public interface FieldCache {
/** The default parser for int values, which are encoded by {@link Integer#toString(int)} */
public static final IntParser DEFAULT_INT_PARSER = new IntParser() {
- public int parseInt(String value) {
- return Integer.parseInt(value);
+ public int parseInt(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Integer.parseInt(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_INT_PARSER;
@@ -190,8 +203,12 @@ public interface FieldCache {
/** The default parser for float values, which are encoded by {@link Float#toString(float)} */
public static final FloatParser DEFAULT_FLOAT_PARSER = new FloatParser() {
- public float parseFloat(String value) {
- return Float.parseFloat(value);
+ public float parseFloat(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Float.parseFloat(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_FLOAT_PARSER;
@@ -204,8 +221,12 @@ public interface FieldCache {
/** The default parser for long values, which are encoded by {@link Long#toString(long)} */
public static final LongParser DEFAULT_LONG_PARSER = new LongParser() {
- public long parseLong(String value) {
- return Long.parseLong(value);
+ public long parseLong(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Long.parseLong(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_LONG_PARSER;
@@ -218,8 +239,12 @@ public interface FieldCache {
/** The default parser for double values, which are encoded by {@link Double#toString(double)} */
public static final DoubleParser DEFAULT_DOUBLE_PARSER = new DoubleParser() {
- public double parseDouble(String value) {
- return Double.parseDouble(value);
+ public double parseDouble(BytesRef term) {
+ // TODO: would be far better to directly parse from
+ // UTF8 bytes... but really users should use
+ // NumericField, instead, which already decodes
+ // directly from byte[]
+ return Double.parseDouble(term.utf8ToString());
}
protected Object readResolve() {
return DEFAULT_DOUBLE_PARSER;
@@ -231,15 +256,14 @@ public interface FieldCache {
};
/**
- * A parser instance for int values encoded by {@link NumericUtils#intToPrefixCoded(int)}, e.g. when indexed
+ * A parser instance for int values encoded by {@link NumericUtils}, e.g. when indexed
* via {@link NumericField}/{@link NumericTokenStream}.
*/
public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){
- public int parseInt(String val) {
- final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT;
- if (shift>0 && shift<=31)
+ public int parseInt(BytesRef term) {
+ if (NumericUtils.getPrefixCodedIntShift(term) > 0)
throw new FieldCacheImpl.StopFillCacheException();
- return NumericUtils.prefixCodedToInt(val);
+ return NumericUtils.prefixCodedToInt(term);
}
protected Object readResolve() {
return NUMERIC_UTILS_INT_PARSER;
@@ -255,11 +279,10 @@ public interface FieldCache {
* via {@link NumericField}/{@link NumericTokenStream}.
*/
public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){
- public float parseFloat(String val) {
- final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT;
- if (shift>0 && shift<=31)
+ public float parseFloat(BytesRef term) {
+ if (NumericUtils.getPrefixCodedIntShift(term) > 0)
throw new FieldCacheImpl.StopFillCacheException();
- return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(val));
+ return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term));
}
protected Object readResolve() {
return NUMERIC_UTILS_FLOAT_PARSER;
@@ -271,15 +294,14 @@ public interface FieldCache {
};
/**
- * A parser instance for long values encoded by {@link NumericUtils#longToPrefixCoded(long)}, e.g. when indexed
+ * A parser instance for long values encoded by {@link NumericUtils}, e.g. when indexed
* via {@link NumericField}/{@link NumericTokenStream}.
*/
public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){
- public long parseLong(String val) {
- final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG;
- if (shift>0 && shift<=63)
+ public long parseLong(BytesRef term) {
+ if (NumericUtils.getPrefixCodedLongShift(term) > 0)
throw new FieldCacheImpl.StopFillCacheException();
- return NumericUtils.prefixCodedToLong(val);
+ return NumericUtils.prefixCodedToLong(term);
}
protected Object readResolve() {
return NUMERIC_UTILS_LONG_PARSER;
@@ -295,11 +317,10 @@ public interface FieldCache {
* via {@link NumericField}/{@link NumericTokenStream}.
*/
public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){
- public double parseDouble(String val) {
- final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG;
- if (shift>0 && shift<=63)
+ public double parseDouble(BytesRef term) {
+ if (NumericUtils.getPrefixCodedLongShift(term) > 0)
throw new FieldCacheImpl.StopFillCacheException();
- return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(val));
+ return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term));
}
protected Object readResolve() {
return NUMERIC_UTILS_DOUBLE_PARSER;