You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/05/07 13:21:14 UTC
svn commit: r1479862 [12/38] - in /lucene/dev/branches/lucene4258: ./
dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/.idea/libraries/
dev-tools/maven/ dev-tools/maven/solr/ dev-tools/maven/solr/core/src/java/
dev-tools/maven/solr/solrj/src/java/ dev-t...
Modified: lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java Tue May 7 11:20:55 2013
@@ -315,8 +315,7 @@ class SimpleTextFieldsReader extends Fie
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
- while(nextDoc() < target);
- return docID;
+ return slowAdvance(target);
}
@Override
@@ -422,8 +421,7 @@ class SimpleTextFieldsReader extends Fie
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
- while(nextDoc() < target);
- return docID;
+ return slowAdvance(target);
}
@Override
Modified: lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java Tue May 7 11:20:55 2013
@@ -430,12 +430,8 @@ public class SimpleTextTermVectorsReader
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int freq) {
@@ -487,12 +483,8 @@ public class SimpleTextTermVectorsReader
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, BytesRef payloads[]) {
Modified: lucene/dev/branches/lucene4258/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4258/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java Tue May 7 11:20:55 2013
@@ -18,13 +18,13 @@ package org.apache.lucene.codecs.diskdv;
*/
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.BaseDocValuesFormatTestCase;
+import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Tests DiskDocValuesFormat
*/
-public class TestDiskDocValuesFormat extends BaseDocValuesFormatTestCase {
+public class TestDiskDocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
private final Codec codec = _TestUtil.alwaysDocValuesFormat(new DiskDocValuesFormat());
@Override
Modified: lucene/dev/branches/lucene4258/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/common-build.xml?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/common-build.xml (original)
+++ lucene/dev/branches/lucene4258/lucene/common-build.xml Tue May 7 11:20:55 2013
@@ -47,16 +47,26 @@
<format property="dateversion" pattern="yyyy.MM.dd.HH.mm.ss" />
</tstamp>
- <property name="name" value="${ant.project.name}"/>
<property name="Name" value="Lucene"/>
- <property name="dev.version" value="5.0-SNAPSHOT"/>
+
+ <property name="name" value="${ant.project.name}"/>
<property name="tests.luceneMatchVersion" value="5.0"/>
+ <property name="dev.version.base" value="5.0"/>
+ <property name="dev.version.suffix" value="SNAPSHOT"/>
+ <property name="dev.version" value="${dev.version.base}-${dev.version.suffix}"/>
<property name="version" value="${dev.version}"/>
<property name="spec.version" value="${version}"/>
+
<property name="year" value="2000-${current.year}"/>
+
+ <!-- Lucene modules unfortunately don't have the "lucene-" prefix, so we add it if no prefix is given in $name: -->
+ <condition property="final.name" value="${name}-${version}">
+ <matches pattern="^(lucene|solr)\b" string="${name}"/>
+ </condition>
<property name="final.name" value="lucene-${name}-${version}"/>
- <property name="common.classpath.excludes" value="**/*.txt,**/*.template,**/*.sha1" />
+ <!-- we exclude ext/*.jar because we don't want example/lib/ext logging jars on the cp -->
+ <property name="common.classpath.excludes" value="**/*.txt,**/*.template,**/*.sha1,ext/*.jar" />
<property name="ivy.bootstrap.version" value="2.3.0" />
<property name="ivy.default.configuration" value="*"/>
@@ -106,6 +116,7 @@
<property name="tests.asserts.gracious" value="false"/>
<property name="tests.verbose" value="false"/>
<property name="tests.infostream" value="${tests.verbose}"/>
+ <property name="tests.filterstacks" value="true"/>
<condition property="tests.heapsize" value="768M">
<isset property="run.clover"/>
@@ -293,8 +304,7 @@
</or>
<or>
<equals arg1="${build.java.runtime}" arg2="1.7"/>
- <!-- TODO: Current Java 8 JDKs have broken Javadocs -->
- <!--<equals arg1="${build.java.runtime}" arg2="1.8"/>-->
+ <equals arg1="${build.java.runtime}" arg2="1.8"/>
</or>
<!-- TODO: Fix this! For now only run this on 64bit, because jTIDY OOMs with default heap size: -->
<contains string="${os.arch}" substring="64"/>
@@ -986,8 +996,25 @@
maxClassNameColumns="${tests.maxClassNameColumns}"
timestamps="${tests.timestamps}"
- showNumFailures="${tests.showNumFailures}"
- />
+ showNumFailures="${tests.showNumFailures}">
+
+ <!-- Filter stack traces. The default set of filters is similar to Ant's (reflection, assertions, junit's own stuff). -->
+ <junit4:filtertrace defaults="true" enabled="${tests.filterstacks}">
+ <!-- Lucene-specific stack frames (test rules mostly). -->
+ <containsstring contains="at com.carrotsearch.randomizedtesting.RandomizedRunner" />
+ <containsstring contains="at org.apache.lucene.util.AbstractBeforeAfterRule" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.rules." />
+ <containsstring contains="at org.apache.lucene.util.TestRule" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.rules.StatementAdapter" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.ThreadLeakControl" />
+
+ <!-- Add custom filters if you like. Lines that match these will be removed. -->
+ <!--
+ <containsstring contains=".." />
+ <containsregex pattern="^(\s+at )(org\.junit\.)" />
+ -->
+ </junit4:filtertrace>
+ </junit4:report-text>
<!-- Emits full status for all tests, their relative order on slaves. -->
<junit4:report-text
@@ -1184,6 +1211,9 @@ ant -Dtests.file.encoding=XXX ...
# the test passes.
ant -Dtests.leaveTemporary=true
+# Do *not* filter stack traces emitted to the console.
+ant -Dtests.filterstacks=false
+
# Output test files and reports.
${tests-output}/tests-report.txt - full ASCII tests report
${tests-output}/tests-failures.txt - failures only (if any)
@@ -1516,6 +1546,14 @@ ${tests-output}/junit4-*.suites - pe
<pattern substring="Permission is hereby granted, free of charge, to any person obtaining a copy"/>
</rat:substringMatcher>
+ <!-- apache -->
+ <rat:substringMatcher licenseFamilyCategory="AL "
+ licenseFamilyName="Apache">
+ <pattern substring="Licensed to the Apache Software Foundation (ASF) under"/>
+ <!-- this is the old-school one under some files -->
+ <pattern substring="Licensed under the Apache License, Version 2.0 (the "License")"/>
+ </rat:substringMatcher>
+
<rat:substringMatcher licenseFamilyCategory="GEN "
licenseFamilyName="Generated">
<!-- svg files generated by gnuplot -->
@@ -1527,7 +1565,7 @@ ${tests-output}/junit4-*.suites - pe
</rat:substringMatcher>
<!-- built in approved licenses -->
- <rat:approvedLicense familyName="Apache License Version 2.0"/>
+ <rat:approvedLicense familyName="Apache"/>
<rat:approvedLicense familyName="The MIT License"/>
<rat:approvedLicense familyName="Modified BSD License"/>
<rat:approvedLicense familyName="Generated"/>
@@ -1537,7 +1575,12 @@ ${tests-output}/junit4-*.suites - pe
<echo>${rat.output}</echo>
<delete>
<fileset file="${rat.sources.logfile}">
- <containsregexp expression="^0 Unknown Licenses"/>
+ <and>
+ <containsregexp expression="^0 Unknown Licenses"/>
+ <not>
+ <containsregexp expression="^\s+!AL"/>
+ </not>
+ </and>
</fileset>
</delete>
<!-- fail if we didnt find the pattern -->
@@ -1939,7 +1982,7 @@ ${tests-output}/junit4-*.suites - pe
<!-- Forbidden API Task -->
<target name="install-forbidden-apis" unless="forbidden-apis.loaded" depends="ivy-availability-check,ivy-configure">
- <ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.2"
+ <ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.3"
inline="true" conf="default" transitive="true" pathid="forbidden-apis.classpath"/>
<taskdef name="forbidden-apis" classname="de.thetaphi.forbiddenapis.AntTask" classpathref="forbidden-apis.classpath"/>
<property name="forbidden-apis.loaded" value="true"/>
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Tue May 7 11:20:55 2013
@@ -17,12 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -43,8 +39,16 @@ import org.apache.lucene.util.automaton.
* @lucene.experimental */
public class TokenStreamToAutomaton {
+ private boolean preservePositionIncrements;
+
/** Sole constructor. */
public TokenStreamToAutomaton() {
+ this.preservePositionIncrements = true;
+ }
+
+ /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
}
private static class Position implements RollingBuffer.Resettable {
@@ -108,6 +112,9 @@ public class TokenStreamToAutomaton {
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
+ if (!preservePositionIncrements && posInc > 1) {
+ posInc = 1;
+ }
assert pos > -1 || posInc > 0;
if (posInc > 0) {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/package.html?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/package.html (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/analysis/package.html Tue May 7 11:20:55 2013
@@ -282,18 +282,18 @@ and proximity searches (though sentence
<p>
If the selected analyzer filters the stop words "is" and "the", then for a document
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
- with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
+ with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
would find that document, because the same analyzer filters the same stop words from
- that query. But also the phrase query "blue sky" would find that document.
+ that query. But the phrase query "blue sky" would not find that document because the
+ position increment between "blue" and "sky" is only 1.
</p>
<p>
- If this behavior does not fit the application needs, a modified analyzer can
- be used, that would increment further the positions of tokens following a
- removed stop word, using
- {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
- This can be done with something like the following (note, however, that
- StopFilter natively includes this capability by subclassing
- FilteringTokenFilter}:
+ If this behavior does not fit the application needs, the query parser needs to be
+ configured to not take position increments into account when generating phrase queries.
+</p>
+<p>
+ Note that a StopFilter MUST increment the position increment in order not to generate corrupt
+ tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
</p>
<PRE class="prettyprint">
public TokenStream tokenStream(final String fieldName, Reader reader) {
@@ -308,7 +308,7 @@ and proximity searches (though sentence
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.toString())) {
- extraIncrement++; // filter this word
+ extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
continue;
}
if (extraIncrement>0) {
@@ -323,11 +323,6 @@ and proximity searches (though sentence
}
</PRE>
<p>
- Now, with this modified analyzer, the phrase query "blue sky" would find that document.
- But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
- where both w1 and w2 are stop words would match that document.
-</p>
-<p>
A few more use cases for modifying position increments are:
</p>
<ol>
@@ -338,6 +333,72 @@ and proximity searches (though sentence
As result, all synonyms of a token would be considered to appear in exactly the
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
+
+<h3>Token Position Length</h3>
+<p>
+ By default, all tokens created by Analyzers and Tokenizers have a
+ {@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
+ This means that the token occupies a single position. This attribute is not indexed
+ and thus not taken into account for positional queries, but is used by eg. suggesters.
+</p>
+<p>
+ The main use case for positions lengths is multi-word synonyms. With single-word
+ synonyms, setting the position increment to 0 is enough to denote the fact that two
+ words are synonyms, for example:
+</p>
+<table>
+<tr><td>Term</td><td>red</td><td>magenta</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td></tr>
+</table>
+<p>
+ Given that position(magenta) = 0 + position(red), they are at the same position, so anything
+ working with analyzers will return the exact same result if you replace "magenta" with "red"
+ in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
+ a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
+ are not enough anymore:
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ The problem with this token stream is that "IBM" is at the same position as "International"
+ although it is a synonym with "International Business Machines" as a whole. Setting
+ the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
+ than "International" is a synonym of "Business". The only way to solve this issue is to
+ make "IBM" span across 3 positions, this is where position lengths come to rescue.
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ This new attribute makes clear that "IBM" and "International Business Machines" start and end
+ at the same positions.
+</p>
+<a name="corrupt" />
+<h3>How to not write corrupt token streams</h3>
+<p>
+ There are a few rules to observe when writing custom Tokenizers and TokenFilters:
+</p>
+<ul>
+ <li>The first position increment must be > 0.</li>
+ <li>Positions must not go backward.</li>
+ <li>Tokens that have the same start position must have the same start offset.</li>
+ <li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
+</ul>
+<p>
+ Although these rules might seem easy to follow, problems can quickly happen when chaining
+ badly implemented filters that play with positions and offsets, such as synonym or n-grams
+ filters. Here are good practices for writing correct filters:
+</p>
+<ul>
+ <li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
+ <li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
+ <li>When they remove tokens, token filters should increment the position increment of the following token.</li>
+ <li>Token filters should preserve position lengths.</li>
+</ul>
<h2>TokenStream API</h2>
<p>
"Flexible Indexing" summarizes the effort of making the Lucene indexer
@@ -383,6 +444,10 @@ and proximity searches (though sentence
<td>See above for detailed information about position increment.</td>
</tr>
<tr>
+ <td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
+ <td>The number of positions occupied by a token.</td>
+ </tr>
+ <tr>
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
<td>The payload that a Token can optionally have.</td>
</tr>
@@ -532,20 +597,26 @@ public final class LengthFilter extends
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
- * Build a filter that removes words that are too long or too
- * short from the text.
+ * Create a new LengthFilter. This will filter out tokens whose
+ * CharTermAttribute is either too short
+ * (< min) or too long (> max).
+ * @param version the Lucene match version
+ * @param in the TokenStream to consume
+ * @param min the minimum length
+ * @param max the maximum length
*/
- public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
- super(enablePositionIncrements, in);
+ public LengthFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
this.min = min;
this.max = max;
}
-
+
{@literal @Override}
- public boolean accept() throws IOException {
+ public boolean accept() {
final int len = termAtt.length();
- return (len >= min && len <= max);
+ return (len >= min && len <= max);
}
+
}
</pre>
<p>
@@ -573,66 +644,39 @@ public final class LengthFilter extends
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
- public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
- super(input);
- this.enablePositionIncrements = enablePositionIncrements;
+ /**
+ * Create a new FilteringTokenFilter.
+ * @param in the TokenStream to consume
+ */
+ public FilteringTokenFilter(Version version, TokenStream in) {
+ super(in);
}
- /** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
+ /** Override this method and return if the current input token should be returned by incrementToken. */
protected abstract boolean accept() throws IOException;
{@literal @Override}
public final boolean incrementToken() throws IOException {
- if (enablePositionIncrements) {
- int skippedPositions = 0;
- while (input.incrementToken()) {
- if (accept()) {
- if (skippedPositions != 0) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
- }
- skippedPositions += posIncrAtt.getPositionIncrement();
- }
- } else {
- while (input.incrementToken()) {
- if (accept()) {
- return true;
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
+ return true;
}
+ skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
}
- /**
- * {@literal @see #setEnablePositionIncrements(boolean)}
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
+ {@literal @Override}
+ public void reset() throws IOException {
+ super.reset();
}
- /**
- * If <code>true</code>, this TokenFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed tokens).
- * Generally, <code>true</code> is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * <p> When set, when a token is stopped
- * (omitted), the position increment of the following
- * token is incremented.
- *
- * <p> <b>NOTE</b>: be sure to also
- * set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
- * you use QueryParser to create queries.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
</pre>
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java Tue May 7 11:20:55 2013
@@ -82,7 +82,7 @@ public final class CompressingStoredFiel
avgChunkDocs[blockCount] = fieldsIndexIn.readVInt();
final int bitsPerDocBase = fieldsIndexIn.readVInt();
if (bitsPerDocBase > 32) {
- throw new CorruptIndexException("Corrupted");
+ throw new CorruptIndexException("Corrupted bitsPerDocBase (resource=" + fieldsIndexIn + ")");
}
docBasesDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerDocBase);
@@ -91,7 +91,7 @@ public final class CompressingStoredFiel
avgChunkSizes[blockCount] = fieldsIndexIn.readVLong();
final int bitsPerStartPointer = fieldsIndexIn.readVInt();
if (bitsPerStartPointer > 64) {
- throw new CorruptIndexException("Corrupted");
+ throw new CorruptIndexException("Corrupted bitsPerStartPointer (resource=" + fieldsIndexIn + ")");
}
startPointersDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerStartPointer);
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java Tue May 7 11:20:55 2013
@@ -203,7 +203,7 @@ public final class CompressingStoredFiel
|| docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: docID=" + docID
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs
- + ", numDocs=" + numDocs);
+ + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
}
final int numStoredFields, offset, length, totalLength;
@@ -217,7 +217,7 @@ public final class CompressingStoredFiel
if (bitsPerStoredFields == 0) {
numStoredFields = fieldsStream.readVInt();
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
} else {
final long filePointer = fieldsStream.getFilePointer();
final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
@@ -231,7 +231,7 @@ public final class CompressingStoredFiel
offset = (docID - docBase) * length;
totalLength = chunkDocs * length;
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerLength=" + bitsPerLength);
+ throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
} else {
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
int off = 0;
@@ -249,7 +249,7 @@ public final class CompressingStoredFiel
}
if ((length == 0) != (numStoredFields == 0)) {
- throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields);
+ throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")");
}
if (numStoredFields == 0) {
// nothing to do
@@ -344,7 +344,7 @@ public final class CompressingStoredFiel
|| docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
+ ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
- + ", new numDocs=" + chunkDocs);
+ + ", new numDocs=" + chunkDocs + " (resource=" + fieldsStream + ")");
}
this.docBase = docBase;
this.chunkDocs = chunkDocs;
@@ -363,7 +363,7 @@ public final class CompressingStoredFiel
if (bitsPerStoredFields == 0) {
Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt());
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
} else {
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
for (int i = 0; i < chunkDocs; ++i) {
@@ -393,7 +393,7 @@ public final class CompressingStoredFiel
final int chunkSize = chunkSize();
decompressor.decompress(fieldsStream, chunkSize, 0, chunkSize, bytes);
if (bytes.length != chunkSize) {
- throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length);
+ throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length + " (resource=" + fieldsStream + ")");
}
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Tue May 7 11:20:55 2013
@@ -53,6 +53,9 @@ import org.apache.lucene.util.packed.Pac
*/
public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
+ // hard limit on the maximum number of documents per chunk
+ static final int MAX_DOCUMENTS_PER_CHUNK = 128;
+
static final int STRING = 0x00;
static final int BYTE_ARR = 0x01;
static final int NUMERIC_INT = 0x02;
@@ -200,7 +203,7 @@ public final class CompressingStoredFiel
private boolean triggerFlush() {
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
- numBufferedDocs >= chunkSize; // can be necessary if most docs are empty
+ numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK;
}
private void flush() throws IOException {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java Tue May 7 11:20:55 2013
@@ -187,7 +187,7 @@ public final class CompressingTermVector
final int docBase = vectorsStream.readVInt();
final int chunkDocs = vectorsStream.readVInt();
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
- throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc);
+ throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc + " (resource=" + vectorsStream + ")");
}
final int skip; // number of fields to skip
@@ -1030,11 +1030,7 @@ public final class CompressingTermVector
@Override
public int advance(int target) throws IOException {
- if (doc == -1 && target == 0 && (liveDocs == null || liveDocs.get(0))) {
- return (doc = 0);
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ return slowAdvance(target);
}
@Override
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Tue May 7 11:20:55 2013
@@ -56,6 +56,9 @@ import org.apache.lucene.util.packed.Pac
*/
public final class CompressingTermVectorsWriter extends TermVectorsWriter {
+ // hard limit on the maximum number of documents per chunk
+ static final int MAX_DOCUMENTS_PER_CHUNK = 128;
+
static final String VECTORS_EXTENSION = "tvd";
static final String VECTORS_INDEX_EXTENSION = "tvx";
@@ -322,7 +325,8 @@ public final class CompressingTermVector
}
private boolean triggerFlush() {
- return termSuffixes.length >= chunkSize || pendingDocs.size() >= chunkSize;
+ return termSuffixes.length >= chunkSize
+ || pendingDocs.size() >= MAX_DOCUMENTS_PER_CHUNK;
}
private void flush() throws IOException {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java Tue May 7 11:20:55 2013
@@ -134,7 +134,7 @@ public abstract class CompressionMode {
}
final int decompressedLength = LZ4.decompress(in, offset + length, bytes.bytes, 0);
if (decompressedLength > originalLength) {
- throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength);
+ throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength + " (resource=" + in + ")");
}
bytes.offset = offset;
bytes.length = length;
@@ -222,7 +222,7 @@ public abstract class CompressionMode {
}
}
if (bytes.length != originalLength) {
- throw new CorruptIndexException("Lengths mismatch: " + bytes.length + " != " + originalLength);
+ throw new CorruptIndexException("Lengths mismatch: " + bytes.length + " != " + originalLength + " (resource=" + in + ")");
}
bytes.offset = offset;
bytes.length = length;
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java Tue May 7 11:20:55 2013
@@ -99,7 +99,7 @@ class Lucene40FieldInfosReader extends F
}
if (oldNormsType.mapping != null) {
if (oldNormsType.mapping != DocValuesType.NUMERIC) {
- throw new CorruptIndexException("invalid norm type: " + oldNormsType);
+ throw new CorruptIndexException("invalid norm type: " + oldNormsType + " (resource=" + input + ")");
}
attributes.put(LEGACY_NORM_TYPE_KEY, oldNormsType.name());
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java Tue May 7 11:20:55 2013
@@ -605,12 +605,8 @@ public class Lucene40TermVectorsReader e
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int freq) {
@@ -664,12 +660,8 @@ public class Lucene40TermVectorsReader e
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, int[] payloadLengths, byte[] payloadBytes) {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java Tue May 7 11:20:55 2013
@@ -34,6 +34,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
@@ -49,7 +50,8 @@ import org.apache.lucene.util.packed.Pac
*/
class Lucene42DocValuesConsumer extends DocValuesConsumer {
static final int VERSION_START = 0;
- static final int VERSION_CURRENT = VERSION_START;
+ static final int VERSION_GCD_COMPRESSION = 1;
+ static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION;
static final byte NUMBER = 0;
static final byte BYTES = 1;
@@ -60,6 +62,7 @@ class Lucene42DocValuesConsumer extends
static final byte DELTA_COMPRESSED = 0;
static final byte TABLE_COMPRESSED = 1;
static final byte UNCOMPRESSED = 2;
+ static final byte GCD_COMPRESSED = 3;
final IndexOutput data, meta;
final int maxDoc;
@@ -83,27 +86,53 @@ class Lucene42DocValuesConsumer extends
}
}
}
-
+
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+ addNumericField(field, values, true);
+ }
+
+ void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(NUMBER);
meta.writeLong(data.getFilePointer());
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
+ long gcd = 0;
// TODO: more efficient?
- HashSet<Long> uniqueValues = new HashSet<Long>();
- for(Number nv : values) {
- long v = nv.longValue();
- minValue = Math.min(minValue, v);
- maxValue = Math.max(maxValue, v);
- if (uniqueValues != null) {
- if (uniqueValues.add(v)) {
- if (uniqueValues.size() > 256) {
- uniqueValues = null;
+ HashSet<Long> uniqueValues = null;
+ if (optimizeStorage) {
+ uniqueValues = new HashSet<>();
+
+ long count = 0;
+ for (Number nv : values) {
+ final long v = nv.longValue();
+
+ if (gcd != 1) {
+ if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
+ // in that case v - minValue might overflow and make the GCD computation return
+ // wrong results. Since these extreme values are unlikely, we just discard
+ // GCD computation for them
+ gcd = 1;
+ } else if (count != 0) { // minValue needs to be set first
+ gcd = MathUtil.gcd(gcd, v - minValue);
}
}
+
+ minValue = Math.min(minValue, v);
+ maxValue = Math.max(maxValue, v);
+
+ if (uniqueValues != null) {
+ if (uniqueValues.add(v)) {
+ if (uniqueValues.size() > 256) {
+ uniqueValues = null;
+ }
+ }
+ }
+
+ ++count;
}
+ assert count == maxDoc;
}
if (uniqueValues != null) {
@@ -135,6 +164,18 @@ class Lucene42DocValuesConsumer extends
}
writer.finish();
}
+ } else if (gcd != 0 && gcd != 1) {
+ meta.writeByte(GCD_COMPRESSED);
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ data.writeLong(minValue);
+ data.writeLong(gcd);
+ data.writeVInt(BLOCK_SIZE);
+
+ final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+ for (Number nv : values) {
+ writer.add((nv.longValue() - minValue) / gcd);
+ }
+ writer.finish();
} else {
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
@@ -222,7 +263,7 @@ class Lucene42DocValuesConsumer extends
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
// write the ordinals as numerics
- addNumericField(field, docToOrd);
+ addNumericField(field, docToOrd, false);
// write the values as FST
writeFST(field, values);
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java Tue May 7 11:20:55 2013
@@ -44,6 +44,8 @@ import org.apache.lucene.util.packed.Blo
* <li>Uncompressed Numerics: when all values would fit into a single byte, and the
* <code>acceptableOverheadRatio</code> would pack values into 8 bits per value anyway, they
* are written as absolute values (with no indirection or packing) for performance.
+ * <li>GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest
+ * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
* Each document's value can be addressed by maxDoc*length.
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
@@ -93,6 +95,8 @@ import org.apache.lucene.util.packed.Blo
* <li>2 --> uncompressed. When the <code>acceptableOverheadRatio</code> parameter would upgrade the number
* of bits required to 8, and all values fit in a byte, these are written as absolute binary values
* for performance.
+ * <li>3 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
+ * using blocks of delta-encoded ints.
* </ul>
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
@@ -103,7 +107,7 @@ import org.apache.lucene.util.packed.Blo
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup></p>
* <ul>
- * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics</li>
+ * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics</li>
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
* <li>SortedData --> {@link FST FST<Int64>}</li>
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li>
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java Tue May 7 11:20:55 2013
@@ -17,6 +17,11 @@ package org.apache.lucene.codecs.lucene4
* limitations under the License.
*/
+import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.DELTA_COMPRESSED;
+import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.GCD_COMPRESSED;
+import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.TABLE_COMPRESSED;
+import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.UNCOMPRESSED;
+
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
@@ -80,14 +85,16 @@ class Lucene42DocValuesProducer extends
// read in the entries from the metadata file.
IndexInput in = state.directory.openInput(metaName, state.context);
boolean success = false;
+ final int version;
try {
- CodecUtil.checkHeader(in, metaCodec,
- Lucene42DocValuesConsumer.VERSION_START,
- Lucene42DocValuesConsumer.VERSION_START);
+ version = CodecUtil.checkHeader(in, metaCodec,
+ Lucene42DocValuesConsumer.VERSION_START,
+ Lucene42DocValuesConsumer.VERSION_CURRENT);
numerics = new HashMap<Integer,NumericEntry>();
binaries = new HashMap<Integer,BinaryEntry>();
fsts = new HashMap<Integer,FSTEntry>();
readFields(in, state.fieldInfos);
+
success = true;
} finally {
if (success) {
@@ -96,12 +103,24 @@ class Lucene42DocValuesProducer extends
IOUtils.closeWhileHandlingException(in);
}
}
-
- String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
- data = state.directory.openInput(dataName, state.context);
- CodecUtil.checkHeader(data, dataCodec,
- Lucene42DocValuesConsumer.VERSION_START,
- Lucene42DocValuesConsumer.VERSION_START);
+
+ success = false;
+ try {
+ String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+ data = state.directory.openInput(dataName, state.context);
+ final int version2 = CodecUtil.checkHeader(data, dataCodec,
+ Lucene42DocValuesConsumer.VERSION_START,
+ Lucene42DocValuesConsumer.VERSION_CURRENT);
+ if (version != version2) {
+ throw new CorruptIndexException("Format versions mismatch");
+ }
+
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(this.data);
+ }
+ }
}
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
@@ -112,6 +131,15 @@ class Lucene42DocValuesProducer extends
NumericEntry entry = new NumericEntry();
entry.offset = meta.readLong();
entry.format = meta.readByte();
+ switch(entry.format) {
+ case DELTA_COMPRESSED:
+ case TABLE_COMPRESSED:
+ case GCD_COMPRESSED:
+ case UNCOMPRESSED:
+ break;
+ default:
+ throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
+ }
if (entry.format != Lucene42DocValuesConsumer.UNCOMPRESSED) {
entry.packedIntsVersion = meta.readVInt();
}
@@ -152,41 +180,56 @@ class Lucene42DocValuesProducer extends
private NumericDocValues loadNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.number);
data.seek(entry.offset);
- if (entry.format == Lucene42DocValuesConsumer.TABLE_COMPRESSED) {
- int size = data.readVInt();
- final long decode[] = new long[size];
- for (int i = 0; i < decode.length; i++) {
- decode[i] = data.readLong();
- }
- final int formatID = data.readVInt();
- final int bitsPerValue = data.readVInt();
- final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), entry.packedIntsVersion, maxDoc, bitsPerValue);
- return new NumericDocValues() {
- @Override
- public long get(int docID) {
- return decode[(int)reader.get(docID)];
- }
- };
- } else if (entry.format == Lucene42DocValuesConsumer.DELTA_COMPRESSED) {
- final int blockSize = data.readVInt();
- final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
- return new NumericDocValues() {
- @Override
- public long get(int docID) {
- return reader.get(docID);
- }
- };
- } else if (entry.format == Lucene42DocValuesConsumer.UNCOMPRESSED) {
- final byte bytes[] = new byte[maxDoc];
- data.readBytes(bytes, 0, bytes.length);
- return new NumericDocValues() {
- @Override
- public long get(int docID) {
- return bytes[docID];
- }
- };
- } else {
- throw new IllegalStateException();
+ switch (entry.format) {
+ case TABLE_COMPRESSED:
+ int size = data.readVInt();
+ if (size > 256) {
+ throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + data);
+ }
+ final long decode[] = new long[size];
+ for (int i = 0; i < decode.length; i++) {
+ decode[i] = data.readLong();
+ }
+ final int formatID = data.readVInt();
+ final int bitsPerValue = data.readVInt();
+ final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), entry.packedIntsVersion, maxDoc, bitsPerValue);
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ return decode[(int)ordsReader.get(docID)];
+ }
+ };
+ case DELTA_COMPRESSED:
+ final int blockSize = data.readVInt();
+ final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ return reader.get(docID);
+ }
+ };
+ case UNCOMPRESSED:
+ final byte bytes[] = new byte[maxDoc];
+ data.readBytes(bytes, 0, bytes.length);
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ return bytes[docID];
+ }
+ };
+ case GCD_COMPRESSED:
+ final long min = data.readLong();
+ final long mult = data.readLong();
+ final int quotientBlockSize = data.readVInt();
+ final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false);
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ return min + mult * quotientReader.get(docID);
+ }
+ };
+ default:
+ throw new AssertionError();
}
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java Tue May 7 11:20:55 2013
@@ -51,7 +51,6 @@ public abstract class BaseCompositeReade
private final int[] starts; // 1st docno for each reader
private final int maxDoc;
private final int numDocs;
- private final boolean hasDeletions;
/** List view solely for {@link #getSequentialSubReaders()},
* for effectiveness the array is used internally. */
@@ -70,7 +69,6 @@ public abstract class BaseCompositeReade
this.subReadersList = Collections.unmodifiableList(Arrays.asList(subReaders));
starts = new int[subReaders.length + 1]; // build starts array
int maxDoc = 0, numDocs = 0;
- boolean hasDeletions = false;
for (int i = 0; i < subReaders.length; i++) {
starts[i] = maxDoc;
final IndexReader r = subReaders[i];
@@ -79,15 +77,11 @@ public abstract class BaseCompositeReade
throw new IllegalArgumentException("Too many documents, composite IndexReaders cannot exceed " + Integer.MAX_VALUE);
}
numDocs += r.numDocs(); // compute numDocs
- if (r.hasDeletions()) {
- hasDeletions = true;
- }
r.registerParentReader(this);
}
starts[subReaders.length] = maxDoc;
this.maxDoc = maxDoc;
this.numDocs = numDocs;
- this.hasDeletions = hasDeletions;
}
@Override
@@ -117,12 +111,6 @@ public abstract class BaseCompositeReade
}
@Override
- public final boolean hasDeletions() {
- // Don't call ensureOpen() here (it could affect performance)
- return hasDeletions;
- }
-
- @Override
public final int docFreq(Term term) throws IOException {
ensureOpen();
int total = 0; // sum freqs in subreaders
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java Tue May 7 11:20:55 2013
@@ -66,7 +66,13 @@ public abstract class CompositeReader ex
@Override
public String toString() {
final StringBuilder buffer = new StringBuilder();
- buffer.append(getClass().getSimpleName());
+ // walk up through class hierarchy to get a non-empty simple name (anonymous classes have no name):
+ for (Class<?> clazz = getClass(); clazz != null; clazz = clazz.getSuperclass()) {
+ if (!clazz.isAnonymousClass()) {
+ buffer.append(clazz.getSimpleName());
+ break;
+ }
+ }
buffer.append('(');
final List<? extends IndexReader> subReaders = getSequentialSubReaders();
assert subReaders != null;
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java Tue May 7 11:20:55 2013
@@ -184,7 +184,7 @@ public class ConcurrentMergeScheduler ex
}
// Sort the merge threads in descending order.
- CollectionUtil.mergeSort(activeMerges, compareByMergeDocCount);
+ CollectionUtil.timSort(activeMerges, compareByMergeDocCount);
int pri = mergeThreadPriority;
final int activeMergeCount = activeMerges.size();
@@ -561,4 +561,13 @@ public class ConcurrentMergeScheduler ex
sb.append("mergeThreadPriority=").append(mergeThreadPriority);
return sb.toString();
}
+
+ @Override
+ public MergeScheduler clone() {
+ ConcurrentMergeScheduler clone = (ConcurrentMergeScheduler) super.clone();
+ clone.writer = null;
+ clone.dir = null;
+ clone.mergeThreads = new ArrayList<MergeThread>();
+ return clone;
+ }
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CorruptIndexException.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CorruptIndexException.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CorruptIndexException.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/CorruptIndexException.java Tue May 7 11:20:55 2013
@@ -1,3 +1,5 @@
+package org.apache.lucene.index;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -15,8 +17,6 @@
* limitations under the License.
*/
-package org.apache.lucene.index;
-
import java.io.IOException;
/**
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java Tue May 7 11:20:55 2013
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.lucene.search.SearcherManager; // javadocs
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.NoSuchDirectoryException;
/** DirectoryReader is an implementation of {@link CompositeReader}
that can read indexes in a {@link Directory}.
@@ -313,17 +314,45 @@ public abstract class DirectoryReader ex
}
/**
- * Returns <code>true</code> if an index exists at the specified directory.
+ * Returns <code>true</code> if an index likely exists at
+ * the specified directory. Note that if a corrupt index
+ * exists, or if an index in the process of committing
* @param directory the directory to check for an index
* @return <code>true</code> if an index exists; <code>false</code> otherwise
*/
- public static boolean indexExists(Directory directory) {
+ public static boolean indexExists(Directory directory) throws IOException {
+ // LUCENE-2812, LUCENE-2727, LUCENE-4738: this logic will
+ // return true in cases that should arguably be false,
+ // such as only IW.prepareCommit has been called, or a
+ // corrupt first commit, but it's too deadly to make
+ // this logic "smarter" and risk accidentally returning
+ // false due to various cases like file description
+ // exhaustion, access denited, etc., because in that
+ // case IndexWriter may delete the entire index. It's
+ // safer to err towards "index exists" than try to be
+ // smart about detecting not-yet-fully-committed or
+ // corrupt indices. This means that IndexWriter will
+ // throw an exception on such indices and the app must
+ // resolve the situation manually:
+ String[] files;
try {
- new SegmentInfos().read(directory);
- return true;
- } catch (IOException ioe) {
+ files = directory.listAll();
+ } catch (NoSuchDirectoryException nsde) {
+ // Directory does not exist --> no index exists
return false;
}
+
+ // Defensive: maybe a Directory impl returns null
+ // instead of throwing NoSuchDirectoryException:
+ if (files != null) {
+ String prefix = IndexFileNames.SEGMENTS + "_";
+ for(String file : files) {
+ if (file.startsWith(prefix) || file.equals(IndexFileNames.SEGMENTS_GEN)) {
+ return true;
+ }
+ }
+ }
+ return false;
}
/**
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java Tue May 7 11:20:55 2013
@@ -214,7 +214,7 @@ final class DocFieldProcessor extends Do
// sort the subset of fields that have vectors
// enabled; we could save [small amount of] CPU
// here.
- ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp);
+ ArrayUtil.introSort(fields, 0, fieldCount, fieldsComp);
for(int i=0;i<fieldCount;i++) {
final DocFieldProcessorPerField perField = fields[i];
perField.consumer.processFields(perField.fields, perField.fieldCount, segmentInfo, trackingDirectory);
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java Tue May 7 11:20:55 2013
@@ -18,6 +18,7 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
@@ -477,21 +478,21 @@ final class DocumentsWriter {
// TODO: somehow we should fix this merge so it's
// abortable so that IW.close(false) is able to stop it
- TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(
- directory);
+ TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory);
- SegmentMerger merger = new SegmentMerger(info, infoStream, trackingDir,
+ final List<AtomicReader> mergeReaders = new ArrayList<AtomicReader>();
+ AtomicReader reader;
+ while ((reader = updates.nextReader()) != null) { // add new indexes
+ mergeReaders.add(reader);
+ }
+
+ SegmentMerger merger = new SegmentMerger(mergeReaders, info, infoStream, trackingDir,
interval, MergeState.CheckAbort.NONE, globalFieldNumberMap, context);
updates.startWriting(infoPerCommit.getNextUpdateGen(),
infoPerCommit.info.getDocCount(), indexWriter.getConfig()
.getReaderTermsIndexDivisor());
- AtomicReader reader;
- while ((reader = updates.nextReader()) != null) { // add new indexes
- merger.add(reader);
- }
-
Set<String> generationReplacementFilenames = null;
boolean success = false;
try {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java Tue May 7 11:20:55 2013
@@ -645,7 +645,7 @@ class DocumentsWriterPerThread {
SegmentInfoPerCommit newSegment = flushedSegment.segmentInfo;
- IndexWriter.setDiagnostics(newSegment.info, "flush");
+ IndexWriter.setDiagnostics(newSegment.info, IndexWriter.SOURCE_FLUSH);
IOContext context = new IOContext(new FlushInfo(newSegment.info.getDocCount(), newSegment.sizeInBytes()));
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java Tue May 7 11:20:55 2013
@@ -17,15 +17,16 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
+import org.apache.lucene.search.CachingWrapperFilter;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
/** A <code>FilterAtomicReader</code> contains another AtomicReader, which it
* uses as its basic source of data, possibly transforming the data along the
* way or providing additional functionality. The class
@@ -34,6 +35,15 @@ import java.util.Iterator;
* contained index reader. Subclasses of <code>FilterAtomicReader</code> may
* further override some of these methods and may also provide additional
* methods and fields.
+ * <p><b>NOTE</b>: If you override {@link #getLiveDocs()}, you will likely need
+ * to override {@link #numDocs()} as well and vice-versa.
+ * <p><b>NOTE</b>: If this {@link FilterAtomicReader} does not change the
+ * content the contained reader, you could consider overriding
+ * {@link #getCoreCacheKey()} so that {@link FieldCache} and
+ * {@link CachingWrapperFilter} share the same entries for this atomic reader
+ * and the wrapped one. {@link #getCombinedCoreAndDeletesKey()} could be
+ * overridden as well if the {@link #getLiveDocs() live docs} are not changed
+ * either.
*/
public class FilterAtomicReader extends AtomicReader {
@@ -67,8 +77,11 @@ public class FilterAtomicReader extends
}
}
- /** Base class for filtering {@link Terms}
- * implementations. */
+ /** Base class for filtering {@link Terms} implementations.
+ * <p><b>NOTE</b>: If the order of terms and documents is not changed, and if
+ * these terms are going to be intersected with automata, you could consider
+ * overriding {@link #intersect} for better performance.
+ */
public static class FilterTerms extends Terms {
/** The underlying Terms instance. */
protected final Terms in;
@@ -85,7 +98,7 @@ public class FilterAtomicReader extends
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return in.iterator(reuse);
}
-
+
@Override
public Comparator<BytesRef> getComparator() {
return in.getComparator();
@@ -110,11 +123,6 @@ public class FilterAtomicReader extends
public int getDocCount() throws IOException {
return in.getDocCount();
}
-
- @Override
- public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws java.io.IOException {
- return in.intersect(automaton, bytes);
- }
@Override
public boolean hasOffsets() {
@@ -144,8 +152,8 @@ public class FilterAtomicReader extends
public FilterTermsEnum(TermsEnum in) { this.in = in; }
@Override
- public boolean seekExact(BytesRef text, boolean useCache) throws IOException {
- return in.seekExact(text, useCache);
+ public AttributeSource attributes() {
+ return in.attributes();
}
@Override
@@ -197,21 +205,6 @@ public class FilterAtomicReader extends
public Comparator<BytesRef> getComparator() {
return in.getComparator();
}
-
- @Override
- public void seekExact(BytesRef term, TermState state) throws IOException {
- in.seekExact(term, state);
- }
-
- @Override
- public TermState termState() throws IOException {
- return in.termState();
- }
-
- @Override
- public AttributeSource attributes() {
- return in.attributes();
- }
}
/** Base class for filtering {@link DocsEnum} implementations. */
@@ -228,6 +221,11 @@ public class FilterAtomicReader extends
}
@Override
+ public AttributeSource attributes() {
+ return in.attributes();
+ }
+
+ @Override
public int docID() {
return in.docID();
}
@@ -246,11 +244,6 @@ public class FilterAtomicReader extends
public int advance(int target) throws IOException {
return in.advance(target);
}
-
- @Override
- public AttributeSource attributes() {
- return in.attributes();
- }
@Override
public long cost() {
@@ -272,6 +265,11 @@ public class FilterAtomicReader extends
}
@Override
+ public AttributeSource attributes() {
+ return in.attributes();
+ }
+
+ @Override
public int docID() {
return in.docID();
}
@@ -312,11 +310,6 @@ public class FilterAtomicReader extends
}
@Override
- public AttributeSource attributes() {
- return in.attributes();
- }
-
- @Override
public long cost() {
return in.cost();
}
@@ -373,12 +366,6 @@ public class FilterAtomicReader extends
}
@Override
- public boolean hasDeletions() {
- ensureOpen();
- return in.hasDeletions();
- }
-
- @Override
protected void doClose() throws IOException {
in.close();
}
@@ -389,24 +376,6 @@ public class FilterAtomicReader extends
return in.fields();
}
- /** {@inheritDoc}
- * <p>If the subclass of FilteredIndexReader modifies the
- * contents (but not liveDocs) of the index, you must override this
- * method to provide a different key. */
- @Override
- public Object getCoreCacheKey() {
- return in.getCoreCacheKey();
- }
-
- /** {@inheritDoc}
- * <p>If the subclass of FilteredIndexReader modifies the
- * liveDocs, you must override this
- * method to provide a different key. */
- @Override
- public Object getCombinedCoreAndDeletesKey() {
- return in.getCombinedCoreAndDeletesKey();
- }
-
@Override
public String toString() {
final StringBuilder buffer = new StringBuilder("FilterAtomicReader(");
@@ -444,4 +413,5 @@ public class FilterAtomicReader extends
ensureOpen();
return in.getNormValues(field);
}
+
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Tue May 7 11:20:55 2013
@@ -55,7 +55,7 @@ final class FreqProxTermsWriter extends
final int numAllFields = allFields.size();
// Sort by field name
- CollectionUtil.quickSort(allFields);
+ CollectionUtil.introSort(allFields);
final FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java Tue May 7 11:20:55 2013
@@ -60,36 +60,23 @@ class FrozenBufferedDeletes {
public FrozenBufferedDeletes(BufferedDeletes deletes, BufferedUpdates updates, boolean isSegmentPrivate) {
this.isSegmentPrivate = isSegmentPrivate;
- int localBytesUsed = 0;
- if (deletes != null) {
- assert !isSegmentPrivate || deletes.terms.size() == 0 : "segment private package should only have del queries";
- Term termsArray[] = deletes.terms.keySet().toArray(
- new Term[deletes.terms.size()]);
- termCount = termsArray.length;
- ArrayUtil.mergeSort(termsArray);
- PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
- for (Term term : termsArray) {
- builder.add(term);
- }
- terms = builder.finish();
- localBytesUsed += (int) terms.getSizeInBytes();
-
- queries = new Query[deletes.queries.size()];
- queryLimits = new int[deletes.queries.size()];
- int upto = 0;
- for (Map.Entry<Query,Integer> ent : deletes.queries.entrySet()) {
- queries[upto] = ent.getKey();
- queryLimits[upto] = ent.getValue();
- upto++;
- }
-
- localBytesUsed += queries.length * BYTES_PER_DEL_QUERY;
- numTermDeletes = deletes.numTermDeletes.get();
- } else {
- terms = null;
- numTermDeletes = 0;
- queries = null;
- queryLimits = null;
+ assert !isSegmentPrivate || deletes.terms.size() == 0 : "segment private package should only have del queries";
+ Term termsArray[] = deletes.terms.keySet().toArray(new Term[deletes.terms.size()]);
+ termCount = termsArray.length;
+ ArrayUtil.timSort(termsArray);
+ PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+ for (Term term : termsArray) {
+ builder.add(term);
+ }
+ terms = builder.finish();
+
+ queries = new Query[deletes.queries.size()];
+ queryLimits = new int[deletes.queries.size()];
+ int upto = 0;
+ for(Map.Entry<Query,Integer> ent : deletes.queries.entrySet()) {
+ queries[upto] = ent.getKey();
+ queryLimits[upto] = ent.getValue();
+ upto++;
}
// freeze updates
@@ -100,10 +87,10 @@ class FrozenBufferedDeletes {
for (SortedSet<FieldsUpdate> list : updates.terms.values()) {
allUpdates.addAll(list);
}
- localBytesUsed += 100;
}
- bytesUsed = localBytesUsed;
+ bytesUsed = (int) terms.getSizeInBytes() + queries.length * BYTES_PER_DEL_QUERY + 100 /* updates */;
+ numTermDeletes = deletes.numTermDeletes.get();
}
public void setDelGen(long gen) {
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexDeletionPolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexDeletionPolicy.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexDeletionPolicy.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexDeletionPolicy.java Tue May 7 11:20:55 2013
@@ -48,7 +48,10 @@ import java.io.IOException;
* for details.</p>
*/
-public interface IndexDeletionPolicy {
+public abstract class IndexDeletionPolicy implements Cloneable {
+
+ /** Sole constructor, typically called by sub-classes constructors. */
+ protected IndexDeletionPolicy() {}
/**
* <p>This is called once when a writer is first
@@ -70,7 +73,7 @@ public interface IndexDeletionPolicy {
* {@link IndexCommit point-in-time commits},
* sorted by age (the 0th one is the oldest commit).
*/
- public void onInit(List<? extends IndexCommit> commits) throws IOException;
+ public abstract void onInit(List<? extends IndexCommit> commits) throws IOException;
/**
* <p>This is called each time the writer completed a commit.
@@ -94,5 +97,15 @@ public interface IndexDeletionPolicy {
* @param commits List of {@link IndexCommit},
* sorted by age (the 0th one is the oldest commit).
*/
- public void onCommit(List<? extends IndexCommit> commits) throws IOException;
+ public abstract void onCommit(List<? extends IndexCommit> commits) throws IOException;
+
+ @Override
+ public IndexDeletionPolicy clone() {
+ try {
+ return (IndexDeletionPolicy) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new Error(e);
+ }
+ }
+
}
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java Tue May 7 11:20:55 2013
@@ -123,7 +123,7 @@ final class IndexFileDeleter implements
* @throws IOException if there is a low-level IO error
*/
public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos,
- InfoStream infoStream, IndexWriter writer) throws IOException {
+ InfoStream infoStream, IndexWriter writer, boolean initialIndexExists) throws IOException {
this.infoStream = infoStream;
this.writer = writer;
@@ -209,7 +209,7 @@ final class IndexFileDeleter implements
}
}
- if (currentCommitPoint == null && currentSegmentsFile != null) {
+ if (currentCommitPoint == null && currentSegmentsFile != null && initialIndexExists) {
// We did not in fact see the segments_N file
// corresponding to the segmentInfos that was passed
// in. Yet, it must exist, because our caller holds
@@ -221,7 +221,7 @@ final class IndexFileDeleter implements
try {
sis.read(directory, currentSegmentsFile);
} catch (IOException e) {
- throw new CorruptIndexException("failed to locate current segments_N file");
+ throw new CorruptIndexException("failed to locate current segments_N file \"" + currentSegmentsFile + "\"");
}
if (infoStream.isEnabled("IFD")) {
infoStream.message("IFD", "forced open of current segments file " + segmentInfos.getSegmentsFileName());
@@ -232,7 +232,7 @@ final class IndexFileDeleter implements
}
// We keep commits list in sorted order (oldest to newest):
- CollectionUtil.mergeSort(commits);
+ CollectionUtil.timSort(commits);
// Now delete anything with ref count at 0. These are
// presumably abandoned files eg due to crash of
@@ -250,9 +250,7 @@ final class IndexFileDeleter implements
// Finally, give policy a chance to remove things on
// startup:
- if (currentSegmentsFile != null) {
- policy.onInit(commits);
- }
+ policy.onInit(commits);
// Always protect the incoming segmentInfos since
// sometime it may not be the most recent commit
Modified: lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexReader.java?rev=1479862&r1=1479861&r2=1479862&view=diff
==============================================================================
--- lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/dev/branches/lucene4258/lucene/core/src/java/org/apache/lucene/index/IndexReader.java Tue May 7 11:20:55 2013
@@ -22,15 +22,14 @@ import java.io.IOException;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
-import java.util.WeakHashMap;
import java.util.Set;
+import java.util.WeakHashMap;
import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
-import org.apache.lucene.search.SearcherManager; // javadocs
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Bits;
+// javadocs
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -179,7 +178,7 @@ public abstract class IndexReader implem
* and returns <code>true</code> iff the refCount was
* successfully incremented, otherwise <code>false</code>.
* If this method returns <code>false</code> the reader is either
- * already closed or is currently been closed. Either way this
+ * already closed or is currently being closed. Either way this
* reader instance shouldn't be used by an application unless
* <code>true</code> is returned.
* <p>
@@ -361,8 +360,12 @@ public abstract class IndexReader implem
return visitor.getDocument();
}
- /** Returns true if any documents have been deleted */
- public abstract boolean hasDeletions();
+ /** Returns true if any documents have been deleted. Implementers should
+ * consider overriding this method if {@link #maxDoc()} or {@link #numDocs()}
+ * are not constant-time operations. */
+ public boolean hasDeletions() {
+ return numDeletedDocs() > 0;
+ }
/**
* Closes files associated with this index.
@@ -415,7 +418,7 @@ public abstract class IndexReader implem
* it again.
* This key must not have equals()/hashCode() methods, so "equals" means "identical". */
public Object getCoreCacheKey() {
- // Don't can ensureOpen since FC calls this (to evict)
+ // Don't call ensureOpen since FC calls this (to evict)
// on close
return this;
}
@@ -424,7 +427,7 @@ public abstract class IndexReader implem
* so FieldCache/CachingWrapperFilter can find it again.
* This key must not have equals()/hashCode() methods, so "equals" means "identical". */
public Object getCombinedCoreAndDeletesKey() {
- // Don't can ensureOpen since FC calls this (to evict)
+ // Don't call ensureOpen since FC calls this (to evict)
// on close
return this;
}
@@ -438,12 +441,11 @@ public abstract class IndexReader implem
*/
public abstract int docFreq(Term term) throws IOException;
- /** Returns the number of documents containing the term
- * <code>term</code>. This method returns 0 if the term or
- * field does not exists, or -1 if the Codec does not support
- * the measure. This method does not take into account deleted
- * documents that have not yet been merged away.
- * @see TermsEnum#totalTermFreq()
+ /**
+ * Returns the total number of occurrences of {@code term} across all
+ * documents (the sum of the freq() for each doc that has this term). This
+ * will be -1 if the codec doesn't support this measure. Note that, like other
+ * term measures, this measure does not take deleted documents into account.
*/
public abstract long totalTermFreq(Term term) throws IOException;