You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/05/30 09:53:46 UTC
svn commit: r1487777 [15/50] - in /lucene/dev/branches/security: ./
dev-tools/ dev-tools/eclipse/dot.settings/ dev-tools/idea/.idea/
dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/replicator/
dev-tools/maven/ dev-tools/maven/lucene/ dev-tools/ma...
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java Thu May 30 07:53:18 2013
@@ -416,21 +416,23 @@ public final class MemoryPostingsFormat
}
@Override
- public int advance(int target) {
+ public int advance(int target) throws IOException {
// TODO: we could make more efficient version, but, it
// should be rare that this will matter in practice
// since usually apps will not store "big" fields in
// this codec!
- //System.out.println("advance start docID=" + docID + " target=" + target);
- while(nextDoc() < target) {
- }
- return docID;
+ return slowAdvance(target);
}
@Override
public int freq() {
return freq;
}
+
+ @Override
+ public long cost() {
+ return numDocs;
+ }
}
private final static class FSTDocsAndPositionsEnum extends DocsAndPositionsEnum {
@@ -602,22 +604,23 @@ public final class MemoryPostingsFormat
}
@Override
- public int advance(int target) {
+ public int advance(int target) throws IOException {
// TODO: we could make more efficient version, but, it
// should be rare that this will matter in practice
// since usually apps will not store "big" fields in
// this codec!
- //System.out.println("advance target=" + target);
- while(nextDoc() < target) {
- }
- //System.out.println(" return " + docID);
- return docID;
+ return slowAdvance(target);
}
@Override
public int freq() {
return freq;
}
+
+ @Override
+ public long cost() {
+ return numDocs;
+ }
}
private final static class FSTTermsEnum extends TermsEnum {
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsReader.java Thu May 30 07:53:18 2013
@@ -260,6 +260,7 @@ public class PulsingPostingsReader exten
private int accum;
private int freq;
private int payloadLength;
+ private int cost;
public PulsingDocsEnum(FieldInfo fieldInfo) {
indexOptions = fieldInfo.getIndexOptions();
@@ -283,6 +284,7 @@ public class PulsingPostingsReader exten
docID = -1;
accum = 0;
freq = 1;
+ cost = termState.docFreq;
payloadLength = 0;
this.liveDocs = liveDocs;
return this;
@@ -360,12 +362,12 @@ public class PulsingPostingsReader exten
@Override
public int advance(int target) throws IOException {
- int doc;
- while((doc=nextDoc()) != NO_MORE_DOCS) {
- if (doc >= target)
- return doc;
- }
- return docID = NO_MORE_DOCS;
+ return docID = slowAdvance(target);
+ }
+
+ @Override
+ public long cost() {
+ return cost;
}
}
@@ -390,6 +392,7 @@ public class PulsingPostingsReader exten
private int offsetLength;
private boolean payloadRetrieved;
+ private int cost;
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
indexOptions = fieldInfo.getIndexOptions();
@@ -415,6 +418,7 @@ public class PulsingPostingsReader exten
posPending = 0;
docID = -1;
accum = 0;
+ cost = termState.docFreq;
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
offsetLength = 0;
//System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this);
@@ -465,13 +469,7 @@ public class PulsingPostingsReader exten
@Override
public int advance(int target) throws IOException {
- int doc;
- while((doc=nextDoc()) != NO_MORE_DOCS) {
- if (doc >= target) {
- return docID = doc;
- }
- }
- return docID = NO_MORE_DOCS;
+ return docID = slowAdvance(target);
}
@Override
@@ -551,6 +549,11 @@ public class PulsingPostingsReader exten
return null;
}
}
+
+ @Override
+ public long cost() {
+ return cost;
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java Thu May 30 07:53:18 2013
@@ -1,7 +1,7 @@
package org.apache.lucene.codecs.sep;
-/**
- * LICENSED to the Apache Software Foundation (ASF) under one or more
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsReader.java Thu May 30 07:53:18 2013
@@ -441,6 +441,11 @@ public class SepPostingsReader extends P
return doc;
}
+
+ @Override
+ public long cost() {
+ return docFreq;
+ }
}
class SepDocsAndPositionsEnum extends DocsAndPositionsEnum {
@@ -717,5 +722,10 @@ public class SepPostingsReader extends P
pendingPayloadBytes = 0;
return payload;
}
+
+ @Override
+ public long cost() {
+ return docFreq;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java Thu May 30 07:53:18 2013
@@ -165,7 +165,7 @@ class SimpleTextDocValuesReader extends
try {
bd = (BigDecimal) decoder.parse(scratch.utf8ToString());
} catch (ParseException pe) {
- CorruptIndexException e = new CorruptIndexException("failed to parse BigDecimal value");
+ CorruptIndexException e = new CorruptIndexException("failed to parse BigDecimal value (resource=" + in + ")");
e.initCause(pe);
throw e;
}
@@ -203,7 +203,7 @@ class SimpleTextDocValuesReader extends
try {
len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
} catch (ParseException pe) {
- CorruptIndexException e = new CorruptIndexException("failed to parse int length");
+ CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
e.initCause(pe);
throw e;
}
@@ -243,7 +243,7 @@ class SimpleTextDocValuesReader extends
try {
return ordDecoder.parse(scratch.utf8ToString()).intValue();
} catch (ParseException pe) {
- CorruptIndexException e = new CorruptIndexException("failed to parse ord");
+ CorruptIndexException e = new CorruptIndexException("failed to parse ord (resource=" + in + ")");
e.initCause(pe);
throw e;
}
@@ -265,7 +265,7 @@ class SimpleTextDocValuesReader extends
try {
len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
} catch (ParseException pe) {
- CorruptIndexException e = new CorruptIndexException("failed to parse int length");
+ CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
e.initCause(pe);
throw e;
}
@@ -343,7 +343,7 @@ class SimpleTextDocValuesReader extends
try {
len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
} catch (ParseException pe) {
- CorruptIndexException e = new CorruptIndexException("failed to parse int length");
+ CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
e.initCause(pe);
throw e;
}
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java Thu May 30 07:53:18 2013
@@ -199,7 +199,7 @@ class SimpleTextFieldsReader extends Fie
} else {
docsEnum = new SimpleTextDocsEnum();
}
- return docsEnum.reset(docsStart, liveDocs, indexOptions == IndexOptions.DOCS_ONLY);
+ return docsEnum.reset(docsStart, liveDocs, indexOptions == IndexOptions.DOCS_ONLY, docFreq);
}
@Override
@@ -216,7 +216,7 @@ class SimpleTextFieldsReader extends Fie
} else {
docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum();
}
- return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions);
+ return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions, docFreq);
}
@Override
@@ -234,6 +234,7 @@ class SimpleTextFieldsReader extends Fie
private Bits liveDocs;
private final BytesRef scratch = new BytesRef(10);
private final CharsRef scratchUTF16 = new CharsRef(10);
+ private int cost;
public SimpleTextDocsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
@@ -244,12 +245,13 @@ class SimpleTextFieldsReader extends Fie
return in == inStart;
}
- public SimpleTextDocsEnum reset(long fp, Bits liveDocs, boolean omitTF) throws IOException {
+ public SimpleTextDocsEnum reset(long fp, Bits liveDocs, boolean omitTF, int docFreq) throws IOException {
this.liveDocs = liveDocs;
in.seek(fp);
this.omitTF = omitTF;
docID = -1;
tf = 1;
+ cost = docFreq;
return this;
}
@@ -313,8 +315,12 @@ class SimpleTextFieldsReader extends Fie
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
- while(nextDoc() < target);
- return docID;
+ return slowAdvance(target);
+ }
+
+ @Override
+ public long cost() {
+ return cost;
}
}
@@ -334,6 +340,7 @@ class SimpleTextFieldsReader extends Fie
private boolean readPositions;
private int startOffset;
private int endOffset;
+ private int cost;
public SimpleTextDocsAndPositionsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
@@ -344,7 +351,7 @@ class SimpleTextFieldsReader extends Fie
return in == inStart;
}
- public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs, IndexOptions indexOptions) {
+ public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs, IndexOptions indexOptions, int docFreq) {
this.liveDocs = liveDocs;
nextDocStart = fp;
docID = -1;
@@ -354,6 +361,7 @@ class SimpleTextFieldsReader extends Fie
startOffset = -1;
endOffset = -1;
}
+ cost = docFreq;
return this;
}
@@ -413,8 +421,7 @@ class SimpleTextFieldsReader extends Fie
@Override
public int advance(int target) throws IOException {
// Naive -- better to index skip data
- while(nextDoc() < target);
- return docID;
+ return slowAdvance(target);
}
@Override
@@ -471,6 +478,11 @@ class SimpleTextFieldsReader extends Fie
public BytesRef getPayload() {
return payload;
}
+
+ @Override
+ public long cost() {
+ return cost;
+ }
}
static class TermData {
Modified: lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java Thu May 30 07:53:18 2013
@@ -430,12 +430,8 @@ public class SimpleTextTermVectorsReader
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int freq) {
@@ -444,6 +440,11 @@ public class SimpleTextTermVectorsReader
this.doc = -1;
didNext = false;
}
+
+ @Override
+ public long cost() {
+ return 1;
+ }
}
private static class SimpleTVDocsAndPositionsEnum extends DocsAndPositionsEnum {
@@ -482,12 +483,8 @@ public class SimpleTextTermVectorsReader
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, BytesRef payloads[]) {
@@ -535,5 +532,10 @@ public class SimpleTextTermVectorsReader
return endOffsets[nextPos-1];
}
}
+
+ @Override
+ public long cost() {
+ return 1;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java (original)
+++ lucene/dev/branches/security/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskDocValuesFormat.java Thu May 30 07:53:18 2013
@@ -18,13 +18,13 @@ package org.apache.lucene.codecs.diskdv;
*/
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.BaseDocValuesFormatTestCase;
+import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Tests DiskDocValuesFormat
*/
-public class TestDiskDocValuesFormat extends BaseDocValuesFormatTestCase {
+public class TestDiskDocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
private final Codec codec = _TestUtil.alwaysDocValuesFormat(new DiskDocValuesFormat());
@Override
Modified: lucene/dev/branches/security/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/common-build.xml?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/common-build.xml (original)
+++ lucene/dev/branches/security/lucene/common-build.xml Thu May 30 07:53:18 2013
@@ -47,16 +47,26 @@
<format property="dateversion" pattern="yyyy.MM.dd.HH.mm.ss" />
</tstamp>
- <property name="name" value="${ant.project.name}"/>
<property name="Name" value="Lucene"/>
- <property name="dev.version" value="5.0-SNAPSHOT"/>
+
+ <property name="name" value="${ant.project.name}"/>
<property name="tests.luceneMatchVersion" value="5.0"/>
+ <property name="dev.version.base" value="5.0"/>
+ <property name="dev.version.suffix" value="SNAPSHOT"/>
+ <property name="dev.version" value="${dev.version.base}-${dev.version.suffix}"/>
<property name="version" value="${dev.version}"/>
<property name="spec.version" value="${version}"/>
+
<property name="year" value="2000-${current.year}"/>
+
+ <!-- Lucene modules unfortunately don't have the "lucene-" prefix, so we add it if no prefix is given in $name: -->
+ <condition property="final.name" value="${name}-${version}">
+ <matches pattern="^(lucene|solr)\b" string="${name}"/>
+ </condition>
<property name="final.name" value="lucene-${name}-${version}"/>
- <property name="common.classpath.excludes" value="**/*.txt,**/*.template,**/*.sha1" />
+ <!-- we exclude ext/*.jar because we don't want example/lib/ext logging jars on the cp -->
+ <property name="common.classpath.excludes" value="**/*.txt,**/*.template,**/*.sha1,ext/*.jar" />
<property name="ivy.bootstrap.version" value="2.3.0" />
<property name="ivy.default.configuration" value="*"/>
@@ -106,6 +116,7 @@
<property name="tests.asserts.gracious" value="false"/>
<property name="tests.verbose" value="false"/>
<property name="tests.infostream" value="${tests.verbose}"/>
+ <property name="tests.filterstacks" value="true"/>
<condition property="tests.heapsize" value="768M">
<isset property="run.clover"/>
@@ -143,16 +154,16 @@
<property name="javac.deprecation" value="off"/>
<property name="javac.debug" value="on"/>
- <property name="javac.source" value="1.6"/>
- <property name="javac.target" value="1.6"/>
- <property name="javac.source.backwards" value="1.6"/>
- <property name="javac.target.backwards" value="1.6"/>
+ <property name="javac.source" value="1.7"/>
+ <property name="javac.target" value="1.7"/>
+ <property name="javac.source.backwards" value="1.7"/>
+ <property name="javac.target.backwards" value="1.7"/>
<property name="javac.args" value="-Xlint -Xlint:-deprecation -Xlint:-serial -Xlint:-options"/>
<property name="bootclasspath" value=""/>
- <property name="javadoc.link" value="http://download.oracle.com/javase/6/docs/api/"/>
+ <property name="javadoc.link" value="http://download.oracle.com/javase/7/docs/api/"/>
<property name="javadoc.link.junit" value="http://junit.sourceforge.net/javadoc/"/>
<property name="javadoc.packagelist.dir" location="${common.dir}/tools/javadoc"/>
- <available file="${javadoc.packagelist.dir}/java6/package-list" property="javadoc.java6.packagelist.exists"/>
+ <available file="${javadoc.packagelist.dir}/java7/package-list" property="javadoc.java7.packagelist.exists"/>
<property name="javadoc.access" value="protected"/>
<property name="javadoc.charset" value="utf-8"/>
<property name="javadoc.dir" location="${common.dir}/build/docs"/>
@@ -253,7 +264,7 @@
</propertyset>
<patternset id="lucene.local.src.package.patterns"
- excludes="**/pom.xml,**/*.iml,**/*.jar,build/**,dist/**,benchmark/work/**,benchmark/temp/**,tools/javadoc/java6/**,tools/clover/**"
+ excludes="**/pom.xml,**/*.iml,**/*.jar,build/**,dist/**,benchmark/work/**,benchmark/temp/**,tools/javadoc/java7/**,tools/clover/**"
/>
<!-- Default exclude sources and javadoc jars from Ivy fetch to save time and bandwidth -->
@@ -282,10 +293,7 @@
<condition property="build.java.runtime" value="1.7">
<hasmethod classname="java.lang.Throwable" method="getSuppressed"/>
</condition>
- <condition property="build.java.runtime" value="1.6">
- <hasmethod classname="java.lang.String" method="isEmpty"/>
- </condition>
- <fail message="Minimum supported Java version is 1.6." unless="build.java.runtime"/>
+ <fail message="Minimum supported Java version is 1.7." unless="build.java.runtime"/>
<condition property="documentation-lint.supported">
<and>
@@ -296,8 +304,7 @@
</or>
<or>
<equals arg1="${build.java.runtime}" arg2="1.7"/>
- <!-- TODO: Current Java 8 JDKs have broken Javadocs -->
- <!--<equals arg1="${build.java.runtime}" arg2="1.8"/>-->
+ <equals arg1="${build.java.runtime}" arg2="1.8"/>
</or>
<!-- TODO: Fix this! For now only run this on 64bit, because jTIDY OOMs with default heap size: -->
<contains string="${os.arch}" substring="64"/>
@@ -989,8 +996,25 @@
maxClassNameColumns="${tests.maxClassNameColumns}"
timestamps="${tests.timestamps}"
- showNumFailures="${tests.showNumFailures}"
- />
+ showNumFailures="${tests.showNumFailures}">
+
+ <!-- Filter stack traces. The default set of filters is similar to Ant's (reflection, assertions, junit's own stuff). -->
+ <junit4:filtertrace defaults="true" enabled="${tests.filterstacks}">
+ <!-- Lucene-specific stack frames (test rules mostly). -->
+ <containsstring contains="at com.carrotsearch.randomizedtesting.RandomizedRunner" />
+ <containsstring contains="at org.apache.lucene.util.AbstractBeforeAfterRule" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.rules." />
+ <containsstring contains="at org.apache.lucene.util.TestRule" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.rules.StatementAdapter" />
+ <containsstring contains="at com.carrotsearch.randomizedtesting.ThreadLeakControl" />
+
+ <!-- Add custom filters if you like. Lines that match these will be removed. -->
+ <!--
+ <containsstring contains=".." />
+ <containsregex pattern="^(\s+at )(org\.junit\.)" />
+ -->
+ </junit4:filtertrace>
+ </junit4:report-text>
<!-- Emits full status for all tests, their relative order on slaves. -->
<junit4:report-text
@@ -1187,6 +1211,9 @@ ant -Dtests.file.encoding=XXX ...
# the test passes.
ant -Dtests.leaveTemporary=true
+# Do *not* filter stack traces emitted to the console.
+ant -Dtests.filterstacks=false
+
# Output test files and reports.
${tests-output}/tests-report.txt - full ASCII tests report
${tests-output}/tests-failures.txt - failures only (if any)
@@ -1239,7 +1266,7 @@ ${tests-output}/junit4-*.suites - pe
]]></fail>
<echo>Code coverage with Atlassian Clover enabled.</echo>
- <ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="2.6.3"
+ <ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="3.1.10"
inline="true" conf="master" type="jar" pathid="clover.classpath"/>
<taskdef resource="cloverlib.xml" classpathref="clover.classpath" />
<mkdir dir="${clover.db.dir}"/>
@@ -1387,7 +1414,7 @@ ${tests-output}/junit4-*.suites - pe
</sequential>
</target>
- <target name="-validate-maven-dependencies.init">
+ <target name="-validate-maven-dependencies.init" depends="filter-pom-templates">
<!-- find the correct pom.xml path and assigns it to property pom.xml -->
<property name="top.level.dir" location="${common.dir}/.."/>
<pathconvert property="maven.pom.xml">
@@ -1414,6 +1441,11 @@ ${tests-output}/junit4-*.suites - pe
<target name="-validate-maven-dependencies" depends="-validate-maven-dependencies.init">
<m2-validate-dependencies pom.xml="${maven.pom.xml}" licenseDirectory="${license.dir}">
+ <additional-filters>
+ <replaceregex pattern="jetty([^/]+)$" replace="jetty" flags="gi" />
+ <replaceregex pattern="slf4j-([^/]+)$" replace="slf4j" flags="gi" />
+ <replaceregex pattern="javax\.servlet([^/]+)$" replace="javax.servlet" flags="gi" />
+ </additional-filters>
<excludes>
<rsel:name name="**/lucene-*-${maven.version.glob}.jar" handledirsep="true"/>
</excludes>
@@ -1519,6 +1551,14 @@ ${tests-output}/junit4-*.suites - pe
<pattern substring="Permission is hereby granted, free of charge, to any person obtaining a copy"/>
</rat:substringMatcher>
+ <!-- apache -->
+ <rat:substringMatcher licenseFamilyCategory="AL "
+ licenseFamilyName="Apache">
+ <pattern substring="Licensed to the Apache Software Foundation (ASF) under"/>
+ <!-- this is the old-school one under some files -->
+ <pattern substring="Licensed under the Apache License, Version 2.0 (the "License")"/>
+ </rat:substringMatcher>
+
<rat:substringMatcher licenseFamilyCategory="GEN "
licenseFamilyName="Generated">
<!-- svg files generated by gnuplot -->
@@ -1530,7 +1570,7 @@ ${tests-output}/junit4-*.suites - pe
</rat:substringMatcher>
<!-- built in approved licenses -->
- <rat:approvedLicense familyName="Apache License Version 2.0"/>
+ <rat:approvedLicense familyName="Apache"/>
<rat:approvedLicense familyName="The MIT License"/>
<rat:approvedLicense familyName="Modified BSD License"/>
<rat:approvedLicense familyName="Generated"/>
@@ -1540,7 +1580,12 @@ ${tests-output}/junit4-*.suites - pe
<echo>${rat.output}</echo>
<delete>
<fileset file="${rat.sources.logfile}">
- <containsregexp expression="^0 Unknown Licenses"/>
+ <and>
+ <containsregexp expression="^0 Unknown Licenses"/>
+ <not>
+ <containsregexp expression="^\s+!AL"/>
+ </not>
+ </and>
</fileset>
</delete>
<!-- fail if we didnt find the pattern -->
@@ -1585,21 +1630,43 @@ ${tests-output}/junit4-*.suites - pe
</sequential>
</macrodef>
- <target name="-ecj-javadoc-lint" depends="-ecj-javadoc-lint-src,-ecj-javadoc-lint-tests"/>
+ <!-- ECJ Javadoc linting: -->
+
+ <condition property="ecj-javadoc-lint.supported">
+ <not><equals arg1="${build.java.runtime}" arg2="1.8"/></not>
+ </condition>
+
+ <condition property="ecj-javadoc-lint-tests.supported">
+ <and>
+ <isset property="ecj-javadoc-lint.supported"/>
+ <isset property="module.has.tests"/>
+ </and>
+ </condition>
+
+ <target name="-ecj-javadoc-lint-unsupported" unless="ecj-javadoc-lint.supported">
+ <fail message="Linting documentation with ECJ is not supported on this Java version (${build.java.runtime}).">
+ <condition>
+ <not><isset property="is.jenkins.build"/></not>
+ </condition>
+ </fail>
+ <echo level="warning" message="WARN: Linting documentation with ECJ is not supported on this Java version (${build.java.runtime}). NOTHING DONE!"/>
+ </target>
+
+ <target name="-ecj-javadoc-lint" depends="-ecj-javadoc-lint-unsupported,-ecj-javadoc-lint-src,-ecj-javadoc-lint-tests"/>
- <target name="-ecj-javadoc-lint-src" depends="-ecj-resolve">
+ <target name="-ecj-javadoc-lint-src" depends="-ecj-resolve" if="ecj-javadoc-lint.supported">
<ecj-macro srcdir="${src.dir}" configuration="${common.dir}/tools/javadoc/ecj.javadocs.prefs">
<classpath refid="classpath"/>
</ecj-macro>
</target>
- <target name="-ecj-javadoc-lint-tests" depends="-ecj-resolve" if="module.has.tests">
+ <target name="-ecj-javadoc-lint-tests" depends="-ecj-resolve" if="ecj-javadoc-lint-tests.supported">
<ecj-macro srcdir="${tests.src.dir}" configuration="${common.dir}/tools/javadoc/ecj.javadocs.prefs">
<classpath refid="test.classpath"/>
</ecj-macro>
</target>
- <target name="-ecj-resolve" unless="ecj.loaded" depends="ivy-availability-check,ivy-configure">
+ <target name="-ecj-resolve" unless="ecj.loaded" depends="ivy-availability-check,ivy-configure" if="ecj-javadoc-lint.supported">
<ivy:cachepath organisation="org.eclipse.jdt.core.compiler" module="ecj" revision="3.7.2"
inline="true" conf="master" type="jar" pathid="ecj.classpath" />
<componentdef classname="org.eclipse.jdt.core.JDTCompilerAdapter"
@@ -1662,7 +1729,7 @@ ${tests-output}/junit4-*.suites - pe
<attribute name="overview" default="${src.dir}/overview.html"/>
<attribute name="linksource" default="no"/>
<sequential>
- <antcall target="download-java6-javadoc-packagelist"/>
+ <antcall target="download-java7-javadoc-packagelist"/>
<delete file="@{destdir}/stylesheet.css" failonerror="false"/>
<copy todir="@{destdir}" file="${prettify.dir}/prettify.js" overwrite="false" />
<record name="@{destdir}/log_javadoc.txt" action="start" append="no"/>
@@ -1691,8 +1758,8 @@ ${tests-output}/junit4-*.suites - pe
description="WARNING: This API is experimental and might change in incompatible ways in the next release."/>
<tag name="lucene.internal"
description="NOTE: This API is for internal purposes only and might change in incompatible ways in the next release."/>
- <link offline="true" packagelistLoc="${javadoc.dir}"/>
- <link offline="true" href="${javadoc.link}" packagelistLoc="${javadoc.packagelist.dir}/java6"/>
+ <link offline="true" packagelistLoc="${javadoc.dir}"/>
+ <link offline="true" href="${javadoc.link}" packagelistLoc="${javadoc.packagelist.dir}/java7"/>
<bottom><![CDATA[
<i>Copyright © ${year} Apache Software Foundation. All Rights Reserved.</i>
<script src='{@docRoot}/prettify.js' type='text/javascript'></script>
@@ -1762,10 +1829,10 @@ ${tests-output}/junit4-*.suites - pe
</sequential>
</macrodef>
- <target name="download-java6-javadoc-packagelist" unless="javadoc.java6.packagelist.exists">
- <mkdir dir="${javadoc.packagelist.dir}/java6"/>
+ <target name="download-java7-javadoc-packagelist" unless="javadoc.java7.packagelist.exists">
+ <mkdir dir="${javadoc.packagelist.dir}/java7"/>
<get src="${javadoc.link}/package-list"
- dest="${javadoc.packagelist.dir}/java6/package-list" ignoreerrors="true"/>
+ dest="${javadoc.packagelist.dir}/java7/package-list" ignoreerrors="true"/>
</target>
<!-- VALIDATION work -->
@@ -1942,7 +2009,7 @@ ${tests-output}/junit4-*.suites - pe
<!-- Forbidden API Task -->
<target name="install-forbidden-apis" unless="forbidden-apis.loaded" depends="ivy-availability-check,ivy-configure">
- <ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.2"
+ <ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.3"
inline="true" conf="default" transitive="true" pathid="forbidden-apis.classpath"/>
<taskdef name="forbidden-apis" classname="de.thetaphi.forbiddenapis.AntTask" classpathref="forbidden-apis.classpath"/>
<property name="forbidden-apis.loaded" value="true"/>
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Thu May 30 07:53:18 2013
@@ -17,12 +17,9 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@@ -37,11 +34,21 @@ import org.apache.lucene.util.automaton.
/** Consumes a TokenStream and creates an {@link Automaton}
* where the transition labels are UTF8 bytes from the {@link
* TermToBytesRefAttribute}. Between tokens we insert
- * POS_SEP and for holes we insert HOLE. */
+ * POS_SEP and for holes we insert HOLE.
+ *
+ * @lucene.experimental */
public class TokenStreamToAutomaton {
+ private boolean preservePositionIncrements;
+
/** Sole constructor. */
public TokenStreamToAutomaton() {
+ this.preservePositionIncrements = true;
+ }
+
+ /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
}
private static class Position implements RollingBuffer.Resettable {
@@ -89,6 +96,7 @@ public class TokenStreamToAutomaton {
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+ final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
final BytesRef term = termBytesAtt.getBytesRef();
@@ -101,9 +109,12 @@ public class TokenStreamToAutomaton {
int pos = -1;
Position posData = null;
-
+ int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
+ if (!preservePositionIncrements && posInc > 1) {
+ posInc = 1;
+ }
assert pos > -1 || posInc > 0;
if (posInc > 0) {
@@ -157,13 +168,26 @@ public class TokenStreamToAutomaton {
state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
state = nextState;
}
+
+ maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
+ }
+
+ in.end();
+ State endState = null;
+ if (offsetAtt.endOffset() > maxOffset) {
+ endState = new State();
+ endState.setAccept(true);
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != null) {
- posData.arriving.setAccept(true);
+ if (endState != null) {
+ posData.arriving.addTransition(new Transition(POS_SEP, endState));
+ } else {
+ posData.arriving.setAccept(true);
+ }
}
pos++;
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java Thu May 30 07:53:18 2013
@@ -47,13 +47,6 @@ public abstract class Tokenizer extends
this.input = input;
}
- /** Construct a token stream processing the given input using the given AttributeSource. */
- protected Tokenizer(AttributeSource source, Reader input) {
- super(source);
- assert input != null: "input must not be null";
- this.input = input;
- }
-
/**
* {@inheritDoc}
* <p>
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/package.html?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/package.html (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/analysis/package.html Thu May 30 07:53:18 2013
@@ -282,18 +282,18 @@ and proximity searches (though sentence
<p>
If the selected analyzer filters the stop words "is" and "the", then for a document
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
- with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
+ with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
would find that document, because the same analyzer filters the same stop words from
- that query. But also the phrase query "blue sky" would find that document.
+ that query. But the phrase query "blue sky" would not find that document because the
+ position increment between "blue" and "sky" is only 1.
</p>
<p>
- If this behavior does not fit the application needs, a modified analyzer can
- be used, that would increment further the positions of tokens following a
- removed stop word, using
- {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
- This can be done with something like the following (note, however, that
- StopFilter natively includes this capability by subclassing
- FilteringTokenFilter}:
+ If this behavior does not fit the application needs, the query parser needs to be
+ configured to not take position increments into account when generating phrase queries.
+</p>
+<p>
+ Note that a StopFilter MUST increment the position increment in order not to generate corrupt
+ tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
</p>
<PRE class="prettyprint">
public TokenStream tokenStream(final String fieldName, Reader reader) {
@@ -308,7 +308,7 @@ and proximity searches (though sentence
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.toString())) {
- extraIncrement++; // filter this word
+ extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
continue;
}
if (extraIncrement>0) {
@@ -323,11 +323,6 @@ and proximity searches (though sentence
}
</PRE>
<p>
- Now, with this modified analyzer, the phrase query "blue sky" would find that document.
- But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
- where both w1 and w2 are stop words would match that document.
-</p>
-<p>
A few more use cases for modifying position increments are:
</p>
<ol>
@@ -338,6 +333,72 @@ and proximity searches (though sentence
As result, all synonyms of a token would be considered to appear in exactly the
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
+
+<h3>Token Position Length</h3>
+<p>
+ By default, all tokens created by Analyzers and Tokenizers have a
+ {@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
+ This means that the token occupies a single position. This attribute is not indexed
+ and thus not taken into account for positional queries, but is used by eg. suggesters.
+</p>
+<p>
+ The main use case for positions lengths is multi-word synonyms. With single-word
+ synonyms, setting the position increment to 0 is enough to denote the fact that two
+ words are synonyms, for example:
+</p>
+<table>
+<tr><td>Term</td><td>red</td><td>magenta</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td></tr>
+</table>
+<p>
+ Given that position(magenta) = 0 + position(red), they are at the same position, so anything
+ working with analyzers will return the exact same result if you replace "magenta" with "red"
+ in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
+ a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
+ are not enough anymore:
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ The problem with this token stream is that "IBM" is at the same position as "International"
+ although it is a synonym with "International Business Machines" as a whole. Setting
+ the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
+ than "International" is a synonym of "Business". The only way to solve this issue is to
+ make "IBM" span across 3 positions, this is where position lengths come to rescue.
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ This new attribute makes clear that "IBM" and "International Business Machines" start and end
+ at the same positions.
+</p>
+<a name="corrupt" />
+<h3>How to not write corrupt token streams</h3>
+<p>
+ There are a few rules to observe when writing custom Tokenizers and TokenFilters:
+</p>
+<ul>
+ <li>The first position increment must be > 0.</li>
+ <li>Positions must not go backward.</li>
+ <li>Tokens that have the same start position must have the same start offset.</li>
+ <li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
+</ul>
+<p>
+ Although these rules might seem easy to follow, problems can quickly happen when chaining
+ badly implemented filters that play with positions and offsets, such as synonym or n-grams
+ filters. Here are good practices for writing correct filters:
+</p>
+<ul>
+ <li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
+ <li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
+ <li>When they remove tokens, token filters should increment the position increment of the following token.</li>
+ <li>Token filters should preserve position lengths.</li>
+</ul>
<h2>TokenStream API</h2>
<p>
"Flexible Indexing" summarizes the effort of making the Lucene indexer
@@ -383,6 +444,10 @@ and proximity searches (though sentence
<td>See above for detailed information about position increment.</td>
</tr>
<tr>
+ <td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
+ <td>The number of positions occupied by a token.</td>
+ </tr>
+ <tr>
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
<td>The payload that a Token can optionally have.</td>
</tr>
@@ -532,20 +597,26 @@ public final class LengthFilter extends
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
- * Build a filter that removes words that are too long or too
- * short from the text.
+ * Create a new LengthFilter. This will filter out tokens whose
+ * CharTermAttribute is either too short
+ * (< min) or too long (> max).
+ * @param version the Lucene match version
+ * @param in the TokenStream to consume
+ * @param min the minimum length
+ * @param max the maximum length
*/
- public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
- super(enablePositionIncrements, in);
+ public LengthFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
this.min = min;
this.max = max;
}
-
+
{@literal @Override}
- public boolean accept() throws IOException {
+ public boolean accept() {
final int len = termAtt.length();
- return (len >= min && len <= max);
+ return (len >= min && len <= max);
}
+
}
</pre>
<p>
@@ -573,66 +644,39 @@ public final class LengthFilter extends
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
- public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
- super(input);
- this.enablePositionIncrements = enablePositionIncrements;
+ /**
+ * Create a new FilteringTokenFilter.
+ * @param in the TokenStream to consume
+ */
+ public FilteringTokenFilter(Version version, TokenStream in) {
+ super(in);
}
- /** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
+ /** Override this method and return if the current input token should be returned by incrementToken. */
protected abstract boolean accept() throws IOException;
{@literal @Override}
public final boolean incrementToken() throws IOException {
- if (enablePositionIncrements) {
- int skippedPositions = 0;
- while (input.incrementToken()) {
- if (accept()) {
- if (skippedPositions != 0) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
- }
- skippedPositions += posIncrAtt.getPositionIncrement();
- }
- } else {
- while (input.incrementToken()) {
- if (accept()) {
- return true;
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
+ return true;
}
+ skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
}
- /**
- * {@literal @see #setEnablePositionIncrements(boolean)}
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
+ {@literal @Override}
+ public void reset() throws IOException {
+ super.reset();
}
- /**
- * If <code>true</code>, this TokenFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed tokens).
- * Generally, <code>true</code> is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * <p> When set, when a token is stopped
- * (omitted), the position increment of the following
- * token is incremented.
- *
- * <p> <b>NOTE</b>: be sure to also
- * set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
- * you use QueryParser to create queries.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
</pre>
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java Thu May 30 07:53:18 2013
@@ -32,9 +32,7 @@ import org.apache.lucene.index.MultiDocV
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedDocValuesTermsEnum;
import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.index.SortedSetDocValuesTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
@@ -269,7 +267,7 @@ public abstract class DocValuesConsumer
SortedDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
- liveTerms[sub] = new SortedDocValuesTermsEnum(dv);
+ liveTerms[sub] = dv.termsEnum();
} else {
OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -277,7 +275,7 @@ public abstract class DocValuesConsumer
bitset.set(dv.getOrd(i));
}
}
- liveTerms[sub] = new BitsFilteredTermsEnum(new SortedDocValuesTermsEnum(dv), bitset);
+ liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
}
}
@@ -401,7 +399,7 @@ public abstract class DocValuesConsumer
SortedSetDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
- liveTerms[sub] = new SortedSetDocValuesTermsEnum(dv);
+ liveTerms[sub] = dv.termsEnum();
} else {
OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -413,7 +411,7 @@ public abstract class DocValuesConsumer
}
}
}
- liveTerms[sub] = new BitsFilteredTermsEnum(new SortedSetDocValuesTermsEnum(dv), bitset);
+ liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
}
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java Thu May 30 07:53:18 2013
@@ -134,5 +134,14 @@ public final class MappingMultiDocsAndPo
public BytesRef getPayload() throws IOException {
return current.getPayload();
}
+
+ @Override
+ public long cost() {
+ long cost = 0;
+ for (EnumWithSlice enumWithSlice : subs) {
+ cost += enumWithSlice.docsAndPositionsEnum.cost();
+ }
+ return cost;
+ }
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java Thu May 30 07:53:18 2013
@@ -114,5 +114,14 @@ public final class MappingMultiDocsEnum
}
}
}
+
+ @Override
+ public long cost() {
+ long cost = 0;
+ for (EnumWithSlice enumWithSlice : subs) {
+ cost += enumWithSlice.docsEnum.cost();
+ }
+ return cost;
+ }
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java Thu May 30 07:53:18 2013
@@ -82,7 +82,7 @@ public final class CompressingStoredFiel
avgChunkDocs[blockCount] = fieldsIndexIn.readVInt();
final int bitsPerDocBase = fieldsIndexIn.readVInt();
if (bitsPerDocBase > 32) {
- throw new CorruptIndexException("Corrupted");
+ throw new CorruptIndexException("Corrupted bitsPerDocBase (resource=" + fieldsIndexIn + ")");
}
docBasesDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerDocBase);
@@ -91,7 +91,7 @@ public final class CompressingStoredFiel
avgChunkSizes[blockCount] = fieldsIndexIn.readVLong();
final int bitsPerStartPointer = fieldsIndexIn.readVInt();
if (bitsPerStartPointer > 64) {
- throw new CorruptIndexException("Corrupted");
+ throw new CorruptIndexException("Corrupted bitsPerStartPointer (resource=" + fieldsIndexIn + ")");
}
startPointersDeltas[blockCount] = PackedInts.getReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerStartPointer);
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java Thu May 30 07:53:18 2013
@@ -202,7 +202,7 @@ public final class CompressingStoredFiel
|| docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: docID=" + docID
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs
- + ", numDocs=" + numDocs);
+ + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
}
final int numStoredFields, offset, length, totalLength;
@@ -216,7 +216,7 @@ public final class CompressingStoredFiel
if (bitsPerStoredFields == 0) {
numStoredFields = fieldsStream.readVInt();
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
} else {
final long filePointer = fieldsStream.getFilePointer();
final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
@@ -230,7 +230,7 @@ public final class CompressingStoredFiel
offset = (docID - docBase) * length;
totalLength = chunkDocs * length;
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerLength=" + bitsPerLength);
+ throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
} else {
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
int off = 0;
@@ -248,7 +248,7 @@ public final class CompressingStoredFiel
}
if ((length == 0) != (numStoredFields == 0)) {
- throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields);
+ throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")");
}
if (numStoredFields == 0) {
// nothing to do
@@ -338,7 +338,7 @@ public final class CompressingStoredFiel
|| docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
+ ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
- + ", new numDocs=" + chunkDocs);
+ + ", new numDocs=" + chunkDocs + " (resource=" + fieldsStream + ")");
}
this.docBase = docBase;
this.chunkDocs = chunkDocs;
@@ -357,7 +357,7 @@ public final class CompressingStoredFiel
if (bitsPerStoredFields == 0) {
Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt());
} else if (bitsPerStoredFields > 31) {
- throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
} else {
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
for (int i = 0; i < chunkDocs; ++i) {
@@ -387,7 +387,7 @@ public final class CompressingStoredFiel
final int chunkSize = chunkSize();
decompressor.decompress(fieldsStream, chunkSize, 0, chunkSize, bytes);
if (bytes.length != chunkSize) {
- throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length);
+ throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length + " (resource=" + fieldsStream + ")");
}
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Thu May 30 07:53:18 2013
@@ -53,6 +53,9 @@ import org.apache.lucene.util.packed.Pac
*/
public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
+ // hard limit on the maximum number of documents per chunk
+ static final int MAX_DOCUMENTS_PER_CHUNK = 128;
+
static final int STRING = 0x00;
static final int BYTE_ARR = 0x01;
static final int NUMERIC_INT = 0x02;
@@ -200,7 +203,7 @@ public final class CompressingStoredFiel
private boolean triggerFlush() {
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
- numBufferedDocs >= chunkSize; // can be necessary if most docs are empty
+ numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK;
}
private void flush() throws IOException {
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java Thu May 30 07:53:18 2013
@@ -187,7 +187,7 @@ public final class CompressingTermVector
final int docBase = vectorsStream.readVInt();
final int chunkDocs = vectorsStream.readVInt();
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
- throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc);
+ throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc + " (resource=" + vectorsStream + ")");
}
final int skip; // number of fields to skip
@@ -1030,13 +1030,13 @@ public final class CompressingTermVector
@Override
public int advance(int target) throws IOException {
- if (doc == -1 && target == 0 && (liveDocs == null || liveDocs.get(0))) {
- return (doc = 0);
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ return slowAdvance(target);
}
+ @Override
+ public long cost() {
+ return 1;
+ }
}
private static int sum(int[] arr) {
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Thu May 30 07:53:18 2013
@@ -56,6 +56,9 @@ import org.apache.lucene.util.packed.Pac
*/
public final class CompressingTermVectorsWriter extends TermVectorsWriter {
+ // hard limit on the maximum number of documents per chunk
+ static final int MAX_DOCUMENTS_PER_CHUNK = 128;
+
static final String VECTORS_EXTENSION = "tvd";
static final String VECTORS_INDEX_EXTENSION = "tvx";
@@ -322,7 +325,8 @@ public final class CompressingTermVector
}
private boolean triggerFlush() {
- return termSuffixes.length >= chunkSize || pendingDocs.size() >= chunkSize;
+ return termSuffixes.length >= chunkSize
+ || pendingDocs.size() >= MAX_DOCUMENTS_PER_CHUNK;
}
private void flush() throws IOException {
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java Thu May 30 07:53:18 2013
@@ -134,7 +134,7 @@ public abstract class CompressionMode {
}
final int decompressedLength = LZ4.decompress(in, offset + length, bytes.bytes, 0);
if (decompressedLength > originalLength) {
- throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength);
+ throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength + " (resource=" + in + ")");
}
bytes.offset = offset;
bytes.length = length;
@@ -222,7 +222,7 @@ public abstract class CompressionMode {
}
}
if (bytes.length != originalLength) {
- throw new CorruptIndexException("Lengths mismatch: " + bytes.length + " != " + originalLength);
+ throw new CorruptIndexException("Lengths mismatch: " + bytes.length + " != " + originalLength + " (resource=" + in + ")");
}
bytes.offset = offset;
bytes.length = length;
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java Thu May 30 07:53:18 2013
@@ -99,7 +99,7 @@ class Lucene40FieldInfosReader extends F
}
if (oldNormsType.mapping != null) {
if (oldNormsType.mapping != DocValuesType.NUMERIC) {
- throw new CorruptIndexException("invalid norm type: " + oldNormsType);
+ throw new CorruptIndexException("invalid norm type: " + oldNormsType + " (resource=" + input + ")");
}
attributes.put(LEGACY_NORM_TYPE_KEY, oldNormsType.name());
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java Thu May 30 07:53:18 2013
@@ -513,6 +513,11 @@ public class Lucene40PostingsReader exte
}
return scanTo(target);
}
+
+ @Override
+ public long cost() {
+ return limit;
+ }
}
private final class AllDocsSegmentDocsEnum extends SegmentDocsEnumBase {
@@ -886,6 +891,11 @@ public class Lucene40PostingsReader exte
public BytesRef getPayload() throws IOException {
return null;
}
+
+ @Override
+ public long cost() {
+ return limit;
+ }
}
// Decodes docs & positions & (payloads and/or offsets)
@@ -1179,5 +1189,10 @@ public class Lucene40PostingsReader exte
return null;
}
}
+
+ @Override
+ public long cost() {
+ return limit;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java Thu May 30 07:53:18 2013
@@ -605,12 +605,8 @@ public class Lucene40TermVectorsReader e
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int freq) {
@@ -619,6 +615,11 @@ public class Lucene40TermVectorsReader e
this.doc = -1;
didNext = false;
}
+
+ @Override
+ public long cost() {
+ return 1;
+ }
}
private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
@@ -659,12 +660,8 @@ public class Lucene40TermVectorsReader e
}
@Override
- public int advance(int target) {
- if (!didNext && target == 0) {
- return nextDoc();
- } else {
- return (doc = NO_MORE_DOCS);
- }
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, int[] payloadLengths, byte[] payloadBytes) {
@@ -726,6 +723,11 @@ public class Lucene40TermVectorsReader e
return endOffsets[nextPos-1];
}
}
+
+ @Override
+ public long cost() {
+ return 1;
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java Thu May 30 07:53:18 2013
@@ -599,6 +599,11 @@ public final class Lucene41PostingsReade
return nextDoc();
}
}
+
+ @Override
+ public long cost() {
+ return docFreq;
+ }
}
@@ -1010,6 +1015,11 @@ public final class Lucene41PostingsReade
public BytesRef getPayload() {
return null;
}
+
+ @Override
+ public long cost() {
+ return docFreq;
+ }
}
// Also handles payloads + offsets
@@ -1588,5 +1598,10 @@ public final class Lucene41PostingsReade
return payload;
}
}
+
+ @Override
+ public long cost() {
+ return docFreq;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java Thu May 30 07:53:18 2013
@@ -34,6 +34,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
@@ -49,7 +50,8 @@ import org.apache.lucene.util.packed.Pac
*/
class Lucene42DocValuesConsumer extends DocValuesConsumer {
static final int VERSION_START = 0;
- static final int VERSION_CURRENT = VERSION_START;
+ static final int VERSION_GCD_COMPRESSION = 1;
+ static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION;
static final byte NUMBER = 0;
static final byte BYTES = 1;
@@ -60,6 +62,7 @@ class Lucene42DocValuesConsumer extends
static final byte DELTA_COMPRESSED = 0;
static final byte TABLE_COMPRESSED = 1;
static final byte UNCOMPRESSED = 2;
+ static final byte GCD_COMPRESSED = 3;
final IndexOutput data, meta;
final int maxDoc;
@@ -83,27 +86,53 @@ class Lucene42DocValuesConsumer extends
}
}
}
-
+
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+ addNumericField(field, values, true);
+ }
+
+ void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(NUMBER);
meta.writeLong(data.getFilePointer());
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
+ long gcd = 0;
// TODO: more efficient?
- HashSet<Long> uniqueValues = new HashSet<Long>();
- for(Number nv : values) {
- long v = nv.longValue();
- minValue = Math.min(minValue, v);
- maxValue = Math.max(maxValue, v);
- if (uniqueValues != null) {
- if (uniqueValues.add(v)) {
- if (uniqueValues.size() > 256) {
- uniqueValues = null;
+ HashSet<Long> uniqueValues = null;
+ if (optimizeStorage) {
+ uniqueValues = new HashSet<>();
+
+ long count = 0;
+ for (Number nv : values) {
+ final long v = nv.longValue();
+
+ if (gcd != 1) {
+ if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
+ // in that case v - minValue might overflow and make the GCD computation return
+ // wrong results. Since these extreme values are unlikely, we just discard
+ // GCD computation for them
+ gcd = 1;
+ } else if (count != 0) { // minValue needs to be set first
+ gcd = MathUtil.gcd(gcd, v - minValue);
}
}
+
+ minValue = Math.min(minValue, v);
+ maxValue = Math.max(maxValue, v);
+
+ if (uniqueValues != null) {
+ if (uniqueValues.add(v)) {
+ if (uniqueValues.size() > 256) {
+ uniqueValues = null;
+ }
+ }
+ }
+
+ ++count;
}
+ assert count == maxDoc;
}
if (uniqueValues != null) {
@@ -135,6 +164,18 @@ class Lucene42DocValuesConsumer extends
}
writer.finish();
}
+ } else if (gcd != 0 && gcd != 1) {
+ meta.writeByte(GCD_COMPRESSED);
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ data.writeLong(minValue);
+ data.writeLong(gcd);
+ data.writeVInt(BLOCK_SIZE);
+
+ final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+ for (Number nv : values) {
+ writer.add((nv.longValue() - minValue) / gcd);
+ }
+ writer.finish();
} else {
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
@@ -222,7 +263,7 @@ class Lucene42DocValuesConsumer extends
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
// write the ordinals as numerics
- addNumericField(field, docToOrd);
+ addNumericField(field, docToOrd, false);
// write the values as FST
writeFST(field, values);
Modified: lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java (original)
+++ lucene/dev/branches/security/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java Thu May 30 07:53:18 2013
@@ -44,6 +44,8 @@ import org.apache.lucene.util.packed.Blo
* <li>Uncompressed Numerics: when all values would fit into a single byte, and the
* <code>acceptableOverheadRatio</code> would pack values into 8 bits per value anyway, they
* are written as absolute values (with no indirection or packing) for performance.
+ * <li>GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest
+ * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
* Each document's value can be addressed by maxDoc*length.
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
@@ -93,6 +95,8 @@ import org.apache.lucene.util.packed.Blo
* <li>2 --> uncompressed. When the <code>acceptableOverheadRatio</code> parameter would upgrade the number
* of bits required to 8, and all values fit in a byte, these are written as absolute binary values
* for performance.
+ * <li>3 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
+ * using blocks of delta-encoded ints.
* </ul>
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
@@ -103,7 +107,7 @@ import org.apache.lucene.util.packed.Blo
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup></p>
* <ul>
- * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics</li>
+ * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics</li>
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
* <li>SortedData --> {@link FST FST<Int64>}</li>
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li>