You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/08/01 09:48:54 UTC
svn commit: r1693682 [1/2] - in /lucene/dev/trunk/lucene: ./
sandbox/src/java/org/apache/lucene/rangetree/
sandbox/src/resources/META-INF/services/
sandbox/src/test/org/apache/lucene/rangetree/
Author: mikemccand
Date: Sat Aug 1 07:48:53 2015
New Revision: 1693682
URL: http://svn.apache.org/r1693682
Log:
LUCENE-6697: add range tree for fast numeric and binary range filtering
Added:
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/GrowingHeapSliceWriter.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceReader.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceWriter.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/NumericRangeTreeQuery.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceReader.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceWriter.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesConsumer.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesFormat.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesProducer.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeReader.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedNumericDocValues.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedSetDocValues.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeWriter.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceReader.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceWriter.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SortedSetRangeTreeQuery.java (with props)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/package.html (with props)
lucene/dev/trunk/lucene/sandbox/src/test/org/apache/lucene/rangetree/
lucene/dev/trunk/lucene/sandbox/src/test/org/apache/lucene/rangetree/TestRangeTree.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1693682&r1=1693681&r2=1693682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Aug 1 07:48:53 2015
@@ -141,6 +141,12 @@ New Features
* LUCENE-6695: Added a new BlendedTermQuery to blend statistics across several
terms. (Simon Willnauer, Adrien Grand)
+* LUCENE-6697: Add experimental range tree doc values format and
+ queries, based on a 1D version of the spatial BKD tree, for a faster
+ and smaller alternative to postings-based numeric and binary term
+ filtering. Range trees can also handle values larger than 64 bits.
+ (Adrien Grand, Mike McCandless)
+
API Changes
* LUCENE-6508: Simplify Lock api, there is now just
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/GrowingHeapSliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/GrowingHeapSliceWriter.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/GrowingHeapSliceWriter.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/GrowingHeapSliceWriter.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,84 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+final class GrowingHeapSliceWriter implements SliceWriter {
+ long[] values;
+ int[] docIDs;
+ long[] ords;
+ private int nextWrite;
+ final int maxSize;
+
+ public GrowingHeapSliceWriter(int maxSize) {
+ values = new long[16];
+ docIDs = new int[16];
+ ords = new long[16];
+ this.maxSize = maxSize;
+ }
+
+ private int[] growExact(int[] arr, int size) {
+ assert size > arr.length;
+ int[] newArr = new int[size];
+ System.arraycopy(arr, 0, newArr, 0, arr.length);
+ return newArr;
+ }
+
+ private long[] growExact(long[] arr, int size) {
+ assert size > arr.length;
+ long[] newArr = new long[size];
+ System.arraycopy(arr, 0, newArr, 0, arr.length);
+ return newArr;
+ }
+
+ @Override
+ public void append(long value, long ord, int docID) {
+ assert ord == nextWrite;
+ if (values.length == nextWrite) {
+ int nextSize = Math.min(maxSize, ArrayUtil.oversize(nextWrite+1, RamUsageEstimator.NUM_BYTES_INT));
+ assert nextSize > nextWrite: "nextSize=" + nextSize + " vs nextWrite=" + nextWrite;
+ values = growExact(values, nextSize);
+ ords = growExact(ords, nextSize);
+ docIDs = growExact(docIDs, nextSize);
+ }
+ values[nextWrite] = value;
+ ords[nextWrite] = ord;
+ docIDs[nextWrite] = docID;
+ nextWrite++;
+ }
+
+ @Override
+ public SliceReader getReader(long start) {
+ return new HeapSliceReader(values, ords, docIDs, (int) start, nextWrite);
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public void destroy() {
+ }
+
+ @Override
+ public String toString() {
+ return "GrowingHeapSliceWriter(count=" + nextWrite + " alloc=" + values.length + ")";
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceReader.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceReader.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceReader.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,60 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+final class HeapSliceReader implements SliceReader {
+ private int curRead;
+ final long[] values;
+ final long[] ords;
+ final int[] docIDs;
+ final int end;
+
+ HeapSliceReader(long[] values, long[] ords, int[] docIDs, int start, int end) {
+ this.values = values;
+ this.ords = ords;
+ this.docIDs = docIDs;
+ curRead = start-1;
+ this.end = end;
+ }
+
+ @Override
+ public boolean next() {
+ curRead++;
+ return curRead < end;
+ }
+
+ @Override
+ public long value() {
+ return values[curRead];
+ }
+
+ @Override
+ public int docID() {
+ return docIDs[curRead];
+ }
+
+ @Override
+ public long ord() {
+ return ords[curRead];
+ }
+
+ @Override
+ public void close() {
+ }
+}
+
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceWriter.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceWriter.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/HeapSliceWriter.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,60 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+final class HeapSliceWriter implements SliceWriter {
+ final long[] values;
+ final int[] docIDs;
+ final long[] ords;
+ private int nextWrite;
+
+ public HeapSliceWriter(int count) {
+ values = new long[count];
+ docIDs = new int[count];
+ ords = new long[count];
+ }
+
+ @Override
+ public void append(long value, long ord, int docID) {
+ values[nextWrite] = value;
+ ords[nextWrite] = ord;
+ docIDs[nextWrite] = docID;
+ nextWrite++;
+ }
+
+ @Override
+ public SliceReader getReader(long start) {
+ return new HeapSliceReader(values, ords, docIDs, (int) start, values.length);
+ }
+
+ @Override
+ public void close() {
+ if (nextWrite != values.length) {
+ throw new IllegalStateException("only wrote " + nextWrite + " values, but expected " + values.length);
+ }
+ }
+
+ @Override
+ public void destroy() {
+ }
+
+ @Override
+ public String toString() {
+ return "HeapSliceWriter(count=" + values.length + ")";
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/NumericRangeTreeQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/NumericRangeTreeQuery.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/NumericRangeTreeQuery.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/NumericRangeTreeQuery.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,159 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.ToStringUtils;
+
+import java.io.IOException;
+
+/** Finds all previously indexed long values that fall within the specified range.
+ *
+ * <p>The field must be indexed with {@link RangeTreeDocValuesFormat}, and {@link SortedNumericDocValuesField} added per document.
+ *
+ * @lucene.experimental */
+
+public class NumericRangeTreeQuery extends Query {
+ final String field;
+ final Long minValue;
+ final Long maxValue;
+ final boolean minInclusive;
+ final boolean maxInclusive;
+
+ // TODO: sugar for all numeric conversions?
+
+ /** Matches all values in the specified long range. */
+ public NumericRangeTreeQuery(String field, Long minValue, boolean minInclusive, Long maxValue, boolean maxInclusive) {
+ this.field = field;
+ this.minInclusive = minInclusive;
+ this.minValue = minValue;
+ this.maxInclusive = maxInclusive;
+ this.maxValue = maxValue;
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
+
+ // I don't use RandomAccessWeight here: it's no good to approximate with "match all docs"; this is an inverted structure and should be
+ // used in the first pass:
+
+ return new ConstantScoreWeight(this) {
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ LeafReader reader = context.reader();
+ SortedNumericDocValues sdv = reader.getSortedNumericDocValues(field);
+ if (sdv == null) {
+ // No docs in this segment had this field
+ return null;
+ }
+
+ if (sdv instanceof RangeTreeSortedNumericDocValues == false) {
+ throw new IllegalStateException("field \"" + field + "\" was not indexed with RangeTreeDocValuesFormat: got: " + sdv);
+ }
+ RangeTreeSortedNumericDocValues treeDV = (RangeTreeSortedNumericDocValues) sdv;
+ RangeTreeReader tree = treeDV.getRangeTreeReader();
+
+ // lower
+ long minBoundIncl = (minValue == null) ? Long.MIN_VALUE : minValue.longValue();
+
+ if (minInclusive == false && minValue != null) {
+ if (minBoundIncl == Long.MAX_VALUE) {
+ return null;
+ }
+ minBoundIncl++;
+ }
+
+ // upper
+ long maxBoundIncl = (maxValue == null) ? Long.MAX_VALUE : maxValue.longValue();
+ if (maxInclusive == false && maxValue != null) {
+ if (maxBoundIncl == Long.MIN_VALUE) {
+ return null;
+ }
+ maxBoundIncl--;
+ }
+
+ if (maxBoundIncl < minBoundIncl) {
+ return null;
+ }
+
+ DocIdSet result = tree.intersect(minBoundIncl, maxBoundIncl, treeDV.delegate, context.reader().maxDoc());
+
+ final DocIdSetIterator disi = result.iterator();
+
+ return new ConstantScoreScorer(this, score(), disi);
+ }
+ };
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = super.hashCode();
+ if (minValue != null) hash += minValue.hashCode()^0x14fa55fb;
+ if (maxValue != null) hash += maxValue.hashCode()^0x733fa5fe;
+ return hash +
+ (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+
+ (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (super.equals(other)) {
+ final NumericRangeTreeQuery q = (NumericRangeTreeQuery) other;
+ return (
+ (q.minValue == null ? minValue == null : q.minValue.equals(minValue)) &&
+ (q.maxValue == null ? maxValue == null : q.maxValue.equals(maxValue)) &&
+ minInclusive == q.minInclusive &&
+ maxInclusive == q.maxInclusive
+ );
+ }
+
+ return false;
+ }
+
+ @Override
+ public String toString(String field) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append(getClass().getSimpleName());
+ sb.append(':');
+ if (this.field.equals(field) == false) {
+ sb.append("field=");
+ sb.append(this.field);
+ sb.append(':');
+ }
+
+ return sb.append(minInclusive ? '[' : '{')
+ .append((minValue == null) ? "*" : minValue.toString())
+ .append(" TO ")
+ .append((maxValue == null) ? "*" : maxValue.toString())
+ .append(maxInclusive ? ']' : '}')
+ .append(ToStringUtils.boost(getBoost()))
+ .toString();
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceReader.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceReader.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceReader.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,82 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.store.InputStreamDataInput;
+
+final class OfflineSliceReader implements SliceReader {
+ final InputStreamDataInput in;
+ long countLeft;
+ private long value;
+ private long ord;
+ private int docID;
+
+ OfflineSliceReader(Path tempFile, long start, long count) throws IOException {
+ InputStream fis = Files.newInputStream(tempFile);
+ long seekFP = start * RangeTreeWriter.BYTES_PER_DOC;
+ long skipped = 0;
+ while (skipped < seekFP) {
+ long inc = fis.skip(seekFP - skipped);
+ skipped += inc;
+ if (inc == 0) {
+ throw new RuntimeException("skip returned 0");
+ }
+ }
+ in = new InputStreamDataInput(new BufferedInputStream(fis));
+ this.countLeft = count;
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ if (countLeft == 0) {
+ return false;
+ }
+ countLeft--;
+ value = in.readLong();
+ ord = in.readLong();
+ docID = in.readInt();
+ return true;
+ }
+
+ @Override
+ public long value() {
+ return value;
+ }
+
+ @Override
+ public long ord() {
+ return ord;
+ }
+
+ @Override
+ public int docID() {
+ return docID;
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+}
+
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceWriter.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceWriter.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/OfflineSliceWriter.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,75 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.IOUtils;
+
+final class OfflineSliceWriter implements SliceWriter {
+
+ final Path tempFile;
+ final byte[] scratchBytes = new byte[RangeTreeWriter.BYTES_PER_DOC];
+ final ByteArrayDataOutput scratchBytesOutput = new ByteArrayDataOutput(scratchBytes);
+ final OutputStreamDataOutput out;
+ final long count;
+ private long countWritten;
+
+ public OfflineSliceWriter(Path tempDir, long count) throws IOException {
+ tempFile = Files.createTempFile(tempDir, "size" + count + ".", "");
+ out = new OutputStreamDataOutput(new BufferedOutputStream(Files.newOutputStream(tempFile)));
+ this.count = count;
+ }
+
+ @Override
+ public void append(long value, long ord, int docID) throws IOException {
+ out.writeLong(value);
+ out.writeLong(ord);
+ out.writeInt(docID);
+ countWritten++;
+ }
+
+ @Override
+ public SliceReader getReader(long start) throws IOException {
+ return new OfflineSliceReader(tempFile, start, count-start);
+ }
+
+ @Override
+ public void close() throws IOException {
+ out.close();
+ if (count != countWritten) {
+ throw new IllegalStateException("wrote " + countWritten + " values, but expected " + count);
+ }
+ }
+
+ @Override
+ public void destroy() throws IOException {
+ IOUtils.rm(tempFile);
+ }
+
+ @Override
+ public String toString() {
+ return "OfflineSliceWriter(count=" + count + " tempFile=" + tempFile + ")";
+ }
+}
+
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesConsumer.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesConsumer.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesConsumer.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,148 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+class RangeTreeDocValuesConsumer extends DocValuesConsumer implements Closeable {
+ final DocValuesConsumer delegate;
+ final int maxPointsInLeafNode;
+ final int maxPointsSortInHeap;
+ final IndexOutput out;
+ final Map<Integer,Long> fieldIndexFPs = new HashMap<>();
+ final SegmentWriteState state;
+
+ public RangeTreeDocValuesConsumer(DocValuesConsumer delegate, SegmentWriteState state, int maxPointsInLeafNode, int maxPointsSortInHeap) throws IOException {
+ RangeTreeWriter.verifyParams(maxPointsInLeafNode, maxPointsSortInHeap);
+ this.delegate = delegate;
+ this.maxPointsInLeafNode = maxPointsInLeafNode;
+ this.maxPointsSortInHeap = maxPointsSortInHeap;
+ this.state = state;
+ String datFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, RangeTreeDocValuesFormat.DATA_EXTENSION);
+ out = state.directory.createOutput(datFileName, state.context);
+ CodecUtil.writeIndexHeader(out, RangeTreeDocValuesFormat.DATA_CODEC_NAME, RangeTreeDocValuesFormat.DATA_VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ }
+
+ @Override
+ public void close() throws IOException {
+ boolean success = false;
+ try {
+ CodecUtil.writeFooter(out);
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(delegate, out);
+ } else {
+ IOUtils.closeWhileHandlingException(delegate, out);
+ }
+ }
+
+ String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, RangeTreeDocValuesFormat.META_EXTENSION);
+ IndexOutput metaOut = state.directory.createOutput(metaFileName, state.context);
+ success = false;
+ try {
+ CodecUtil.writeIndexHeader(metaOut, RangeTreeDocValuesFormat.META_CODEC_NAME, RangeTreeDocValuesFormat.META_VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ metaOut.writeVInt(fieldIndexFPs.size());
+ for(Map.Entry<Integer,Long> ent : fieldIndexFPs.entrySet()) {
+ metaOut.writeVInt(ent.getKey());
+ metaOut.writeVLong(ent.getValue());
+ }
+ CodecUtil.writeFooter(metaOut);
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(metaOut);
+ } else {
+ IOUtils.closeWhileHandlingException(metaOut);
+ }
+ }
+ }
+
+ @Override
+ public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException {
+ delegate.addSortedNumericField(field, docToValueCount, values);
+ RangeTreeWriter writer = new RangeTreeWriter(maxPointsInLeafNode, maxPointsSortInHeap);
+ Iterator<Number> valueIt = values.iterator();
+ Iterator<Number> valueCountIt = docToValueCount.iterator();
+ //System.out.println("\nSNF: field=" + field.name);
+ for (int docID=0;docID<state.segmentInfo.maxDoc();docID++) {
+ assert valueCountIt.hasNext();
+ int count = valueCountIt.next().intValue();
+ for(int i=0;i<count;i++) {
+ assert valueIt.hasNext();
+ writer.add(valueIt.next().longValue(), docID);
+ }
+ }
+
+ long indexStartFP = writer.finish(out);
+
+ fieldIndexFPs.put(field.number, indexStartFP);
+ }
+
+ @Override
+ public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+ throw new UnsupportedOperationException("use either SortedNumericDocValuesField or SortedSetDocValuesField");
+ }
+
+ @Override
+ public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) {
+ throw new UnsupportedOperationException("use either SortedNumericDocValuesField or SortedSetDocValuesField");
+ }
+
+ @Override
+ public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) {
+ throw new UnsupportedOperationException("use either SortedNumericDocValuesField or SortedSetDocValuesField");
+ }
+
+ @Override
+ public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
+ delegate.addSortedSetField(field, values, docToOrdCount, ords);
+ RangeTreeWriter writer = new RangeTreeWriter(maxPointsInLeafNode, maxPointsSortInHeap);
+ Iterator<Number> docToOrdCountIt = docToOrdCount.iterator();
+ Iterator<Number> ordsIt = ords.iterator();
+ //System.out.println("\nSSF: field=" + field.name);
+ for (int docID=0;docID<state.segmentInfo.maxDoc();docID++) {
+ assert docToOrdCountIt.hasNext();
+ int count = docToOrdCountIt.next().intValue();
+ for(int i=0;i<count;i++) {
+ assert ordsIt.hasNext();
+ long ord = ordsIt.next().longValue();
+ writer.add(ord, docID);
+ }
+ }
+
+ long indexStartFP = writer.finish(out);
+
+ fieldIndexFPs.put(field.number, indexStartFP);
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesFormat.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesFormat.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesFormat.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,112 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat;
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField; // javadocs
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+import java.io.IOException;
+
+/**
+ * A {@link DocValuesFormat} to efficiently index numeric values from
+ * from {@link SortedNumericDocValuesField} or BytesRef values from {@link SortedSetDocValuesField}
+ * for numeric range queries using ({@link NumericRangeTreeQuery}) and arbitrary binary
+ * range queries using {@link SortedSetRangeTreeQuery}.
+ *
+ * <p>This wraps {@link Lucene50DocValuesFormat}, but saves its own numeric tree
+ * structures to disk for fast query-time intersection. See <a
+ * href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a>
+ * for details.
+ *
+ * <p>The numeric tree slices up 1D space into smaller and
+ * smaller ranges, until the smallest ranges have approximately
+ * between X/2 and X (X default is 1024) values in them, at which point
+ * such leaf cells are written as a block to disk, while the index tree
+ * structure records how space was sub-divided is loaded into HEAP
+ * at search time. At search time, the tree is recursed based on whether
+ * each of left or right child overlap with the query range, and once
+ * a leaf block is reached, all documents in that leaf block are collected
+ * if the cell is fully enclosed by the query shape, or filtered and then
+ * collected, if not.
+ *
+ * <p>The index is also quite compact, because docs only appear once in
+ * the tree (no "prefix terms").
+ *
+ * <p>In addition to the files written by {@link Lucene50DocValuesFormat}, this format writes:
+ * <ol>
+ * <li><tt>.ndd</tt>: numeric tree leaf data and index</li>
+ * <li><tt>.ndm</tt>: numeric tree metadata</li>
+ * </ol>
+ *
+ * <p>The disk format is experimental and free to change suddenly, and this code likely has new and exciting bugs!
+ *
+ * @lucene.experimental */
+
+public class RangeTreeDocValuesFormat extends DocValuesFormat {
+
+ static final String DATA_CODEC_NAME = "NumericTreeData";
+ static final int DATA_VERSION_START = 0;
+ static final int DATA_VERSION_CURRENT = DATA_VERSION_START;
+ static final String DATA_EXTENSION = "ndd";
+
+ static final String META_CODEC_NAME = "NumericTreeMeta";
+ static final int META_VERSION_START = 0;
+ static final int META_VERSION_CURRENT = META_VERSION_START;
+ static final String META_EXTENSION = "ndm";
+
+ private final int maxPointsInLeafNode;
+ private final int maxPointsSortInHeap;
+
+ private final DocValuesFormat delegate = new Lucene50DocValuesFormat();
+
+ /** Default constructor */
+ public RangeTreeDocValuesFormat() {
+ this(RangeTreeWriter.DEFAULT_MAX_VALUES_IN_LEAF_NODE, RangeTreeWriter.DEFAULT_MAX_VALUES_SORT_IN_HEAP);
+ }
+
+ /** Creates this with custom configuration.
+ *
+ * @param maxPointsInLeafNode Maximum number of points in each leaf cell. Smaller values create a deeper tree with larger in-heap index and possibly
+ * faster searching. The default is 1024.
+ * @param maxPointsSortInHeap Maximum number of points where in-heap sort can be used. When the number of points exceeds this, a (slower)
+ * offline sort is used. The default is 128 * 1024.
+ *
+ * @lucene.experimental */
+ public RangeTreeDocValuesFormat(int maxPointsInLeafNode, int maxPointsSortInHeap) {
+ super("NumericTree");
+ RangeTreeWriter.verifyParams(maxPointsInLeafNode, maxPointsSortInHeap);
+ this.maxPointsInLeafNode = maxPointsInLeafNode;
+ this.maxPointsSortInHeap = maxPointsSortInHeap;
+ }
+
+ @Override
+ public DocValuesConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {
+ return new RangeTreeDocValuesConsumer(delegate.fieldsConsumer(state), state, maxPointsInLeafNode, maxPointsSortInHeap);
+ }
+
+ @Override
+ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
+ return new RangeTreeDocValuesProducer(delegate.fieldsProducer(state), state);
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesProducer.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesProducer.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeDocValuesProducer.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,190 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Accountables;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+
+class RangeTreeDocValuesProducer extends DocValuesProducer {
+
+ private final Map<String,RangeTreeReader> treeReaders = new HashMap<>();
+ private final Map<Integer,Long> fieldToIndexFPs = new HashMap<>();
+
+ private final IndexInput datIn;
+ private final AtomicLong ramBytesUsed;
+ private final int maxDoc;
+ private final DocValuesProducer delegate;
+ private final boolean merging;
+
+ public RangeTreeDocValuesProducer(DocValuesProducer delegate, SegmentReadState state) throws IOException {
+ String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, RangeTreeDocValuesFormat.META_EXTENSION);
+ ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaFileName, state.context);
+ CodecUtil.checkIndexHeader(metaIn, RangeTreeDocValuesFormat.META_CODEC_NAME, RangeTreeDocValuesFormat.META_VERSION_START, RangeTreeDocValuesFormat.META_VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ int fieldCount = metaIn.readVInt();
+ for(int i=0;i<fieldCount;i++) {
+ int fieldNumber = metaIn.readVInt();
+ long indexFP = metaIn.readVLong();
+ fieldToIndexFPs.put(fieldNumber, indexFP);
+ }
+ CodecUtil.checkFooter(metaIn);
+ metaIn.close();
+
+ String datFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, RangeTreeDocValuesFormat.DATA_EXTENSION);
+ datIn = state.directory.openInput(datFileName, state.context);
+ CodecUtil.checkIndexHeader(datIn, RangeTreeDocValuesFormat.DATA_CODEC_NAME, RangeTreeDocValuesFormat.DATA_VERSION_START, RangeTreeDocValuesFormat.DATA_VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
+ maxDoc = state.segmentInfo.maxDoc();
+ this.delegate = delegate;
+ merging = false;
+ }
+
+ // clone for merge: we don't hang onto the NumericTrees we load
+ RangeTreeDocValuesProducer(RangeTreeDocValuesProducer orig) throws IOException {
+ assert Thread.holdsLock(orig);
+ datIn = orig.datIn.clone();
+ ramBytesUsed = new AtomicLong(orig.ramBytesUsed.get());
+ delegate = orig.delegate.getMergeInstance();
+ fieldToIndexFPs.putAll(orig.fieldToIndexFPs);
+ treeReaders.putAll(orig.treeReaders);
+ merging = true;
+ maxDoc = orig.maxDoc;
+ }
+
+ @Override
+ public synchronized SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
+ RangeTreeReader treeReader = treeReaders.get(field.name);
+ if (treeReader == null) {
+ // Lazy load
+ Long fp = fieldToIndexFPs.get(field.number);
+ // FieldInfos checks has already ensured we are a DV field of this type, and Codec ensures
+ // this DVFormat was used at write time:
+ assert fp != null;
+ datIn.seek(fp);
+ treeReader = new RangeTreeReader(datIn);
+
+ // Only hang onto the reader when we are not merging:
+ if (merging == false) {
+ treeReaders.put(field.name, treeReader);
+ ramBytesUsed.addAndGet(treeReader.ramBytesUsed());
+ }
+ }
+
+ return new RangeTreeSortedNumericDocValues(treeReader, delegate.getSortedNumeric(field));
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.close(datIn, delegate);
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ CodecUtil.checksumEntireFile(datIn);
+ }
+
+ @Override
+ public NumericDocValues getNumeric(FieldInfo field) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public BinaryDocValues getBinary(FieldInfo field) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public SortedDocValues getSorted(FieldInfo field) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public synchronized SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+ RangeTreeReader treeReader = treeReaders.get(field.name);
+ if (treeReader == null) {
+ // Lazy load
+ Long fp = fieldToIndexFPs.get(field.number);
+
+ // FieldInfos checks has already ensured we are a DV field of this type, and Codec ensures
+ // this DVFormat was used at write time:
+ assert fp != null;
+
+ datIn.seek(fp);
+ //System.out.println("load field=" + field.name);
+ treeReader = new RangeTreeReader(datIn);
+
+ // Only hang onto the reader when we are not merging:
+ if (merging == false) {
+ treeReaders.put(field.name, treeReader);
+ ramBytesUsed.addAndGet(treeReader.ramBytesUsed());
+ }
+ }
+
+ return new RangeTreeSortedSetDocValues(treeReader, delegate.getSortedSet(field));
+ }
+
+ @Override
+ public Bits getDocsWithField(FieldInfo field) throws IOException {
+ return delegate.getDocsWithField(field);
+ }
+
+ @Override
+ public synchronized Collection<Accountable> getChildResources() {
+ List<Accountable> resources = new ArrayList<>();
+ for(Map.Entry<String,RangeTreeReader> ent : treeReaders.entrySet()) {
+ resources.add(Accountables.namedAccountable("field " + ent.getKey(), ent.getValue()));
+ }
+ resources.add(Accountables.namedAccountable("delegate", delegate));
+
+ return resources;
+ }
+
+ @Override
+ public synchronized DocValuesProducer getMergeInstance() throws IOException {
+ return new RangeTreeDocValuesProducer(this);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return ramBytesUsed.get() + delegate.ramBytesUsed();
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeReader.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeReader.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeReader.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,202 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/** Handles intersection of a range with a numeric tree previously written with {@link RangeTreeWriter}.
+ *
+ * @lucene.experimental */
+
+final class RangeTreeReader implements Accountable {
+ final private long[] blockFPs;
+ final private long[] blockMinValues;
+ final IndexInput in;
+ final long globalMaxValue;
+ final int approxDocsPerBlock;
+
+ public RangeTreeReader(IndexInput in) throws IOException {
+
+ // Read index:
+ int numLeaves = in.readVInt();
+ approxDocsPerBlock = in.readVInt();
+
+ blockMinValues = new long[numLeaves];
+ for(int i=0;i<numLeaves;i++) {
+ blockMinValues[i] = in.readLong();
+ }
+ blockFPs = new long[numLeaves];
+ for(int i=0;i<numLeaves;i++) {
+ blockFPs[i] = in.readVLong();
+ }
+ globalMaxValue = in.readLong();
+
+ this.in = in;
+ }
+
+ public long getMinValue() {
+ return blockMinValues[0];
+ }
+
+ public long getMaxValue() {
+ return globalMaxValue;
+ }
+
+ private static final class QueryState {
+ final IndexInput in;
+ final DocIdSetBuilder docs;
+ final long minValueIncl;
+ final long maxValueIncl;
+ final SortedNumericDocValues sndv;
+
+ public QueryState(IndexInput in, int maxDoc,
+ long minValueIncl, long maxValueIncl,
+ SortedNumericDocValues sndv) {
+ this.in = in;
+ this.docs = new DocIdSetBuilder(maxDoc);
+ this.minValueIncl = minValueIncl;
+ this.maxValueIncl = maxValueIncl;
+ this.sndv = sndv;
+ }
+ }
+
+ public DocIdSet intersect(long minIncl, long maxIncl, SortedNumericDocValues sndv, int maxDoc) throws IOException {
+
+ if (minIncl > maxIncl) {
+ return DocIdSet.EMPTY;
+ }
+
+ if (minIncl > globalMaxValue || maxIncl < blockMinValues[0]) {
+ return DocIdSet.EMPTY;
+ }
+
+ QueryState state = new QueryState(in.clone(), maxDoc,
+ minIncl, maxIncl,
+ sndv);
+
+ int startBlockIncl = Arrays.binarySearch(blockMinValues, minIncl);
+ if (startBlockIncl >= 0) {
+ // There can be dups here, when the same value is added many
+ // times. Also, we need the first block whose min is < minIncl:
+ while (startBlockIncl > 0 && blockMinValues[startBlockIncl] == minIncl) {
+ startBlockIncl--;
+ }
+ } else {
+ startBlockIncl = Math.max(-startBlockIncl-2, 0);
+ }
+
+ int endBlockIncl = Arrays.binarySearch(blockMinValues, maxIncl);
+ if (endBlockIncl >= 0) {
+ // There can be dups here, when the same value is added many
+ // times. Also, we need the first block whose max is > minIncl:
+ while (endBlockIncl < blockMinValues.length-1 && blockMinValues[endBlockIncl] == maxIncl) {
+ endBlockIncl++;
+ }
+ } else {
+ endBlockIncl = Math.max(-endBlockIncl-2, 0);
+ }
+
+ assert startBlockIncl <= endBlockIncl;
+
+ state.in.seek(blockFPs[startBlockIncl]);
+
+ //System.out.println("startBlockIncl=" + startBlockIncl + " endBlockIncl=" + endBlockIncl);
+
+ // Rough estimate of how many hits we'll see. Note that in the degenerate case
+ // (index same value many times) this could be a big over-estimate, but in the typical
+ // case it's good:
+ state.docs.grow(approxDocsPerBlock * (endBlockIncl - startBlockIncl + 1));
+
+ int hitCount = 0;
+ for (int block=startBlockIncl;block<=endBlockIncl;block++) {
+ boolean doFilter = blockMinValues[block] <= minIncl || block == blockMinValues.length-1 || blockMinValues[block+1] >= maxIncl;
+ //System.out.println(" block=" + block + " min=" + blockMinValues[block] + " doFilter=" + doFilter);
+
+ int newCount;
+ if (doFilter) {
+ // We must filter each hit:
+ newCount = addSome(state);
+ } else {
+ newCount = addAll(state);
+ }
+
+ hitCount += newCount;
+ }
+
+ // NOTE: hitCount is an over-estimate in the multi-valued case:
+ return state.docs.build(hitCount);
+ }
+
+ /** Adds all docs from the current block. */
+ private int addAll(QueryState state) throws IOException {
+ // How many values are stored in this leaf cell:
+ int count = state.in.readVInt();
+ state.docs.grow(count);
+ for(int i=0;i<count;i++) {
+ int docID = state.in.readInt();
+ state.docs.add(docID);
+ }
+
+ return count;
+ }
+
+ /** Adds docs from the current block, filtering each hit against the query min/max. This
+ * is only needed on the boundary blocks. */
+ private int addSome(QueryState state) throws IOException {
+ int hitCount = 0;
+
+ // How many points are stored in this leaf cell:
+ int count = state.in.readVInt();
+ state.docs.grow(count);
+ for(int i=0;i<count;i++) {
+ int docID = state.in.readInt();
+ state.sndv.setDocument(docID);
+
+ // How many values this doc has:
+ int docValueCount = state.sndv.count();
+
+ for(int j=0;j<docValueCount;j++) {
+ long value = state.sndv.valueAt(j);
+
+ if (value >= state.minValueIncl && value <= state.maxValueIncl) {
+ state.docs.add(docID);
+ hitCount++;
+
+ // Stop processing values for this doc:
+ break;
+ }
+ }
+ }
+
+ return hitCount;
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return blockMinValues.length * RamUsageEstimator.NUM_BYTES_LONG +
+ blockFPs.length * RamUsageEstimator.NUM_BYTES_LONG;
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedNumericDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedNumericDocValues.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedNumericDocValues.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedNumericDocValues.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,49 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.SortedNumericDocValues;
+
+class RangeTreeSortedNumericDocValues extends SortedNumericDocValues {
+ final RangeTreeReader rangeTreeReader;
+ final SortedNumericDocValues delegate;
+
+ public RangeTreeSortedNumericDocValues(RangeTreeReader rangeTreeReader, SortedNumericDocValues delegate) {
+ this.rangeTreeReader = rangeTreeReader;
+ this.delegate = delegate;
+ }
+
+ public RangeTreeReader getRangeTreeReader() {
+ return rangeTreeReader;
+ }
+
+ @Override
+ public void setDocument(int doc) {
+ delegate.setDocument(doc);
+ }
+
+ @Override
+ public long valueAt(int index) {
+ return delegate.valueAt(index);
+ }
+
+ @Override
+ public int count() {
+ return delegate.count();
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedSetDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedSetDocValues.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedSetDocValues.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeSortedSetDocValues.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,66 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+
+class RangeTreeSortedSetDocValues extends SortedSetDocValues {
+ final RangeTreeReader rangeTreeReader;
+ final SortedSetDocValues delegate;
+
+ public RangeTreeSortedSetDocValues(RangeTreeReader rangeTreeReader, SortedSetDocValues delegate) {
+ this.rangeTreeReader = rangeTreeReader;
+ this.delegate = delegate;
+ }
+
+ public RangeTreeReader getRangeTreeReader() {
+ return rangeTreeReader;
+ }
+
+ @Override
+ public long nextOrd() {
+ return delegate.nextOrd();
+ }
+
+ @Override
+ public void setDocument(int doc) {
+ delegate.setDocument(doc);
+ }
+
+ @Override
+ public BytesRef lookupOrd(long ord) {
+ return delegate.lookupOrd(ord);
+ }
+
+ @Override
+ public long getValueCount() {
+ return delegate.getValueCount();
+ }
+
+ @Override
+ public long lookupTerm(BytesRef key) {
+ return delegate.lookupTerm(key);
+ }
+
+ @Override
+ public TermsEnum termsEnum() {
+ return delegate.termsEnum();
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeWriter.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeWriter.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/RangeTreeWriter.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,591 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.RamUsageEstimator;
+
+// TODO
+// - could we just "use postings" to map leaf -> docIDs?
+// - we could also index "auto-prefix terms" here, and use better compression
+// - the index could be efficiently encoded as an FST, so we don't have wasteful
+// (monotonic) long[] leafBlockFPs; or we could use MonotonicLongValues ... but then
+// the index is already plenty small: 60M OSM points --> 1.1 MB with 128 points
+// per leaf, and you can reduce that by putting more points per leaf
+// - we can quantize the split values to 2 bytes (short): http://people.csail.mit.edu/tmertens/papers/qkdtree.pdf
+
+/** Recursively builds a 1d BKD tree to assign all incoming {@code long} values to smaller
+ * and smaller ranges until the number of points in a given
+ * range is <= the <code>maxPointsInLeafNode</code>. The tree is
+ * fully balanced, which means the leaf nodes will have between 50% and 100% of
+ * the requested <code>maxPointsInLeafNode</code>, except for the adversarial case
+ * of indexing exactly the same value many times.
+ *
+ * <p>
+ * See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
+ *
+ * <p>This consumes heap during writing: for any nodes with fewer than <code>maxPointsSortInHeap</code>, it holds
+ * the points in memory as simple java arrays.
+ *
+ * <p>
+ * <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total values,
+ * which should be plenty since a Lucene index can have at most Integer.MAX_VALUE-1 documents.
+ *
+ * @lucene.experimental */
+
+class RangeTreeWriter {
+
+ // value (long) + ord (long) + docID (int)
+ static final int BYTES_PER_DOC = 2 * RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT;
+
+ public static final int DEFAULT_MAX_VALUES_IN_LEAF_NODE = 1024;
+
+ /** This works out to max of ~10 MB peak heap tied up during writing: */
+ public static final int DEFAULT_MAX_VALUES_SORT_IN_HEAP = 128*1024;;
+
+ private final byte[] scratchBytes = new byte[BYTES_PER_DOC];
+ private final ByteArrayDataOutput scratchBytesOutput = new ByteArrayDataOutput(scratchBytes);
+
+ private OfflineSorter.ByteSequencesWriter writer;
+ private GrowingHeapSliceWriter heapWriter;
+
+ private Path tempInput;
+ private Path tempDir;
+ private final int maxValuesInLeafNode;
+ private final int maxValuesSortInHeap;
+
+ private long valueCount;
+ private long globalMinValue = Long.MAX_VALUE;
+ private long globalMaxValue = Long.MIN_VALUE;
+
+ public RangeTreeWriter() throws IOException {
+ this(DEFAULT_MAX_VALUES_IN_LEAF_NODE, DEFAULT_MAX_VALUES_SORT_IN_HEAP);
+ }
+
+ // TODO: instead of maxValuesSortInHeap, change to maxMBHeap ... the mapping is non-obvious:
+ public RangeTreeWriter(int maxValuesInLeafNode, int maxValuesSortInHeap) throws IOException {
+ verifyParams(maxValuesInLeafNode, maxValuesSortInHeap);
+ this.maxValuesInLeafNode = maxValuesInLeafNode;
+ this.maxValuesSortInHeap = maxValuesSortInHeap;
+
+ // We write first maxValuesSortInHeap in heap, then cutover to offline for additional points:
+ heapWriter = new GrowingHeapSliceWriter(maxValuesSortInHeap);
+ }
+
+ public static void verifyParams(int maxValuesInLeafNode, int maxValuesSortInHeap) {
+ if (maxValuesInLeafNode <= 0) {
+ throw new IllegalArgumentException("maxValuesInLeafNode must be > 0; got " + maxValuesInLeafNode);
+ }
+ if (maxValuesInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) {
+ throw new IllegalArgumentException("maxValuesInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxValuesInLeafNode);
+ }
+ if (maxValuesSortInHeap < maxValuesInLeafNode) {
+ throw new IllegalArgumentException("maxValuesSortInHeap must be >= maxValuesInLeafNode; got " + maxValuesSortInHeap + " vs maxValuesInLeafNode="+ maxValuesInLeafNode);
+ }
+ if (maxValuesSortInHeap > ArrayUtil.MAX_ARRAY_LENGTH) {
+ throw new IllegalArgumentException("maxValuesSortInHeap must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxValuesSortInHeap);
+ }
+ }
+
+ /** If the current segment has too many points then we switchover to temp files / offline sort. */
+ private void switchToOffline() throws IOException {
+
+ // OfflineSorter isn't thread safe, but our own private tempDir works around this:
+ tempDir = Files.createTempDirectory(OfflineSorter.defaultTempDir(), RangeTreeWriter.class.getSimpleName());
+
+ // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
+ tempInput = tempDir.resolve("in");
+ writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+ for(int i=0;i<valueCount;i++) {
+ scratchBytesOutput.reset(scratchBytes);
+ scratchBytesOutput.writeLong(heapWriter.values[i]);
+ scratchBytesOutput.writeVInt(heapWriter.docIDs[i]);
+ scratchBytesOutput.writeVLong(i);
+ // TODO: can/should OfflineSorter optimize the fixed-width case?
+ writer.write(scratchBytes, 0, scratchBytes.length);
+ }
+
+ heapWriter = null;
+ }
+
+ void add(long value, int docID) throws IOException {
+ if (valueCount >= maxValuesSortInHeap) {
+ if (writer == null) {
+ switchToOffline();
+ }
+ scratchBytesOutput.reset(scratchBytes);
+ scratchBytesOutput.writeLong(value);
+ scratchBytesOutput.writeVInt(docID);
+ scratchBytesOutput.writeVLong(valueCount);
+ writer.write(scratchBytes, 0, scratchBytes.length);
+ } else {
+ // Not too many points added yet, continue using heap:
+ heapWriter.append(value, valueCount, docID);
+ }
+
+ valueCount++;
+ globalMaxValue = Math.max(value, globalMaxValue);
+ globalMinValue = Math.min(value, globalMinValue);
+ }
+
+ /** Changes incoming {@link ByteSequencesWriter} file to to fixed-width-per-entry file, because we need to be able to slice
+ * as we recurse in {@link #build}. */
+ private SliceWriter convertToFixedWidth(Path in) throws IOException {
+ BytesRefBuilder scratch = new BytesRefBuilder();
+ scratch.grow(BYTES_PER_DOC);
+ BytesRef bytes = scratch.get();
+ ByteArrayDataInput dataReader = new ByteArrayDataInput();
+
+ OfflineSorter.ByteSequencesReader reader = null;
+ SliceWriter sortedWriter = null;
+ boolean success = false;
+ try {
+ reader = new OfflineSorter.ByteSequencesReader(in);
+ sortedWriter = getWriter(valueCount);
+ for (long i=0;i<valueCount;i++) {
+ boolean result = reader.read(scratch);
+ assert result;
+ dataReader.reset(bytes.bytes, bytes.offset, bytes.length);
+ long value = dataReader.readLong();
+ int docID = dataReader.readVInt();
+ assert docID >= 0: "docID=" + docID;
+ long ord = dataReader.readVLong();
+ sortedWriter.append(value, ord, docID);
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(sortedWriter, reader);
+ } else {
+ IOUtils.closeWhileHandlingException(reader);
+ try {
+ sortedWriter.destroy();
+ } catch (Throwable t) {
+ // Suppress to keep throwing original exc
+ }
+ }
+ }
+
+ return sortedWriter;
+ }
+
+ private SliceWriter sort() throws IOException {
+ if (heapWriter != null) {
+
+ assert valueCount < Integer.MAX_VALUE;
+
+ // All buffered points are still in heap
+ new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ int docID = heapWriter.docIDs[i];
+ heapWriter.docIDs[i] = heapWriter.docIDs[j];
+ heapWriter.docIDs[j] = docID;
+
+ long ord = heapWriter.ords[i];
+ heapWriter.ords[i] = heapWriter.ords[j];
+ heapWriter.ords[j] = ord;
+
+ long value = heapWriter.values[i];
+ heapWriter.values[i] = heapWriter.values[j];
+ heapWriter.values[j] = value;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ int cmp = Long.compare(heapWriter.values[i], heapWriter.values[j]);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ // Tie-break
+ cmp = Integer.compare(heapWriter.docIDs[i], heapWriter.docIDs[j]);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ return Long.compare(heapWriter.ords[i], heapWriter.ords[j]);
+ }
+ }.sort(0, (int) valueCount);
+
+ HeapSliceWriter sorted = new HeapSliceWriter((int) valueCount);
+ for(int i=0;i<valueCount;i++) {
+ sorted.append(heapWriter.values[i],
+ heapWriter.ords[i],
+ heapWriter.docIDs[i]);
+ }
+
+ return sorted;
+ } else {
+
+ // Offline sort:
+ assert tempDir != null;
+
+ final ByteArrayDataInput reader = new ByteArrayDataInput();
+ Comparator<BytesRef> cmp = new Comparator<BytesRef>() {
+ private final ByteArrayDataInput readerB = new ByteArrayDataInput();
+
+ @Override
+ public int compare(BytesRef a, BytesRef b) {
+ reader.reset(a.bytes, a.offset, a.length);
+ final long valueA = reader.readLong();
+ final int docIDA = reader.readVInt();
+ final long ordA = reader.readVLong();
+
+ reader.reset(b.bytes, b.offset, b.length);
+ final long valueB = reader.readLong();
+ final int docIDB = reader.readVInt();
+ final long ordB = reader.readVLong();
+
+ int cmp = Long.compare(valueA, valueB);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ // Tie-break
+ cmp = Integer.compare(docIDA, docIDB);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ return Long.compare(ordA, ordB);
+ }
+ };
+
+ Path sorted = tempDir.resolve("sorted");
+ boolean success = false;
+ try {
+ OfflineSorter sorter = new OfflineSorter(cmp, OfflineSorter.BufferSize.automatic(), tempDir, OfflineSorter.MAX_TEMPFILES);
+ sorter.sort(tempInput, sorted);
+ SliceWriter writer = convertToFixedWidth(sorted);
+ success = true;
+ return writer;
+ } finally {
+ if (success) {
+ IOUtils.rm(sorted);
+ } else {
+ IOUtils.deleteFilesIgnoringExceptions(sorted);
+ }
+ }
+ }
+ }
+
+ /** Writes the 1d BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
+ public long finish(IndexOutput out) throws IOException {
+
+ if (writer != null) {
+ writer.close();
+ }
+
+ if (valueCount == 0) {
+ throw new IllegalStateException("at least one value must be indexed");
+ }
+
+ // TODO: we should use in-memory sort here, if number of points is small enough:
+
+ long countPerLeaf = valueCount;
+ long innerNodeCount = 1;
+
+ while (countPerLeaf > maxValuesInLeafNode) {
+ countPerLeaf /= 2;
+ innerNodeCount *= 2;
+ }
+
+ //System.out.println("innerNodeCount=" + innerNodeCount);
+
+ if (1+2*innerNodeCount >= Integer.MAX_VALUE) {
+ throw new IllegalStateException("too many nodes; increase maxValuesInLeafNode (currently " + maxValuesInLeafNode + ") and reindex");
+ }
+
+ innerNodeCount--;
+
+ int numLeaves = (int) (innerNodeCount+1);
+
+ // Indexed by nodeID, but first (root) nodeID is 1
+ long[] blockMinValues = new long[numLeaves];
+
+ // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
+ long[] leafBlockFPs = new long[numLeaves];
+
+ // Make sure the math above "worked":
+ assert valueCount / blockMinValues.length <= maxValuesInLeafNode: "valueCount=" + valueCount + " blockMinValues.length=" + blockMinValues.length + " maxValuesInLeafNode=" + maxValuesInLeafNode;
+ //System.out.println(" avg pointsPerLeaf=" + (valueCount/blockMinValues.length));
+
+ // Sort all docs by value:
+ SliceWriter sortedWriter = null;
+
+ boolean success = false;
+ try {
+ sortedWriter = sort();
+ heapWriter = null;
+
+ build(1, numLeaves,
+ new PathSlice(sortedWriter, 0, valueCount),
+ out,
+ globalMinValue, globalMaxValue,
+ blockMinValues,
+ leafBlockFPs);
+ success = true;
+ } finally {
+ if (success) {
+ sortedWriter.destroy();
+ IOUtils.rm(tempInput);
+ } else {
+ try {
+ sortedWriter.destroy();
+ } catch (Throwable t) {
+ // Suppress to keep throwing original exc
+ }
+ IOUtils.deleteFilesIgnoringExceptions(tempInput);
+ }
+ }
+
+ //System.out.println("Total nodes: " + innerNodeCount);
+
+ // Write index:
+ long indexFP = out.getFilePointer();
+ out.writeVInt(numLeaves);
+ out.writeVInt((int) (valueCount / numLeaves));
+
+ for (int i=0;i<blockMinValues.length;i++) {
+ out.writeLong(blockMinValues[i]);
+ }
+ for (int i=0;i<leafBlockFPs.length;i++) {
+ out.writeVLong(leafBlockFPs[i]);
+ }
+ out.writeLong(globalMaxValue);
+
+ if (tempDir != null) {
+ // If we had to go offline, we should have removed all temp files we wrote:
+ assert directoryIsEmpty(tempDir);
+ IOUtils.rm(tempDir);
+ }
+
+ return indexFP;
+ }
+
+ // Called only from assert
+ private boolean directoryIsEmpty(Path in) {
+ try (DirectoryStream<Path> dir = Files.newDirectoryStream(in)) {
+ for (Path path : dir) {
+ assert false: "dir=" + in + " still has file=" + path;
+ return false;
+ }
+ } catch (IOException ioe) {
+ // Just ignore: we are only called from assert
+ }
+ return true;
+ }
+
+ /** Sliced reference to points in an OfflineSorter.ByteSequencesWriter file. */
+ private static final class PathSlice {
+ final SliceWriter writer;
+ final long start;
+ final long count;
+
+ public PathSlice(SliceWriter writer, long start, long count) {
+ this.writer = writer;
+ this.start = start;
+ this.count = count;
+ }
+
+ @Override
+ public String toString() {
+ return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")";
+ }
+ }
+
+ private long getSplitValue(PathSlice source, long leftCount, long minValue, long maxValue) throws IOException {
+
+ // Read the split value:
+ SliceReader reader = source.writer.getReader(source.start + leftCount);
+ boolean success = false;
+ long splitValue;
+ try {
+ boolean result = reader.next();
+ assert result;
+ splitValue = reader.value();
+ assert splitValue >= minValue && splitValue <= maxValue: "splitValue=" + splitValue + " minValue=" + minValue + " maxValue=" + maxValue + " reader=" + reader;
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(reader);
+ } else {
+ IOUtils.closeWhileHandlingException(reader);
+ }
+ }
+
+ return splitValue;
+ }
+
+ /** The incoming PathSlice for the dim we will split is already partitioned/sorted. */
+ private void build(int nodeID, int leafNodeOffset,
+ PathSlice source,
+ IndexOutput out,
+ long minValue, long maxValue,
+ long[] blockMinValues,
+ long[] leafBlockFPs) throws IOException {
+
+ long count = source.count;
+
+ if (source.writer instanceof OfflineSliceWriter && count <= maxValuesSortInHeap) {
+ // Cutover to heap:
+ SliceWriter writer = new HeapSliceWriter((int) count);
+ SliceReader reader = source.writer.getReader(source.start);
+ for(int i=0;i<count;i++) {
+ boolean hasNext = reader.next();
+ assert hasNext;
+ writer.append(reader.value(), reader.ord(), reader.docID());
+ }
+ source = new PathSlice(writer, 0, count);
+ }
+
+ // We should never hit dead-end nodes on recursion even in the adversarial cases:
+ assert count > 0;
+
+ if (nodeID >= leafNodeOffset) {
+ // Leaf node: write block
+ assert maxValue >= minValue;
+
+ //System.out.println("\nleaf:\n lat range: " + ((long) maxLatEnc-minLatEnc));
+ //System.out.println(" lon range: " + ((long) maxLonEnc-minLonEnc));
+
+ // Sort by docID in the leaf so we can .or(DISI) at search time:
+ SliceReader reader = source.writer.getReader(source.start);
+
+ int[] docIDs = new int[(int) count];
+
+ boolean success = false;
+ try {
+ for (int i=0;i<source.count;i++) {
+
+ // NOTE: we discard ord at this point; we only needed it temporarily
+ // during building to uniquely identify each point to properly handle
+ // the multi-valued case (one docID having multiple values):
+
+ // We also discard lat/lon, since at search time, we reside on the
+ // wrapped doc values for this:
+
+ boolean result = reader.next();
+ assert result;
+ docIDs[i] = reader.docID();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(reader);
+ } else {
+ IOUtils.closeWhileHandlingException(reader);
+ }
+ }
+
+ // TODO: not clear we need to do this anymore (we used to make a DISI over
+ // the block at search time), but maybe it buys some memory
+ // locality/sequentiality at search time?
+ Arrays.sort(docIDs);
+
+ // Dedup docIDs: for the multi-valued case where more than one value for the doc
+ // wound up in this leaf cell, we only need to store the docID once:
+ int lastDocID = -1;
+ int uniqueCount = 0;
+ for(int i=0;i<docIDs.length;i++) {
+ int docID = docIDs[i];
+ if (docID != lastDocID) {
+ uniqueCount++;
+ lastDocID = docID;
+ }
+ }
+ assert uniqueCount <= count;
+
+ // TODO: in theory we could compute exactly what this fp will be, since we fixed-width (writeInt) encode docID, and up-front we know
+ // how many docIDs are in every leaf since we don't do anything special about multiple splitValue boundary case?
+ long startFP = out.getFilePointer();
+ out.writeVInt(uniqueCount);
+
+ // Save the block file pointer:
+ int blockID = nodeID - leafNodeOffset;
+ leafBlockFPs[blockID] = startFP;
+ //System.out.println(" leafFP=" + startFP);
+
+ blockMinValues[blockID] = minValue;
+
+ lastDocID = -1;
+ for (int i=0;i<docIDs.length;i++) {
+ // Absolute int encode; with "vInt of deltas" encoding, the .kdd size dropped from
+ // 697 MB -> 539 MB, but query time for 225 queries went from 1.65 sec -> 2.64 sec.
+ // I think if we also indexed prefix terms here we could do less costly compression
+ // on those lists:
+ int docID = docIDs[i];
+ if (docID != lastDocID) {
+ out.writeInt(docID);
+ lastDocID = docID;
+ }
+ }
+ //long endFP = out.getFilePointer();
+ //System.out.println(" bytes/doc: " + ((endFP - startFP) / count));
+ } else {
+ // Inner node: sort, partition/recurse
+
+ assert nodeID < blockMinValues.length: "nodeID=" + nodeID + " blockMinValues.length=" + blockMinValues.length;
+
+ assert source.count == count;
+
+ long leftCount = source.count / 2;
+
+ // NOTE: we don't tweak leftCount for the boundary cases, which means at search time if we are looking for exactly splitValue then we
+ // must search both left and right trees:
+ long splitValue = getSplitValue(source, leftCount, minValue, maxValue);
+
+ build(2*nodeID, leafNodeOffset,
+ new PathSlice(source.writer, source.start, leftCount),
+ out,
+ minValue, splitValue,
+ blockMinValues, leafBlockFPs);
+
+ build(2*nodeID+1, leafNodeOffset,
+ new PathSlice(source.writer, source.start+leftCount, count-leftCount),
+ out,
+ splitValue, maxValue,
+ blockMinValues, leafBlockFPs);
+ }
+ }
+
+ SliceWriter getWriter(long count) throws IOException {
+ if (count < maxValuesSortInHeap) {
+ return new HeapSliceWriter((int) count);
+ } else {
+ return new OfflineSliceWriter(tempDir, count);
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceReader.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceReader.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceReader.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,31 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/** Iterates over one slice of the sorted values. This abstracts away whether
+ * OfflineSorter or simple arrays in heap are used. */
+interface SliceReader extends Closeable {
+ boolean next() throws IOException;
+ long value();
+ long ord();
+ int docID();
+}
+
Added: lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceWriter.java?rev=1693682&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceWriter.java (added)
+++ lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/rangetree/SliceWriter.java Sat Aug 1 07:48:53 2015
@@ -0,0 +1,29 @@
+package org.apache.lucene.rangetree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/** Abstracts away whether OfflineSorter or simple arrays in heap are used. */
+interface SliceWriter extends Closeable {
+ void append(long value, long ord, int docID) throws IOException;
+ SliceReader getReader(long start) throws IOException;
+ void destroy() throws IOException;
+}
+