You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2016/12/19 12:52:03 UTC
[19/23] lucene-solr:feature/metrics: LUCENE-7590: add DocValuesStats
for SortedNumeric DV fields
LUCENE-7590: add DocValuesStats for SortedNumeric DV fields
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/944b8e07
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/944b8e07
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/944b8e07
Branch: refs/heads/feature/metrics
Commit: 944b8e07f557b9320895998fe33d71cae5199eee
Parents: dcf202a
Author: Shai Erera <sh...@apache.org>
Authored: Sat Dec 17 21:17:14 2016 +0200
Committer: Shai Erera <sh...@apache.org>
Committed: Sun Dec 18 08:39:42 2016 +0200
----------------------------------------------------------------------
.../apache/lucene/search/DocValuesStats.java | 128 +++++++++++++++-
.../search/TestDocValuesStatsCollector.java | 153 ++++++++++++++++++-
2 files changed, 271 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/944b8e07/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
----------------------------------------------------------------------
diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
index c8b7752..9dd97a6 100644
--- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
+++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
/** Holds statistics for a DocValues field. */
public abstract class DocValuesStats<T> {
@@ -95,7 +96,7 @@ public abstract class DocValuesStats<T> {
}
/** Holds statistics for a numeric DocValues field. */
- public static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
+ static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
protected double mean = 0.0;
protected double variance = 0.0;
@@ -113,7 +114,7 @@ public abstract class DocValuesStats<T> {
}
@Override
- protected boolean hasValue(int doc) throws IOException {
+ protected final boolean hasValue(int doc) throws IOException {
return ndv.advanceExact(doc);
}
@@ -199,4 +200,127 @@ public abstract class DocValuesStats<T> {
}
}
+ /** Holds statistics for a sorted-numeric DocValues field. */
+ static abstract class SortedNumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
+
+ protected long valuesCount = 0;
+ protected double mean = 0.0;
+ protected double variance = 0.0;
+
+ protected SortedNumericDocValues sndv;
+
+ protected SortedNumericDocValuesStats(String field, T initialMin, T initialMax) {
+ super(field, initialMin, initialMax);
+ }
+
+ @Override
+ protected final boolean init(LeafReaderContext context) throws IOException {
+ sndv = context.reader().getSortedNumericDocValues(field);
+ return sndv != null;
+ }
+
+ @Override
+ protected final boolean hasValue(int doc) throws IOException {
+ return sndv.advanceExact(doc);
+ }
+
+ /** The mean of all values of the field. */
+ public final double mean() {
+ return mean;
+ }
+
+ /** Returns the variance of all values of the field. */
+ public final double variance() {
+ int count = count();
+ return count > 0 ? variance / count : 0;
+ }
+
+ /** Returns the stdev of all values of the field. */
+ public final double stdev() {
+ return Math.sqrt(variance());
+ }
+
+ /** Returns the total number of values for this field. */
+ public final long valuesCount() {
+ return valuesCount;
+ }
+
+ /** Returns the sum of values of the field. Note that if the values are large, the {@code sum} might overflow. */
+ public abstract T sum();
+ }
+
+ /** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */
+ public static final class SortedLongDocValuesStats extends SortedNumericDocValuesStats<Long> {
+
+ // To avoid boxing 'long' to 'Long' while the sum is computed, declare it as private variable.
+ private long sum = 0;
+
+ public SortedLongDocValuesStats(String field) {
+ super(field, Long.MAX_VALUE, Long.MIN_VALUE);
+ }
+
+ @Override
+ protected void doAccumulate(int count) throws IOException {
+ int numValues = sndv.docValueCount();
+ while (numValues-- > 0) {
+ long val = sndv.nextValue();
+ if (val > max) {
+ max = val;
+ }
+ if (val < min) {
+ min = val;
+ }
+ sum += val;
+ double oldMean = mean;
+ // for correct "running average computation", increase valuesCount with each value, rather than once before the
+ // loop stats.
+ ++valuesCount;
+ mean += (val - mean) / valuesCount;
+ variance += (val - mean) * (val - oldMean);
+ }
+ }
+
+ @Override
+ public Long sum() {
+ return sum;
+ }
+ }
+
+ /** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */
+ public static final class SortedDoubleDocValuesStats extends SortedNumericDocValuesStats<Double> {
+
+ // To avoid boxing 'double' to 'Double' while the sum is computed, declare it as private variable.
+ private double sum = 0;
+
+ public SortedDoubleDocValuesStats(String field) {
+ super(field, Double.MAX_VALUE, Double.MIN_VALUE);
+ }
+
+ @Override
+ protected void doAccumulate(int count) throws IOException {
+ int numValues = sndv.docValueCount();
+ while (numValues-- > 0) {
+ double val = Double.longBitsToDouble(sndv.nextValue());
+ if (Double.compare(val, max) > 0) {
+ max = val;
+ }
+ if (Double.compare(val, min) < 0) {
+ min = val;
+ }
+ sum += val;
+ double oldMean = mean;
+ // for correct "running average computation", increase valuesCount with each value, rather than once before the
+ // loop stats.
+ ++valuesCount;
+ mean += (val - mean) / valuesCount;
+ variance += (val - mean) * (val - oldMean);
+ }
+ }
+
+ @Override
+ public Double sum() {
+ return sum;
+ }
+ }
+
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/944b8e07/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
----------------------------------------------------------------------
diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
index 8f8b09e..5fa4b04 100644
--- a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
+++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
@@ -20,19 +20,24 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.DoubleSummaryStatistics;
import java.util.LongSummaryStatistics;
+import java.util.function.Predicate;
import java.util.stream.DoubleStream;
import java.util.stream.LongStream;
+import java.util.stream.Stream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleDocValuesField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
+import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats;
+import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@@ -185,20 +190,136 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
}
}
- private static LongStream getPositiveValues(long[] docValues) {
- return Arrays.stream(docValues).filter(v -> v > 0);
+ public void testDocsWithMultipleLongValues() throws IOException {
+ try (Directory dir = newDirectory();
+ IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
+ String field = "numeric";
+ int numDocs = TestUtil.nextInt(random(), 1, 100);
+ long[][] docValues = new long[numDocs][];
+ long nextVal = 1;
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ if (random().nextBoolean()) { // not all documents have a value
+ int numValues = TestUtil.nextInt(random(), 1, 5);
+ docValues[i] = new long[numValues];
+ for (int j = 0; j < numValues; j++) {
+ doc.add(new SortedNumericDocValuesField(field, nextVal));
+ docValues[i][j] = nextVal;
+ ++nextVal;
+ }
+ doc.add(new StringField("id", "doc" + i, Store.NO));
+ }
+ indexWriter.addDocument(doc);
+ }
+
+ // 20% of cases delete some docs
+ if (random().nextDouble() < 0.2) {
+ for (int i = 0; i < numDocs; i++) {
+ if (random().nextBoolean()) {
+ indexWriter.deleteDocuments(new Term("id", "doc" + i));
+ docValues[i] = null;
+ }
+ }
+ }
+
+ try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field);
+ searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
+
+ assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
+ assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
+ if (stats.count() > 0) {
+ LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
+ assertEquals(sumStats.getMax(), stats.max().longValue());
+ assertEquals(sumStats.getMin(), stats.min().longValue());
+ assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
+ assertEquals(sumStats.getSum(), stats.sum().longValue());
+ assertEquals(sumStats.getCount(), stats.valuesCount());
+ double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count());
+ assertEquals(variance, stats.variance(), 0.00001);
+ assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+ }
+ }
+ }
}
- private static DoubleStream getPositiveValues(double[] docValues) {
- return Arrays.stream(docValues).filter(v -> v > 0);
+ public void testDocsWithMultipleDoubleValues() throws IOException {
+ try (Directory dir = newDirectory();
+ IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
+ String field = "numeric";
+ int numDocs = TestUtil.nextInt(random(), 1, 100);
+ double[][] docValues = new double[numDocs][];
+ double nextVal = 1;
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ if (random().nextBoolean()) { // not all documents have a value
+ int numValues = TestUtil.nextInt(random(), 1, 5);
+ docValues[i] = new double[numValues];
+ for (int j = 0; j < numValues; j++) {
+ doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal)));
+ docValues[i][j] = nextVal;
+ ++nextVal;
+ }
+ doc.add(new StringField("id", "doc" + i, Store.NO));
+ }
+ indexWriter.addDocument(doc);
+ }
+
+ // 20% of cases delete some docs
+ if (random().nextDouble() < 0.2) {
+ for (int i = 0; i < numDocs; i++) {
+ if (random().nextBoolean()) {
+ indexWriter.deleteDocuments(new Term("id", "doc" + i));
+ docValues[i] = null;
+ }
+ }
+ }
+
+ try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field);
+ searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
+
+ assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
+ assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
+ if (stats.count() > 0) {
+ DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
+ assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001);
+ assertEquals(sumStats.getMin(), stats.min().longValue(), 0.00001);
+ assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
+ assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001);
+ assertEquals(sumStats.getCount(), stats.valuesCount());
+ double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count());
+ assertEquals(variance, stats.variance(), 0.00001);
+ assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+ }
+ }
+ }
+ }
+
+ private static LongStream getPositiveValues(long[] values) {
+ return Arrays.stream(values).filter(v -> v > 0);
+ }
+
+ private static DoubleStream getPositiveValues(double[] values) {
+ return Arrays.stream(values).filter(v -> v > 0);
+ }
+
+ private static LongStream getZeroValues(long[] values) {
+ return Arrays.stream(values).filter(v -> v == 0);
}
- private static LongStream getZeroValues(long[] docValues) {
- return Arrays.stream(docValues).filter(v -> v == 0);
+ private static DoubleStream getZeroValues(double[] values) {
+ return Arrays.stream(values).filter(v -> v == 0);
}
- private static DoubleStream getZeroValues(double[] docValues) {
- return Arrays.stream(docValues).filter(v -> v == 0);
+ private static Stream<long[]> filterValues(long[][] values, Predicate<? super long[]> p) {
+ return Arrays.stream(values).filter(p);
+ }
+
+ private static Stream<double[]> filterValues(double[][] values, Predicate<? super double[]> p) {
+ return Arrays.stream(values).filter(p);
}
private static double computeVariance(long[] values, double mean, int count) {
@@ -209,4 +330,20 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count;
}
+ private static LongStream filterAndFlatValues(long[][] values, Predicate<? super long[]> p) {
+ return filterValues(values, (v) -> v != null).flatMapToLong(Arrays::stream);
+ }
+
+ private static DoubleStream filterAndFlatValues(double[][] values, Predicate<? super double[]> p) {
+ return filterValues(values, (v) -> v != null).flatMapToDouble(Arrays::stream);
+ }
+
+ private static double computeVariance(LongStream values, double mean, int count) {
+ return values.mapToDouble(v -> (v - mean) * (v-mean)).sum() / count;
+ }
+
+ private static double computeVariance(DoubleStream values, double mean, int count) {
+ return values.map(v -> (v - mean) * (v-mean)).sum() / count;
+ }
+
}