You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2015/05/07 19:58:59 UTC
svn commit: r1678245 [1/2] - in /lucene/dev/trunk: lucene/ solr/ solr/core/
solr/core/src/java/org/apache/solr/handler/component/
solr/core/src/test/org/apache/solr/
solr/core/src/test/org/apache/solr/handler/component/ solr/licenses/
solr/solrj/src/ja...
Author: hossman
Date: Thu May 7 17:58:58 2015
New Revision: 1678245
URL: http://svn.apache.org/r1678245
Log:
SOLR-6968: New 'cardinality' option for stats.field, uses HyperLogLog to efficiently estimate the cardinality of a field w/bounded RAM
Added:
lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java (with props)
lucene/dev/trunk/solr/licenses/fastutil-6.5.11.jar.sha1 (with props)
lucene/dev/trunk/solr/licenses/fastutil-LICENSE-ASL.txt (with props)
lucene/dev/trunk/solr/licenses/fastutil-NOTICE.txt (with props)
lucene/dev/trunk/solr/licenses/hll-1.6.0.jar.sha1 (with props)
lucene/dev/trunk/solr/licenses/hll-LICENSE-ASL.txt (with props)
lucene/dev/trunk/solr/licenses/hll-NOTICE.txt (with props)
Modified:
lucene/dev/trunk/lucene/ivy-versions.properties
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/ivy.xml
lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestDistributedSearch.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/client/solrj/response/FieldStatsInfo.java
Modified: lucene/dev/trunk/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/ivy-versions.properties?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/ivy-versions.properties (original)
+++ lucene/dev/trunk/lucene/ivy-versions.properties Thu May 7 17:58:58 2015
@@ -69,6 +69,7 @@ com.sun.jersey.version = 1.9
/dom4j/dom4j = 1.6.1
/hsqldb/hsqldb = 1.8.0.10
/io.netty/netty = 3.7.0.Final
+/it.unimi.dsi/fastutil = 6.5.11
/jakarta-regexp/jakarta-regexp = 1.4
/javax.activation/activation = 1.1.1
/javax.inject/javax.inject= 1
@@ -80,6 +81,7 @@ com.sun.jersey.version = 1.9
/log4j/log4j = 1.2.17
/mecab/mecab-ipadic = 2.7.0-20070801
/mecab/mecab-naist-jdic = 0.6.3b-20111013
+/net.agkn/hll = 1.6.0
/net.arnx/jsonic = 1.2.7
/net.sf.saxon/Saxon-HE = 9.6.0-2
/net.sourceforge.argparse4j/argparse4j = 0.4.3
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu May 7 17:58:58 2015
@@ -169,6 +169,8 @@ New Features
* SOLR-6220: Rule Based Replica Assignment during collection creation (Noble Paul)
+* SOLR-6968: New 'cardinality' option for stats.field, uses HyperLogLog to efficiently
+ estimate the cardinality of a field w/bounded RAM. (hossman)
Bug Fixes
----------------------
Modified: lucene/dev/trunk/solr/core/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/ivy.xml?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/ivy.xml (original)
+++ lucene/dev/trunk/solr/core/ivy.xml Thu May 7 17:58:58 2015
@@ -89,6 +89,10 @@
<!-- StatsComponents percentiles Dependencies-->
<dependency org="com.tdunning" name="t-digest" rev="${/com.tdunning/t-digest}" conf="compile->*"/>
+ <!-- StatsComponents HLL Dependencies-->
+ <dependency org="net.agkn" name="hll" rev="${/net.agkn/hll}" conf="compile->*"/>
+ <dependency org="it.unimi.dsi" name="fastutil" rev="${/it.unimi.dsi/fastutil}" conf="compile->*"/>
+
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsField.java?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsField.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsField.java Thu May 7 17:58:58 2015
@@ -30,6 +30,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
@@ -55,6 +56,10 @@ import org.apache.solr.search.QueryParsi
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
+import net.agkn.hll.HLL;
+import com.google.common.hash.Hashing;
+import com.google.common.hash.HashFunction;
+
/**
* Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
* instance.
@@ -107,6 +112,19 @@ public class StatsField {
}
return false;
}
+ },
+ cardinality(true) {
+ /** special for percentiles **/
+ boolean parseParams(StatsField sf) {
+ try {
+ sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
+ return (null != sf.hllOpts);
+ } catch (Exception e) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
+ + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
+ + e.getMessage(), e);
+ }
+ }
};
private final List<Stat> distribDeps;
@@ -150,7 +168,10 @@ public class StatsField {
return EnumSet.copyOf(this.distribDeps);
}
- /** return value of true means user is requesting this stat */
+ /**
+ * Called when the name of a stat is found as a local param on this {@link StatsField}
+ * @return true if the user is requesting this stat, else false
+ */
boolean parseParams(StatsField sf) {
return sf.localParams.getBool(this.name(), false);
}
@@ -180,7 +201,7 @@ public class StatsField {
private final boolean isShard;
private double tdigestCompression = 100.0D;
-
+ private HllOptions hllOpts;
/**
* @param rb the current request/response
@@ -549,4 +570,163 @@ public class StatsField {
public double getTdigestCompression() {
return tdigestCompression;
}
+
+ public HllOptions getHllOptions() {
+ return hllOpts;
+ }
+
+ /**
+ * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
+ *
+ * @see Stat#cardinality
+ * @lucene.internal
+ */
+ public static final class HllOptions {
+ final HashFunction hasher;
+
+ // NOTE: this explanation linked to from the java-hll jdocs...
+ // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
+ // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
+ // to support any max cardinality given that we're always dealing with hashes and
+ // the cardinality of the set of all long values is 2**64 == 1.9e19
+ //
+ // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect
+ // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values
+ // might fall in the same register (ie: bucket) and having a wider register to count more of
+ // them may be useful
+
+ final int log2m;
+ final int regwidth;
+
+ final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";
+
+ private HllOptions(int log2m, int regwidth, HashFunction hasher) {
+ this.log2m = log2m;
+ this.regwidth = regwidth;
+ this.hasher = hasher;
+ }
+ /**
+ * Creates an HllOptions based on the (local) params specified (if appropriate).
+ *
+ * @param localParams the LocalParams for this {@link StatsField}
+ * @param field the field corrisponding to this {@link StatsField}, may be null if these stats are over a value source
+ * @return the {@link HllOptions} to use basd on the params, or null if no {@link HLL} should be computed
+ * @throws SolrException if there are invalid options
+ */
+ public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
+ throws SolrException {
+
+ String cardinalityOpt = localParams.get(Stat.cardinality.name());
+ if (StringUtils.isBlank(cardinalityOpt)) {
+ return null;
+ }
+
+ final NumericType hashableNumType = getHashableNumericType(field);
+
+ // some sane defaults
+ int log2m = 13; // roughly equivilent to "cardinality='0.33'"
+ int regwidth = 6; // with decent hash, this is plenty for all valid long hashes
+
+ if (NumericType.FLOAT.equals(hashableNumType) || NumericType.INT.equals(hashableNumType)) {
+ // for 32bit values, we can adjust our default regwidth down a bit
+ regwidth--;
+
+ // NOTE: EnumField uses NumericType.INT, and in theory we could be super conservative
+ // with it, but there's no point - just let the EXPLICIT HLL handle it
+ }
+
+ // TODO: we could attempt additional reductions in the default regwidth based on index
+ // statistics -- but thta doesn't seem worth the effort. for tiny indexes, the
+ // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't
+ // want to be too aggresive about lowering regwidth or we could really poor results if
+ // log2m is also low and there is heavy hashkey collision
+
+ try {
+ // NFE will short out here if it's not a number
+ final double accuracyOpt = Double.parseDouble(cardinalityOpt);
+
+ // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
+ // - 0 means accuracy is not a concern, save RAM
+ // - 1 means be as accurate as possible, using as much RAM as needed.
+
+ if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
+ }
+
+ // use accuracyOpt as a scaling factor between min & max legal log2m values
+ log2m = HLL.MINIMUM_LOG2M_PARAM
+ + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));
+
+ // use accuracyOpt as a scaling factor for regwidth as well, BUT...
+ // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
+ // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
+ final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
+ regwidth = MIN_HUERISTIC_REGWIDTH
+ + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));
+
+ } catch (NumberFormatException nfe) {
+ // param value isn't a number -- let's check for simple true/false
+ if (! localParams.getBool(Stat.cardinality.name(), false)) {
+ return null;
+ }
+ }
+
+ // let explicit params override both the default and/or any accuracy specification
+ log2m = localParams.getInt("hllLog2m", log2m);
+ regwidth = localParams.getInt("hllRegwidth", regwidth);
+
+ // validate legal values
+ if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " +
+ HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
+ + " (" + log2m +")");
+ }
+ if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " +
+ HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
+ }
+
+ HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();
+
+ if (null == hasher) {
+ // if this is a function, or a non Long field, pre-hashed is invalid
+ // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
+ if (null == field || !NumericType.LONG.equals(field.getType().getNumericType())) {
+ throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
+ }
+ }
+
+ // if we're still here, then we need an HLL...
+ return new HllOptions(log2m, regwidth, hasher);
+ }
+ /** @see HLL */
+ public int getLog2m() {
+ return log2m;
+ }
+ /** @see HLL */
+ public int getRegwidth() {
+ return regwidth;
+ }
+ /** May be null if user has indicated that field values are pre-hashed */
+ public HashFunction getHasher() {
+ return hasher;
+ }
+ public HLL newHLL() {
+ return new HLL(getLog2m(), getRegwidth());
+ }
+ }
+
+ /**
+ * Returns the effective {@link NumericType} for the field for the purposes of hash values.
+ * ie: If the field has an explict NumericType that is returned; If the field has no explicit
+ * NumericType then {@link NumericType#LONG} is returned; If field is null, then
+ * {@link NumericType#FLOAT} is assumed for ValueSource.
+ */
+ private static NumericType getHashableNumericType(SchemaField field) {
+ if (null == field) {
+ return NumericType.FLOAT;
+ }
+ final NumericType result = field.getType().getNumericType();
+ return null == result ? NumericType.LONG : result;
+ }
}
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java Thu May 7 17:58:58 2015
@@ -34,6 +34,10 @@ import org.apache.solr.schema.*;
import com.tdunning.math.stats.AVLTreeDigest;
+import net.agkn.hll.HLL;
+import com.google.common.hash.Hashing;
+import com.google.common.hash.HashFunction;
+
/**
* Factory class for creating instance of
* {@link org.apache.solr.handler.component.StatsValues}
@@ -105,6 +109,7 @@ abstract class AbstractStatsValues<T> im
final protected boolean computeMin;
final protected boolean computeMax;
final protected boolean computeMinOrMax;
+ final protected boolean computeCardinality;
/**
* Either a function value source to collect from, or the ValueSource associated
@@ -129,7 +134,13 @@ abstract class AbstractStatsValues<T> im
protected long count;
protected long countDistinct;
protected final Set<T> distinctValues;
-
+
+ /**
+ * Hash function that must be used by implementations of {@link #hash}
+ */
+ protected final HashFunction hasher;
+ private final HLL hll;
+
// facetField facetValue
protected Map<String,Map<String, StatsValues>> facets = new HashMap<>();
@@ -141,9 +152,20 @@ abstract class AbstractStatsValues<T> im
this.computeMin = statsField.calculateStats(Stat.min);
this.computeMax = statsField.calculateStats(Stat.max);
this.computeMinOrMax = computeMin || computeMax;
-
+
this.distinctValues = computeCalcDistinct ? new TreeSet<>() : null;
+ this.computeCardinality = statsField.calculateStats(Stat.cardinality);
+ if ( computeCardinality ) {
+
+ hasher = statsField.getHllOptions().getHasher();
+ hll = statsField.getHllOptions().newHLL();
+ assert null != hll : "Cardinality requires an HLL";
+ } else {
+ hll = null;
+ hasher = null;
+ }
+
// alternatively, we could refactor a common base class that doesn't know/care
// about either SchemaField or ValueSource - but then there would be a lot of
// duplicate code between "NumericSchemaFieldStatsValues" and
@@ -186,6 +208,12 @@ abstract class AbstractStatsValues<T> im
if (computeMinOrMax) {
updateMinMax((T) stv.get("min"), (T) stv.get("max"));
}
+
+ if (computeCardinality) {
+ byte[] data = (byte[]) stv.get("cardinality");
+ hll.union(HLL.fromBytes(data));
+ }
+
updateTypeSpecificStats(stv);
NamedList f = (NamedList) stv.get(FACETS);
@@ -228,6 +256,8 @@ abstract class AbstractStatsValues<T> im
}
public void accumulate(T value, int count) {
+ assert null != value : "Can't accumulate null";
+
if (computeCount) {
this.count += count;
}
@@ -238,6 +268,14 @@ abstract class AbstractStatsValues<T> im
if (computeMinOrMax) {
updateMinMax(value, value);
}
+ if (computeCardinality) {
+ if (null == hasher) {
+ assert value instanceof Number : "pre-hashed value support only works with numeric longs";
+ hll.addRaw(((Number)value).longValue());
+ } else {
+ hll.addRaw(hash(value));
+ }
+ }
updateTypeSpecificStats(value, count);
}
@@ -290,6 +328,13 @@ abstract class AbstractStatsValues<T> im
res.add("distinctValues", distinctValues);
res.add("countDistinct", countDistinct);
}
+ if (statsField.includeInResponse(Stat.cardinality)) {
+ if (statsField.getIsShard()) {
+ res.add("cardinality", hll.toBytes());
+ } else {
+ res.add("cardinality", hll.cardinality());
+ }
+ }
addTypeSpecificStats(res);
@@ -326,6 +371,18 @@ abstract class AbstractStatsValues<T> im
}
/**
+ * Hash function to be used for computing cardinality.
+ *
+ * This method will not be called in cases where the user has indicated the values
+ * are already hashed. If this method is called, then {@link #hasher} will be non-null,
+ * and should be used to generate the appropriate hash value.
+ *
+ * @see Stat#cardinality
+ * @see #hasher
+ */
+ protected abstract long hash(T value);
+
+ /**
* Updates the minimum and maximum statistics based on the given values
*
* @param min
@@ -388,9 +445,31 @@ class NumericStatsValues extends Abstrac
this.computePercentiles = statsField.calculateStats(Stat.percentiles);
if ( computePercentiles ) {
-
tdigest = new AVLTreeDigest(statsField.getTdigestCompression());
}
+
+ }
+
+ @Override
+ public long hash(Number v) {
+ // have to use a bit of reflection to ensure good hash values since
+ // we don't have truely type specific stats
+ if (v instanceof Long) {
+ return hasher.hashLong(v.longValue()).asLong();
+ } else if (v instanceof Integer) {
+ return hasher.hashInt(v.intValue()).asLong();
+ } else if (v instanceof Double) {
+ return hasher.hashLong(Double.doubleToRawLongBits(v.doubleValue())).asLong();
+ } else if (v instanceof Float) {
+ return hasher.hashInt(Float.floatToRawIntBits(v.floatValue())).asLong();
+ } else if (v instanceof Byte) {
+ return hasher.newHasher().putByte(v.byteValue()).hash().asLong();
+ } else if (v instanceof Short) {
+ return hasher.newHasher().putShort(v.shortValue()).hash().asLong();
+ }
+ // else...
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ "Unsupported Numeric Type ("+v.getClass()+") for hashing: " +statsField);
}
@Override
@@ -540,6 +619,11 @@ class EnumStatsValues extends AbstractSt
super(statsField);
}
+ @Override
+ public long hash(EnumFieldValue v) {
+ return hasher.hashInt(v.toInt().intValue()).asLong();
+ }
+
/**
* {@inheritDoc}
*/
@@ -617,6 +701,11 @@ class DateStatsValues extends AbstractSt
this.computeSum = statsField.calculateStats(Stat.sum);
this.computeSumOfSquares = statsField.calculateStats(Stat.sumOfSquares);
}
+
+ @Override
+ public long hash(Date v) {
+ return hasher.hashLong(v.getTime()).asLong();
+ }
@Override
public void accumulate(int docID) {
@@ -716,6 +805,12 @@ class StringStatsValues extends Abstract
public StringStatsValues(StatsField statsField) {
super(statsField);
}
+
+ @Override
+ public long hash(String v) {
+ // NOTE: renamed hashUnencodedChars starting with guava 15
+ return hasher.hashString(v).asLong();
+ }
@Override
public void accumulate(int docID) {
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestDistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestDistributedSearch.java?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestDistributedSearch.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/TestDistributedSearch.java Thu May 7 17:58:58 2015
@@ -422,7 +422,47 @@ public class TestDistributedSearch exten
query("q","*:*", "sort",i1+" desc", "stats", "true", "stats.field", i1);
query("q","*:*", "sort",i1+" desc", "stats", "true", "stats.field", tdate_a);
query("q","*:*", "sort",i1+" desc", "stats", "true", "stats.field", tdate_b);
-
+
+
+ rsp = query("q", "*:*", "sort", i1 + " desc", "stats", "true",
+ "stats.field", "{!cardinality='true'}" + oddField,
+ "stats.field", "{!cardinality='true'}" + tlong);
+
+ { // don't leak variabls
+
+ // long
+ FieldStatsInfo s = rsp.getFieldStatsInfo().get(tlong);
+ assertNotNull("missing stats", s);
+ assertEquals("wrong cardinality", new Long(13), s.getCardinality());
+ //
+ assertNull("expected null for min", s.getMin());
+ assertNull("expected null for mean", s.getMean());
+ assertNull("expected null for count", s.getCount());
+ assertNull("expected null for calcDistinct", s.getCountDistinct());
+ assertNull("expected null for distinct vals", s.getDistinctValues());
+ assertNull("expected null for max", s.getMax());
+ assertNull("expected null for missing", s.getMissing());
+ assertNull("expected null for stddev", s.getStddev());
+ assertNull("expected null for sum", s.getSum());
+ assertNull("expected null for percentiles", s.getSum());
+
+ // string
+ s = rsp.getFieldStatsInfo().get(oddField);
+ assertNotNull("missing stats", s);
+ assertEquals("wrong cardinality", new Long(1), s.getCardinality());
+ //
+ assertNull("expected null for min", s.getMin());
+ assertNull("expected null for mean", s.getMean());
+ assertNull("expected null for count", s.getCount());
+ assertNull("expected null for calcDistinct", s.getCountDistinct());
+ assertNull("expected null for distinct vals", s.getDistinctValues());
+ assertNull("expected null for max", s.getMax());
+ assertNull("expected null for missing", s.getMissing());
+ assertNull("expected null for stddev", s.getStddev());
+ assertNull("expected null for sum", s.getSum());
+ assertNull("expected null for percentiles", s.getSum());
+ }
+
query("q", "*:*", "sort", i1 + " desc", "stats", "true", "stats.field",
"{!percentiles='1,2,3,4,5'}" + i1);
@@ -510,6 +550,7 @@ public class TestDistributedSearch exten
assertNull("expected null for stddev", s.getStddev());
assertNull("expected null for sum", s.getSum());
assertNull("expected null for percentiles", s.getPercentiles());
+ assertNull("expected null for cardinality", s.getCardinality());
// sanity check deps relationship
for (Stat dep : EnumSet.of(Stat.sum, Stat.count)) {
@@ -566,6 +607,7 @@ public class TestDistributedSearch exten
assertNull("expected null for missing", s.getMissing());
assertNull("expected null for sum", s.getSum());
assertNull("expected null for percentiles", s.getPercentiles());
+ assertNull("expected null for cardinality", s.getCardinality());
}
// request stats, but disable them all via param refs
@@ -587,6 +629,7 @@ public class TestDistributedSearch exten
assertNull("expected null for missing", s.getMissing());
assertNull("expected null for sum", s.getSum());
assertNull("expected null for percentiles", s.getPercentiles());
+ assertNull("expected null for cardinality", s.getCardinality());
}
final String[] stats = new String[] {
@@ -672,6 +715,7 @@ public class TestDistributedSearch exten
assertNull(p+" expected null for stddev", s.getStddev());
assertNull(p+" expected null for sum", s.getSum());
assertNull(p+" expected null for percentiles", s.getPercentiles());
+ assertNull(p+" expected null for cardinality", s.getCardinality());
}
@@ -706,7 +750,8 @@ public class TestDistributedSearch exten
assertNull(p+" expected null for missing", s.getMissing());
assertNull(p+" expected null for stddev", s.getStddev());
assertNull(p+" expected null for sum", s.getSum());
- assertNull(p+"expected null for percentiles", s.getPercentiles());
+ assertNull(p+" expected null for percentiles", s.getPercentiles());
+ assertNull(p+" expected null for cardinality", s.getCardinality());
}
@@ -732,6 +777,7 @@ public class TestDistributedSearch exten
assertNull("expected null for missing", s.getMissing());
assertNull("expected null for sum", s.getSum());
assertNull("expected null for percentiles", s.getPercentiles());
+ assertNull("expected null for cardinality", s.getCardinality());
}
// look at stats on non numeric fields
@@ -793,7 +839,7 @@ public class TestDistributedSearch exten
}
assertEquals("Sanity check failed: either test broke, or test changed, or you adjusted Stat enum" +
" (adjust constant accordingly if intentional)",
- 3465, numTotalStatQueries);
+ 4235, numTotalStatQueries);
/*** TODO: the failure may come back in "exception"
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java?rev=1678245&r1=1678244&r2=1678245&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java Thu May 7 17:58:58 2015
@@ -19,12 +19,14 @@ package org.apache.solr.handler.componen
import java.nio.ByteBuffer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
+import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.EnumSet;
import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -33,6 +35,8 @@ import java.util.TimeZone;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
@@ -42,6 +46,7 @@ import org.apache.solr.common.util.Named
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.StatsField.Stat;
+import org.apache.solr.handler.component.StatsField.HllOptions;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
@@ -50,6 +55,9 @@ import org.apache.solr.util.AbstractSolr
import org.apache.commons.math3.util.Combinations;
import com.tdunning.math.stats.AVLTreeDigest;
+import net.agkn.hll.HLL;
+import com.google.common.hash.Hashing;
+import com.google.common.hash.HashFunction;
import org.junit.BeforeClass;
@@ -196,7 +204,6 @@ public class StatsComponentTest extends
, kpre + "double[@name='stddev'][.='12.909944487358056']"
);
-
}
}
@@ -257,6 +264,17 @@ public class StatsComponentTest extends
, kpre + "double[@name='mean'][.='-50.0']"
, kpre + "double[@name='stddev'][.='25.81988897471611']"
);
+
+ // simple cardinality over a numeric field
+ assertQ("test function statistics & key override",
+ // NOTE: baseParams aren't used, we're looking only at the cardinality
+ req("q", "*:*", "stats", "true",
+ "fq", "{!tag=key_ex_tag}-id:4",
+ "stats.field", "{!key="+key+" cardinality=true}"+f)
+
+ , kpre + "long[@name='cardinality'][.='3']"
+ , "count(" + kpre + "/*)=1"
+ );
}
@@ -358,6 +376,10 @@ public class StatsComponentTest extends
);
}
+ assertQ("cardinality"
+ , req("q", "*:*", "rows", "0", "stats", "true", "stats.field", "{!cardinality=true}" + f)
+ , "//long[@name='cardinality'][.='8']"
+ );
}
public void testFieldStatisticsResultsStringField() throws Exception {
@@ -384,6 +406,13 @@ public class StatsComponentTest extends
"//long[@name='countDistinct'][.='3']",
"count(//arr[@name='distinctValues']/str)=3");
+ assertQ("test string cardinality"
+ , req("q", "*:*",
+ "rows", "0",
+ "stats","true",
+ "stats.field","{!cardinality=true}active_s")
+ , "//long[@name='cardinality'][.='3']");
+
// stats over a string function
assertQ("strdist func stats",
req("q", "*:*",
@@ -430,6 +459,11 @@ public class StatsComponentTest extends
// "//date[@name='sum'][.='1970-01-13T20:38:30Z']", // sometimes 29.999Z
// "//date[@name='mean'][.='1970-01-07T10:19:15Z']" // sometiems 14.999Z
);
+
+ assertQ("cardinality",
+ req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}active_dt")
+ , "//lst[@name='active_dt']/long[@name='cardinality'][.='2']");
+
}
@@ -595,6 +629,16 @@ public class StatsComponentTest extends
, pre+"/lst[@name='false']/double[@name='stddev'][.='7.0710678118654755']"
);
}
+
+ assertQ("stats.facet w/ cardinality"
+ , req("q", "*:*", "stats", "true",
+ "fq", "-other_s:bar",
+ "stats.facet", "active_s",
+ "stats.field", "{!cardinality=true}"+f)
+ , pre+"/lst[@name='true' ]/long[@name='cardinality'][.='1']"
+ , pre+"/lst[@name='false']/long[@name='cardinality'][.='2']"
+ );
+
}
public void doTestFacetStatisticsMissingResult(String f, SolrParams[] baseParamsSet) throws Exception {
@@ -637,6 +681,13 @@ public class StatsComponentTest extends
);
}
+ assertQ("stats.facet w/ cardinality"
+ , req("q", "*:*", "stats", "true",
+ "stats.facet", "active_s",
+ "stats.field", "{!cardinality=true}"+f)
+ , "//lst[@name='active_s']/lst[@name='true' ]/long[@name='cardinality'][.='2']"
+ , "//lst[@name='active_s']/lst[@name='false']/long[@name='cardinality'][.='1']"
+ );
}
public void testFieldStatisticsResultsNumericFieldAlwaysMissing() throws Exception {
@@ -669,6 +720,14 @@ public class StatsComponentTest extends
,"count(//lst[@name='active_i']/*)=8"
);
+
+ // NOTE: empty set percentiles covered in testPercentiles()
+
+ assertQ("test cardinality of missing"
+ , req("q", "*:*", "stats", "true", "stats.field", "{!cardinality=true}active_i")
+ ,"//lst[@name='active_i']/long[@name='cardinality'][.='0']"
+ );
+
}
public void testFieldStatisticsResultsStringFieldAlwaysMissing() throws Exception {
@@ -695,7 +754,13 @@ public class StatsComponentTest extends
,"//lst[@name='active_s']/null[@name='max']"
// if new stats are supported, this will break - update test to assert values for each
,"count(//lst[@name='active_s']/*)=4"
- );
+ );
+
+ assertQ("test string statistics values"
+ , req("q", "*:*", "stats", "true", "stats.field", "{!cardinality=true}active_s")
+ ,"//lst[@name='active_s']/long[@name='cardinality'][.='0']"
+ );
+
}
//SOLR-3160
@@ -729,6 +794,12 @@ public class StatsComponentTest extends
// if new stats are supported, this will break - update test to assert values for each
,"count(//lst[@name='active_dt']/*)=8"
);
+
+ assertQ("cardinality"
+ , req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}active_dt")
+ ,"//lst[@name='active_dt']/long[@name='cardinality'][.='0']"
+ );
+
}
public void testStatsFacetMultivaluedErrorHandling() throws Exception {
@@ -822,6 +893,10 @@ public class StatsComponentTest extends
, "//lst[@name='cat_docValues']/str[@name='min'][.='test']"
, "//lst[@name='cat_docValues']/str[@name='max'][.='testtw']");
+ assertQ("cardinality",
+ req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}cat_docValues")
+ , "//lst[@name='cat_docValues']/long[@name='cardinality'][.='3']");
+
}
public void testFieldStatisticsDocValuesAndMultiValuedInteger() throws Exception {
@@ -868,7 +943,11 @@ public class StatsComponentTest extends
, "//lst[@name='" + fieldName + "']/double[@name='sumOfSquares'][.='470.0']"
, "//lst[@name='" + fieldName + "']/long[@name='missing'][.='0']");
- }
+ assertQ("cardinality",
+ req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}" + fieldName)
+ , "//lst[@name='"+fieldName+"']/long[@name='cardinality'][.='9']");
+
+ }
public void testFieldStatisticsDocValuesAndMultiValuedIntegerFacetStats() throws Exception {
SolrCore core = h.getCore();
@@ -1054,6 +1133,11 @@ public class StatsComponentTest extends
,"count(//lst[@name='" + fieldName + "']/*)=10"
);
}
+
+ assertQ("cardinality",
+ req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}"+fieldName)
+ , "//lst[@name='"+fieldName+"']/long[@name='cardinality'][.='9']");
+
}
public void testEnumFieldTypeStatus() throws Exception {
@@ -1088,7 +1172,10 @@ public class StatsComponentTest extends
, "//lst[@name='" + fieldName + "']/str[@name='max'][.='Critical']"
, "//lst[@name='" + fieldName + "']/long[@name='count'][.='15']"
, "//lst[@name='" + fieldName + "']/long[@name='missing'][.='11']");
-
+
+ assertQ("cardinality",
+ req("q","*:*", "stats", "true", "stats.field", "{!cardinality=true}"+fieldName)
+ , "//lst[@name='" + fieldName + "']/long[@name='cardinality'][.='5']");
assertQ("enum calcdistinct", req("q","*:*", "stats", "true", "stats.field", fieldName,
StatsParams.STATS_CALC_DISTINCT, "true")
@@ -1139,12 +1226,60 @@ public class StatsComponentTest extends
return cat_docValues;
}
+ /** Convinience struct used in {@link #testIndividualStatLocalParams} */
+ private static final class ExpectedStat {
+ public final static String KPRE = XPRE + "lst[@name='stats_fields']/lst[@name='k']/";
+ public final Stat stat;
+ public final String input;
+ public final int numResponseKeys; // all because calcdistinct is obnoxious
+ public final List<String> perShardXpaths;
+ public final List<String> finalXpaths;
+
+ public final static Map<Stat,ExpectedStat> ALL = new LinkedHashMap<Stat,ExpectedStat>();
+ private ExpectedStat(Stat stat, String input, int numResponseKeys,
+ List<String> perShardXpaths, List<String> finalXpaths) {
+ this.stat = stat;
+ this.input = input;
+ this.numResponseKeys = numResponseKeys;
+ this.perShardXpaths = perShardXpaths;
+ this.finalXpaths = finalXpaths;
+ }
+
+ public static void createSimple(Stat stat, String input, String type, String result) {
+ EnumSet<Stat> deps = stat.getDistribDeps();
+ List<String> perShardXpaths = new ArrayList<String>(deps.size());
+ String xpath = KPRE + type + "[@name='" + stat + "'][.='" + result + "']";
+ for (Stat dep : deps) {
+ if (dep.equals(stat)) { // self dependency
+ perShardXpaths.add(xpath);;
+ } else {
+ ExpectedStat expectedDep = ALL.get(dep);
+ assertNotNull("can't find dep in ExpectedStat.ALL", expectedDep);
+ perShardXpaths.addAll(expectedDep.perShardXpaths);
+ }
+ }
+ ALL.put(stat, new ExpectedStat(stat, input, 1,
+ perShardXpaths, Collections.singletonList(xpath)));
+ }
+ public static void create(Stat stat, String input, int numResponseKeys,
+ List<String> perShardXpaths, List<String> finalXpaths) {
+ ALL.put(stat, new ExpectedStat(stat, input, numResponseKeys, perShardXpaths, finalXpaths));
+ }
+ }
+
public void testIndividualStatLocalParams() throws Exception {
- final String kpre = XPRE + "lst[@name='stats_fields']/lst[@name='k']/";
+ final String kpre = ExpectedStat.KPRE;
assertU(adoc("id", "1", "a_f", "2.3", "b_f", "9.7", "a_i", "9", "foo_t", "how now brown cow"));
assertU(commit());
+
+ SolrCore core = h.getCore();
+ SchemaField field = core.getLatestSchema().getField("a_i");
+ HllOptions hllOpts = HllOptions.parseHllOptions(params("cardinality","true"), field);
+ HLL hll = hllOpts.newHLL();
+ HashFunction hasher = hllOpts.getHasher();
+
AVLTreeDigest tdigest = new AVLTreeDigest(100);
// some quick sanity check assertions...
@@ -1156,7 +1291,7 @@ public class StatsComponentTest extends
, kpre + "double[@name='min'][.='9.0']"
, "count(" + kpre + "*)=2"
);
-
+
// for stats that are true/false, sanity check false does it's job
assertQ("min=true & max=false: only min should come back",
req("q","*:*", "stats", "true",
@@ -1173,147 +1308,127 @@ public class StatsComponentTest extends
// ...but be empty
, "count(" + kpre + "*)=0"
);
-
+
double sum = 0;
double sumOfSquares = 0;
final int count = 20;
for (int i = 0; i < count; i++) {
+ int a_i = i % 10;
assertU(adoc("id", String.valueOf(i), "a_f", "2.3", "b_f", "9.7", "a_i",
- String.valueOf(i % 10), "foo_t", "how now brown cow"));
- tdigest.add(i % 10);
- sum += i % 10;
- sumOfSquares += (i % 10) * (i % 10);
+ String.valueOf(a_i), "foo_t", "how now brown cow"));
+ tdigest.add(a_i);
+ hll.addRaw(hasher.hashInt(a_i).asLong());
+ sum += a_i;
+ sumOfSquares += (a_i) * (a_i);
}
-
+ double stddev = Math.sqrt(((count * sumOfSquares) - (sum * sum))/ (20 * (count - 1.0D)));
+
assertU(commit());
- ByteBuffer buf = ByteBuffer.allocate(tdigest.smallByteSize());
- tdigest.asSmallBytes(buf);
+ ByteBuffer tdigestBuf = ByteBuffer.allocate(tdigest.smallByteSize());
+ tdigest.asSmallBytes(tdigestBuf);
+ byte[] hllBytes = hll.toBytes();
+
EnumSet<Stat> allStats = EnumSet.allOf(Stat.class);
- Map<Stat,String> expectedStats = new HashMap<>();
- expectedStats.put(Stat.min, "0.0");
- expectedStats.put(Stat.max, "9.0");
- expectedStats.put(Stat.missing, "0");
- expectedStats.put(Stat.sum, String.valueOf(sum));
- expectedStats.put(Stat.count, String.valueOf(count));
- expectedStats.put(Stat.mean, String.valueOf(sum / count));
- expectedStats.put(Stat.sumOfSquares, String.valueOf(sumOfSquares));
- expectedStats.put(Stat.stddev, String.valueOf(Math.sqrt(((count * sumOfSquares) - (sum * sum))/ (20 * (count - 1.0D)))));
- expectedStats.put(Stat.calcdistinct, "10");
- // NOTE: per shard expected value
- expectedStats.put(Stat.percentiles, Base64.byteArrayToBase64(buf.array(), 0, buf.array().length));
-
- Map<Stat,String> expectedType = new HashMap<>();
- expectedType.put(Stat.min, "double");
- expectedType.put(Stat.max, "double");
- expectedType.put(Stat.missing, "long");
- expectedType.put(Stat.sum, "double");
- expectedType.put(Stat.count, "long");
- expectedType.put(Stat.mean, "double");
- expectedType.put(Stat.sumOfSquares, "double");
- expectedType.put(Stat.stddev, "double");
- expectedType.put(Stat.calcdistinct, "long");
- expectedType.put(Stat.percentiles, "str");
-
- Map<Stat,String> localParasInput = new HashMap<>();
- localParasInput.put(Stat.min, "true");
- localParasInput.put(Stat.max, "true");
- localParasInput.put(Stat.missing, "true");
- localParasInput.put(Stat.sum, "true");
- localParasInput.put(Stat.count, "true");
- localParasInput.put(Stat.mean, "true");
- localParasInput.put(Stat.sumOfSquares, "true");
- localParasInput.put(Stat.stddev, "true");
- localParasInput.put(Stat.calcdistinct, "true");
- localParasInput.put(Stat.percentiles, "'90, 99'");
-
- // canary in the coal mine
- assertEquals("size of expectedStats doesn't match all known stats; " +
- "enum was updated w/o updating test?",
- expectedStats.size(), allStats.size());
- assertEquals("size of expectedType doesn't match all known stats; " +
- "enum was updated w/o updating test?",
- expectedType.size(), allStats.size());
-
- // whitebox test: explicitly ask for isShard=true with an individual stat
- for (Stat stat : expectedStats.keySet()) {
- EnumSet<Stat> distribDeps = stat.getDistribDeps();
-
- StringBuilder exclude = new StringBuilder();
- List<String> testParas = new ArrayList<String>(distribDeps.size() + 2);
- int calcdistinctFudge = 0;
-
- for (Stat perShardStat : distribDeps ){
- String key = perShardStat.toString();
- if (perShardStat.equals(Stat.calcdistinct)) {
- // this abomination breaks all the rules - uses a diff response key and triggers
- // the additional "distinctValues" stat
- key = "countDistinct";
- calcdistinctFudge++;
- testParas.add("count(" + kpre + "arr[@name='distinctValues']/*)=10");
- }
- testParas.add(kpre + expectedType.get(perShardStat) +
- "[@name='" + key + "'][.='" + expectedStats.get(perShardStat) + "']");
- // even if we go out of our way to exclude the dependent stats,
- // the shard should return them since they are a dependency for the requested stat
- if (!stat.equals(Stat.percentiles)){
- exclude.append(perShardStat + "=false ");
- }
- }
- testParas.add("count(" + kpre + "*)=" + (distribDeps.size() + calcdistinctFudge));
+ final List<ExpectedStat> expected = new ArrayList<ExpectedStat>(allStats.size());
+ ExpectedStat.createSimple(Stat.min, "true", "double", "0.0");
+ ExpectedStat.createSimple(Stat.max, "true", "double", "9.0");
+ ExpectedStat.createSimple(Stat.missing, "true", "long", "0");
+ ExpectedStat.createSimple(Stat.sum, "true", "double", String.valueOf(sum));
+ ExpectedStat.createSimple(Stat.count, "true", "long", String.valueOf(count));
+ ExpectedStat.createSimple(Stat.mean, "true", "double", String.valueOf(sum / count));
+ ExpectedStat.createSimple(Stat.sumOfSquares, "true", "double", String.valueOf(sumOfSquares));
+ ExpectedStat.createSimple(Stat.stddev, "true", "double", String.valueOf(stddev));
+ final String countDistinctXpath = kpre + "long[@name='countDistinct'][.='10']";
+ ExpectedStat.create(Stat.calcdistinct, "true", 2,
+ Arrays.asList("count(" + kpre + "arr[@name='distinctValues']/*)=10",
+ countDistinctXpath),
+ Collections.singletonList(countDistinctXpath));
+ final String percentileShardXpath = kpre + "str[@name='percentiles'][.='"
+ + Base64.byteArrayToBase64(tdigestBuf.array(), 0, tdigestBuf.array().length) + "']";
+ final String p90 = "" + tdigest.quantile(0.90D);
+ final String p99 = "" + tdigest.quantile(0.99D);
+ ExpectedStat.create(Stat.percentiles, "'90, 99'", 1,
+ Collections.singletonList(percentileShardXpath),
+ Arrays.asList("count(" + kpre + "lst[@name='percentiles']/*)=2",
+ kpre + "lst[@name='percentiles']/double[@name='90.0'][.="+p90+"]",
+ kpre + "lst[@name='percentiles']/double[@name='99.0'][.="+p99+"]"));
+ final String cardinalityShardXpath = kpre + "str[@name='cardinality'][.='"
+ + Base64.byteArrayToBase64(hllBytes, 0, hllBytes.length) + "']";
+ final String cardinalityXpath = kpre + "long[@name='cardinality'][.='10']";
+ ExpectedStat.create(Stat.cardinality, "true", 1,
+ Collections.singletonList(cardinalityShardXpath),
+ Collections.singletonList(cardinalityXpath));
+
+ // canary in the coal mine
+ assertEquals("num of ExpectedStat doesn't match all known stats; " +
+ "enum was updated w/o updating test?",
+ ExpectedStat.ALL.size(), allStats.size());
+
+ // whitebox test: explicitly ask for isShard=true with each individual stat
+ for (ExpectedStat expect : ExpectedStat.ALL.values()) {
+ Stat stat = expect.stat;
+
+ StringBuilder exclude = new StringBuilder();
+ List<String> testXpaths = new ArrayList<String>(5 + expect.perShardXpaths.size());
+ testXpaths.addAll(expect.perShardXpaths);
+
+ int numKeysExpected = 0;
+ EnumSet<Stat> distribDeps = stat.getDistribDeps();
+ for (Stat perShardDep : distribDeps) {
+ numKeysExpected += ExpectedStat.ALL.get(perShardDep).numResponseKeys;
+
+ // even if we go out of our way to exclude the dependent stats,
+ // the shard should return them since they are a dependency for the requested stat
+ if (!stat.equals(perShardDep)){
+ // NOTE: this only works because all the cases where there are distribDeps
+ // beyond a self dependency are simple true/false options
+ exclude.append(perShardDep + "=false ");
+ }
+ }
+ // we don't want to find anything we aren't expecting
+ testXpaths.add("count(" + kpre + "*)=" + numKeysExpected);
- assertQ("ask for only "+stat+", with isShard=true, and expect only deps: " + distribDeps,
- req("q", "*:*", "isShard", "true", "stats", "true",
- "stats.field", "{!key=k " + exclude + stat +"=" + localParasInput.get(stat) + "}a_i")
- , testParas.toArray(new String[testParas.size()])
- );
- }
-
- // test all the possible combinations (of all possible sizes) of stats params
- for (int numParams = 1; numParams <= allStats.size(); numParams++) {
- for (EnumSet<Stat> set : new StatSetCombinations(numParams, allStats)) {
-
- // EnumSets use natural ordering, we want to randomize the order of the params
- List<Stat> combo = new ArrayList<Stat>(set);
- Collections.shuffle(combo, random());
-
- StringBuilder paras = new StringBuilder("{!key=k ");
- List<String> testParas = new ArrayList<String>(numParams + 2);
-
- int calcdistinctFudge = 0;
- for (Stat stat : combo) {
- String key = stat.toString();
- if (stat.equals(Stat.calcdistinct)) {
- // this abomination breaks all the rules - uses a diff response key and triggers
- // the additional "distinctValues" stat
- key = "countDistinct";
- calcdistinctFudge++;
- testParas.add("count(" + kpre + "arr[@name='distinctValues']/*)=10");
- }
- paras.append(stat + "=" + localParasInput.get(stat)+ " ");
-
- if (!stat.equals(Stat.percentiles)){
- testParas.add(kpre + expectedType.get(stat) + "[@name='" + key + "'][.='" + expectedStats.get(stat) + "']");
- } else {
- testParas.add("count(" + kpre + "lst[@name='percentiles']/*)=2");
- String p90 = "" + tdigest.quantile(0.90D);
- String p99 = "" + tdigest.quantile(0.99D);
- testParas.add(kpre + "lst[@name='percentiles']/double[@name='90.0'][.="+p90+"]");
- testParas.add(kpre + "lst[@name='percentiles']/double[@name='99.0'][.="+p99+"]");
- }
- }
-
- paras.append("}a_i");
- testParas.add("count(" + kpre + "*)=" + (combo.size() + calcdistinctFudge));
-
- assertQ("ask for an get only: "+ combo,
- req("q","*:*", "stats", "true",
- "stats.field", paras.toString())
- , testParas.toArray(new String[testParas.size()])
- );
- }
- }
+ assertQ("ask for only "+stat+", with isShard=true, and expect only deps: " + distribDeps,
+ req("q", "*:*", "isShard", "true", "stats", "true",
+ "stats.field", "{!key=k " + exclude + stat +"=" + expect.input + "}a_i")
+ , testXpaths.toArray(new String[testXpaths.size()])
+ );
+ }
+
+ // test all the possible combinations (of all possible sizes) of stats params
+ for (int numParams = 1; numParams <= allStats.size(); numParams++) {
+ for (EnumSet<Stat> set : new StatSetCombinations(numParams, allStats)) {
+ // EnumSets use natural ordering, we want to randomize the order of the params
+ List<Stat> combo = new ArrayList<Stat>(set);
+ Collections.shuffle(combo, random());
+
+ StringBuilder paras = new StringBuilder("{!key=k ");
+ List<String> testXpaths = new ArrayList<String>(numParams + 5);
+
+ int numKeysExpected = 0;
+ for (Stat stat : combo) {
+ ExpectedStat expect = ExpectedStat.ALL.get(stat);
+
+ paras.append(stat + "=" + expect.input + " ");
+
+ numKeysExpected += expect.numResponseKeys;
+ testXpaths.addAll(expect.finalXpaths);
+ }
+
+ paras.append("}a_i");
+
+ // we don't want to find anything we aren't expecting
+ testXpaths.add("count(" + kpre + "*)=" + numKeysExpected);
+
+ assertQ("ask for and get only: "+ combo,
+ req("q","*:*", "stats", "true",
+ "stats.field", paras.toString())
+ , testXpaths.toArray(new String[testXpaths.size()])
+ );
+ }
+ }
}
// Test for Solr-6349
@@ -1436,6 +1551,285 @@ public class StatsComponentTest extends
}
}
+ /** Helper used in {@link #testCardinality} */
+ public static String cardinalityXpath(String key, int cardinality) {
+ return XPRE + "lst[@name='stats_fields']/lst[@name='" + key +
+ "']/long[@name='cardinality'][.='"+cardinality+"']";
+ }
+
+ /** @see #testHllOptions */
+ public void testCardinality() throws Exception {
+ SolrCore core = h.getCore();
+ // insure we have the same hasher a_l would use
+ HashFunction hasher = HllOptions.parseHllOptions
+ (params("cardinality","true"), core.getLatestSchema().getField("a_l")).getHasher();
+
+ String[] baseParams = new String[] { "q","*:*", "stats","true", "indent","true", "rows","0" };
+ assertQ("empty cardinalities"
+ , req(params("stats.field","{!key=a cardinality=true}a_l",
+ "stats.field","{!key=pa cardinality=true}prehashed_a_l",
+ "stats.field","{!key=b cardinality=true}b_l",
+ "stats.field","{!key=c cardinality=true}c_l"),
+ baseParams)
+ , cardinalityXpath("a", 0)
+ , cardinalityXpath("pa", 0)
+ , cardinalityXpath("b", 0)
+ , cardinalityXpath("c", 0)
+ );
+
+ int id = 0;
+ // add trivial docs to test basic cardinality
+ for (int i = 0; i < 100; i++) {
+ // add the same values multiple times (diff docs)
+ for (int j =0; j < 5; j++) {
+ ++id;
+ assertU(adoc("id", ""+id,
+ "a_l", ""+i, "prehashed_a_l", ""+hasher.hashLong((long)i).asLong(),
+ "b_l", ""+(i % 7), "c_l", ""+id));
+ }
+ }
+ assertU(commit());
+
+ assertQ("various cardinalities"
+ , req(params("stats.field","{!key=a cardinality=true}a_l",
+ "stats.field","{!key=pa hllPreHashed=true cardinality=true}prehashed_a_l",
+ "stats.field","{!key=b cardinality=true}b_l",
+ "stats.field","{!key=c cardinality=true}c_l"),
+ baseParams)
+ , cardinalityXpath("a", 100)
+ , cardinalityXpath("pa", 100)
+ , cardinalityXpath("b", 7)
+ , cardinalityXpath("c", 500)
+ );
+
+ // various ways of explicitly saying "don't bother to compute cardinality"
+ for (SolrParams p : new SolrParams[] {
+ params("stats.field","{!key=a min=true cardinality=false}a_l"),
+ params("stats.field","{!key=a min=true cardinality=$doit}a_l", "doit", "false"),
+ params("stats.field","{!key=a min=true cardinality=$doit}a_l"), // missing doit param
+ // other tunning options shouldn't change things
+ params("stats.field","{!key=a min=true hllPreHashed=true cardinality=false}a_l"),
+ params("stats.field","{!key=a min=true hllRegwidth=4 cardinality=$doit}a_l", "doit", "false"),
+ params("stats.field","{!key=a min=true hllLog2m=18 cardinality=$doit}a_l"), // missing doit param
+ }) {
+ assertQ("min w/cardinality explicitly disabled", req(p, baseParams),
+ "count(//lst[@name='stats_fields']/lst[@name='a']/double[@name='min'])=1",
+ "count(//lst[@name='stats_fields']/lst[@name='a']/long[@name='cardinality'])=0");
+ }
+ }
+
+ /**
+ * whitebox test that HLL Option parsing does the right thing
+ * @see #testCardinality
+ * @see #testHllOptionsErrors
+ */
+ public void testHllOptions() throws Exception {
+ SolrCore core = h.getCore();
+
+ SchemaField field_l = core.getLatestSchema().getField("field_l");
+ SchemaField field_d = core.getLatestSchema().getField("field_d");
+ SchemaField field_dt = core.getLatestSchema().getField("field_dt");
+ SchemaField field_s = core.getLatestSchema().getField("field_s");
+ SchemaField field_i = core.getLatestSchema().getField("field_i");
+ SchemaField field_f = core.getLatestSchema().getField("field_f");
+ SchemaField field_severity = core.getLatestSchema().getField("severity");
+
+ // simple cases that shouldn't use HLL
+ assertNull(HllOptions.parseHllOptions(params(), field_l));
+ assertNull(HllOptions.parseHllOptions(params("cardinality","false"), field_l));
+
+ // sanity check, future proof againts the HLL library changing stuff on us
+ assertEquals("HLL Changed definition min for log2m, " +
+ "need to note in upgrade instructions and maybe adjust accuracy hueristic",
+ 4, HLL.MINIMUM_LOG2M_PARAM);
+ // NOTE: https://github.com/aggregateknowledge/java-hll/issues/14
+ assertEquals("HLL Changed definition max for log2m, " +
+ "need to note in upgrade instructions and maybe adjust accuracy hueristic",
+ 30, HLL.MAXIMUM_LOG2M_PARAM);
+ assertEquals("HLL Changed definition min for regwidth, " +
+ "need to note in upgrade instructions and probably adjust hueristic",
+ 1, HLL.MINIMUM_REGWIDTH_PARAM);
+ assertEquals("HLL Changed definition max for regwidth, " +
+ "need to note in upgrade instructions and probably adjust hueristic",
+ 8, HLL.MAXIMUM_REGWIDTH_PARAM);
+
+ // all of these should produce equivilent HLLOptions (Long, Double, or String using defaults)
+ SolrParams[] longDefaultParams = new SolrParams[] {
+ // basic usage
+ params("cardinality","true"),
+ params("cardinality","0.33"),
+
+ // expert level options
+ params("cardinality","true", "hllLog2m","13"),
+ params("cardinality","true", "hllRegwidth","6"),
+ params("cardinality","true", "hllPreHash","false"),
+ params("cardinality","true", "hllLog2m","13", "hllRegwidth","6", "hllPreHash", "false"),
+
+ // explicit hllLog2M should override numeric arg
+ params("cardinality","1.0", "hllLog2m","13", "hllRegwidth","6"),
+ params("cardinality","0.0", "hllLog2m","13", "hllRegwidth","6", "hllPreHash","false")
+ };
+ for (SchemaField field : new SchemaField[] { field_l, field_d, field_dt, field_s }) {
+ final String f = field.getName();
+ for (SolrParams p : longDefaultParams) {
+ HllOptions opts = HllOptions.parseHllOptions(p, field);
+ assertEquals(f + " long defaults: " + p, 13, opts.getLog2m());
+ assertEquals(f + " long defaults: " + p, 6, opts.getRegwidth());
+ assertNotNull(f + " long defaults: " + p, opts.getHasher());
+ }
+
+ // non defaults: lower/upper accuracy bounds should give min/max log2m & adjusted regwidth
+ HllOptions optsMin = HllOptions.parseHllOptions(params("cardinality","0"), field);
+ assertEquals(f + " min log2m", HLL.MINIMUM_LOG2M_PARAM, optsMin.getLog2m());
+ assertEquals(f + " min regwidth", 5, optsMin.getRegwidth()); // lowest hueristic for 64bit
+
+ HllOptions optsMax = HllOptions.parseHllOptions(params("cardinality","1"), field);
+ assertEquals(f + " max log2m", HLL.MAXIMUM_LOG2M_PARAM, optsMax.getLog2m());
+ assertEquals(f + " max regwidth", HLL.MAXIMUM_REGWIDTH_PARAM, optsMax.getRegwidth());
+
+ }
+
+ // all of these should produce equivilent HLLOptions (Int, Float, or ValueSource using defaults)
+ SolrParams[] intDefaultParams = new SolrParams[] {
+ // basic usage
+ params("cardinality","true"),
+ params("cardinality","0.33"),
+
+ // expert level options
+ params("cardinality","true", "hllLog2m","13"),
+ params("cardinality","true", "hllRegwidth","5"),
+ params("cardinality","true", "hllPreHash","false"),
+ params("cardinality","true", "hllLog2m","13", "hllRegwidth","5", "hllPreHash", "false"),
+
+ // explicit hllLog2M & hllRegwidth should override hueristic float arg
+ params("cardinality","1.0", "hllLog2m","13", "hllRegwidth","5"),
+ params("cardinality","0.0", "hllLog2m","13", "hllRegwidth","5", "hllPreHash","false")
+ };
+ for (SchemaField field : new SchemaField[] { field_i, field_f, field_severity, null }) {
+ final String f = null == field ? "(func)" : field.getName();
+ for (SolrParams p : intDefaultParams) {
+ HllOptions opts = HllOptions.parseHllOptions(p, field);
+ assertEquals(f + " int defaults: " + p, 13, opts.getLog2m());
+ assertEquals(f + " int defaults: " + p, 5, opts.getRegwidth());
+ assertNotNull(f + " int defaults: " + p, opts.getHasher());
+ }
+
+ // non defaults: lower/upper accuracy bounds should give min/max log2m & adjusted regwidth
+ HllOptions optsMin = HllOptions.parseHllOptions(params("cardinality","0"), field);
+ assertEquals(f + " min log2m", HLL.MINIMUM_LOG2M_PARAM, optsMin.getLog2m());
+ assertEquals(f + " min regwidth", 4, optsMin.getRegwidth()); // lowest hueristic for 32bit
+
+ HllOptions optsMax = HllOptions.parseHllOptions(params("cardinality","1"), field);
+ assertEquals(f + " max log2m", HLL.MAXIMUM_LOG2M_PARAM, optsMax.getLog2m());
+ assertEquals(f + " max regwidth", HLL.MAXIMUM_REGWIDTH_PARAM, optsMax.getRegwidth());
+
+ }
+
+ // basic pre-hashed arg check specifically for long fields
+ assertNotNull(HllOptions.parseHllOptions(params("cardinality","true"), field_l).getHasher());
+ assertNotNull(HllOptions.parseHllOptions(params("cardinality","true", "hllPreHashed", "false"),
+ field_l).getHasher());
+ assertNull(HllOptions.parseHllOptions(params("cardinality","true", "hllPreHashed", "true"),
+ field_l).getHasher());
+
+ }
+
+ /**
+ * Test user input errors (split into it's own test to isolate ignored exceptions
+ * @see #testCardinality
+ * @see #testHllOptions
+ */
+ public void testHllOptionsErrors() throws Exception {
+ String[] baseParams = new String[] { "q","*:*", "stats","true", "indent","true", "rows","0" };
+ SolrCore core = h.getCore();
+ SchemaField foo_s = core.getLatestSchema().getField("foo_s");
+ SchemaField foo_i = core.getLatestSchema().getField("foo_i");
+
+ ignoreException("hllPreHashed");
+ for (SchemaField field : new SchemaField[] { foo_s, foo_i }) {
+ // whitebox - field
+ try {
+ HllOptions.parseHllOptions(params("cardinality","true", "hllPreHashed", "true"), field);
+ fail("hllPreHashed should have failed for " + field.getName());
+ } catch (SolrException e) {
+ assertTrue("MSG: " + e.getMessage(),
+ e.getMessage().contains("hllPreHashed is only supported with Long"));
+ }
+ // blackbox - field
+ assertQEx("hllPreHashed " + field.getName(), "hllPreHashed is only supported with Long",
+ req(params("stats.field","{!cardinality=true hllPreHashed=true}" + field.getName()),
+ baseParams),
+ ErrorCode.BAD_REQUEST);
+ }
+ // whitebox - function
+ try {
+ HllOptions.parseHllOptions(params("cardinality","true", "hllPreHashed", "true"), null);
+ fail("hllPreHashed should have failed for function");
+ } catch (SolrException e) {
+ assertTrue("MSG: " + e.getMessage(),
+ e.getMessage().contains("hllPreHashed is only supported with Long"));
+ }
+ // blackbox - function
+ assertQEx("hllPreHashed function", "hllPreHashed is only supported with Long",
+ req(params("stats.field","{!func cardinality=true hllPreHashed=true}sum(foo_i,foo_l)"),
+ baseParams),
+ ErrorCode.BAD_REQUEST);
+
+
+ ignoreException("accuracy");
+ for (String invalid : new String[] { "-1", "1.1", "100" }) {
+ // whitebox
+ try {
+ Object trash = HllOptions.parseHllOptions(params("cardinality",invalid), foo_s);
+ fail("Should have failed: " + invalid);
+ } catch (SolrException e) {
+ assertTrue("MSG: " + e.getMessage(),
+ e.getMessage().contains("number between 0 and 1"));
+ }
+ // blackbox
+ assertQEx("cardinality="+invalid, "number between 0 and 1",
+ req(params("stats.field","{!cardinality="+invalid+"}foo_s"),
+ baseParams),
+ ErrorCode.BAD_REQUEST);
+ }
+
+ ignoreException("hllLog2m must be");
+ for (int invalid : new int[] { HLL.MINIMUM_LOG2M_PARAM-1, HLL.MAXIMUM_LOG2M_PARAM+11 }) {
+ // whitebox
+ try {
+ Object trash = HllOptions.parseHllOptions(params("cardinality","true",
+ "hllLog2m", ""+invalid), foo_s);
+ fail("Should have failed: " + invalid);
+ } catch (SolrException e) {
+ assertTrue("MSG: " + e.getMessage(),
+ e.getMessage().contains("hllLog2m must be"));
+ }
+ // blackbox
+ assertQEx("hllLog2m="+invalid, "hllLog2m must be",
+ req(params("stats.field","{!cardinality=true hllLog2m="+invalid+"}foo_s"),
+ baseParams),
+ ErrorCode.BAD_REQUEST);
+ }
+
+ ignoreException("hllRegwidth must be");
+ for (int invalid : new int[] { HLL.MINIMUM_REGWIDTH_PARAM-1, HLL.MAXIMUM_REGWIDTH_PARAM+1 }) {
+ // whitebox
+ try {
+ Object trash = HllOptions.parseHllOptions(params("cardinality","true",
+ "hllRegwidth", ""+invalid), foo_s);
+ fail("Should have failed: " + invalid);
+ } catch (SolrException e) {
+ assertTrue("MSG: " + e.getMessage(),
+ e.getMessage().contains("hllRegwidth must be"));
+ }
+ // blackbox
+ assertQEx("hllRegwidth="+invalid, "hllRegwidth must be",
+ req(params("stats.field","{!cardinality=true hllRegwidth="+invalid+"}foo_s"),
+ baseParams),
+ ErrorCode.BAD_REQUEST);
+ }
+ }
+
// simple percentiles test
public void testPercentiles() throws Exception {
@@ -1553,4 +1947,5 @@ public class StatsComponentTest extends
};
}
}
+
}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java?rev=1678245&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java Thu May 7 17:58:58 2015
@@ -0,0 +1,284 @@
+package org.apache.solr.handler.component;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.LuceneTestCase.Slow;
+
+import org.apache.solr.BaseDistributedSearchTestCase;
+import org.apache.solr.client.solrj.response.FieldStatsInfo;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+
+import net.agkn.hll.HLL;
+import com.google.common.hash.Hashing;
+import com.google.common.hash.HashFunction;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Slow
+public class TestDistributedStatsComponentCardinality extends BaseDistributedSearchTestCase {
+
+ public static final Logger log
+ = LoggerFactory.getLogger(TestDistributedStatsComponentCardinality.class);
+
+ final static HashFunction HASHER = Hashing.murmur3_128();
+
+ final static long BIG_PRIME = 982451653L;
+
+ final static int MIN_NUM_DOCS = 10000;
+ final static int MAX_NUM_DOCS = MIN_NUM_DOCS * 2;
+
+ final static List<String> STAT_FIELDS =
+ Collections.unmodifiableList(Arrays.asList( "int_i", "long_l", "string_s" ));
+
+ final int NUM_DOCS;
+ final long MAX_LONG;
+ final long MIN_LONG;
+
+ public TestDistributedStatsComponentCardinality() {
+ super();
+ // we want some randomness in the shard number, but we don't want multiple iterations
+ fixShardCount(TEST_NIGHTLY ? 7 : random().nextInt(3) + 1);
+
+ handle.put("maxScore", SKIPVAL);
+ NUM_DOCS = TestUtil.nextInt(random(), 10000, 15000);
+ MAX_LONG = TestUtil.nextLong(random(), 0, NUM_DOCS * BIG_PRIME);
+ MIN_LONG = MAX_LONG - (((long)NUM_DOCS-1) * BIG_PRIME);
+ }
+
+ /** CAUTION: this builds a very large index */
+ public void buildIndex() throws Exception {
+ log.info("Building an index of {} docs", NUM_DOCS);
+
+ // we want a big spread in the long values we use, decrement by BIG_PRIME as we index
+ long longValue = MAX_LONG;
+
+ for (int i = 1; i <= NUM_DOCS; i++) {
+ // with these values, we know that every doc indexed has a unique value in all of the
+ // fields we will compute cardinality against.
+ // which means the number of docs matching a query is the true cardinality for each field
+
+ final String strValue = "s"+longValue;
+ indexDoc(sdoc("id","" + i,
+ "int_i", ""+i,
+ "int_i_prehashed_l", ""+HASHER.hashInt(i).asLong(),
+ "long_l", ""+longValue,
+ "long_l_prehashed_l", ""+HASHER.hashLong(longValue).asLong(),
+ "string_s", strValue,
+ // NOTE: renamed hashUnencodedChars starting with guava 15
+ "string_s_prehashed_l", ""+HASHER.hashString(strValue).asLong()));
+
+ longValue -= BIG_PRIME;
+ }
+
+ commit();
+
+ }
+
+
+ public void test() throws Exception {
+ buildIndex();
+
+ { // simple sanity checks - don't leak variables
+ QueryResponse rsp = null;
+ rsp = query(params("rows", "0", "q", "id:42"));
+ assertEquals(1, rsp.getResults().getNumFound());
+
+ rsp = query(params("rows", "0", "q", "*:*",
+ "stats","true", "stats.field", "{!min=true max=true}long_l"));
+ assertEquals(NUM_DOCS, rsp.getResults().getNumFound());
+ assertEquals(MIN_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMin()));
+ assertEquals(MAX_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMax()));
+ }
+
+ final int NUM_QUERIES = atLeast(100);
+
+ // Some Randomized queries with randomized log2m and max regwidth
+ for (int i = 0; i < NUM_QUERIES; i++) {
+
+ // testing shows that on random data, at the size we're dealing with,
+ // MINIMUM_LOG2M_PARAM is just too absurdly small to give anything remotely close the
+ // the theoretically expected relative error.
+ //
+ // So we have to use a slightly higher lower bound on what log2m values we randomly test
+ final int log2m = TestUtil.nextInt(random(),
+ 2 + HLL.MINIMUM_LOG2M_PARAM,
+ HLL.MAXIMUM_LOG2M_PARAM);
+
+ // use max regwidth to try and prevent hash collisions from introducing problems
+ final int regwidth = HLL.MAXIMUM_REGWIDTH_PARAM;
+
+ final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000);
+ final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS);
+ final int numMatches = 1+highId-lowId;
+
+ SolrParams p = buildCardinalityQ(lowId, highId, log2m, regwidth);
+ QueryResponse rsp = query(p);
+ assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound());
+
+ Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo();
+
+ for (String f : STAT_FIELDS) {
+ // regardless of log2m and regwidth, the estimated cardinality of the
+ // hashed vs prehashed values should be exactly the same for each field
+
+ assertEquals(f + ": hashed vs prehashed, real="+ numMatches + ", p=" + p,
+ stats.get(f).getCardinality().longValue(),
+ stats.get(f+"_prehashed_l").getCardinality().longValue());
+ }
+
+ for (String f : STAT_FIELDS) {
+ // check the relative error of the estimate returned against the known truth
+
+ final double relErr = expectedRelativeError(log2m);
+ final long estimate = stats.get(f).getCardinality().longValue();
+ assertTrue(f + ": relativeErr="+relErr+", estimate="+estimate+", real="+numMatches+", p=" + p,
+ (Math.abs(numMatches - estimate) / numMatches) < relErr);
+
+ }
+ }
+
+ // Some Randomized queries with both low and high accuracy options
+ for (int i = 0; i < NUM_QUERIES; i++) {
+
+ final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000);
+ final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS);
+ final int numMatches = 1+highId-lowId;
+
+ // WTF? - https://github.com/aggregateknowledge/java-hll/issues/15
+ //
+ // aparently we can't rely on estimates always being more accurate with higher log2m values?
+ // so for now, just try testing accuracy values that differ by at least 0.5
+ //
+ // (that should give us a significant enough log2m diff that the "highAccuracy" is always
+ // more accurate -- if, not then the entire premise of the float value is fundementally bogus)
+ //
+ final double lowAccuracy = random().nextDouble() / 2;
+ // final double highAccuracy = Math.min(1.0D, lowAccuracy + (random().nextDouble() / 2));
+ final double highAccuracy = Math.min(1.0D, lowAccuracy + 0.5D);
+
+ SolrParams p = buildCardinalityQ(lowId, highId, lowAccuracy, highAccuracy);
+ QueryResponse rsp = query(p);
+ assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound());
+
+ Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo();
+
+ // can't use STAT_FIELDS here ...
+ //
+ // hueristic differences for regwidth on 32 bit values mean we get differences
+ // between estimates for the normal field vs the prehashed (long) field
+ //
+ // so we settle for only testing things where the regwidth is consistent
+ // w/the prehashed long...
+ for (String f : new String[] { "long_l", "string_s" }) {
+
+ // regardless of accuracy, the estimated cardinality of the
+ // hashed vs prehashed values should be exactly the same for each field
+
+ assertEquals(f + ": hashed vs prehashed (low), real="+ numMatches + ", p=" + p,
+ stats.get("low_"+f).getCardinality().longValue(),
+ stats.get("low_"+f+"_prehashed_l").getCardinality().longValue());
+ assertEquals(f + ": hashed vs prehashed (high), real="+ numMatches + ", p=" + p,
+ stats.get("high_"+f).getCardinality().longValue(),
+ stats.get("high_"+f+"_prehashed_l").getCardinality().longValue());
+ }
+
+ for (String f : STAT_FIELDS) {
+ for (String ff : new String[] { f, f+"_prehashed_l"}) {
+ // for both the prehashed and regular fields, the high accuracy option
+ // should always produce an estimate at least as good as the low accuracy option
+
+ long poorEst = stats.get("low_"+ff).getCardinality();
+ long goodEst = stats.get("high_"+ff).getCardinality();
+ assertTrue(ff + ": goodEst="+goodEst+", poorEst="+poorEst+", real="+numMatches+", p=" + p,
+ Math.abs(numMatches - goodEst) <= Math.abs(numMatches - poorEst));
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns the (max) expected relative error according ot the HLL algorithm docs
+ */
+ private static double expectedRelativeError(final int log2m) {
+ final long m = 1 << log2m;
+ // theoretical error is 1.04D * sqrt(m)
+ // fudge slightly to account for variance in random data
+ return 1.1D / Math.sqrt(m);
+ }
+
+ /**
+ * Helper utility for building up a set of query params.
+ *
+ * The main query is a simple range query against the id field (using lowId TO highId).
+ * 2 stats.field params are generated for every field in {@link #STAT_FIELDS} --
+ * both with and w/o a prehashed_l suffix -- using the specified log2m and regwidth.
+ *
+ * The response keys will be the full field names
+ */
+ private static SolrParams buildCardinalityQ(final int lowId,
+ final int highId,
+ final int log2m,
+ final int regwidth) {
+ ModifiableSolrParams p = params("q", "id:["+lowId+" TO "+highId+"]",
+ "rows", "0", "stats", "true");
+ final String prefix = "{!cardinality=true hllLog2m="+log2m+" hllRegwidth="+regwidth;
+ for (String f : STAT_FIELDS) {
+ p.add("stats.field", prefix+"}"+f);
+ p.add("stats.field", prefix+" hllPreHashed=true}"+f+"_prehashed_l");
+ }
+ return p;
+ }
+
+ /**
+ * Helper utility for building up a set of query params.
+ *
+ * The main query is a simple range query against the id field (using lowId TO highId).
+ * 4 stats.field params are generated for every field in {@link #STAT_FIELDS} --
+ * both with and w/o a prehashed_l suffix, and using both the low and high accuracy values
+ *
+ * The response keys will be the full field names with either a "low_" or "high_" prefix
+ */
+ private static SolrParams buildCardinalityQ(final int lowId,
+ final int highId,
+ final double lowAccuracy,
+ final double highAccuracy) {
+ ModifiableSolrParams p = params("q", "id:["+lowId+" TO "+highId+"]",
+ "rows", "0", "stats", "true");
+ final String[] prefixes = new String[] {
+ "{!cardinality=" + lowAccuracy + " key=low_",
+ "{!cardinality=" + highAccuracy + " key=high_"
+ };
+
+ for (String f : STAT_FIELDS) {
+ for (String prefix : prefixes) {
+ p.add("stats.field", prefix+f+"}"+f);
+ p.add("stats.field", prefix+f+"_prehashed_l hllPreHashed=true}"+f+"_prehashed_l");
+ }
+ }
+ return p;
+ }
+}
Added: lucene/dev/trunk/solr/licenses/fastutil-6.5.11.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/licenses/fastutil-6.5.11.jar.sha1?rev=1678245&view=auto
==============================================================================
--- lucene/dev/trunk/solr/licenses/fastutil-6.5.11.jar.sha1 (added)
+++ lucene/dev/trunk/solr/licenses/fastutil-6.5.11.jar.sha1 Thu May 7 17:58:58 2015
@@ -0,0 +1 @@
+403289e76a91394944ded6056095bdf52b457249