You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2018/08/13 19:28:26 UTC
lucene-solr:branch_7x: SOLR-12660: Add outliers Stream Evaluator to
support outlier detection
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 0e5abcede -> bcf3a5d18
SOLR-12660: Add outliers Stream Evaluator to support outlier detection
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bcf3a5d1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bcf3a5d1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bcf3a5d1
Branch: refs/heads/branch_7x
Commit: bcf3a5d187299f18cc651c16732fe3a44a9da557
Parents: 0e5abce
Author: Joel Bernstein <jb...@apache.org>
Authored: Mon Aug 13 15:21:16 2018 -0400
Committer: Joel Bernstein <jb...@apache.org>
Committed: Mon Aug 13 15:25:37 2018 -0400
----------------------------------------------------------------------
.../org/apache/solr/client/solrj/io/Lang.java | 1 +
.../client/solrj/io/eval/OutliersEvaluator.java | 142 +++++++++++++++++++
.../solrj/io/stream/MathExpressionTest.java | 53 +++++++
3 files changed, 196 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
index cedf062..c6485bb 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
@@ -248,6 +248,7 @@ public class Lang {
.withFunctionName("setValue", SetValueEvaluator.class)
.withFunctionName("knnRegress", KnnRegressionEvaluator.class)
.withFunctionName("gaussfit", GaussFitEvaluator.class)
+ .withFunctionName("outliers", OutliersEvaluator.class)
// Boolean Stream Evaluators
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
new file mode 100644
index 0000000..85d298f
--- /dev/null
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.solrj.io.eval;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.commons.math3.distribution.IntegerDistribution;
+import org.apache.commons.math3.distribution.AbstractRealDistribution;
+import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
+import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
+import org.apache.solr.client.solrj.io.Tuple;
+
+public class OutliersEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
+ protected static final long serialVersionUID = 1L;
+
+ public OutliersEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
+ super(expression, factory);
+ }
+
+ @Override
+ public Object doWork(Object... values) throws IOException{
+
+ if(values.length < 4) {
+ throw new IOException("The outliers function requires 4 parameters");
+ }
+
+ Object dist = values[0];
+ List<Number> vec = null;
+ if(values[1] instanceof List) {
+ vec = (List<Number>)values[1];
+ } else {
+ throw new IOException("The second parameter of the outliers function is the numeric array to be tested for outliers.");
+ }
+
+ double low = 0.0;
+
+ if(values[2] instanceof Number) {
+ low = ((Number)values[2]).doubleValue();
+ } else {
+ throw new IOException("The third parameter of the outliers function is a number for the low outlier threshold.");
+ }
+
+ double hi = 0.0;
+
+ if(values[3] instanceof Number) {
+ hi = ((Number)values[3]).doubleValue();
+ } else {
+ throw new IOException("The fourth parameter of the outliers function is a number for the high outlier threshold");
+ }
+
+ List<Tuple> tuples = null;
+
+ if(values.length ==5) {
+ if(values[4] instanceof List) {
+ tuples = (List<Tuple>) values[4];
+ } else {
+ throw new IOException("The optional fifth parameter of the outliers function is an array of Tuples that are paired with the numeric array of values to be tested.");
+ }
+ } else {
+ tuples = new ArrayList();
+ for(int i=0; i<vec.size(); i++) {
+ tuples.add(new Tuple(new HashMap()));
+ }
+ }
+
+ List<Tuple> outliers = new ArrayList();
+
+ if(dist instanceof IntegerDistribution) {
+
+ IntegerDistribution d = (IntegerDistribution) dist;
+
+ for(int i=0; i<vec.size(); i++) {
+
+ Number n = vec.get(i);
+ Tuple t = tuples.get(i);
+
+ double cumProb = d.cumulativeProbability(n.intValue());
+ if(low >= 0 && cumProb <= low) {
+ t.put("lowOutlier", true);
+ t.put("lowOutlierValue", n);
+ t.put("cumulativeProbablity", cumProb);
+ outliers.add(t);
+ }
+
+ if(hi >= 0 && cumProb >= hi) {
+ t.put("highOutlier", true);
+ t.put("highOutlierValue", n);
+ t.put("cumulativeProbablity", cumProb);
+ outliers.add(t);
+ }
+ }
+
+ return outliers;
+
+ } else if(dist instanceof AbstractRealDistribution) {
+
+ AbstractRealDistribution d = (AbstractRealDistribution)dist;
+ for(int i=0; i<vec.size(); i++) {
+
+ Number n = vec.get(i);
+ Tuple t = tuples.get(i);
+
+ double cumProb = d.cumulativeProbability(n.doubleValue());
+ if(low >= 0 && cumProb <= low) {
+ t.put("lowOutlier", true);
+ t.put("lowOutlierValue", n);
+ t.put("cumulativeProbablity", cumProb);
+ outliers.add(t);
+
+ }
+
+ if(hi >= 0 && cumProb >= hi) {
+ t.put("highOutlier", true);
+ t.put("highOutlierValue", n);
+ t.put("cumulativeProbablity", cumProb);
+ outliers.add(t);
+ }
+ }
+
+ return outliers;
+ } else {
+ throw new IOException("The first parameter of the outliers function must be a real or integer probability distribution");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
index a27ad2d..be51050 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
@@ -3272,6 +3272,59 @@ public class MathExpressionTest extends SolrCloudTestCase {
assertEquals(out1.get(7).doubleValue(), 61.5, 0.0001);
}
+
+ @Test
+ public void testOutliers() throws Exception {
+ String cexpr = "let(echo=true," +
+ " a=list(tuple(id=0.0), tuple(id=1), tuple(id=2), tuple(id=3)), " +
+ " b=normalDistribution(100, 5)," +
+ " d=array(100, 110, 90, 99), " +
+ " e=outliers(b, d, .05, .95, a)," +
+ " f=outliers(b, d, .05, .95))";
+
+ ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+ paramsLoc.set("expr", cexpr);
+ paramsLoc.set("qt", "/stream");
+ String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+ TupleStream solrStream = new SolrStream(url, paramsLoc);
+ StreamContext context = new StreamContext();
+ solrStream.setStreamContext(context);
+ List<Tuple> tuples = getTuples(solrStream);
+ assertTrue(tuples.size() == 1);
+ List<Map> out = (List<Map>)tuples.get(0).get("e");
+ assertEquals(out.size(), 2);
+ Map high = out.get(0);
+ assertEquals(((String)high.get("id")), "1");
+
+ assertEquals(((Number)high.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
+ assertEquals(((Number)high.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
+ assertEquals(((Boolean)high.get("highOutlier")).booleanValue(), true);
+
+
+ Map low = out.get(1);
+ assertEquals(((String)low.get("id")), "2");
+ assertEquals(((Number)low.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
+ assertEquals(((Number)low.get("lowOutlierValue")).doubleValue(), 90, 0.0);
+ assertEquals(((Boolean)low.get("lowOutlier")).booleanValue(), true);
+
+
+ List<Map> out1 = (List<Map>)tuples.get(0).get("f");
+ assertEquals(out1.size(), 2);
+ Map high1 = out1.get(0);
+ assert(high1.get("id") == null);
+ assertEquals(((Number)high1.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
+ assertEquals(((Number)high1.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
+ assertEquals(((Boolean)high1.get("highOutlier")).booleanValue(), true);
+
+
+ Map low1 = out1.get(1);
+ assert(low1.get("id") == null);
+ assertEquals(((Number)low1.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
+ assertEquals(((Number)low1.get("lowOutlierValue")).doubleValue(), 90, 0.0);
+ assertEquals(((Boolean)low1.get("lowOutlier")).booleanValue(), true);
+
+ }
+
@Test
public void testLerp() throws Exception {
String cexpr = "let(echo=true," +