You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2018/08/13 19:28:26 UTC

lucene-solr:branch_7x: SOLR-12660: Add outliers Stream Evaluator to support outlier detection

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 0e5abcede -> bcf3a5d18


SOLR-12660: Add outliers Stream Evaluator to support outlier detection


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bcf3a5d1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bcf3a5d1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bcf3a5d1

Branch: refs/heads/branch_7x
Commit: bcf3a5d187299f18cc651c16732fe3a44a9da557
Parents: 0e5abce
Author: Joel Bernstein <jb...@apache.org>
Authored: Mon Aug 13 15:21:16 2018 -0400
Committer: Joel Bernstein <jb...@apache.org>
Committed: Mon Aug 13 15:25:37 2018 -0400

----------------------------------------------------------------------
 .../org/apache/solr/client/solrj/io/Lang.java   |   1 +
 .../client/solrj/io/eval/OutliersEvaluator.java | 142 +++++++++++++++++++
 .../solrj/io/stream/MathExpressionTest.java     |  53 +++++++
 3 files changed, 196 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
index cedf062..c6485bb 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
@@ -248,6 +248,7 @@ public class Lang {
         .withFunctionName("setValue", SetValueEvaluator.class)
         .withFunctionName("knnRegress", KnnRegressionEvaluator.class)
         .withFunctionName("gaussfit", GaussFitEvaluator.class)
+        .withFunctionName("outliers", OutliersEvaluator.class)
 
         // Boolean Stream Evaluators
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
new file mode 100644
index 0000000..85d298f
--- /dev/null
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/OutliersEvaluator.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.solrj.io.eval;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.commons.math3.distribution.IntegerDistribution;
+import org.apache.commons.math3.distribution.AbstractRealDistribution;
+import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
+import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
+import org.apache.solr.client.solrj.io.Tuple;
+
+public class OutliersEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
+  protected static final long serialVersionUID = 1L;
+
+  public OutliersEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
+    super(expression, factory);
+  }
+
+  @Override
+  public Object doWork(Object... values) throws IOException{
+
+    if(values.length < 4) {
+      throw new IOException("The outliers function requires 4 parameters");
+    }
+
+    Object dist = values[0];
+    List<Number> vec = null;
+    if(values[1] instanceof List) {
+      vec = (List<Number>)values[1];
+    } else {
+      throw new IOException("The second parameter of the outliers function is the numeric array to be tested for outliers.");
+    }
+
+    double low = 0.0;
+
+    if(values[2] instanceof Number) {
+      low = ((Number)values[2]).doubleValue();
+    } else {
+      throw new IOException("The third parameter of the outliers function is a number for the low outlier threshold.");
+    }
+
+    double hi = 0.0;
+
+    if(values[3] instanceof Number) {
+      hi = ((Number)values[3]).doubleValue();
+    } else {
+      throw new IOException("The fourth parameter of the outliers function is a number for the high outlier threshold");
+    }
+
+    List<Tuple> tuples = null;
+
+    if(values.length ==5) {
+      if(values[4] instanceof List) {
+        tuples = (List<Tuple>) values[4];
+      } else {
+        throw new IOException("The optional fifth parameter of the outliers function is an array of Tuples that are paired with the numeric array of values to be tested.");
+      }
+    } else {
+      tuples = new ArrayList();
+      for(int i=0; i<vec.size(); i++) {
+        tuples.add(new Tuple(new HashMap()));
+      }
+    }
+
+    List<Tuple> outliers = new ArrayList();
+
+    if(dist instanceof IntegerDistribution) {
+
+      IntegerDistribution d = (IntegerDistribution) dist;
+
+      for(int i=0; i<vec.size(); i++) {
+
+        Number n = vec.get(i);
+        Tuple t = tuples.get(i);
+
+        double cumProb = d.cumulativeProbability(n.intValue());
+        if(low >= 0 && cumProb <= low) {
+          t.put("lowOutlier", true);
+          t.put("lowOutlierValue", n);
+          t.put("cumulativeProbablity", cumProb);
+          outliers.add(t);
+        }
+
+        if(hi >= 0 && cumProb >= hi) {
+          t.put("highOutlier", true);
+          t.put("highOutlierValue", n);
+          t.put("cumulativeProbablity", cumProb);
+          outliers.add(t);
+        }
+      }
+
+      return outliers;
+
+    } else if(dist instanceof AbstractRealDistribution) {
+
+      AbstractRealDistribution d = (AbstractRealDistribution)dist;
+      for(int i=0; i<vec.size(); i++) {
+
+        Number n = vec.get(i);
+        Tuple t = tuples.get(i);
+
+        double cumProb = d.cumulativeProbability(n.doubleValue());
+        if(low >= 0 && cumProb <= low) {
+          t.put("lowOutlier", true);
+          t.put("lowOutlierValue", n);
+          t.put("cumulativeProbablity", cumProb);
+          outliers.add(t);
+
+        }
+
+        if(hi >= 0 && cumProb >= hi) {
+          t.put("highOutlier", true);
+          t.put("highOutlierValue", n);
+          t.put("cumulativeProbablity", cumProb);
+          outliers.add(t);
+        }
+      }
+
+      return outliers;
+    } else {
+      throw new IOException("The first parameter of the outliers function must be a real or integer probability distribution");
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcf3a5d1/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
index a27ad2d..be51050 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
@@ -3272,6 +3272,59 @@ public class MathExpressionTest extends SolrCloudTestCase {
     assertEquals(out1.get(7).doubleValue(), 61.5, 0.0001);
   }
 
+
+  @Test
+  public void testOutliers() throws Exception {
+    String cexpr = "let(echo=true," +
+        "               a=list(tuple(id=0.0), tuple(id=1), tuple(id=2), tuple(id=3)), " +
+        "               b=normalDistribution(100, 5)," +
+        "               d=array(100, 110, 90, 99), " +
+        "               e=outliers(b, d, .05, .95, a)," +
+        "               f=outliers(b, d, .05, .95))";
+
+    ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+    paramsLoc.set("expr", cexpr);
+    paramsLoc.set("qt", "/stream");
+    String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+    TupleStream solrStream = new SolrStream(url, paramsLoc);
+    StreamContext context = new StreamContext();
+    solrStream.setStreamContext(context);
+    List<Tuple> tuples = getTuples(solrStream);
+    assertTrue(tuples.size() == 1);
+    List<Map> out = (List<Map>)tuples.get(0).get("e");
+    assertEquals(out.size(), 2);
+    Map high = out.get(0);
+    assertEquals(((String)high.get("id")), "1");
+
+    assertEquals(((Number)high.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
+    assertEquals(((Number)high.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
+    assertEquals(((Boolean)high.get("highOutlier")).booleanValue(), true);
+
+
+    Map low = out.get(1);
+    assertEquals(((String)low.get("id")), "2");
+    assertEquals(((Number)low.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
+    assertEquals(((Number)low.get("lowOutlierValue")).doubleValue(), 90, 0.0);
+    assertEquals(((Boolean)low.get("lowOutlier")).booleanValue(), true);
+
+
+    List<Map> out1 = (List<Map>)tuples.get(0).get("f");
+    assertEquals(out1.size(), 2);
+    Map high1 = out1.get(0);
+    assert(high1.get("id") == null);
+    assertEquals(((Number)high1.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
+    assertEquals(((Number)high1.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
+    assertEquals(((Boolean)high1.get("highOutlier")).booleanValue(), true);
+
+
+    Map low1 = out1.get(1);
+    assert(low1.get("id") == null);
+    assertEquals(((Number)low1.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
+    assertEquals(((Number)low1.get("lowOutlierValue")).doubleValue(), 90, 0.0);
+    assertEquals(((Boolean)low1.get("lowOutlier")).booleanValue(), true);
+
+  }
+
   @Test
   public void testLerp() throws Exception {
     String cexpr = "let(echo=true," +