You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2018/01/23 21:02:40 UTC
lucene-solr:branch_7x: SOLR-11890: Add multiKmeans Stream Evaluator
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 1fdcafcad -> 47ce3bd00
SOLR-11890: Add multiKmeans Stream Evaluator
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/47ce3bd0
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/47ce3bd0
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/47ce3bd0
Branch: refs/heads/branch_7x
Commit: 47ce3bd000d1d888cfb4786491312ca00c37b259
Parents: 1fdcafc
Author: Joel Bernstein <jb...@apache.org>
Authored: Tue Jan 23 15:54:37 2018 -0500
Committer: Joel Bernstein <jb...@apache.org>
Committed: Tue Jan 23 15:58:44 2018 -0500
----------------------------------------------------------------------
.../org/apache/solr/handler/StreamHandler.java | 1 +
.../solrj/io/eval/MultiKmeansEvaluator.java | 108 +++++++++++++++++++
.../solrj/io/stream/StreamExpressionTest.java | 81 ++++++++++++++
3 files changed, 190 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/47ce3bd0/solr/core/src/java/org/apache/solr/handler/StreamHandler.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java
index eda53ea..c9616af 100644
--- a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java
@@ -312,6 +312,7 @@ public class StreamHandler extends RequestHandlerBase implements SolrCoreAware,
.withFunctionName("rowCount", RowCountEvaluator.class)
.withFunctionName("fuzzyKmeans", FuzzyKmeansEvaluator.class)
.withFunctionName("getMembershipMatrix", GetMembershipMatrixEvaluator.class)
+ .withFunctionName("multiKmeans", MultiKmeansEvaluator.class)
// Boolean Stream Evaluators
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/47ce3bd0/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/MultiKmeansEvaluator.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/MultiKmeansEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/MultiKmeansEvaluator.java
new file mode 100644
index 0000000..86f1d85
--- /dev/null
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/MultiKmeansEvaluator.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.client.solrj.io.eval;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+
+import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
+import org.apache.commons.math3.ml.clustering.MultiKMeansPlusPlusClusterer;
+import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
+import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionNamedParameter;
+import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
+
+public class MultiKmeansEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
+ protected static final long serialVersionUID = 1L;
+
+ private int maxIterations = 1000;
+
+ public MultiKmeansEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
+ super(expression, factory);
+
+ List<StreamExpressionNamedParameter> namedParams = factory.getNamedOperands(expression);
+
+ for(StreamExpressionNamedParameter namedParam : namedParams){
+ if(namedParam.getName().equals("maxIterations")) {
+ this.maxIterations = Integer.parseInt(namedParam.getParameter().toString().trim());
+ } else {
+ throw new IOException("Unexpected named parameter:"+namedParam.getName());
+ }
+ }
+ }
+
+ @Override
+ public Object doWork(Object... values) throws IOException {
+
+ if(values.length != 3) {
+ throw new IOException("The multiKmeans function expects three parameters; a matrix to cluster, k and number of trials.");
+ }
+
+ Object value1 = values[0];
+ Object value2 = values[1];
+ Object value3 = values[2];
+
+ Matrix matrix = null;
+ int k = 0;
+ int trials=0;
+
+ if(value1 instanceof Matrix) {
+ matrix = (Matrix)value1;
+ } else {
+ throw new IOException("The first parameter for multiKmeans should be the observation matrix.");
+ }
+
+ if(value2 instanceof Number) {
+ k = ((Number)value2).intValue();
+ } else {
+ throw new IOException("The second parameter for multiKmeans should be k.");
+ }
+
+ if(value3 instanceof Number) {
+ trials= ((Number)value3).intValue();
+ } else {
+ throw new IOException("The third parameter for multiKmeans should be trials.");
+ }
+
+ KMeansPlusPlusClusterer<KmeansEvaluator.ClusterPoint> kmeans = new KMeansPlusPlusClusterer(k, maxIterations);
+ MultiKMeansPlusPlusClusterer multiKmeans = new MultiKMeansPlusPlusClusterer(kmeans, trials);
+
+ List<KmeansEvaluator.ClusterPoint> points = new ArrayList();
+ double[][] data = matrix.getData();
+
+ List<String> ids = matrix.getRowLabels();
+
+ for(int i=0; i<data.length; i++) {
+ double[] vec = data[i];
+ points.add(new KmeansEvaluator.ClusterPoint(ids.get(i), vec));
+ }
+
+ Map fields = new HashMap();
+
+ fields.put("k", k);
+ fields.put("trials", trials);
+ fields.put("distance", "euclidean");
+ fields.put("maxIterations", maxIterations);
+
+ return new KmeansEvaluator.ClusterTuple(fields, multiKmeans.cluster(points), matrix.getColumnLabels());
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/47ce3bd0/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
index ea2a7ab..2738a7c 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
@@ -7076,6 +7076,87 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
}
+ @Test
+ public void testMultiKmeans() throws Exception {
+ String cexpr = "let(echo=true," +
+ " a=array(1,1,1,0,0,0)," +
+ " b=array(1,1,1,0,0,0)," +
+ " c=array(0,0,0,1,1,1)," +
+ " d=array(0,0,0,1,1,1)," +
+ " e=setRowLabels(matrix(a,b,c,d), " +
+ " array(doc1, doc2, doc3, doc4))," +
+ " f=multiKmeans(e, 2, 5)," +
+ " g=getCluster(f, 0)," +
+ " h=getCluster(f, 1)," +
+ " i=getCentroids(f)," +
+ " j=getRowLabels(g)," +
+ " k=getRowLabels(h))";
+ ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+ paramsLoc.set("expr", cexpr);
+ paramsLoc.set("qt", "/stream");
+ String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+ TupleStream solrStream = new SolrStream(url, paramsLoc);
+ StreamContext context = new StreamContext();
+ solrStream.setStreamContext(context);
+ List<Tuple> tuples = getTuples(solrStream);
+ assertTrue(tuples.size() == 1);
+ List<List<Number>> cluster1 = (List<List<Number>>)tuples.get(0).get("g");
+ List<List<Number>> cluster2 = (List<List<Number>>)tuples.get(0).get("h");
+ List<List<Number>> centroids = (List<List<Number>>)tuples.get(0).get("i");
+ List<String> labels1 = (List<String>)tuples.get(0).get("j");
+ List<String> labels2 = (List<String>)tuples.get(0).get("k");
+
+ assertEquals(cluster1.size(), 2);
+ assertEquals(cluster2.size(), 2);
+ assertEquals(centroids.size(), 2);
+
+ //Assert that the docs are not in both clusters
+ assertTrue(!(labels1.contains("doc1") && labels2.contains("doc1")));
+ assertTrue(!(labels1.contains("doc2") && labels2.contains("doc2")));
+ assertTrue(!(labels1.contains("doc3") && labels2.contains("doc3")));
+ assertTrue(!(labels1.contains("doc4") && labels2.contains("doc4")));
+
+ //Assert that (doc1 and doc2) or (doc3 and doc4) are in labels1
+ assertTrue((labels1.contains("doc1") && labels1.contains("doc2")) ||
+ ((labels1.contains("doc3") && labels1.contains("doc4"))));
+
+ //Assert that (doc1 and doc2) or (doc3 and doc4) are in labels2
+ assertTrue((labels2.contains("doc1") && labels2.contains("doc2")) ||
+ ((labels2.contains("doc3") && labels2.contains("doc4"))));
+
+ if(labels1.contains("doc1")) {
+ assertEquals(centroids.get(0).get(0).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(0).get(1).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(0).get(2).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(0).get(3).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(0).get(4).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(0).get(5).doubleValue(), 0.0, 0.0);
+
+ assertEquals(centroids.get(1).get(0).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(1).get(1).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(1).get(2).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(1).get(3).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(1).get(4).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(1).get(5).doubleValue(), 1.0, 0.0);
+ } else {
+ assertEquals(centroids.get(0).get(0).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(0).get(1).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(0).get(2).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(0).get(3).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(0).get(4).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(0).get(5).doubleValue(), 1.0, 0.0);
+
+ assertEquals(centroids.get(1).get(0).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(1).get(1).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(1).get(2).doubleValue(), 1.0, 0.0);
+ assertEquals(centroids.get(1).get(3).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(1).get(4).doubleValue(), 0.0, 0.0);
+ assertEquals(centroids.get(1).get(5).doubleValue(), 0.0, 0.0);
+ }
+ }
+
+
+
@Test
public void testFuzzyKmeans() throws Exception {