You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 14:52:18 UTC
[50/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045
Delete directories which were moved/no longer in use
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/ssvd.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/ssvd.props b/community/mahout-mr/conf/ssvd.props
new file mode 100644
index 0000000..26a52c7
--- /dev/null
+++ b/community/mahout-mr/conf/ssvd.props
@@ -0,0 +1,14 @@
+#i|input =
+#o|output =
+#k|rank =
+#t|tempDir =
+#p|oversampling =
+#r|blockHeight =
+#s|minSplitSize =
+#U|computeU =
+#uhs|uHalfSigma =
+#V|computeV =
+#vhs|vHalfSigma =
+#t|reduceTasks =
+#w|wide =
+#q|powerIter =
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/svd.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/svd.props b/community/mahout-mr/conf/svd.props
new file mode 100644
index 0000000..8c9a467
--- /dev/null
+++ b/community/mahout-mr/conf/svd.props
@@ -0,0 +1,6 @@
+#i|input =
+#o|output =
+#nr|numRows =
+#nc|numCols =
+#r|rank =
+#t|tempDir =
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/trainlogistic.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/trainlogistic.props b/community/mahout-mr/conf/trainlogistic.props
new file mode 100644
index 0000000..f474942
--- /dev/null
+++ b/community/mahout-mr/conf/trainlogistic.props
@@ -0,0 +1,2 @@
+#lambda|lambda =
+#passes|passes =
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/transpose.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/transpose.props b/community/mahout-mr/conf/transpose.props
new file mode 100644
index 0000000..025f945
--- /dev/null
+++ b/community/mahout-mr/conf/transpose.props
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/vectordump.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/vectordump.props b/community/mahout-mr/conf/vectordump.props
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/community/mahout-mr/conf/vectordump.props
@@ -0,0 +1 @@
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
new file mode 100755
index 0000000..77f5d13
--- /dev/null
+++ b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# Performs the setup procedures for clustering the ASF mail archives
+# described in Taming Text.
+#
+# Required Command-line Parameters:
+#
+# $1 - Path to this script's working directory, you will need about
+# 22GB of free space to run this script.
+#
+# $2 - Path to where the ASF Public Archive data is, untarred.
+# If you are running Hadoop and the files are in HDFS, then
+# this will need to be an HDFS path. Default is $1/input
+# $3 - Path to where this script saves the SequenceFile output.
+# If you are running Hadoop and you want the sequence files
+# saved to your HDFS then you need to set this value to an
+# HDFS path and make sure you set HADOOP_HOME so Mahout can
+# find Hadoop. Default is $1/sequence-files
+#
+#
+# Required Environment Variables:
+#
+# MAHOUT_HOME
+# Root directory of your Mahout distribution
+#
+# HADOOP_HOME
+# Only needed if you want to send output to HDFS
+#
+# Example:
+# ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output
+#
+# This will download the TAR files from S3, extract them, and then
+# run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job
+# to create Hadoop SequenceFiles in /mnt/asf-mail-archives/output
+#
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+if [ "$MAHOUT_HOME" = "" ]; then
+ echo "Error: MAHOUT_HOME is not set."
+ exit 1
+fi
+
+if [ "$1" = "" ]; then
+ echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir inputPath outputPath\n"
+ exit 1
+fi
+
+# Location where this script saves files
+PREP_DIR=$1
+
+if [ "$2" != "" ]; then
+ SEQFILE_INPUT_DIR=$2
+else
+ SEQFILE_INPUT_DIR=$PREP_DIR/input
+fi
+
+
+# Change this to an HDFS path if you are running Hadoop
+if [ "$3" != "" ]; then
+ SEQFILE_OUTPUT_DIR=$3
+else
+ SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files
+fi
+
+# If output sent to HDFS, clear MAHOUT_LOCAL and make sure HADOOP_HOME is set
+if [[ "$SEQFILE_OUTPUT_DIR" = hdfs://* ]]; then
+ export MAHOUT_LOCAL=
+ if [ "$HADOOP_HOME" = "" ]; then
+ echo "Error: HADOOP_HOME must be set if you want to send output to HDFS."
+ exit 1
+ fi
+else
+ export MAHOUT_LOCAL=$PREP_DIR
+fi
+
+echo "Running $0 with:
+ PREP_DIR = $PREP_DIR
+ SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR
+ SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR
+ MAHOUT_LOCAL = $MAHOUT_LOCAL
+ HADOOP_HOME = $HADOOP_HOME"
+
+# Run Mahout in Local mode! Remove this if you want the
+# sequence files stored in your HDFS
+
+
+# convert the extracted gz files into Hadoop SequenceFiles
+echo "Converting extracted directories to SequenceFiles ..."
+$MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \
+--input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR --subject --body \
+-c UTF-8 -chunk 1024 -prefix asf_archives
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml
new file mode 100644
index 0000000..cb0c19a
--- /dev/null
+++ b/community/mahout-mr/integration/pom.xml
@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout</artifactId>
+ <version>0.13.1-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>mahout-integration</artifactId>
+ <name>Mahout Integration</name>
+ <description>Optional components of Mahout which generally support interaction with third party systems,
+ formats, APIs, etc.</description>
+
+ <packaging>jar</packaging>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-remote-resources-plugin</artifactId>
+ <configuration>
+ <appendedResourcesDirectory>../community/mahout-mr/src/appended-resources</appendedResourcesDirectory>
+ <resourceBundles>
+ <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+ </resourceBundles>
+ <supplementalModels>
+ <supplementalModel>supplemental-models.xml</supplementalModel>
+ </supplementalModels>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+
+ </plugins>
+
+ </build>
+
+ <dependencies>
+
+ <!-- own modules -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-hdfs</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-mr</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-hdfs</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-mr</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-math</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-math</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- 3rd party -->
+
+ <dependency>
+ <groupId>commons-dbcp</groupId>
+ <artifactId>commons-dbcp</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-pool</groupId>
+ <artifactId>commons-pool</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-benchmark</artifactId>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mongodb</groupId>
+ <artifactId>mongo-java-driver</artifactId>
+ <version>2.11.2</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mongodb</groupId>
+ <artifactId>bson</artifactId>
+ <version>2.11.2</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-client</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hectorclient</groupId>
+ <artifactId>hector-core</artifactId>
+ <version>1.1-4</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jcl</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.carrotsearch.randomizedtesting</groupId>
+ <artifactId>randomizedtesting-runner</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymock</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
new file mode 100644
index 0000000..549cf2c
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.base.Function;
+
+public final class BenchmarkRunner {
+ private static final int BUCKET_SIZE = 10000;
+ private static final Random R = RandomUtils.getRandom();
+ private final long maxTimeUsec;
+ private final long leadTimeUsec;
+
+ public BenchmarkRunner(long leadTimeMs, long maxTimeMs) {
+ maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs);
+ leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs);
+ }
+
+ public abstract static class BenchmarkFn implements Function<Integer, Boolean> {
+ protected int randIndex() {
+ return BenchmarkRunner.randIndex();
+ }
+
+ protected boolean randBool() {
+ return BenchmarkRunner.randBool();
+ }
+
+ /**
+ * Adds a random data dependency so that JVM does not remove dead code.
+ */
+ protected boolean depends(Vector v) {
+ return randIndex() < v.getNumNondefaultElements();
+ }
+ }
+
+ public abstract static class BenchmarkFnD implements Function<Integer, Double> {
+ protected int randIndex() {
+ return BenchmarkRunner.randIndex();
+ }
+
+ protected boolean randBool() {
+ return BenchmarkRunner.randBool();
+ }
+
+ /**
+ * Adds a random data dependency so that JVM does not remove dead code.
+ */
+ protected boolean depends(Vector v) {
+ return randIndex() < v.getNumNondefaultElements();
+ }
+ }
+
+ private static int randIndex() {
+ return R.nextInt(BUCKET_SIZE);
+ }
+
+ private static boolean randBool() {
+ return R.nextBoolean();
+ }
+
+ public TimingStatistics benchmark(BenchmarkFn function) {
+ TimingStatistics stats = new TimingStatistics();
+ boolean result = false;
+ while (true) {
+ int i = R.nextInt(BUCKET_SIZE);
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ result = result ^ function.apply(i);
+ if (call.end(maxTimeUsec)) {
+ break;
+ }
+ }
+ return stats;
+ }
+
+ public TimingStatistics benchmarkD(BenchmarkFnD function) {
+ TimingStatistics stats = new TimingStatistics();
+ double result = 0;
+ while (true) {
+ int i = R.nextInt(BUCKET_SIZE);
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ result += function.apply(i);
+ if (call.end(maxTimeUsec)) {
+ break;
+ }
+ }
+ // print result to prevent hotspot from eliminating deadcode
+ System.err.println("Result = " + result);
+ return stats;
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
new file mode 100644
index 0000000..5e6ab4d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+
+public class CloneBenchmark {
+ public static final String CLONE = "Clone";
+ private final VectorBenchmarks mark;
+
+ public CloneBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[0][mark.vIndex(i)]);
+ }
+ }), CLONE, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[1][mark.vIndex(i)]);
+ }
+ }), CLONE, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[2][mark.vIndex(i)]);
+ }
+ }), CLONE, SEQ_SPARSE_VECTOR);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
new file mode 100644
index 0000000..b1c2ded
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+
+public class ClosestCentroidBenchmark {
+ private final VectorBenchmarks mark;
+
+ public ClosestCentroidBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark(DistanceMeasure measure) throws IOException {
+ SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters);
+ for (int i = 0; i < mark.numClusters; i++) {
+ for (int j = 0; j < mark.numClusters; j++) {
+ double distance = Double.POSITIVE_INFINITY;
+ if (i != j) {
+ distance = measure.distance(mark.clusters[i], mark.clusters[j]);
+ }
+ clusterDistances.setQuick(i, j, distance);
+ }
+ }
+
+ long distanceCalculations = 0;
+ TimingStatistics stats = new TimingStatistics();
+ for (int l = 0; l < mark.loop; l++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ for (int i = 0; i < mark.numVectors; i++) {
+ Vector vector = mark.vectors[1][mark.vIndex(i)];
+ double minDistance = Double.MAX_VALUE;
+ for (int k = 0; k < mark.numClusters; k++) {
+ double distance = measure.distance(vector, mark.clusters[k]);
+ distanceCalculations++;
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = "
+ + distanceCalculations);
+
+ distanceCalculations = 0;
+ stats = new TimingStatistics();
+ Random rand = RandomUtils.getRandom();
+ for (int l = 0; l < mark.loop; l++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ for (int i = 0; i < mark.numVectors; i++) {
+ Vector vector = mark.vectors[1][mark.vIndex(i)];
+ int closestCentroid = rand.nextInt(mark.numClusters);
+ double dist = measure.distance(vector, mark.clusters[closestCentroid]);
+ distanceCalculations++;
+ for (int k = 0; k < mark.numClusters; k++) {
+ if (closestCentroid != k) {
+ double centroidDist = clusterDistances.getQuick(k, closestCentroid);
+ if (centroidDist < 2 * dist) {
+ dist = measure.distance(vector, mark.clusters[k]);
+ closestCentroid = k;
+ distanceCalculations++;
+ }
+ }
+ }
+ }
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = "
+ + distanceCalculations);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
new file mode 100644
index 0000000..25d0ad7
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
+import org.apache.mahout.common.distance.DistanceMeasure;
+
+public class DistanceBenchmark {
+ private final VectorBenchmarks mark;
+
+ public DistanceBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark(final DistanceMeasure measure) {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_FN_RAND);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
new file mode 100644
index 0000000..fc7f911
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
+
+public class DotBenchmark {
+ private static final String DOT_PRODUCT = "DotProduct";
+ private static final String NORM1 = "Norm1";
+ private static final String NORM2 = "Norm2";
+ private static final String LOG_NORMALIZE = "LogNormalize";
+ private final VectorBenchmarks mark;
+
+ public DotBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ benchmarkDot();
+ benchmarkNorm1();
+ benchmarkNorm2();
+ benchmarkLogNormalize();
+ }
+
+ private void benchmarkLogNormalize() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[0][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[1][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[2][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkNorm1() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkNorm2() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkDot() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_FN_RAND);
+ }
+
+ public static void main(String[] args) {
+ VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1);
+ mark.createData();
+ new DotBenchmark(mark).benchmarkNorm2();
+ System.out.println(mark);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
new file mode 100644
index 0000000..82fb693
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class MinusBenchmark {
+
+ private static final String MINUS = "Minus";
+ private final VectorBenchmarks mark;
+
+ public MinusBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_FN_RAND);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
new file mode 100644
index 0000000..bd76e94
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class PlusBenchmark {
+
+ private static final String PLUS = "Plus";
+ private final VectorBenchmarks mark;
+
+ public PlusBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_FN_RAND);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
new file mode 100644
index 0000000..cd403c2
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+public class SerializationBenchmark {
+ public static final String SERIALIZE = "Serialize";
+ public static final String DESERIALIZE = "Deserialize";
+ private final VectorBenchmarks mark;
+
+ public SerializationBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() throws IOException {
+ serializeBenchmark();
+ deserializeBenchmark();
+ }
+
+ public void serializeBenchmark() throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+
+ Writable one = new IntWritable(0);
+ VectorWritable vec = new VectorWritable();
+ TimingStatistics stats = new TimingStatistics();
+
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf, new Path("/tmp/dense-vector"),
+ IntWritable.class, VectorWritable.class)){
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[0][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, DENSE_VECTOR);
+
+ stats = new TimingStatistics();
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf,
+ new Path("/tmp/randsparse-vector"), IntWritable.class, VectorWritable.class)){
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[1][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, RAND_SPARSE_VECTOR);
+
+ stats = new TimingStatistics();
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf,
+ new Path("/tmp/seqsparse-vector"), IntWritable.class, VectorWritable.class)) {
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[2][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, SEQ_SPARSE_VECTOR);
+
+ }
+
+ public void deserializeBenchmark() throws IOException {
+ doDeserializeBenchmark(DENSE_VECTOR, "/tmp/dense-vector");
+ doDeserializeBenchmark(RAND_SPARSE_VECTOR, "/tmp/randsparse-vector");
+ doDeserializeBenchmark(SEQ_SPARSE_VECTOR, "/tmp/seqsparse-vector");
+ }
+
+ private void doDeserializeBenchmark(String name, String pathString) throws IOException {
+ TimingStatistics stats = new TimingStatistics();
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ SequenceFileValueIterator<Writable> iterator = new SequenceFileValueIterator<>(new Path(pathString), true,
+ new Configuration());
+ while (iterator.hasNext()) {
+ iterator.next();
+ call.end();
+ call = stats.newCall(mark.leadTimeUsec);
+ }
+ iterator.close();
+ mark.printStats(stats, DESERIALIZE, name);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
new file mode 100644
index 0000000..bf81228
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class TimesBenchmark {
+
+ private static final String TIMES = "Times";
+ private final VectorBenchmarks mark;
+
+ public TimesBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_FN_RAND);
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
new file mode 100644
index 0000000..a076322
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
@@ -0,0 +1,497 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.ChebyshevDistanceMeasure;
+import org.apache.mahout.common.distance.CosineDistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.common.distance.MinkowskiDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+
+public class VectorBenchmarks {
+ private static final int MAX_TIME_MS = 5000;
+ private static final int LEAD_TIME_MS = 15000;
+ public static final String CLUSTERS = "Clusters";
+ public static final String CREATE_INCREMENTALLY = "Create (incrementally)";
+ public static final String CREATE_COPY = "Create (copy)";
+
+ public static final String DENSE_FN_SEQ = "Dense.fn(Seq)";
+ public static final String RAND_FN_DENSE = "Rand.fn(Dense)";
+ public static final String SEQ_FN_RAND = "Seq.fn(Rand)";
+ public static final String RAND_FN_SEQ = "Rand.fn(Seq)";
+ public static final String SEQ_FN_DENSE = "Seq.fn(Dense)";
+ public static final String DENSE_FN_RAND = "Dense.fn(Rand)";
+ public static final String SEQ_SPARSE_VECTOR = "SeqSparseVector";
+ public static final String RAND_SPARSE_VECTOR = "RandSparseVector";
+ public static final String DENSE_VECTOR = "DenseVector";
+
+ private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class);
+ private static final Pattern TAB_NEWLINE_PATTERN = Pattern.compile("[\n\t]");
+ private static final String[] EMPTY = new String[0];
+ private static final DecimalFormat DF = new DecimalFormat("#.##");
+
+ /* package private */
+ final Vector[][] vectors;
+ final Vector[] clusters;
+ final int cardinality;
+ final int numNonZeros;
+ final int numVectors;
+ final int numClusters;
+ final int loop = Integer.MAX_VALUE;
+ final int opsPerUnit;
+ final long maxTimeUsec;
+ final long leadTimeUsec;
+
+ private final List<Vector> randomVectors = new ArrayList<>();
+ private final List<int[]> randomVectorIndices = new ArrayList<>();
+ private final List<double[]> randomVectorValues = new ArrayList<>();
+ private final Map<String, Integer> implType = new HashMap<>();
+ private final Map<String, List<String[]>> statsMap = new HashMap<>();
+ private final BenchmarkRunner runner;
+ private final Random r = RandomUtils.getRandom();
+
+ public VectorBenchmarks(int cardinality, int numNonZeros, int numVectors, int numClusters,
+ int opsPerUnit) {
+ runner = new BenchmarkRunner(LEAD_TIME_MS, MAX_TIME_MS);
+ maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(MAX_TIME_MS);
+ leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(LEAD_TIME_MS);
+
+ this.cardinality = cardinality;
+ this.numNonZeros = numNonZeros;
+ this.numVectors = numVectors;
+ this.numClusters = numClusters;
+ this.opsPerUnit = opsPerUnit;
+
+ setUpVectors(cardinality, numNonZeros, numVectors);
+
+ vectors = new Vector[3][numVectors];
+ clusters = new Vector[numClusters];
+ }
+
+ private void setUpVectors(int cardinality, int numNonZeros, int numVectors) {
+ for (int i = 0; i < numVectors; i++) {
+ Vector v = new SequentialAccessSparseVector(cardinality, numNonZeros); // sparsity!
+ BitSet featureSpace = new BitSet(cardinality);
+ int[] indexes = new int[numNonZeros];
+ double[] values = new double[numNonZeros];
+ int j = 0;
+ while (j < numNonZeros) {
+ double value = r.nextGaussian();
+ int index = r.nextInt(cardinality);
+ if (!featureSpace.get(index) && value != 0) {
+ featureSpace.set(index);
+ indexes[j] = index;
+ values[j++] = value;
+ v.set(index, value);
+ }
+ }
+ randomVectorIndices.add(indexes);
+ randomVectorValues.add(values);
+ randomVectors.add(v);
+ }
+ }
+
+ void printStats(TimingStatistics stats, String benchmarkName, String implName, String content) {
+ printStats(stats, benchmarkName, implName, content, 1);
+ }
+
+ void printStats(TimingStatistics stats, String benchmarkName, String implName) {
+ printStats(stats, benchmarkName, implName, "", 1);
+ }
+
+ private void printStats(TimingStatistics stats, String benchmarkName, String implName,
+ String content, int multiplier) {
+ float speed = multiplier * stats.getNCalls() * (numNonZeros * 1000.0f * 12 / stats.getSumTime());
+ float opsPerSec = stats.getNCalls() * 1000000000.0f / stats.getSumTime();
+ log.info("{} {} \n{} {} \nOps = {} Units/sec\nIOps = {} MBytes/sec", benchmarkName,
+ implName, content, stats.toString(), DF.format(opsPerSec), DF.format(speed));
+
+ if (!implType.containsKey(implName)) {
+ implType.put(implName, implType.size());
+ }
+ int implId = implType.get(implName);
+ if (!statsMap.containsKey(benchmarkName)) {
+ statsMap.put(benchmarkName, new ArrayList<String[]>());
+ }
+ List<String[]> implStats = statsMap.get(benchmarkName);
+ while (implStats.size() < implId + 1) {
+ implStats.add(EMPTY);
+ }
+ implStats.set(
+ implId,
+ TAB_NEWLINE_PATTERN.split(stats + "\tSpeed = " + DF.format(opsPerSec) + " /sec\tRate = "
+ + DF.format(speed) + " MB/s"));
+ }
+
+ public void createData() {
+ for (int i = 0; i < Math.max(numVectors, numClusters); ++i) {
+ vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
+ if (numClusters > 0) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ }
+ }
+ }
+
+ public void createBenchmark() {
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[0][vIndex(i)]);
+ }
+ }), CREATE_COPY, DENSE_VECTOR);
+
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[1][vIndex(i)]);
+ }
+ }), CREATE_COPY, RAND_SPARSE_VECTOR);
+
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[2][vIndex(i)]);
+ }
+ }), CREATE_COPY, SEQ_SPARSE_VECTOR);
+
+ if (numClusters > 0) {
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(clusters[cIndex(i)]);
+ }
+ }), CREATE_COPY, CLUSTERS);
+ }
+ }
+
+ private boolean buildVectorIncrementally(TimingStatistics stats, int randomIndex, Vector v, boolean useSetQuick) {
+ int[] indexes = randomVectorIndices.get(randomIndex);
+ double[] values = randomVectorValues.get(randomIndex);
+ List<Integer> randomOrder = new ArrayList<>();
+ for (int i = 0; i < indexes.length; i++) {
+ randomOrder.add(i);
+ }
+ Collections.shuffle(randomOrder);
+ int[] permutation = new int[randomOrder.size()];
+ for (int i = 0; i < randomOrder.size(); i++) {
+ permutation[i] = randomOrder.get(i);
+ }
+
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ if (useSetQuick) {
+ for (int i : permutation) {
+ v.setQuick(indexes[i], values[i]);
+ }
+ } else {
+ for (int i : permutation) {
+ v.set(indexes[i], values[i]);
+ }
+ }
+ return call.end(maxTimeUsec);
+ }
+
+ public void incrementalCreateBenchmark() {
+ TimingStatistics stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[0][vIndex(i)] = new DenseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[0][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, DENSE_VECTOR);
+
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[1][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, RAND_SPARSE_VECTOR);
+
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[2][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, SEQ_SPARSE_VECTOR);
+
+ if (numClusters > 0) {
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), clusters[cIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, CLUSTERS);
+ }
+ }
+
+ public int vIndex(int i) {
+ return i % numVectors;
+ }
+
+ public int cIndex(int i) {
+ return i % numClusters;
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option vectorSizeOpt = obuilder
+ .withLongName("vectorSize")
+ .withRequired(false)
+ .withArgument(abuilder.withName("vs").withDefault(1000000).create())
+ .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create();
+ Option numNonZeroOpt = obuilder
+ .withLongName("numNonZero")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nz").withDefault(1000).create())
+ .withDescription("Size of the vector. Default: 1000").withShortName("nz").create();
+ Option numVectorsOpt = obuilder
+ .withLongName("numVectors")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nv").withDefault(25).create())
+ .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create();
+ Option numClustersOpt = obuilder
+ .withLongName("numClusters")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nc").withDefault(0).create())
+ .withDescription("Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0")
+ .withShortName("nc").create();
+ Option numOpsOpt = obuilder
+ .withLongName("numOps")
+ .withRequired(false)
+ .withArgument(abuilder.withName("numOps").withDefault(10).create())
+ .withDescription(
+ "Number of operations to do per timer. "
+ + "E.g In distance measure, the distance is calculated numOps times"
+ + " and the total time is measured. Default: 10").withShortName("no").create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt)
+ .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelpWithGenericOptions(group);
+ return;
+ }
+
+ int cardinality = 1000000;
+ if (cmdLine.hasOption(vectorSizeOpt)) {
+ cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt));
+
+ }
+
+ int numClusters = 0;
+ if (cmdLine.hasOption(numClustersOpt)) {
+ numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt));
+ }
+
+ int numNonZero = 1000;
+ if (cmdLine.hasOption(numNonZeroOpt)) {
+ numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt));
+ }
+
+ int numVectors = 25;
+ if (cmdLine.hasOption(numVectorsOpt)) {
+ numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt));
+
+ }
+
+ int numOps = 10;
+ if (cmdLine.hasOption(numOpsOpt)) {
+ numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt));
+
+ }
+ VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps);
+ runBenchmark(mark);
+
+ // log.info("\n{}", mark);
+ log.info("\n{}", mark.asCsvString());
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ private static void runBenchmark(VectorBenchmarks mark) throws IOException {
+ // Required to set up data.
+ mark.createData();
+
+ mark.createBenchmark();
+ if (mark.cardinality < 200000) {
+ // Too slow.
+ mark.incrementalCreateBenchmark();
+ }
+
+ new CloneBenchmark(mark).benchmark();
+ new DotBenchmark(mark).benchmark();
+ new PlusBenchmark(mark).benchmark();
+ new MinusBenchmark(mark).benchmark();
+ new TimesBenchmark(mark).benchmark();
+ new SerializationBenchmark(mark).benchmark();
+
+ DistanceBenchmark distanceBenchmark = new DistanceBenchmark(mark);
+ distanceBenchmark.benchmark(new CosineDistanceMeasure());
+ distanceBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
+ distanceBenchmark.benchmark(new EuclideanDistanceMeasure());
+ distanceBenchmark.benchmark(new ManhattanDistanceMeasure());
+ distanceBenchmark.benchmark(new TanimotoDistanceMeasure());
+ distanceBenchmark.benchmark(new ChebyshevDistanceMeasure());
+ distanceBenchmark.benchmark(new MinkowskiDistanceMeasure());
+
+ if (mark.numClusters > 0) {
+ ClosestCentroidBenchmark centroidBenchmark = new ClosestCentroidBenchmark(mark);
+ centroidBenchmark.benchmark(new CosineDistanceMeasure());
+ centroidBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
+ centroidBenchmark.benchmark(new EuclideanDistanceMeasure());
+ centroidBenchmark.benchmark(new ManhattanDistanceMeasure());
+ centroidBenchmark.benchmark(new TanimotoDistanceMeasure());
+ centroidBenchmark.benchmark(new ChebyshevDistanceMeasure());
+ centroidBenchmark.benchmark(new MinkowskiDistanceMeasure());
+ }
+ }
+
+ private String asCsvString() {
+ List<String> keys = new ArrayList<>(statsMap.keySet());
+ Collections.sort(keys);
+ Map<Integer,String> implMap = new HashMap<>();
+ for (Entry<String,Integer> e : implType.entrySet()) {
+ implMap.put(e.getValue(), e.getKey());
+ }
+
+ StringBuilder sb = new StringBuilder(1000);
+ for (String benchmarkName : keys) {
+ int i = 0;
+ for (String[] stats : statsMap.get(benchmarkName)) {
+ if (stats.length < 8) {
+ continue;
+ }
+ sb.append(benchmarkName).append(',');
+ sb.append(implMap.get(i++)).append(',');
+ sb.append(stats[7].trim().split("=|/")[1].trim());
+ sb.append('\n');
+ }
+ }
+ sb.append('\n');
+ return sb.toString();
+ }
+
+ @Override
+ public String toString() {
+ int pad = 24;
+ StringBuilder sb = new StringBuilder(1000);
+ sb.append(StringUtils.rightPad("BenchMarks", pad));
+ for (int i = 0; i < implType.size(); i++) {
+ for (Entry<String,Integer> e : implType.entrySet()) {
+ if (e.getValue() == i) {
+ sb.append(StringUtils.rightPad(e.getKey(), pad).substring(0, pad));
+ break;
+ }
+ }
+ }
+ sb.append('\n');
+ List<String> keys = new ArrayList<>(statsMap.keySet());
+ Collections.sort(keys);
+ for (String benchmarkName : keys) {
+ List<String[]> implTokenizedStats = statsMap.get(benchmarkName);
+ int maxStats = 0;
+ for (String[] stat : implTokenizedStats) {
+ maxStats = Math.max(maxStats, stat.length);
+ }
+
+ for (int i = 0; i < maxStats; i++) {
+ boolean printedName = false;
+ for (String[] stats : implTokenizedStats) {
+ if (i == 0 && !printedName) {
+ sb.append(StringUtils.rightPad(benchmarkName, pad));
+ printedName = true;
+ } else if (!printedName) {
+ printedName = true;
+ sb.append(StringUtils.rightPad("", pad));
+ }
+ if (stats.length > i) {
+ sb.append(StringUtils.rightPad(stats[i], pad));
+ } else {
+ sb.append(StringUtils.rightPad("", pad));
+ }
+
+ }
+ sb.append('\n');
+ }
+ sb.append('\n');
+ }
+ return sb.toString();
+ }
+
+ public BenchmarkRunner getRunner() {
+ return runner;
+ }
+}