You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/02/22 01:22:43 UTC
[text] Initial implementation of the cosine distance for strings (not
sequences)
Repository: commons-text
Updated Branches:
refs/heads/NEW-METRICS 0404dbf4b -> ff1959c84
Initial implementation of the cosine distance for strings (not sequences)
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/ff1959c8
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/ff1959c8
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/ff1959c8
Branch: refs/heads/NEW-METRICS
Commit: ff1959c84dce2eac6f2e8432623d2a2a270a5f32
Parents: 0404dbf
Author: Bruno P. Kinoshita <ki...@apache.org>
Authored: Sat Feb 21 22:22:28 2015 -0200
Committer: Bruno P. Kinoshita <ki...@apache.org>
Committed: Sat Feb 21 22:22:28 2015 -0200
----------------------------------------------------------------------
.../text/similarity/CosineSimilarity.java | 48 +++++++++++++++++-
.../text/similarity/CosineSimilarityTest.java | 51 ++++++++++++++++++++
2 files changed, 98 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
index 4589c2d..ca9d087 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -16,6 +16,52 @@
*/
package org.apache.commons.text.similarity;
-public class CosineSimilarity {
+/**
+ * <p>Measures the Cosine similarity of two CharSequences. It treats the CharSequences as
+ * two vectors of an inner product space and compares the angle between them.</p>
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, take a look at its
+ * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 0.1
+ */
+public class CosineSimilarity implements StringMetric<Double> {
+
+ @Override
+ public Double compare(CharSequence left, CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("String parameters must not be null");
+ }
+ long dotProduct = dot(left, right);
+ double d1 = 0.0d;
+ for (int i = 0; i < left.length(); ++i) {
+ d1 += Math.pow(((int) left.charAt(i)), 2);
+ }
+ double d2 = 0.0d;
+ for (int i = 0; i < right.length(); ++i) {
+ d2 += Math.pow(((int) right.charAt(i)), 2);
+ }
+ double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2));
+ return cosineSimilarity;
+ }
+
+ /**
+ * Computes the dot product of two CharSequences. It ignores remaining characters. It means
+ * that if a string is longer than other, then a smaller part of it will be used to compute
+ * the dot product.
+ *
+ * @param left left string
+ * @param right right string
+ * @return the dot product
+ */
+ protected long dot(CharSequence left, CharSequence right) {
+ long dotProduct = 0;
+ for (int i = 0; i < left.length() && i < right.length(); ++i) {
+ dotProduct += (((int) left.charAt(i)) * ((int) right.charAt(i)));
+ }
+ return dotProduct;
+ }
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
new file mode 100644
index 0000000..aa08057
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}.
+ */
+public class CosineSimilarityTest {
+
+ private static CosineSimilarity cosineSimilarity;
+
+ @BeforeClass
+ public static void setUp() {
+ cosineSimilarity = new CosineSimilarity();
+ }
+
+ @Test
+ public void testCosineSimilarity() {
+ assertEquals(Double.valueOf(0.62d), roundValue(cosineSimilarity.compare("ABCDE", "AB")));
+ assertEquals(Double.valueOf(1.00d), roundValue(cosineSimilarity.compare("AB", "AB")));
+ }
+
+ // --- Utility methods
+
+ private Double roundValue(Double value) {
+ return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue();
+ }
+
+}
Re: [text] Initial implementation of the cosine distance for strings
(not sequences)
Posted by Benedikt Ritter <br...@apache.org>.
Hello Bruno,
2015-02-22 1:22 GMT+01:00 <ki...@apache.org>:
> Repository: commons-text
> Updated Branches:
> refs/heads/NEW-METRICS 0404dbf4b -> ff1959c84
>
>
> Initial implementation of the cosine distance for strings (not sequences)
>
>
> Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
> Commit:
> http://git-wip-us.apache.org/repos/asf/commons-text/commit/ff1959c8
> Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/ff1959c8
> Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/ff1959c8
>
> Branch: refs/heads/NEW-METRICS
> Commit: ff1959c84dce2eac6f2e8432623d2a2a270a5f32
> Parents: 0404dbf
> Author: Bruno P. Kinoshita <ki...@apache.org>
> Authored: Sat Feb 21 22:22:28 2015 -0200
> Committer: Bruno P. Kinoshita <ki...@apache.org>
> Committed: Sat Feb 21 22:22:28 2015 -0200
>
> ----------------------------------------------------------------------
> .../text/similarity/CosineSimilarity.java | 48 +++++++++++++++++-
> .../text/similarity/CosineSimilarityTest.java | 51 ++++++++++++++++++++
> 2 files changed, 98 insertions(+), 1 deletion(-)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
> b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
> index 4589c2d..ca9d087 100644
> ---
> a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
> +++
> b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
> @@ -16,6 +16,52 @@
> */
> package org.apache.commons.text.similarity;
>
> -public class CosineSimilarity {
> +/**
> + * <p>Measures the Cosine similarity of two CharSequences. It treats the
> CharSequences as
> + * two vectors of an inner product space and compares the angle between
> them.</p>
> + *
> + * <p>
> + * For further explanation about the Cosine Similarity, take a look at its
> + * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity.
> + * </p>
> + *
> + * @since 0.1
> + */
> +public class CosineSimilarity implements StringMetric<Double> {
> +
> + @Override
> + public Double compare(CharSequence left, CharSequence right) {
> + if (left == null || right == null) {
> + throw new IllegalArgumentException("String parameters must
> not be null");
> + }
> + long dotProduct = dot(left, right);
> + double d1 = 0.0d;
> + for (int i = 0; i < left.length(); ++i) {
> + d1 += Math.pow(((int) left.charAt(i)), 2);
> + }
> + double d2 = 0.0d;
> + for (int i = 0; i < right.length(); ++i) {
> + d2 += Math.pow(((int) right.charAt(i)), 2);
> + }
> + double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) *
> Math.sqrt(d2));
> + return cosineSimilarity;
> + }
> +
> + /**
> + * Computes the dot product of two CharSequences. It ignores
> remaining characters. It means
> + * that if a string is longer than other, then a smaller part of it
> will be used to compute
> + * the dot product.
> + *
> + * @param left left string
> + * @param right right string
> + * @return the dot product
> + */
> + protected long dot(CharSequence left, CharSequence right) {
> + long dotProduct = 0;
> + for (int i = 0; i < left.length() && i < right.length(); ++i) {
> + dotProduct += (((int) left.charAt(i)) * ((int)
> right.charAt(i)));
> + }
> + return dotProduct;
> + }
>
> }
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
> b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
> new file mode 100644
> index 0000000..aa08057
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
> @@ -0,0 +1,51 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import java.math.BigDecimal;
> +import java.math.RoundingMode;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link
> org.apache.commons.text.similarity.CosineSimilarity}.
> + */
> +public class CosineSimilarityTest {
> +
> + private static CosineSimilarity cosineSimilarity;
> +
> + @BeforeClass
> + public static void setUp() {
> + cosineSimilarity = new CosineSimilarity();
> + }
> +
> + @Test
> + public void testCosineSimilarity() {
> + assertEquals(Double.valueOf(0.62d),
> roundValue(cosineSimilarity.compare("ABCDE", "AB")));
> + assertEquals(Double.valueOf(1.00d),
> roundValue(cosineSimilarity.compare("AB", "AB")));
>
I don't understand this test case. Why can't we check for the acutal values?
> + }
> +
> + // --- Utility methods
> +
> + private Double roundValue(Double value) {
> + return (Double) new BigDecimal(value).setScale(2,
> RoundingMode.HALF_UP).doubleValue();
> + }
> +
> +}
>
>
--
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter