You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/12 14:49:54 UTC
[text] [SANDBOX-492] Create StringDistanceFrom class that contains a
StringMetric and the "left" side string. This would have a method that
accepts the "right" side string to test.
Repository: commons-text
Updated Branches:
refs/heads/master a5ac07106 -> 6280d46c5
[SANDBOX-492] Create StringDistanceFrom class that contains a StringMetric and the "left" side string.
This would have a method that accepts the "right" side string to test.
This fixes #3 from GitHub. Thanks to Jonathan Baker.
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6280d46c
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6280d46c
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6280d46c
Branch: refs/heads/master
Commit: 6280d46c5d5e87e0491573efa969fcf4a6397cd7
Parents: a5ac071
Author: j--baker <j-...@users.noreply.github.com>
Authored: Wed Mar 4 15:20:18 2015 -0500
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Mon Apr 13 00:48:45 2015 +1200
----------------------------------------------------------------------
src/changes/changes.xml | 1 +
.../text/similarity/StringMetricFrom.java | 111 +++++++++++++++++++
.../ParameterizedStringMetricFromTest.java | 92 +++++++++++++++
.../text/similarity/StringMetricFromTest.java | 71 ++++++++++++
4 files changed, 275 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 4ac588a..45ac073 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,7 @@
<body>
<release version="1.0" date="tba" description="tba">
+ <action issue="SANDBOX-492" type="fix" dev="kinow" due-to="Jonathan baker">Create StringDistanceFrom class that contains a StringMetric and the "left" side string. This would have a method that accepts the "right" side string to test.</action>
<action issue="SANDBOX-490" type="add" dev="kinow">Add Cosine Similarity and Cosine Distance</action>
<action issue="SANDBOX-493" type="fix" dev="kinow" due-to="Jonathan Baker">Change (R) StringMetric.compare(CS left, CS right) to "apply" so that it is consistent with BiFunction.</action>
<action issue="SANDBOX-491" type="fix" dev="kinow" due-to="Jonathan Baker">Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance.</action>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
new file mode 100644
index 0000000..3b2a871
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * <p>
+ * This stores a {@link StringMetric} implementation and a {@link CharSequence} "left" string.
+ * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the
+ * comparison function for the pair of strings.
+ * </p>
+ *
+ * <p>
+ * The following is an example which finds the most similar string:
+ * </p>
+ * <pre>
+ * StringMetric<Integer> metric = new LevenshteinDistance();
+ * String target = "Apache";
+ * StringMetricFrom<Integer> metricFrom =
+ * new StringMetricFrom<Integer>(metric, target);
+ * String mostSimilar = null;
+ * Integer shortestDistance = null;
+ *
+ * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+ * Integer distance = metricFrom.apply(test);
+ * if (shortestDistance == null || distance < shortestDistance) {
+ * shortestDistance = distance;
+ * mostSimilar = test;
+ * }
+ * }
+ *
+ * System.out.println("The string most similar to \"" + target + "\" "
+ * + "is \"" + mostSimilar + "\" because "
+ * + "its distance is only " + shortestDistance + ".");
+ * </pre>
+ *
+ * @param <R> This is the type of similarity score used by the StringMetric function.
+ */
+public class StringMetricFrom<R> {
+
+ /**
+ * String metric.
+ */
+ private final StringMetric<R> metric;
+ /**
+ * Left parameter used in distance function.
+ */
+ private final CharSequence left;
+
+ /**
+ * <p>This accepts the metric implementation and the "left" string.</p>
+ *
+ * @param metric This may not be null.
+ * @param left This may be null here,
+ * but the StringMetric#compare(CharSequence left, CharSequence right)
+ * implementation may not accept nulls.
+ */
+ public StringMetricFrom(final StringMetric<R> metric, final CharSequence left) {
+ if (metric == null) {
+ throw new IllegalArgumentException("The metric may not be null.");
+ }
+
+ this.metric = metric;
+ this.left = left;
+ }
+
+ /**
+ * <p>
+ * This compares "left" field against the "right" parameter
+ * using the "metric" implementation.
+ * </p>
+ *
+ * @param right the second CharSequence
+ * @return the similarity score between two CharSequences
+ */
+ public R apply(CharSequence right) {
+ return metric.apply(left, right);
+ }
+
+ /**
+ * Gets the left parameter.
+ *
+ * @return the left parameter
+ */
+ public CharSequence getLeft() {
+ return left;
+ }
+
+ /**
+ * Gets the right parameter.
+ *
+ * @return the right parameter
+ */
+ public StringMetric<R> getMetric() {
+ return metric;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
new file mode 100644
index 0000000..36c03bb
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}.
+ *
+ * @param <R> The {@link StringMetric} return type.
+ */
+@RunWith(Parameterized.class)
+public class ParameterizedStringMetricFromTest<R> {
+
+ private final StringMetric<R> metric;
+ private final CharSequence left;
+ private final CharSequence right;
+ private final R distance;
+
+ public ParameterizedStringMetricFromTest(
+ final StringMetric<R> metric,
+ final CharSequence left, final CharSequence right,
+ final R distance) {
+
+ this.metric = metric;
+ this.left = left;
+ this.right = right;
+ this.distance = distance;
+ }
+
+ @Parameters
+ public static Iterable<Object[]> parameters() {
+ return Arrays.asList( new Object[][] {
+
+ /* TODO: When SANDBOX-491 is ready, add a few FuzzyScore tests. */
+
+ { new HammingDistance(), "Sam I am.", "Ham I am.", 1 },
+ { new HammingDistance(), "Japtheth, Ham, Shem", "Japtheth, HAM, Shem", 2 },
+ { new HammingDistance(), "Hamming", "Hamming", 0 },
+
+ { new JaroWrinklerDistance(), "elephant", "hippo", 0.44 },
+ { new JaroWrinklerDistance(), "hippo", "elephant", 0.44 },
+ { new JaroWrinklerDistance(), "hippo", "zzzzzzzz", 0.0 },
+
+ /* TODO: When SANDBOX-491 is ready, add a few limited/threshold tests. */
+ { new LevenshteinDistance(), "Apache", "a patchy", 4 },
+ { new LevenshteinDistance(), "go", "no go", 3 },
+ { new LevenshteinDistance(), "go", "go", 0 },
+
+ {
+ new StringMetric<Boolean>() {
+ public Boolean apply(CharSequence left, CharSequence right) {
+ return left == right || (left != null && left.equals(right));
+ }
+ },
+ "Bob's your uncle.",
+ "Every good boy does fine.",
+ false
+ }
+
+ } );
+ }
+
+ @Test
+ public void test() {
+ StringMetricFrom<R> metricFrom = new StringMetricFrom<R>(metric, left);
+ assertThat(metricFrom.apply(right), equalTo(distance));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
new file mode 100644
index 0000000..e268366
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}.
+ */
+public class StringMetricFromTest {
+
+ @Test
+ public void testEquivalence() {
+ StringMetric<Integer> metric = new LevenshteinDistance();
+ String left = "Apache";
+ String right = "a patchy";
+ Integer distance = 4;
+ StringMetricFrom<Integer> metricFrom = new StringMetricFrom<Integer>(metric, left);
+
+ assertThat(metricFrom.apply(right), equalTo(distance));
+ assertThat(metricFrom.apply(right), equalTo(metric.apply(left, right)));
+ }
+
+ @Test
+ public void testJavadocExample() {
+ StringMetric<Integer> metric = new LevenshteinDistance();
+ String target = "Apache";
+ StringMetricFrom<Integer> metricFrom =
+ new StringMetricFrom<Integer>(metric, target);
+ String mostSimilar = null;
+ Integer shortestDistance = null;
+
+ for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+ Integer distance = metricFrom.apply(test);
+ if (shortestDistance == null || distance < shortestDistance) {
+ shortestDistance = distance;
+ mostSimilar = test;
+ }
+ }
+
+ System.out.println("The string most similar to \"" + target + "\" "
+ + "is \"" + mostSimilar + "\" because "
+ + "its distance is only " + shortestDistance + ".");
+
+ assertThat(mostSimilar, equalTo("a patchy"));
+ assertThat(shortestDistance, equalTo(4));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testMissingMetric() {
+ new StringMetricFrom<Number>(null, "no go");
+ }
+
+}