You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/12 14:49:54 UTC

[text] [SANDBOX-492] Create StringDistanceFrom class that contains a StringMetric and the "left" side string. This would have a method that accepts the "right" side string to test.

Repository: commons-text
Updated Branches:
  refs/heads/master a5ac07106 -> 6280d46c5


[SANDBOX-492] Create StringDistanceFrom class that contains a StringMetric and the "left" side string.
This would have a method that accepts the "right" side string to test.

This fixes #3 from GitHub. Thanks to Jonathan Baker.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6280d46c
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6280d46c
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6280d46c

Branch: refs/heads/master
Commit: 6280d46c5d5e87e0491573efa969fcf4a6397cd7
Parents: a5ac071
Author: j--baker <j-...@users.noreply.github.com>
Authored: Wed Mar 4 15:20:18 2015 -0500
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Mon Apr 13 00:48:45 2015 +1200

----------------------------------------------------------------------
 src/changes/changes.xml                         |   1 +
 .../text/similarity/StringMetricFrom.java       | 111 +++++++++++++++++++
 .../ParameterizedStringMetricFromTest.java      |  92 +++++++++++++++
 .../text/similarity/StringMetricFromTest.java   |  71 ++++++++++++
 4 files changed, 275 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 4ac588a..45ac073 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,7 @@
   <body>
 
   <release version="1.0" date="tba" description="tba">
+    <action issue="SANDBOX-492" type="fix" dev="kinow" due-to="Jonathan baker">Create StringDistanceFrom class that contains a StringMetric and the "left" side string.  This would have a method that accepts the "right" side string to test.</action>
     <action issue="SANDBOX-490" type="add" dev="kinow">Add Cosine Similarity and Cosine Distance</action>
     <action issue="SANDBOX-493" type="fix" dev="kinow" due-to="Jonathan Baker">Change (R) StringMetric.compare(CS left, CS right) to "apply" so that it is consistent with BiFunction.</action>
     <action issue="SANDBOX-491" type="fix" dev="kinow" due-to="Jonathan Baker">Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance.</action>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
new file mode 100644
index 0000000..3b2a871
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * <p>
+ * This stores a {@link StringMetric} implementation and a {@link CharSequence} "left" string.
+ * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the
+ * comparison function for the pair of strings.
+ * </p>
+ *
+ * <p>
+ * The following is an example which finds the most similar string:
+ * </p>
+ * <pre>
+ * StringMetric&lt;Integer&gt; metric = new LevenshteinDistance();
+ * String target = "Apache";
+ * StringMetricFrom&lt;Integer&gt; metricFrom =
+ *     new StringMetricFrom&lt;Integer&gt;(metric, target);
+ * String mostSimilar = null;
+ * Integer shortestDistance = null;
+ *
+ * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+ *     Integer distance = metricFrom.apply(test);
+ *     if (shortestDistance == null || distance &lt; shortestDistance) {
+ *         shortestDistance = distance;
+ *         mostSimilar = test;
+ *     }
+ * }
+ *
+ * System.out.println("The string most similar to \"" + target + "\" "
+ *     + "is \"" + mostSimilar + "\" because "
+ *     + "its distance is only " + shortestDistance + ".");
+ * </pre>
+ *
+ * @param <R> This is the type of similarity score used by the StringMetric function.
+ */
+public class StringMetricFrom<R> {
+
+    /**
+     * String metric.
+     */
+    private final StringMetric<R> metric;
+    /**
+     * Left parameter used in distance function.
+     */
+    private final CharSequence left;
+
+    /**
+     * <p>This accepts the metric implementation and the "left" string.</p>
+     *
+     * @param metric This may not be null.
+     * @param left This may be null here,
+     *             but the StringMetric#compare(CharSequence left, CharSequence right)
+     *             implementation may not accept nulls.
+     */
+    public StringMetricFrom(final StringMetric<R> metric, final CharSequence left) {
+        if (metric == null) {
+            throw new IllegalArgumentException("The metric may not be null.");
+        }
+
+        this.metric = metric;
+        this.left = left;
+    }
+
+    /**
+     * <p>
+     * This compares "left" field against the "right" parameter
+     * using the "metric" implementation.
+     * </p>
+     *
+     * @param right the second CharSequence
+     * @return the similarity score between two CharSequences
+     */
+    public R apply(CharSequence right) {
+        return metric.apply(left, right);
+    }
+
+    /**
+     * Gets the left parameter.
+     *
+     * @return the left parameter
+     */
+    public CharSequence getLeft() {
+        return left;
+    }
+
+    /**
+     * Gets the right parameter.
+     *
+     * @return the right parameter
+     */
+    public StringMetric<R> getMetric() {
+        return metric;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
new file mode 100644
index 0000000..36c03bb
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}.
+ *
+ * @param <R> The {@link StringMetric} return type.
+ */
+@RunWith(Parameterized.class)
+public class ParameterizedStringMetricFromTest<R> {
+
+    private final StringMetric<R> metric;
+    private final CharSequence left;
+    private final CharSequence right;
+    private final R distance;
+
+    public ParameterizedStringMetricFromTest(
+        final StringMetric<R> metric,
+        final CharSequence left, final CharSequence right,
+        final R distance) {
+
+        this.metric = metric;
+        this.left = left;
+        this.right = right;
+        this.distance = distance;
+    }
+
+    @Parameters
+    public static Iterable<Object[]> parameters() {
+        return Arrays.asList( new Object[][] {
+
+            /* TODO: When SANDBOX-491 is ready, add a few FuzzyScore tests. */
+
+            { new HammingDistance(), "Sam I am.", "Ham I am.", 1 },
+            { new HammingDistance(), "Japtheth, Ham, Shem", "Japtheth, HAM, Shem", 2 },
+            { new HammingDistance(), "Hamming", "Hamming", 0 },
+
+            { new JaroWrinklerDistance(), "elephant", "hippo", 0.44 },
+            { new JaroWrinklerDistance(), "hippo", "elephant",  0.44 },
+            { new JaroWrinklerDistance(), "hippo", "zzzzzzzz", 0.0 },
+
+            /* TODO: When SANDBOX-491 is ready, add a few limited/threshold tests. */
+            { new LevenshteinDistance(), "Apache", "a patchy", 4 },
+            { new LevenshteinDistance(), "go", "no go", 3 },
+            { new LevenshteinDistance(), "go", "go", 0 },
+
+            {
+                new StringMetric<Boolean>() {
+                    public Boolean apply(CharSequence left, CharSequence right) {
+                        return left == right || (left != null && left.equals(right));
+                    }
+                },
+                "Bob's your uncle.",
+                "Every good boy does fine.",
+                false
+            }
+
+        } );
+    }
+
+    @Test
+    public void test() {
+        StringMetricFrom<R> metricFrom = new StringMetricFrom<R>(metric, left);
+        assertThat(metricFrom.apply(right), equalTo(distance));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
new file mode 100644
index 0000000..e268366
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}.
+ */
+public class StringMetricFromTest {
+
+    @Test
+    public void testEquivalence() {
+        StringMetric<Integer> metric = new LevenshteinDistance();
+        String left = "Apache";
+        String right = "a patchy";
+        Integer distance = 4;
+        StringMetricFrom<Integer> metricFrom = new StringMetricFrom<Integer>(metric, left);
+
+        assertThat(metricFrom.apply(right), equalTo(distance));
+        assertThat(metricFrom.apply(right), equalTo(metric.apply(left, right)));
+    }
+
+    @Test
+    public void testJavadocExample() {
+        StringMetric<Integer> metric = new LevenshteinDistance();
+        String target = "Apache";
+        StringMetricFrom<Integer> metricFrom =
+            new StringMetricFrom<Integer>(metric, target);
+        String mostSimilar = null;
+        Integer shortestDistance = null;
+        
+        for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+            Integer distance = metricFrom.apply(test);
+            if (shortestDistance == null || distance < shortestDistance) {
+                shortestDistance = distance;
+                mostSimilar = test;
+            }
+        }
+       
+        System.out.println("The string most similar to \"" + target + "\" "
+            + "is \"" + mostSimilar + "\" because "
+            + "its distance is only " + shortestDistance + ".");
+
+        assertThat(mostSimilar, equalTo("a patchy"));
+        assertThat(shortestDistance, equalTo(4));
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testMissingMetric() {
+        new StringMetricFrom<Number>(null, "no go");
+    }
+
+}