You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by ki...@apache.org on 2014/11/20 05:54:46 UTC
commons-text git commit: SANDBOX-483 Incorporate String algorithms
from Commons Lang
Repository: commons-text
Updated Branches:
refs/heads/master 71b59af90 -> c0e8c0210
SANDBOX-483 Incorporate String algorithms from Commons Lang
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/c0e8c021
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c0e8c021
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c0e8c021
Branch: refs/heads/master
Commit: c0e8c021059d8300f90cf2123528a35d7c468b57
Parents: 71b59af
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Thu Nov 20 02:23:05 2014 -0200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Thu Nov 20 02:23:05 2014 -0200
----------------------------------------------------------------------
.../commons/text/similarity/FuzzyDistance.java | 130 +++++++
.../text/similarity/JaroWrinklerDistance.java | 373 +++++++++++++++++++
.../text/similarity/LevenshteinDistance.java | 258 +++++++++++++
.../commons/text/similarity/StringMetric.java | 45 +++
.../commons/text/similarity/package-info.java | 22 ++
.../text/similarity/TestFuzzyDistance.java | 75 ++++
.../similarity/TestJaroWrinklerDistance.java | 62 +++
.../similarity/TestLevenshteinDistance.java | 146 ++++++++
8 files changed, 1111 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
new file mode 100644
index 0000000..8e9228a
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Locale;
+
+/**
+ * <p>
+ * This string matching algorithm is similar to the algorithms of editors such
+ * as Sublime Text, TextMate, Atom and others. One point is given for every
+ * matched character. Subsequent matches yield two bonus points. A higher score
+ * indicates a higher similarity.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class FuzzyDistance implements StringMetric<Integer> {
+
+ /**
+ * <p>
+ * Find the Fuzzy Distance which indicates the similarity score between two
+ * Strings. This method uses the default locale.
+ * </p>
+ *
+ * @param term a full term that should be matched against, must not be null
+ * @param query the query that will be matched against a term, must not be
+ * null
+ * @return result score
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ @Override
+ public Integer compare(CharSequence term, CharSequence query) {
+ return compare(term, query, Locale.getDefault());
+ }
+
+ /**
+ * <p>
+ * Find the Fuzzy Distance which indicates the similarity score between two
+ * Strings.
+ * </p>
+ *
+ * <pre>
+ * StringUtils.getFuzzyDistance(null, null, null) = IllegalArgumentException
+ * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH) = 0
+ * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH) = 0
+ * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH) = 1
+ * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH) = 1
+ * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH) = 2
+ * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH) = 4
+ * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
+ * </pre>
+ *
+ * @param term a full term that should be matched against, must not be null
+ * @param query the query that will be matched against a term, must not be
+ * null
+ * @param locale This string matching logic is case insensitive. A locale is
+ * necessary to normalize both Strings to lower case.
+ * @return result score
+ * @throws IllegalArgumentException if either String input {@code null} or
+ * Locale input {@code null}
+ */
+ public Integer compare(CharSequence term, CharSequence query, Locale locale) {
+ if (term == null || query == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ } else if (locale == null) {
+ throw new IllegalArgumentException("Locale must not be null");
+ }
+
+ // fuzzy logic is case insensitive. We normalize the Strings to lower
+ // case right from the start. Turning characters to lower case
+ // via Character.toLowerCase(char) is unfortunately insufficient
+ // as it does not accept a locale.
+ final String termLowerCase = term.toString().toLowerCase(locale);
+ final String queryLowerCase = query.toString().toLowerCase(locale);
+
+ // the resulting score
+ int score = 0;
+
+ // the position in the term which will be scanned next for potential
+ // query character matches
+ int termIndex = 0;
+
+ // index of the previously matched character in the term
+ int previousMatchingCharacterIndex = Integer.MIN_VALUE;
+
+ for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
+ final char queryChar = queryLowerCase.charAt(queryIndex);
+
+ boolean termCharacterMatchFound = false;
+ for (; termIndex < termLowerCase.length()
+ && !termCharacterMatchFound; termIndex++) {
+ final char termChar = termLowerCase.charAt(termIndex);
+
+ if (queryChar == termChar) {
+ // simple character matches result in one point
+ score++;
+
+ // subsequent character matches further improve
+ // the score.
+ if (previousMatchingCharacterIndex + 1 == termIndex) {
+ score += 2;
+ }
+
+ previousMatchingCharacterIndex = termIndex;
+
+ // we can leave the nested loop. Every character in the
+ // query can match at most one character in the term.
+ termCharacterMatchFound = true;
+ }
+ }
+ }
+
+ return score;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
new file mode 100644
index 0000000..2dcd51c
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * <p>
+ * The Jaro measure is the weighted sum of percentage of matched characters
+ * from each file and transposed characters. Winkler increased this measure
+ * for matching initial characters.
+ * </p>
+ *
+ * <p>
+ * This implementation is based on the Jaro Winkler similarity algorithm
+ * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
+ * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaroWrinklerDistance implements StringMetric<Double> {
+
+ /**
+ * Represents a failed index search.
+ */
+ public static final int INDEX_NOT_FOUND = -1;
+
+ /**
+ * <p>
+ * Find the Jaro Winkler Distance which indicates the similarity score
+ * between two Strings.
+ * </p>
+ *
+ * <pre>
+ * StringUtils.getJaroWinklerDistance(null, null) = IllegalArgumentException
+ * StringUtils.getJaroWinklerDistance("","") = 0.0
+ * StringUtils.getJaroWinklerDistance("","a") = 0.0
+ * StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
+ * StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
+ * StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
+ * StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
+ * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
+ * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
+ * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
+ * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91
+ * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
+ * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
+ * StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
+ * </pre>
+ *
+ * @param left the first String, must not be null
+ * @param right the second String, must not be null
+ * @return result distance
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ @Override
+ public Double compare(CharSequence left, CharSequence right) {
+ final double DEFAULT_SCALING_FACTOR = 0.1;
+
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ final double jaro = score(left, right);
+ final int cl = commonPrefixLength(left, right);
+ final double matchScore = Math.round(jaro + (DEFAULT_SCALING_FACTOR
+ * cl * (1.0 - jaro)) * 100.0) / 100.0;
+
+ return matchScore;
+ }
+
+ // TODO: we can move these methods to a Util class, keep them here,
+ // create a common abstract class, shade lang-3.3...
+
+ /**
+ * Calculates the number of characters from the beginning of the strings
+ * that match exactly one-to-one, up to a maximum of four (4) characters.
+ *
+ * @param first The first string.
+ * @param second The second string.
+ * @return A number between 0 and 4.
+ */
+ private static int commonPrefixLength(final CharSequence first,
+ final CharSequence second) {
+ final int result = getCommonPrefix(first.toString(), second.toString())
+ .length();
+
+ // Limit the result to 4.
+ return result > 4 ? 4 : result;
+ }
+
+ /**
+ * <p>
+ * Compares all Strings in an array and returns the initial sequence of
+ * characters that is common to all of them.
+ * </p>
+ *
+ * <p>
+ * For example,
+ * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -> "i am a "</code>
+ * </p>
+ *
+ * <pre>
+ * StringUtils.getCommonPrefix(null) = ""
+ * StringUtils.getCommonPrefix(new String[] {}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"abc"}) = "abc"
+ * StringUtils.getCommonPrefix(new String[] {null, null}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"", ""}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"", null}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"abc", null, null}) = ""
+ * StringUtils.getCommonPrefix(new String[] {null, null, "abc"}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"", "abc"}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"abc", ""}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
+ * StringUtils.getCommonPrefix(new String[] {"abc", "a"}) = "a"
+ * StringUtils.getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
+ * StringUtils.getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
+ * StringUtils.getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
+ * StringUtils.getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a "
+ * </pre>
+ *
+ * @param strs array of String objects, entries may be null
+ * @return the initial sequence of characters that are common to all Strings
+ * in the array; empty String if the array is null, the elements are
+ * all null or if there is no common prefix.
+ * @since 2.4
+ */
+ public static String getCommonPrefix(final String... strs) {
+ if (strs == null || strs.length == 0) {
+ return "";
+ }
+ final int smallestIndexOfDiff = indexOfDifference(strs);
+ if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
+ // all strings were identical
+ if (strs[0] == null) {
+ return "";
+ }
+ return strs[0];
+ } else if (smallestIndexOfDiff == 0) {
+ // there were no common initial characters
+ return "";
+ } else {
+ // we found a common initial character sequence
+ return strs[0].substring(0, smallestIndexOfDiff);
+ }
+ }
+
+ /**
+ * This method returns the Jaro-Winkler score for string matching.
+ *
+ * @param first the first string to be matched
+ * @param second the second string to be machted
+ * @return matching score without scaling factor impact
+ */
+ protected static double score(final CharSequence first,
+ final CharSequence second) {
+ String shorter;
+ String longer;
+
+ // Determine which String is longer.
+ if (first.length() > second.length()) {
+ longer = first.toString().toLowerCase();
+ shorter = second.toString().toLowerCase();
+ } else {
+ longer = second.toString().toLowerCase();
+ shorter = first.toString().toLowerCase();
+ }
+
+ // Calculate the half length() distance of the shorter String.
+ final int halflength = shorter.length() / 2 + 1;
+
+ // Find the set of matching characters between the shorter and longer
+ // strings. Note that
+ // the set of matching characters may be different depending on the
+ // order of the strings.
+ final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
+ halflength);
+ final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
+ halflength);
+
+ // If one or both of the sets of common characters is empty, then
+ // there is no similarity between the two strings.
+ if (m1.length() == 0 || m2.length() == 0) {
+ return 0.0;
+ }
+
+ // If the set of common characters is not the same size, then
+ // there is no similarity between the two strings, either.
+ if (m1.length() != m2.length()) {
+ return 0.0;
+ }
+
+ // Calculate the number of transposition between the two sets
+ // of common characters.
+ final int transpositions = transpositions(m1, m2);
+
+ // Calculate the distance.
+ final double dist = (m1.length() / ((double) shorter.length())
+ + m2.length() / ((double) longer.length()) + (m1.length() - transpositions)
+ / ((double) m1.length())) / 3.0;
+ return dist;
+ }
+
+ /**
+ * Calculates the number of transposition between two strings.
+ *
+ * @param first The first string.
+ * @param second The second string.
+ * @return The number of transposition between the two strings.
+ */
+ protected static int transpositions(final CharSequence first,
+ final CharSequence second) {
+ int transpositions = 0;
+ for (int i = 0; i < first.length(); i++) {
+ if (first.charAt(i) != second.charAt(i)) {
+ transpositions++;
+ }
+ }
+ return transpositions / 2;
+ }
+
+ /**
+ * <p>
+ * Compares all CharSequences in an array and returns the index at which the
+ * CharSequences begin to differ.
+ * </p>
+ *
+ * <p>
+ * For example,
+ * <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -> 7</code>
+ * </p>
+ *
+ * <pre>
+ * StringUtils.indexOfDifference(null) = -1
+ * StringUtils.indexOfDifference(new String[] {}) = -1
+ * StringUtils.indexOfDifference(new String[] {"abc"}) = -1
+ * StringUtils.indexOfDifference(new String[] {null, null}) = -1
+ * StringUtils.indexOfDifference(new String[] {"", ""}) = -1
+ * StringUtils.indexOfDifference(new String[] {"", null}) = 0
+ * StringUtils.indexOfDifference(new String[] {"abc", null, null}) = 0
+ * StringUtils.indexOfDifference(new String[] {null, null, "abc"}) = 0
+ * StringUtils.indexOfDifference(new String[] {"", "abc"}) = 0
+ * StringUtils.indexOfDifference(new String[] {"abc", ""}) = 0
+ * StringUtils.indexOfDifference(new String[] {"abc", "abc"}) = -1
+ * StringUtils.indexOfDifference(new String[] {"abc", "a"}) = 1
+ * StringUtils.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
+ * StringUtils.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
+ * StringUtils.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
+ * StringUtils.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
+ * StringUtils.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7
+ * </pre>
+ *
+ * @param css array of CharSequences, entries may be null
+ * @return the index where the strings begin to differ; -1 if they are all
+ * equal
+ * @since 2.4
+ * @since 3.0 Changed signature from indexOfDifference(String...) to
+ * indexOfDifference(CharSequence...)
+ */
+ protected static int indexOfDifference(final CharSequence... css) {
+ if (css == null || css.length <= 1) {
+ return INDEX_NOT_FOUND;
+ }
+ boolean anyStringNull = false;
+ boolean allStringsNull = true;
+ final int arrayLen = css.length;
+ int shortestStrLen = Integer.MAX_VALUE;
+ int longestStrLen = 0;
+
+ // find the min and max string lengths; this avoids checking to make
+ // sure we are not exceeding the length of the string each time through
+ // the bottom loop.
+ for (int i = 0; i < arrayLen; i++) {
+ if (css[i] == null) {
+ anyStringNull = true;
+ shortestStrLen = 0;
+ } else {
+ allStringsNull = false;
+ shortestStrLen = Math.min(css[i].length(), shortestStrLen);
+ longestStrLen = Math.max(css[i].length(), longestStrLen);
+ }
+ }
+
+ // handle lists containing all nulls or all empty strings
+ if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
+ return INDEX_NOT_FOUND;
+ }
+
+ // handle lists containing some nulls or some empty strings
+ if (shortestStrLen == 0) {
+ return 0;
+ }
+
+ // find the position with the first difference across all strings
+ int firstDiff = -1;
+ for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
+ final char comparisonChar = css[0].charAt(stringPos);
+ for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
+ if (css[arrayPos].charAt(stringPos) != comparisonChar) {
+ firstDiff = stringPos;
+ break;
+ }
+ }
+ if (firstDiff != -1) {
+ break;
+ }
+ }
+
+ if (firstDiff == -1 && shortestStrLen != longestStrLen) {
+ // we compared all of the characters up to the length of the
+ // shortest string and didn't find a match, but the string lengths
+ // vary, so return the length of the shortest string.
+ return shortestStrLen;
+ }
+ return firstDiff;
+ }
+
+ /**
+ * Gets a set of matching characters between two strings.
+ *
+ * <p>
+ * Two characters from the first string and the second string are
+ * considered matching if the character's respective positions are no
+ * farther than the limit value.
+ * </p>
+ *
+ * @param first The first string.
+ * @param second The second string.
+ * @param limit The maximum distance to consider.
+ * @return A string contain the set of common characters.
+ */
+ protected static String getSetOfMatchingCharacterWithin(
+ final CharSequence first, final CharSequence second, final int limit) {
+ final StringBuilder common = new StringBuilder();
+ final StringBuilder copy = new StringBuilder(second);
+
+ for (int i = 0; i < first.length(); i++) {
+ final char ch = first.charAt(i);
+ boolean found = false;
+
+ // See if the character is within the limit positions away from the
+ // original position of that character.
+ for (int j = Math.max(0, i - limit); !found
+ && j < Math.min(i + limit, second.length()); j++) {
+ if (copy.charAt(j) == ch) {
+ found = true;
+ common.append(ch);
+ copy.setCharAt(j, '*');
+ }
+ }
+ }
+ return common.toString();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
new file mode 100644
index 0000000..1793f1e
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Arrays;
+
+/**
+ * <p>
+ * A string metric for measuring the difference between two sequences.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class LevenshteinDistance implements StringMetric<Integer> {
+
+ /**
+ * <p>
+ * Find the Levenshtein distance between two Strings.
+ * </p>
+ *
+ * <p>
+ * This is the number of changes needed to change one String into another,
+ * where each change is a single character modification (deletion, insertion
+ * or substitution).
+ * </p>
+ *
+ * <p>
+ * The previous implementation of the Levenshtein distance algorithm was
+ * from <a
+ * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com
+ * /ld.htm</a>
+ * </p>
+ *
+ * <p>
+ * Chas Emerick has written an implementation in Java, which avoids an
+ * OutOfMemoryError which can occur when my Java implementation is used with
+ * very large strings.<br>
+ * This implementation of the Levenshtein distance algorithm is from <a
+ * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/
+ * ldjava.htm</a>
+ * </p>
+ *
+ * <pre>
+ * StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance("","") = 0
+ * StringUtils.getLevenshteinDistance("","a") = 1
+ * StringUtils.getLevenshteinDistance("aaapppp", "") = 7
+ * StringUtils.getLevenshteinDistance("frog", "fog") = 1
+ * StringUtils.getLevenshteinDistance("fly", "ant") = 3
+ * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7
+ * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7
+ * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8
+ * StringUtils.getLevenshteinDistance("hello", "hallo") = 1
+ * </pre>
+ *
+ * @param left the first string, must not be null
+ * @param right the second string, must not be null
+ * @return result distance
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ @Override
+ public Integer compare(CharSequence left, CharSequence right) {
+ return compare(left, right, Integer.MAX_VALUE);
+ }
+
+ /**
+ * <p>
+ * Find the Levenshtein distance between two Strings if it's less than or
+ * equal to a given threshold.
+ * </p>
+ *
+ * <p>
+ * This is the number of changes needed to change one String into another,
+ * where each change is a single character modification (deletion, insertion
+ * or substitution).
+ * </p>
+ *
+ * <p>
+ * This implementation follows from Algorithms on Strings, Trees and
+ * Sequences by Dan Gusfield and Chas Emerick's implementation of the
+ * Levenshtein distance algorithm from <a
+ * href="http://www.merriampark.com/ld.htm"
+ * >http://www.merriampark.com/ld.htm</a>
+ * </p>
+ *
+ * <pre>
+ * StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance("","", 0) = 0
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
+ * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
+ * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
+ * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
+ * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
+ * </pre>
+ *
+ * @param left the first string, must not be null
+ * @param right the second string, must not be null
+ * @param threshold the target threshold, must not be negative
+ * @return result distance
+ * @throws IllegalArgumentException if either String input {@code null} or
+ * negative threshold
+ */
+ public Integer compare(CharSequence left, CharSequence right, int threshold) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+ if (threshold < 0) {
+ throw new IllegalArgumentException("Threshold must not be negative");
+ }
+
+ /*
+ * This implementation only computes the distance if it's less than or
+ * equal to the threshold value, returning -1 if it's greater. The
+ * advantage is performance: unbounded distance is O(nm), but a bound of
+ * k allows us to reduce it to O(km) time by only computing a diagonal
+ * stripe of width 2k + 1 of the cost table. It is also possible to use
+ * this to compute the unbounded Levenshtein distance by starting the
+ * threshold at 1 and doubling each time until the distance is found;
+ * this is O(dm), where d is the distance.
+ *
+ * One subtlety comes from needing to ignore entries on the border of
+ * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
+ * to the left of the leftmost member We must ignore the entry above the
+ * rightmost member
+ *
+ * Another subtlety comes from our stripe running off the matrix if the
+ * strings aren't of the same size. Since string s is always swapped to
+ * be the shorter of the two, the stripe will always run off to the
+ * upper right instead of the lower left of the matrix.
+ *
+ * As a concrete example, suppose s is of length 5, t is of length 7,
+ * and our threshold is 1. In this case we're going to walk a stripe of
+ * length 3. The matrix would look like so:
+ *
+ * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 |
+ * | | |#|#| 6 | | | | |#| 7 | | | | | |
+ *
+ * Note how the stripe leads off the table as there is no possible way
+ * to turn a string of length 5 into one of length 7 in edit distance of
+ * 1.
+ *
+ * Additionally, this implementation decreases memory usage by using two
+ * single-dimensional arrays and swapping them back and forth instead of
+ * allocating an entire n by m matrix. This requires a few minor
+ * changes, such as immediately returning when it's detected that the
+ * stripe has run off the matrix and initially filling the arrays with
+ * large values so that entries we don't compute are ignored.
+ *
+ * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
+ * some discussion.
+ */
+
+ int n = left.length(); // length of s
+ int m = right.length(); // length of t
+
+ // if one string is empty, the edit distance is necessarily the length
+ // of the other
+ if (n == 0) {
+ return m <= threshold ? m : -1;
+ } else if (m == 0) {
+ return n <= threshold ? n : -1;
+ }
+
+ if (n > m) {
+ // swap the two strings to consume less memory
+ final CharSequence tmp = left;
+ left = right;
+ right = tmp;
+ n = m;
+ m = right.length();
+ }
+
+ int[] p = new int[n + 1]; // 'previous' cost array, horizontally
+ int[] d = new int[n + 1]; // cost array, horizontally
+ int[] _d; // placeholder to assist in swapping p and d
+
+ // fill in starting table values
+ final int boundary = Math.min(n, threshold) + 1;
+ for (int i = 0; i < boundary; i++) {
+ p[i] = i;
+ }
+ // these fills ensure that the value above the rightmost entry of our
+ // stripe will be ignored in following loop iterations
+ Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
+ Arrays.fill(d, Integer.MAX_VALUE);
+
+ // iterates through t
+ for (int j = 1; j <= m; j++) {
+ final char t_j = right.charAt(j - 1); // jth character of t
+ d[0] = j;
+
+ // compute stripe indices, constrain to array size
+ final int min = Math.max(1, j - threshold);
+ final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
+ n, j + threshold);
+
+ // the stripe may lead off of the table if s and t are of different
+ // sizes
+ if (min > max) {
+ return -1;
+ }
+
+ // ignore entry left of leftmost
+ if (min > 1) {
+ d[min - 1] = Integer.MAX_VALUE;
+ }
+
+ // iterates through [min, max] in s
+ for (int i = min; i <= max; i++) {
+ if (left.charAt(i - 1) == t_j) {
+ // diagonally left and up
+ d[i] = p[i - 1];
+ } else {
+ // 1 + minimum of cell to the left, to the top, diagonally
+ // left and up
+ d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
+ }
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // if p[n] is greater than the threshold, there's no guarantee on it
+ // being the correct
+ // distance
+ if (p[n] <= threshold) {
+ return p[n];
+ }
+ return -1;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/StringMetric.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/StringMetric.java b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
new file mode 100644
index 0000000..6765bbd
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * <p>
+ * Marker interface for <a
+ * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a>
+ * </p>
+ *
+ * <p>
+ * A string metric measures the distance between two text strings.
+ * </p>
+ *
+ * @param <R> return type of the edit distance
+ * @since 1.0
+ */
+public interface StringMetric<R> {
+
+ /**
+ * <p>
+ * Compares two strings.
+ * </p>
+ *
+ * @param left the first string
+ * @param right the second string
+ * @return result distance
+ */
+ R compare(CharSequence left, CharSequence right);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java
new file mode 100644
index 0000000..8e9d478
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Provides algorithms for string similarity.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text.similarity;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
new file mode 100644
index 0000000..30b1dfc
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Locale;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.FuzzyDistance}.
+ */
+public class TestFuzzyDistance {
+
+ private static FuzzyDistance distance;
+
+ @BeforeClass
+ public static void setUp() {
+ distance = new FuzzyDistance();
+ }
+
+ @Test
+ public void testGetFuzzyDistance() throws Exception {
+ assertEquals(0, (int) distance.compare("", "", Locale.ENGLISH));
+ assertEquals(0,
+ (int) distance.compare("Workshop", "b", Locale.ENGLISH));
+ assertEquals(1,
+ (int) distance.compare("Room", "o", Locale.ENGLISH));
+ assertEquals(1,
+ (int) distance.compare("Workshop", "w", Locale.ENGLISH));
+ assertEquals(2,
+ (int) distance.compare("Workshop", "ws", Locale.ENGLISH));
+ assertEquals(4,
+ (int) distance.compare("Workshop", "wo", Locale.ENGLISH));
+ assertEquals(3, (int) distance.compare(
+ "Apache Software Foundation", "asf", Locale.ENGLISH));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_NullNullNull() throws Exception {
+ distance.compare(null, null, null);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
+ distance.compare(" ", null, Locale.ENGLISH);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_NullStringLocale() throws Exception {
+ distance.compare(null, "clear", Locale.ENGLISH);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_StringStringNull() throws Exception {
+ distance.compare(" ", "clear", null);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
new file mode 100644
index 0000000..6477de0
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
+ */
+public class TestJaroWrinklerDistance {
+
+ private static JaroWrinklerDistance distance;
+
+ @BeforeClass
+ public static void setUp() {
+ distance = new JaroWrinklerDistance();
+ }
+
+ @Test
+ public void testGetJaroWinklerDistance_StringString() {
+ assertEquals(0.93d, (double) distance.compare("frog", "fog"), 0.0d);
+ assertEquals(0.0d, (double) distance.compare("fly", "ant"), 0.0d);
+ assertEquals(0.44d, (double) distance.compare("elephant", "hippo"), 0.0d);
+ assertEquals(0.91d, (double) distance.compare("ABC Corporation", "ABC Corp"), 0.0d);
+ assertEquals(0.93d, (double) distance.compare("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
+ assertEquals(0.94d, (double) distance.compare("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
+ assertEquals(0.9d, (double) distance.compare("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetJaroWinklerDistance_NullNull() throws Exception {
+ distance.compare(null, null);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetJaroWinklerDistance_StringNull() throws Exception {
+ distance.compare(" ", null);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetJaroWinklerDistance_NullString() throws Exception {
+ distance.compare(null, "clear");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
new file mode 100644
index 0000000..7790b05
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
+ */
+public class TestLevenshteinDistance {
+
+ private static LevenshteinDistance distance;
+
+ @BeforeClass
+ public static void setUp() {
+ distance = new LevenshteinDistance();
+ }
+
+ @Test
+ public void testGetLevenshteinDistance_StringString() {
+ assertEquals(0, (int) distance.compare("", ""));
+ assertEquals(1, (int) distance.compare("", "a"));
+ assertEquals(7, (int) distance.compare("aaapppp", ""));
+ assertEquals(1, (int) distance.compare("frog", "fog"));
+ assertEquals(3, (int) distance.compare("fly", "ant"));
+ assertEquals(7, (int) distance.compare("elephant", "hippo"));
+ assertEquals(7, (int) distance.compare("hippo", "elephant"));
+ assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
+ assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
+ assertEquals(1, (int) distance.compare("hello", "hallo"));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetLevenshteinDistance_NullString() throws Exception {
+ distance.compare("a", null);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetLevenshteinDistance_StringNull() throws Exception {
+ distance.compare(null, "a");
+ }
+
+ @Test
+ public void testGetLevenshteinDistance_StringStringInt() {
+ // empty strings
+ assertEquals(0, (int) distance.compare("", "", 0));
+ assertEquals(7, (int) distance.compare("aaapppp", "", 8));
+ assertEquals(7, (int) distance.compare("aaapppp", "", 7));
+ assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
+
+ // unequal strings, zero threshold
+ assertEquals(-1, (int) distance.compare("b", "a", 0));
+ assertEquals(-1, (int) distance.compare("a", "b", 0));
+
+ // equal strings
+ assertEquals(0, (int) distance.compare("aa", "aa", 0));
+ assertEquals(0, (int) distance.compare("aa", "aa", 2));
+
+ // same length
+ assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
+ assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
+
+ // big stripe
+ assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
+
+ // distance less than threshold
+ assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
+ assertEquals(3, (int) distance.compare("a", "bbb", 4));
+
+ // distance equal to threshold
+ assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
+ assertEquals(3, (int) distance.compare("a", "bbb", 3));
+
+ // distance greater than threshold
+ assertEquals(-1, (int) distance.compare("a", "bbb", 2));
+ assertEquals(-1, (int) distance.compare("bbb", "a", 2));
+ assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
+
+ // stripe runs off array, strings not similar
+ assertEquals(-1, (int) distance.compare("a", "bbb", 1));
+ assertEquals(-1, (int) distance.compare("bbb", "a", 1));
+
+ // stripe runs off array, strings are similar
+ assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
+ assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
+
+ // old getLevenshteinDistance test cases
+ assertEquals(1, (int) distance.compare("frog", "fog", 1));
+ assertEquals(3, (int) distance.compare("fly", "ant", 3));
+ assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
+ assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
+ assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
+ assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
+ assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
+ assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
+ assertEquals(1, (int) distance.compare("hello", "hallo", 1));
+
+ assertEquals(1,
+ (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
+ assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE));
+ assertEquals(7,
+ (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE));
+ assertEquals(7,
+ (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE));
+ assertEquals(8,
+ (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE));
+ assertEquals(8,
+ (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE));
+ assertEquals(1,
+ (int) distance.compare("hello", "hallo", Integer.MAX_VALUE));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetLevenshteinDistance_NullStringInt() throws Exception {
+ distance.compare(null, "a", 0);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetLevenshteinDistance_StringNullInt() throws Exception {
+ distance.compare("a", null, 0);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetLevenshteinDistance_StringStringNegativeInt()
+ throws Exception {
+ distance.compare("a", "a", -1);
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
For additional commands, e-mail: dev-help@commons.apache.org
Re: commons-text git commit: SANDBOX-483 Incorporate String
algorithms from Commons Lang
Posted by Benedikt Ritter <br...@apache.org>.
Works now! Thanks!
2014-11-27 2:56 GMT+01:00 Bruno P. Kinoshita <br...@yahoo.com.br>:
> Hi Benedikt!
> I had a look at some checkstyle issues, and one of them warned me about
> unnecessary parenthesis. Guess I removed one too many parenthesis.
> Can you update your repo and give it a try again, please? Thanks!Bruno
>
> From: Benedikt Ritter <br...@apache.org>
> To: Commons Developers List <de...@commons.apache.org>
> Sent: Wednesday, November 26, 2014 6:00 AM
> Subject: Re: commons-text git commit: SANDBOX-483 Incorporate String
> algorithms from Commons Lang
>
> Hello Bruno,
>
> when building commons-text I get:
>
>
> testGetJaroWinklerDistance_StringString(org.apache.commons.text.similarity.TestJaroWrinklerDistance)
> Time elapsed: 0.015 sec <<< FAILURE!
> java.lang.AssertionError: expected:<0.93> but was:<0.02>
> at org.junit.Assert.fail(Assert.java:88)
> at org.junit.Assert.failNotEquals(Assert.java:743)
> at org.junit.Assert.assertEquals(Assert.java:494)
> at org.junit.Assert.assertEquals(Assert.java:592)
> at
>
> org.apache.commons.text.similarity.TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString(TestJaroWrinklerDistance.java:38)
>
> Running org.apache.commons.text.similarity.TestLevenshteinDistance
> Tests run: 7, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.006 sec -
> in org.apache.commons.text.similarity.TestLevenshteinDistance
>
> Results :
>
> Failed tests:
> TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString:38
> expected:<0.93> but was:<0.02>
>
> Tests run: 16, Failures: 1, Errors: 0, Skipped: 0
>
> can you check?
>
> thanks!
> Benedikt
>
> 2014-11-20 5:54 GMT+01:00 <ki...@apache.org>:
>
> > Repository: commons-text
> > Updated Branches:
> > refs/heads/master 71b59af90 -> c0e8c0210
> >
> >
> > SANDBOX-483 Incorporate String algorithms from Commons Lang
> >
> >
> > Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
> > Commit:
> > http://git-wip-us.apache.org/repos/asf/commons-text/commit/c0e8c021
> > Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c0e8c021
> > Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c0e8c021
> >
> > Branch: refs/heads/master
> > Commit: c0e8c021059d8300f90cf2123528a35d7c468b57
> > Parents: 71b59af
> > Author: Bruno P. Kinoshita <br...@yahoo.com.br>
> > Authored: Thu Nov 20 02:23:05 2014 -0200
> > Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
> > Committed: Thu Nov 20 02:23:05 2014 -0200
> >
> > ----------------------------------------------------------------------
> > .../commons/text/similarity/FuzzyDistance.java | 130 +++++++
> > .../text/similarity/JaroWrinklerDistance.java | 373 +++++++++++++++++++
> > .../text/similarity/LevenshteinDistance.java | 258 +++++++++++++
> > .../commons/text/similarity/StringMetric.java | 45 +++
> > .../commons/text/similarity/package-info.java | 22 ++
> > .../text/similarity/TestFuzzyDistance.java | 75 ++++
> > .../similarity/TestJaroWrinklerDistance.java | 62 +++
> > .../similarity/TestLevenshteinDistance.java | 146 ++++++++
> > 8 files changed, 1111 insertions(+)
> > ----------------------------------------------------------------------
> >
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> > a/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> > b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> > new file mode 100644
> > index 0000000..8e9228a
> > --- /dev/null
> > +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> > @@ -0,0 +1,130 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +import java.util.Locale;
> > +
> > +/**
> > + * <p>
> > + * This string matching algorithm is similar to the algorithms of
> editors
> > such
> > + * as Sublime Text, TextMate, Atom and others. One point is given for
> > every
> > + * matched character. Subsequent matches yield two bonus points. A
> higher
> > score
> > + * indicates a higher similarity.
> > + * </p>
> > + *
> > + * @since 1.0
> > + */
> > +public class FuzzyDistance implements StringMetric<Integer> {
> > +
> > + /**
> > + * <p>
> > + * Find the Fuzzy Distance which indicates the similarity score
> > between two
> > + * Strings. This method uses the default locale.
> > + * </p>
> > + *
> > + * @param term a full term that should be matched against, must not
> > be null
> > + * @param query the query that will be matched against a term, must
> > not be
> > + * null
> > + * @return result score
> > + * @throws IllegalArgumentException if either String input {@code
> > null}
> > + */
> > + @Override
> > + public Integer compare(CharSequence term, CharSequence query) {
> > + return compare(term, query, Locale.getDefault());
> > + }
> > +
> > + /**
> > + * <p>
> > + * Find the Fuzzy Distance which indicates the similarity score
> > between two
> > + * Strings.
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.getFuzzyDistance(null, null, null)
> > = IllegalArgumentException
> > + * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)
> > = 0
> > + * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)
> > = 0
> > + * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)
> > = 1
> > + * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)
> > = 1
> > + * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)
> > = 2
> > + * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)
> > = 4
> > + * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf",
> > Locale.ENGLISH) = 3
> > + * </pre>
> > + *
> > + * @param term a full term that should be matched against, must not
> > be null
> > + * @param query the query that will be matched against a term, must
> > not be
> > + * null
> > + * @param locale This string matching logic is case insensitive. A
> > locale is
> > + * necessary to normalize both Strings to lower case.
> > + * @return result score
> > + * @throws IllegalArgumentException if either String input {@code
> > null} or
> > + * Locale input {@code null}
> > + */
> > + public Integer compare(CharSequence term, CharSequence query, Locale
> > locale) {
> > + if (term == null || query == null) {
> > + throw new IllegalArgumentException("Strings must not be
> > null");
> > + } else if (locale == null) {
> > + throw new IllegalArgumentException("Locale must not be
> null");
> > + }
> > +
> > + // fuzzy logic is case insensitive. We normalize the Strings to
> > lower
> > + // case right from the start. Turning characters to lower case
> > + // via Character.toLowerCase(char) is unfortunately insufficient
> > + // as it does not accept a locale.
> > + final String termLowerCase =
> term.toString().toLowerCase(locale);
> > + final String queryLowerCase =
> > query.toString().toLowerCase(locale);
> > +
> > + // the resulting score
> > + int score = 0;
> > +
> > + // the position in the term which will be scanned next for
> > potential
> > + // query character matches
> > + int termIndex = 0;
> > +
> > + // index of the previously matched character in the term
> > + int previousMatchingCharacterIndex = Integer.MIN_VALUE;
> > +
> > + for (int queryIndex = 0; queryIndex < queryLowerCase.length();
> > queryIndex++) {
> > + final char queryChar = queryLowerCase.charAt(queryIndex);
> > +
> > + boolean termCharacterMatchFound = false;
> > + for (; termIndex < termLowerCase.length()
> > + && !termCharacterMatchFound; termIndex++) {
> > + final char termChar = termLowerCase.charAt(termIndex);
> > +
> > + if (queryChar == termChar) {
> > + // simple character matches result in one point
> > + score++;
> > +
> > + // subsequent character matches further improve
> > + // the score.
> > + if (previousMatchingCharacterIndex + 1 ==
> termIndex) {
> > + score += 2;
> > + }
> > +
> > + previousMatchingCharacterIndex = termIndex;
> > +
> > + // we can leave the nested loop. Every character in
> > the
> > + // query can match at most one character in the
> term.
> > + termCharacterMatchFound = true;
> > + }
> > + }
> > + }
> > +
> > + return score;
> > + }
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> >
> a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> >
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> > new file mode 100644
> > index 0000000..2dcd51c
> > --- /dev/null
> > +++
> >
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> > @@ -0,0 +1,373 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +/**
> > + * <p>
> > + * The Jaro measure is the weighted sum of percentage of matched
> > characters
> > + * from each file and transposed characters. Winkler increased this
> > measure
> > + * for matching initial characters.
> > + * </p>
> > + *
> > + * <p>
> > + * This implementation is based on the Jaro Winkler similarity algorithm
> > + * from <a href="
> > http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
> > + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
> > + * </p>
> > + *
> > + * <p>
> > + * This code has been adapted from Apache Commons Lang 3.3.
> > + * </p>
> > + *
> > + * @since 1.0
> > + */
> > +public class JaroWrinklerDistance implements StringMetric<Double> {
> > +
> > + /**
> > + * Represents a failed index search.
> > + */
> > + public static final int INDEX_NOT_FOUND = -1;
> > +
> > + /**
> > + * <p>
> > + * Find the Jaro Winkler Distance which indicates the similarity
> score
> > + * between two Strings.
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.getJaroWinklerDistance(null, null) =
> > IllegalArgumentException
> > + * StringUtils.getJaroWinklerDistance("","") = 0.0
> > + * StringUtils.getJaroWinklerDistance("","a") = 0.0
> > + * StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
> > + * StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
> > + * StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
> > + * StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
> > + * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
> > + * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
> > + * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
> > + * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp")
> > = 0.91
> > + * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D
> > & H Enterprises, Inc.") = 0.93
> > + * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness
> > Center", "My Gym. Childrens Fitness") = 0.94
> > + * StringUtils.getJaroWinklerDistance("PENNSYLVANIA",
> > "PENNCISYLVNIA") = 0.9
> > + * </pre>
> > + *
> > + * @param left the first String, must not be null
> > + * @param right the second String, must not be null
> > + * @return result distance
> > + * @throws IllegalArgumentException if either String input {@code
> > null}
> > + */
> > + @Override
> > + public Double compare(CharSequence left, CharSequence right) {
> > + final double DEFAULT_SCALING_FACTOR = 0.1;
> > +
> > + if (left == null || right == null) {
> > + throw new IllegalArgumentException("Strings must not be
> > null");
> > + }
> > +
> > + final double jaro = score(left, right);
> > + final int cl = commonPrefixLength(left, right);
> > + final double matchScore = Math.round(jaro +
> > (DEFAULT_SCALING_FACTOR
> > + * cl * (1.0 - jaro)) * 100.0) / 100.0;
> > +
> > + return matchScore;
> > + }
> > +
> > + // TODO: we can move these methods to a Util class, keep them here,
> > + // create a common abstract class, shade lang-3.3...
> > +
> > + /**
> > + * Calculates the number of characters from the beginning of the
> > strings
> > + * that match exactly one-to-one, up to a maximum of four (4)
> > characters.
> > + *
> > + * @param first The first string.
> > + * @param second The second string.
> > + * @return A number between 0 and 4.
> > + */
> > + private static int commonPrefixLength(final CharSequence first,
> > + final CharSequence second) {
> > + final int result = getCommonPrefix(first.toString(),
> > second.toString())
> > + .length();
> > +
> > + // Limit the result to 4.
> > + return result > 4 ? 4 : result;
> > + }
> > +
> > + /**
> > + * <p>
> > + * Compares all Strings in an array and returns the initial sequence
> > of
> > + * characters that is common to all of them.
> > + * </p>
> > + *
> > + * <p>
> > + * For example,
> > + * <code>getCommonPrefix(new String[] {"i am a machine", "i am a
> > robot"}) -> "i am a "</code>
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.getCommonPrefix(null) = ""
> > + * StringUtils.getCommonPrefix(new String[] {}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"abc"}) = "abc"
> > + * StringUtils.getCommonPrefix(new String[] {null, null}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"", ""}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"", null}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"abc", null, null}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {null, null, "abc"}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"", "abc"}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"abc", ""}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
> > + * StringUtils.getCommonPrefix(new String[] {"abc", "a"}) = "a"
> > + * StringUtils.getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
> > + * StringUtils.getCommonPrefix(new String[] {"abcde", "abxyz"}) =
> "ab"
> > + * StringUtils.getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
> > + * StringUtils.getCommonPrefix(new String[] {"i am a machine", "i am
> > a robot"}) = "i am a "
> > + * </pre>
> > + *
> > + * @param strs array of String objects, entries may be null
> > + * @return the initial sequence of characters that are common to all
> > Strings
> > + * in the array; empty String if the array is null, the
> > elements are
> > + * all null or if there is no common prefix.
> > + * @since 2.4
> > + */
> > + public static String getCommonPrefix(final String... strs) {
> > + if (strs == null || strs.length == 0) {
> > + return "";
> > + }
> > + final int smallestIndexOfDiff = indexOfDifference(strs);
> > + if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
> > + // all strings were identical
> > + if (strs[0] == null) {
> > + return "";
> > + }
> > + return strs[0];
> > + } else if (smallestIndexOfDiff == 0) {
> > + // there were no common initial characters
> > + return "";
> > + } else {
> > + // we found a common initial character sequence
> > + return strs[0].substring(0, smallestIndexOfDiff);
> > + }
> > + }
> > +
> > + /**
> > + * This method returns the Jaro-Winkler score for string matching.
> > + *
> > + * @param first the first string to be matched
> > + * @param second the second string to be machted
> > + * @return matching score without scaling factor impact
> > + */
> > + protected static double score(final CharSequence first,
> > + final CharSequence second) {
> > + String shorter;
> > + String longer;
> > +
> > + // Determine which String is longer.
> > + if (first.length() > second.length()) {
> > + longer = first.toString().toLowerCase();
> > + shorter = second.toString().toLowerCase();
> > + } else {
> > + longer = second.toString().toLowerCase();
> > + shorter = first.toString().toLowerCase();
> > + }
> > +
> > + // Calculate the half length() distance of the shorter String.
> > + final int halflength = shorter.length() / 2 + 1;
> > +
> > + // Find the set of matching characters between the shorter and
> > longer
> > + // strings. Note that
> > + // the set of matching characters may be different depending on
> > the
> > + // order of the strings.
> > + final String m1 = getSetOfMatchingCharacterWithin(shorter,
> longer,
> > + halflength);
> > + final String m2 = getSetOfMatchingCharacterWithin(longer,
> shorter,
> > + halflength);
> > +
> > + // If one or both of the sets of common characters is empty,
> then
> > + // there is no similarity between the two strings.
> > + if (m1.length() == 0 || m2.length() == 0) {
> > + return 0.0;
> > + }
> > +
> > + // If the set of common characters is not the same size, then
> > + // there is no similarity between the two strings, either.
> > + if (m1.length() != m2.length()) {
> > + return 0.0;
> > + }
> > +
> > + // Calculate the number of transposition between the two sets
> > + // of common characters.
> > + final int transpositions = transpositions(m1, m2);
> > +
> > + // Calculate the distance.
> > + final double dist = (m1.length() / ((double) shorter.length())
> > + + m2.length() / ((double) longer.length()) +
> (m1.length()
> > - transpositions)
> > + / ((double) m1.length())) / 3.0;
> > + return dist;
> > + }
> > +
> > + /**
> > + * Calculates the number of transposition between two strings.
> > + *
> > + * @param first The first string.
> > + * @param second The second string.
> > + * @return The number of transposition between the two strings.
> > + */
> > + protected static int transpositions(final CharSequence first,
> > + final CharSequence second) {
> > + int transpositions = 0;
> > + for (int i = 0; i < first.length(); i++) {
> > + if (first.charAt(i) != second.charAt(i)) {
> > + transpositions++;
> > + }
> > + }
> > + return transpositions / 2;
> > + }
> > +
> > + /**
> > + * <p>
> > + * Compares all CharSequences in an array and returns the index at
> > which the
> > + * CharSequences begin to differ.
> > + * </p>
> > + *
> > + * <p>
> > + * For example,
> > + * <code>indexOfDifference(new String[] {"i am a machine", "i am a
> > robot"}) -> 7</code>
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.indexOfDifference(null) = -1
> > + * StringUtils.indexOfDifference(new String[] {}) = -1
> > + * StringUtils.indexOfDifference(new String[] {"abc"}) = -1
> > + * StringUtils.indexOfDifference(new String[] {null, null}) = -1
> > + * StringUtils.indexOfDifference(new String[] {"", ""}) = -1
> > + * StringUtils.indexOfDifference(new String[] {"", null}) = 0
> > + * StringUtils.indexOfDifference(new String[] {"abc", null, null}) =
> 0
> > + * StringUtils.indexOfDifference(new String[] {null, null, "abc"}) =
> 0
> > + * StringUtils.indexOfDifference(new String[] {"", "abc"}) = 0
> > + * StringUtils.indexOfDifference(new String[] {"abc", ""}) = 0
> > + * StringUtils.indexOfDifference(new String[] {"abc", "abc"}) = -1
> > + * StringUtils.indexOfDifference(new String[] {"abc", "a"}) = 1
> > + * StringUtils.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
> > + * StringUtils.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
> > + * StringUtils.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
> > + * StringUtils.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
> > + * StringUtils.indexOfDifference(new String[] {"i am a machine", "i
> > am a robot"}) = 7
> > + * </pre>
> > + *
> > + * @param css array of CharSequences, entries may be null
> > + * @return the index where the strings begin to differ; -1 if they
> > are all
> > + * equal
> > + * @since 2.4
> > + * @since 3.0 Changed signature from indexOfDifference(String...) to
> > + * indexOfDifference(CharSequence...)
> > + */
> > + protected static int indexOfDifference(final CharSequence... css) {
> > + if (css == null || css.length <= 1) {
> > + return INDEX_NOT_FOUND;
> > + }
> > + boolean anyStringNull = false;
> > + boolean allStringsNull = true;
> > + final int arrayLen = css.length;
> > + int shortestStrLen = Integer.MAX_VALUE;
> > + int longestStrLen = 0;
> > +
> > + // find the min and max string lengths; this avoids checking to
> > make
> > + // sure we are not exceeding the length of the string each time
> > through
> > + // the bottom loop.
> > + for (int i = 0; i < arrayLen; i++) {
> > + if (css[i] == null) {
> > + anyStringNull = true;
> > + shortestStrLen = 0;
> > + } else {
> > + allStringsNull = false;
> > + shortestStrLen = Math.min(css[i].length(),
> > shortestStrLen);
> > + longestStrLen = Math.max(css[i].length(),
> longestStrLen);
> > + }
> > + }
> > +
> > + // handle lists containing all nulls or all empty strings
> > + if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
> > + return INDEX_NOT_FOUND;
> > + }
> > +
> > + // handle lists containing some nulls or some empty strings
> > + if (shortestStrLen == 0) {
> > + return 0;
> > + }
> > +
> > + // find the position with the first difference across all
> strings
> > + int firstDiff = -1;
> > + for (int stringPos = 0; stringPos < shortestStrLen;
> stringPos++) {
> > + final char comparisonChar = css[0].charAt(stringPos);
> > + for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
> > + if (css[arrayPos].charAt(stringPos) != comparisonChar) {
> > + firstDiff = stringPos;
> > + break;
> > + }
> > + }
> > + if (firstDiff != -1) {
> > + break;
> > + }
> > + }
> > +
> > + if (firstDiff == -1 && shortestStrLen != longestStrLen) {
> > + // we compared all of the characters up to the length of the
> > + // shortest string and didn't find a match, but the string
> > lengths
> > + // vary, so return the length of the shortest string.
> > + return shortestStrLen;
> > + }
> > + return firstDiff;
> > + }
> > +
> > + /**
> > + * Gets a set of matching characters between two strings.
> > + *
> > + * <p>
> > + * Two characters from the first string and the second string are
> > + * considered matching if the character's respective positions are no
> > + * farther than the limit value.
> > + * </p>
> > + *
> > + * @param first The first string.
> > + * @param second The second string.
> > + * @param limit The maximum distance to consider.
> > + * @return A string contain the set of common characters.
> > + */
> > + protected static String getSetOfMatchingCharacterWithin(
> > + final CharSequence first, final CharSequence second, final
> > int limit) {
> > + final StringBuilder common = new StringBuilder();
> > + final StringBuilder copy = new StringBuilder(second);
> > +
> > + for (int i = 0; i < first.length(); i++) {
> > + final char ch = first.charAt(i);
> > + boolean found = false;
> > +
> > + // See if the character is within the limit positions away
> > from the
> > + // original position of that character.
> > + for (int j = Math.max(0, i - limit); !found
> > + && j < Math.min(i + limit, second.length()); j++) {
> > + if (copy.charAt(j) == ch) {
> > + found = true;
> > + common.append(ch);
> > + copy.setCharAt(j, '*');
> > + }
> > + }
> > + }
> > + return common.toString();
> > + }
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> >
> a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> >
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> > new file mode 100644
> > index 0000000..1793f1e
> > --- /dev/null
> > +++
> >
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> > @@ -0,0 +1,258 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +import java.util.Arrays;
> > +
> > +/**
> > + * <p>
> > + * A string metric for measuring the difference between two sequences.
> > + * </p>
> > + *
> > + * <p>
> > + * This code has been adapted from Apache Commons Lang 3.3.
> > + * </p>
> > + *
> > + * @since 1.0
> > + */
> > +public class LevenshteinDistance implements StringMetric<Integer> {
> > +
> > + /**
> > + * <p>
> > + * Find the Levenshtein distance between two Strings.
> > + * </p>
> > + *
> > + * <p>
> > + * This is the number of changes needed to change one String into
> > another,
> > + * where each change is a single character modification (deletion,
> > insertion
> > + * or substitution).
> > + * </p>
> > + *
> > + * <p>
> > + * The previous implementation of the Levenshtein distance algorithm
> > was
> > + * from <a
> > + * href="http://www.merriampark.com/ld.htm">
> > http://www.merriampark.com
> > + * /ld.htm</a>
> > + * </p>
> > + *
> > + * <p>
> > + * Chas Emerick has written an implementation in Java, which avoids
> an
> > + * OutOfMemoryError which can occur when my Java implementation is
> > used with
> > + * very large strings.<br>
> > + * This implementation of the Levenshtein distance algorithm is from
> > <a
> > + * href="http://www.merriampark.com/ldjava.htm">
> > http://www.merriampark.com/
> > + * ldjava.htm</a>
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.getLevenshteinDistance(null, *) =
> > IllegalArgumentException
> > + * StringUtils.getLevenshteinDistance(*, null) =
> > IllegalArgumentException
> > + * StringUtils.getLevenshteinDistance("","") = 0
> > + * StringUtils.getLevenshteinDistance("","a") = 1
> > + * StringUtils.getLevenshteinDistance("aaapppp", "") = 7
> > + * StringUtils.getLevenshteinDistance("frog", "fog") = 1
> > + * StringUtils.getLevenshteinDistance("fly", "ant") = 3
> > + * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7
> > + * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7
> > + * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8
> > + * StringUtils.getLevenshteinDistance("hello", "hallo") = 1
> > + * </pre>
> > + *
> > + * @param left the first string, must not be null
> > + * @param right the second string, must not be null
> > + * @return result distance
> > + * @throws IllegalArgumentException if either String input {@code
> > null}
> > + */
> > + @Override
> > + public Integer compare(CharSequence left, CharSequence right) {
> > + return compare(left, right, Integer.MAX_VALUE);
> > + }
> > +
> > + /**
> > + * <p>
> > + * Find the Levenshtein distance between two Strings if it's less
> > than or
> > + * equal to a given threshold.
> > + * </p>
> > + *
> > + * <p>
> > + * This is the number of changes needed to change one String into
> > another,
> > + * where each change is a single character modification (deletion,
> > insertion
> > + * or substitution).
> > + * </p>
> > + *
> > + * <p>
> > + * This implementation follows from Algorithms on Strings, Trees and
> > + * Sequences by Dan Gusfield and Chas Emerick's implementation of the
> > + * Levenshtein distance algorithm from <a
> > + * href="http://www.merriampark.com/ld.htm"
> > + * >http://www.merriampark.com/ld.htm</a>
> > + * </p>
> > + *
> > + * <pre>
> > + * StringUtils.getLevenshteinDistance(null, *, *) =
> > IllegalArgumentException
> > + * StringUtils.getLevenshteinDistance(*, null, *) =
> > IllegalArgumentException
> > + * StringUtils.getLevenshteinDistance(*, *, -1) =
> > IllegalArgumentException
> > + * StringUtils.getLevenshteinDistance("","", 0) = 0
> > + * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
> > + * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
> > + * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
> > + * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
> > + * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
> > + * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
> > + * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
> > + * </pre>
> > + *
> > + * @param left the first string, must not be null
> > + * @param right the second string, must not be null
> > + * @param threshold the target threshold, must not be negative
> > + * @return result distance
> > + * @throws IllegalArgumentException if either String input {@code
> > null} or
> > + * negative threshold
> > + */
> > + public Integer compare(CharSequence left, CharSequence right, int
> > threshold) {
> > + if (left == null || right == null) {
> > + throw new IllegalArgumentException("Strings must not be
> > null");
> > + }
> > + if (threshold < 0) {
> > + throw new IllegalArgumentException("Threshold must not be
> > negative");
> > + }
> > +
> > + /*
> > + * This implementation only computes the distance if it's less
> > than or
> > + * equal to the threshold value, returning -1 if it's greater.
> The
> > + * advantage is performance: unbounded distance is O(nm), but a
> > bound of
> > + * k allows us to reduce it to O(km) time by only computing a
> > diagonal
> > + * stripe of width 2k + 1 of the cost table. It is also possible
> > to use
> > + * this to compute the unbounded Levenshtein distance by starting
> > the
> > + * threshold at 1 and doubling each time until the distance is
> > found;
> > + * this is O(dm), where d is the distance.
> > + *
> > + * One subtlety comes from needing to ignore entries on the
> > border of
> > + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore
> > the entry
> > + * to the left of the leftmost member We must ignore the entry
> > above the
> > + * rightmost member
> > + *
> > + * Another subtlety comes from our stripe running off the matrix
> > if the
> > + * strings aren't of the same size. Since string s is always
> > swapped to
> > + * be the shorter of the two, the stripe will always run off to
> > the
> > + * upper right instead of the lower left of the matrix.
> > + *
> > + * As a concrete example, suppose s is of length 5, t is of
> > length 7,
> > + * and our threshold is 1. In this case we're going to walk a
> > stripe of
> > + * length 3. The matrix would look like so:
> > + *
> > + * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | |
> > |#|#|#| 5 |
> > + * | | |#|#| 6 | | | | |#| 7 | | | | | |
> > + *
> > + * Note how the stripe leads off the table as there is no
> > possible way
> > + * to turn a string of length 5 into one of length 7 in edit
> > distance of
> > + * 1.
> > + *
> > + * Additionally, this implementation decreases memory usage by
> > using two
> > + * single-dimensional arrays and swapping them back and forth
> > instead of
> > + * allocating an entire n by m matrix. This requires a few minor
> > + * changes, such as immediately returning when it's detected that
> > the
> > + * stripe has run off the matrix and initially filling the arrays
> > with
> > + * large values so that entries we don't compute are ignored.
> > + *
> > + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield
> > for
> > + * some discussion.
> > + */
> > +
> > + int n = left.length(); // length of s
> > + int m = right.length(); // length of t
> > +
> > + // if one string is empty, the edit distance is necessarily the
> > length
> > + // of the other
> > + if (n == 0) {
> > + return m <= threshold ? m : -1;
> > + } else if (m == 0) {
> > + return n <= threshold ? n : -1;
> > + }
> > +
> > + if (n > m) {
> > + // swap the two strings to consume less memory
> > + final CharSequence tmp = left;
> > + left = right;
> > + right = tmp;
> > + n = m;
> > + m = right.length();
> > + }
> > +
> > + int[] p = new int[n + 1]; // 'previous' cost array, horizontally
> > + int[] d = new int[n + 1]; // cost array, horizontally
> > + int[] _d; // placeholder to assist in swapping p and d
> > +
> > + // fill in starting table values
> > + final int boundary = Math.min(n, threshold) + 1;
> > + for (int i = 0; i < boundary; i++) {
> > + p[i] = i;
> > + }
> > + // these fills ensure that the value above the rightmost entry
> of
> > our
> > + // stripe will be ignored in following loop iterations
> > + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
> > + Arrays.fill(d, Integer.MAX_VALUE);
> > +
> > + // iterates through t
> > + for (int j = 1; j <= m; j++) {
> > + final char t_j = right.charAt(j - 1); // jth character of t
> > + d[0] = j;
> > +
> > + // compute stripe indices, constrain to array size
> > + final int min = Math.max(1, j - threshold);
> > + final int max = j > Integer.MAX_VALUE - threshold ? n :
> > Math.min(
> > + n, j + threshold);
> > +
> > + // the stripe may lead off of the table if s and t are of
> > different
> > + // sizes
> > + if (min > max) {
> > + return -1;
> > + }
> > +
> > + // ignore entry left of leftmost
> > + if (min > 1) {
> > + d[min - 1] = Integer.MAX_VALUE;
> > + }
> > +
> > + // iterates through [min, max] in s
> > + for (int i = min; i <= max; i++) {
> > + if (left.charAt(i - 1) == t_j) {
> > + // diagonally left and up
> > + d[i] = p[i - 1];
> > + } else {
> > + // 1 + minimum of cell to the left, to the top,
> > diagonally
> > + // left and up
> > + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i -
> > 1]);
> > + }
> > + }
> > +
> > + // copy current distance counts to 'previous row' distance
> > counts
> > + _d = p;
> > + p = d;
> > + d = _d;
> > + }
> > +
> > + // if p[n] is greater than the threshold, there's no guarantee
> on
> > it
> > + // being the correct
> > + // distance
> > + if (p[n] <= threshold) {
> > + return p[n];
> > + }
> > + return -1;
> > + }
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> > ----------------------------------------------------------------------
> > diff --git
> > a/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> > b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> > new file mode 100644
> > index 0000000..6765bbd
> > --- /dev/null
> > +++ b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> > @@ -0,0 +1,45 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +/**
> > + * <p>
> > + * Marker interface for <a
> > + * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a>
> > + * </p>
> > + *
> > + * <p>
> > + * A string metric measures the distance between two text strings.
> > + * </p>
> > + *
> > + * @param <R> return type of the edit distance
> > + * @since 1.0
> > + */
> > +public interface StringMetric<R> {
> > +
> > + /**
> > + * <p>
> > + * Compares two strings.
> > + * </p>
> > + *
> > + * @param left the first string
> > + * @param right the second string
> > + * @return result distance
> > + */
> > + R compare(CharSequence left, CharSequence right);
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/package-info.java
> > ----------------------------------------------------------------------
> > diff --git
> > a/src/main/java/org/apache/commons/text/similarity/package-info.java
> > b/src/main/java/org/apache/commons/text/similarity/package-info.java
> > new file mode 100644
> > index 0000000..8e9d478
> > --- /dev/null
> > +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
> > @@ -0,0 +1,22 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +/**
> > + * <p>Provides algorithms for string similarity.</p>
> > + *
> > + * @since 1.0
> > + */
> > +package org.apache.commons.text.similarity;
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> > a/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> > b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> > new file mode 100644
> > index 0000000..30b1dfc
> > --- /dev/null
> > +++
> > b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> > @@ -0,0 +1,75 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +import static org.junit.Assert.assertEquals;
> > +
> > +import java.util.Locale;
> > +
> > +import org.junit.BeforeClass;
> > +import org.junit.Test;
> > +
> > +/**
> > + * Unit tests for {@link org.apache.commons.text.FuzzyDistance}.
> > + */
> > +public class TestFuzzyDistance {
> > +
> > + private static FuzzyDistance distance;
> > +
> > + @BeforeClass
> > + public static void setUp() {
> > + distance = new FuzzyDistance();
> > + }
> > +
> > + @Test
> > + public void testGetFuzzyDistance() throws Exception {
> > + assertEquals(0, (int) distance.compare("", "", Locale.ENGLISH));
> > + assertEquals(0,
> > + (int) distance.compare("Workshop", "b",
> Locale.ENGLISH));
> > + assertEquals(1,
> > + (int) distance.compare("Room", "o", Locale.ENGLISH));
> > + assertEquals(1,
> > + (int) distance.compare("Workshop", "w",
> Locale.ENGLISH));
> > + assertEquals(2,
> > + (int) distance.compare("Workshop", "ws",
> Locale.ENGLISH));
> > + assertEquals(4,
> > + (int) distance.compare("Workshop", "wo",
> Locale.ENGLISH));
> > + assertEquals(3, (int) distance.compare(
> > + "Apache Software Foundation", "asf", Locale.ENGLISH));
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetFuzzyDistance_NullNullNull() throws Exception {
> > + distance.compare(null, null, null);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetFuzzyDistance_StringNullLoclae() throws
> Exception {
> > + distance.compare(" ", null, Locale.ENGLISH);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetFuzzyDistance_NullStringLocale() throws
> Exception {
> > + distance.compare(null, "clear", Locale.ENGLISH);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetFuzzyDistance_StringStringNull() throws
> Exception {
> > + distance.compare(" ", "clear", null);
> > + }
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> >
> a/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> >
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> > new file mode 100644
> > index 0000000..6477de0
> > --- /dev/null
> > +++
> >
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> > @@ -0,0 +1,62 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +import static org.junit.Assert.assertEquals;
> > +
> > +import org.junit.BeforeClass;
> > +import org.junit.Test;
> > +
> > +/**
> > + * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
> > + */
> > +public class TestJaroWrinklerDistance {
> > +
> > + private static JaroWrinklerDistance distance;
> > +
> > + @BeforeClass
> > + public static void setUp() {
> > + distance = new JaroWrinklerDistance();
> > + }
> > +
> > + @Test
> > + public void testGetJaroWinklerDistance_StringString() {
> > + assertEquals(0.93d, (double) distance.compare("frog", "fog"),
> > 0.0d);
> > + assertEquals(0.0d, (double) distance.compare("fly", "ant"),
> 0.0d);
> > + assertEquals(0.44d, (double) distance.compare("elephant",
> > "hippo"), 0.0d);
> > + assertEquals(0.91d, (double) distance.compare("ABC Corporation",
> > "ABC Corp"), 0.0d);
> > + assertEquals(0.93d, (double) distance.compare("D N H Enterprises
> > Inc", "D & H Enterprises, Inc."), 0.0d);
> > + assertEquals(0.94d, (double) distance.compare("My Gym Children's
> > Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
> > + assertEquals(0.9d, (double) distance.compare("PENNSYLVANIA",
> > "PENNCISYLVNIA"), 0.0d);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetJaroWinklerDistance_NullNull() throws Exception {
> > + distance.compare(null, null);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetJaroWinklerDistance_StringNull() throws
> Exception {
> > + distance.compare(" ", null);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetJaroWinklerDistance_NullString() throws
> Exception {
> > + distance.compare(null, "clear");
> > + }
> > +
> > +}
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> > ----------------------------------------------------------------------
> > diff --git
> >
> a/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> >
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> > new file mode 100644
> > index 0000000..7790b05
> > --- /dev/null
> > +++
> >
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> > @@ -0,0 +1,146 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.commons.text.similarity;
> > +
> > +import static org.junit.Assert.assertEquals;
> > +
> > +import org.junit.BeforeClass;
> > +import org.junit.Test;
> > +
> > +/**
> > + * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
> > + */
> > +public class TestLevenshteinDistance {
> > +
> > + private static LevenshteinDistance distance;
> > +
> > + @BeforeClass
> > + public static void setUp() {
> > + distance = new LevenshteinDistance();
> > + }
> > +
> > + @Test
> > + public void testGetLevenshteinDistance_StringString() {
> > + assertEquals(0, (int) distance.compare("", ""));
> > + assertEquals(1, (int) distance.compare("", "a"));
> > + assertEquals(7, (int) distance.compare("aaapppp", ""));
> > + assertEquals(1, (int) distance.compare("frog", "fog"));
> > + assertEquals(3, (int) distance.compare("fly", "ant"));
> > + assertEquals(7, (int) distance.compare("elephant", "hippo"));
> > + assertEquals(7, (int) distance.compare("hippo", "elephant"));
> > + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
> > + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
> > + assertEquals(1, (int) distance.compare("hello", "hallo"));
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetLevenshteinDistance_NullString() throws
> Exception {
> > + distance.compare("a", null);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetLevenshteinDistance_StringNull() throws
> Exception {
> > + distance.compare(null, "a");
> > + }
> > +
> > + @Test
> > + public void testGetLevenshteinDistance_StringStringInt() {
> > + // empty strings
> > + assertEquals(0, (int) distance.compare("", "", 0));
> > + assertEquals(7, (int) distance.compare("aaapppp", "", 8));
> > + assertEquals(7, (int) distance.compare("aaapppp", "", 7));
> > + assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
> > +
> > + // unequal strings, zero threshold
> > + assertEquals(-1, (int) distance.compare("b", "a", 0));
> > + assertEquals(-1, (int) distance.compare("a", "b", 0));
> > +
> > + // equal strings
> > + assertEquals(0, (int) distance.compare("aa", "aa", 0));
> > + assertEquals(0, (int) distance.compare("aa", "aa", 2));
> > +
> > + // same length
> > + assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
> > + assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
> > +
> > + // big stripe
> > + assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
> > +
> > + // distance less than threshold
> > + assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
> > + assertEquals(3, (int) distance.compare("a", "bbb", 4));
> > +
> > + // distance equal to threshold
> > + assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
> > + assertEquals(3, (int) distance.compare("a", "bbb", 3));
> > +
> > + // distance greater than threshold
> > + assertEquals(-1, (int) distance.compare("a", "bbb", 2));
> > + assertEquals(-1, (int) distance.compare("bbb", "a", 2));
> > + assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
> > +
> > + // stripe runs off array, strings not similar
> > + assertEquals(-1, (int) distance.compare("a", "bbb", 1));
> > + assertEquals(-1, (int) distance.compare("bbb", "a", 1));
> > +
> > + // stripe runs off array, strings are similar
> > + assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
> > + assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
> > +
> > + // old getLevenshteinDistance test cases
> > + assertEquals(1, (int) distance.compare("frog", "fog", 1));
> > + assertEquals(3, (int) distance.compare("fly", "ant", 3));
> > + assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
> > + assertEquals(-1, (int) distance.compare("elephant", "hippo",
> 6));
> > + assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
> > + assertEquals(-1, (int) distance.compare("hippo", "elephant",
> 6));
> > + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
> > + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
> > + assertEquals(1, (int) distance.compare("hello", "hallo", 1));
> > +
> > + assertEquals(1,
> > + (int) distance.compare("frog", "fog",
> Integer.MAX_VALUE));
> > + assertEquals(3, (int) distance.compare("fly", "ant",
> > Integer.MAX_VALUE));
> > + assertEquals(7,
> > + (int) distance.compare("elephant", "hippo",
> > Integer.MAX_VALUE));
> > + assertEquals(7,
> > + (int) distance.compare("hippo", "elephant",
> > Integer.MAX_VALUE));
> > + assertEquals(8,
> > + (int) distance.compare("hippo", "zzzzzzzz",
> > Integer.MAX_VALUE));
> > + assertEquals(8,
> > + (int) distance.compare("zzzzzzzz", "hippo",
> > Integer.MAX_VALUE));
> > + assertEquals(1,
> > + (int) distance.compare("hello", "hallo",
> > Integer.MAX_VALUE));
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetLevenshteinDistance_NullStringInt() throws
> > Exception {
> > + distance.compare(null, "a", 0);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetLevenshteinDistance_StringNullInt() throws
> > Exception {
> > + distance.compare("a", null, 0);
> > + }
> > +
> > + @Test(expected = IllegalArgumentException.class)
> > + public void testGetLevenshteinDistance_StringStringNegativeInt()
> > + throws Exception {
> > + distance.compare("a", "a", -1);
> > + }
> > +
> > +}
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> > For additional commands, e-mail: dev-help@commons.apache.org
> >
> >
>
>
> --
> http://people.apache.org/~britter/
> http://www.systemoutprintln.de/
> http://twitter.com/BenediktRitter
> http://github.com/britter
>
>
>
>
--
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter
Re: commons-text git commit: SANDBOX-483 Incorporate String
algorithms from Commons Lang
Posted by "Bruno P. Kinoshita" <br...@yahoo.com.br>.
Hi Benedikt!
I had a look at some checkstyle issues, and one of them warned me about unnecessary parenthesis. Guess I removed one too many parenthesis.
Can you update your repo and give it a try again, please? Thanks!Bruno
From: Benedikt Ritter <br...@apache.org>
To: Commons Developers List <de...@commons.apache.org>
Sent: Wednesday, November 26, 2014 6:00 AM
Subject: Re: commons-text git commit: SANDBOX-483 Incorporate String algorithms from Commons Lang
Hello Bruno,
when building commons-text I get:
testGetJaroWinklerDistance_StringString(org.apache.commons.text.similarity.TestJaroWrinklerDistance)
Time elapsed: 0.015 sec <<< FAILURE!
java.lang.AssertionError: expected:<0.93> but was:<0.02>
at org.junit.Assert.fail(Assert.java:88)
at org.junit.Assert.failNotEquals(Assert.java:743)
at org.junit.Assert.assertEquals(Assert.java:494)
at org.junit.Assert.assertEquals(Assert.java:592)
at
org.apache.commons.text.similarity.TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString(TestJaroWrinklerDistance.java:38)
Running org.apache.commons.text.similarity.TestLevenshteinDistance
Tests run: 7, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.006 sec -
in org.apache.commons.text.similarity.TestLevenshteinDistance
Results :
Failed tests:
TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString:38
expected:<0.93> but was:<0.02>
Tests run: 16, Failures: 1, Errors: 0, Skipped: 0
can you check?
thanks!
Benedikt
2014-11-20 5:54 GMT+01:00 <ki...@apache.org>:
> Repository: commons-text
> Updated Branches:
> refs/heads/master 71b59af90 -> c0e8c0210
>
>
> SANDBOX-483 Incorporate String algorithms from Commons Lang
>
>
> Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
> Commit:
> http://git-wip-us.apache.org/repos/asf/commons-text/commit/c0e8c021
> Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c0e8c021
> Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c0e8c021
>
> Branch: refs/heads/master
> Commit: c0e8c021059d8300f90cf2123528a35d7c468b57
> Parents: 71b59af
> Author: Bruno P. Kinoshita <br...@yahoo.com.br>
> Authored: Thu Nov 20 02:23:05 2014 -0200
> Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
> Committed: Thu Nov 20 02:23:05 2014 -0200
>
> ----------------------------------------------------------------------
> .../commons/text/similarity/FuzzyDistance.java | 130 +++++++
> .../text/similarity/JaroWrinklerDistance.java | 373 +++++++++++++++++++
> .../text/similarity/LevenshteinDistance.java | 258 +++++++++++++
> .../commons/text/similarity/StringMetric.java | 45 +++
> .../commons/text/similarity/package-info.java | 22 ++
> .../text/similarity/TestFuzzyDistance.java | 75 ++++
> .../similarity/TestJaroWrinklerDistance.java | 62 +++
> .../similarity/TestLevenshteinDistance.java | 146 ++++++++
> 8 files changed, 1111 insertions(+)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> new file mode 100644
> index 0000000..8e9228a
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> @@ -0,0 +1,130 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import java.util.Locale;
> +
> +/**
> + * <p>
> + * This string matching algorithm is similar to the algorithms of editors
> such
> + * as Sublime Text, TextMate, Atom and others. One point is given for
> every
> + * matched character. Subsequent matches yield two bonus points. A higher
> score
> + * indicates a higher similarity.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class FuzzyDistance implements StringMetric<Integer> {
> +
> + /**
> + * <p>
> + * Find the Fuzzy Distance which indicates the similarity score
> between two
> + * Strings. This method uses the default locale.
> + * </p>
> + *
> + * @param term a full term that should be matched against, must not
> be null
> + * @param query the query that will be matched against a term, must
> not be
> + * null
> + * @return result score
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Integer compare(CharSequence term, CharSequence query) {
> + return compare(term, query, Locale.getDefault());
> + }
> +
> + /**
> + * <p>
> + * Find the Fuzzy Distance which indicates the similarity score
> between two
> + * Strings.
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getFuzzyDistance(null, null, null)
> = IllegalArgumentException
> + * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)
> = 0
> + * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)
> = 0
> + * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)
> = 1
> + * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)
> = 1
> + * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)
> = 2
> + * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)
> = 4
> + * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf",
> Locale.ENGLISH) = 3
> + * </pre>
> + *
> + * @param term a full term that should be matched against, must not
> be null
> + * @param query the query that will be matched against a term, must
> not be
> + * null
> + * @param locale This string matching logic is case insensitive. A
> locale is
> + * necessary to normalize both Strings to lower case.
> + * @return result score
> + * @throws IllegalArgumentException if either String input {@code
> null} or
> + * Locale input {@code null}
> + */
> + public Integer compare(CharSequence term, CharSequence query, Locale
> locale) {
> + if (term == null || query == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + } else if (locale == null) {
> + throw new IllegalArgumentException("Locale must not be null");
> + }
> +
> + // fuzzy logic is case insensitive. We normalize the Strings to
> lower
> + // case right from the start. Turning characters to lower case
> + // via Character.toLowerCase(char) is unfortunately insufficient
> + // as it does not accept a locale.
> + final String termLowerCase = term.toString().toLowerCase(locale);
> + final String queryLowerCase =
> query.toString().toLowerCase(locale);
> +
> + // the resulting score
> + int score = 0;
> +
> + // the position in the term which will be scanned next for
> potential
> + // query character matches
> + int termIndex = 0;
> +
> + // index of the previously matched character in the term
> + int previousMatchingCharacterIndex = Integer.MIN_VALUE;
> +
> + for (int queryIndex = 0; queryIndex < queryLowerCase.length();
> queryIndex++) {
> + final char queryChar = queryLowerCase.charAt(queryIndex);
> +
> + boolean termCharacterMatchFound = false;
> + for (; termIndex < termLowerCase.length()
> + && !termCharacterMatchFound; termIndex++) {
> + final char termChar = termLowerCase.charAt(termIndex);
> +
> + if (queryChar == termChar) {
> + // simple character matches result in one point
> + score++;
> +
> + // subsequent character matches further improve
> + // the score.
> + if (previousMatchingCharacterIndex + 1 == termIndex) {
> + score += 2;
> + }
> +
> + previousMatchingCharacterIndex = termIndex;
> +
> + // we can leave the nested loop. Every character in
> the
> + // query can match at most one character in the term.
> + termCharacterMatchFound = true;
> + }
> + }
> + }
> +
> + return score;
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> new file mode 100644
> index 0000000..2dcd51c
> --- /dev/null
> +++
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> @@ -0,0 +1,373 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +/**
> + * <p>
> + * The Jaro measure is the weighted sum of percentage of matched
> characters
> + * from each file and transposed characters. Winkler increased this
> measure
> + * for matching initial characters.
> + * </p>
> + *
> + * <p>
> + * This implementation is based on the Jaro Winkler similarity algorithm
> + * from <a href="
> http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
> + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
> + * </p>
> + *
> + * <p>
> + * This code has been adapted from Apache Commons Lang 3.3.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class JaroWrinklerDistance implements StringMetric<Double> {
> +
> + /**
> + * Represents a failed index search.
> + */
> + public static final int INDEX_NOT_FOUND = -1;
> +
> + /**
> + * <p>
> + * Find the Jaro Winkler Distance which indicates the similarity score
> + * between two Strings.
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getJaroWinklerDistance(null, null) =
> IllegalArgumentException
> + * StringUtils.getJaroWinklerDistance("","") = 0.0
> + * StringUtils.getJaroWinklerDistance("","a") = 0.0
> + * StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
> + * StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
> + * StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
> + * StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
> + * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
> + * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
> + * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
> + * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp")
> = 0.91
> + * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D
> & H Enterprises, Inc.") = 0.93
> + * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness
> Center", "My Gym. Childrens Fitness") = 0.94
> + * StringUtils.getJaroWinklerDistance("PENNSYLVANIA",
> "PENNCISYLVNIA") = 0.9
> + * </pre>
> + *
> + * @param left the first String, must not be null
> + * @param right the second String, must not be null
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Double compare(CharSequence left, CharSequence right) {
> + final double DEFAULT_SCALING_FACTOR = 0.1;
> +
> + if (left == null || right == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + }
> +
> + final double jaro = score(left, right);
> + final int cl = commonPrefixLength(left, right);
> + final double matchScore = Math.round(jaro +
> (DEFAULT_SCALING_FACTOR
> + * cl * (1.0 - jaro)) * 100.0) / 100.0;
> +
> + return matchScore;
> + }
> +
> + // TODO: we can move these methods to a Util class, keep them here,
> + // create a common abstract class, shade lang-3.3...
> +
> + /**
> + * Calculates the number of characters from the beginning of the
> strings
> + * that match exactly one-to-one, up to a maximum of four (4)
> characters.
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @return A number between 0 and 4.
> + */
> + private static int commonPrefixLength(final CharSequence first,
> + final CharSequence second) {
> + final int result = getCommonPrefix(first.toString(),
> second.toString())
> + .length();
> +
> + // Limit the result to 4.
> + return result > 4 ? 4 : result;
> + }
> +
> + /**
> + * <p>
> + * Compares all Strings in an array and returns the initial sequence
> of
> + * characters that is common to all of them.
> + * </p>
> + *
> + * <p>
> + * For example,
> + * <code>getCommonPrefix(new String[] {"i am a machine", "i am a
> robot"}) -> "i am a "</code>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getCommonPrefix(null) = ""
> + * StringUtils.getCommonPrefix(new String[] {}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc"}) = "abc"
> + * StringUtils.getCommonPrefix(new String[] {null, null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", ""}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", null, null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {null, null, "abc"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", "abc"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", ""}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
> + * StringUtils.getCommonPrefix(new String[] {"abc", "a"}) = "a"
> + * StringUtils.getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
> + * StringUtils.getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
> + * StringUtils.getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"i am a machine", "i am
> a robot"}) = "i am a "
> + * </pre>
> + *
> + * @param strs array of String objects, entries may be null
> + * @return the initial sequence of characters that are common to all
> Strings
> + * in the array; empty String if the array is null, the
> elements are
> + * all null or if there is no common prefix.
> + * @since 2.4
> + */
> + public static String getCommonPrefix(final String... strs) {
> + if (strs == null || strs.length == 0) {
> + return "";
> + }
> + final int smallestIndexOfDiff = indexOfDifference(strs);
> + if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
> + // all strings were identical
> + if (strs[0] == null) {
> + return "";
> + }
> + return strs[0];
> + } else if (smallestIndexOfDiff == 0) {
> + // there were no common initial characters
> + return "";
> + } else {
> + // we found a common initial character sequence
> + return strs[0].substring(0, smallestIndexOfDiff);
> + }
> + }
> +
> + /**
> + * This method returns the Jaro-Winkler score for string matching.
> + *
> + * @param first the first string to be matched
> + * @param second the second string to be machted
> + * @return matching score without scaling factor impact
> + */
> + protected static double score(final CharSequence first,
> + final CharSequence second) {
> + String shorter;
> + String longer;
> +
> + // Determine which String is longer.
> + if (first.length() > second.length()) {
> + longer = first.toString().toLowerCase();
> + shorter = second.toString().toLowerCase();
> + } else {
> + longer = second.toString().toLowerCase();
> + shorter = first.toString().toLowerCase();
> + }
> +
> + // Calculate the half length() distance of the shorter String.
> + final int halflength = shorter.length() / 2 + 1;
> +
> + // Find the set of matching characters between the shorter and
> longer
> + // strings. Note that
> + // the set of matching characters may be different depending on
> the
> + // order of the strings.
> + final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
> + halflength);
> + final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
> + halflength);
> +
> + // If one or both of the sets of common characters is empty, then
> + // there is no similarity between the two strings.
> + if (m1.length() == 0 || m2.length() == 0) {
> + return 0.0;
> + }
> +
> + // If the set of common characters is not the same size, then
> + // there is no similarity between the two strings, either.
> + if (m1.length() != m2.length()) {
> + return 0.0;
> + }
> +
> + // Calculate the number of transposition between the two sets
> + // of common characters.
> + final int transpositions = transpositions(m1, m2);
> +
> + // Calculate the distance.
> + final double dist = (m1.length() / ((double) shorter.length())
> + + m2.length() / ((double) longer.length()) + (m1.length()
> - transpositions)
> + / ((double) m1.length())) / 3.0;
> + return dist;
> + }
> +
> + /**
> + * Calculates the number of transposition between two strings.
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @return The number of transposition between the two strings.
> + */
> + protected static int transpositions(final CharSequence first,
> + final CharSequence second) {
> + int transpositions = 0;
> + for (int i = 0; i < first.length(); i++) {
> + if (first.charAt(i) != second.charAt(i)) {
> + transpositions++;
> + }
> + }
> + return transpositions / 2;
> + }
> +
> + /**
> + * <p>
> + * Compares all CharSequences in an array and returns the index at
> which the
> + * CharSequences begin to differ.
> + * </p>
> + *
> + * <p>
> + * For example,
> + * <code>indexOfDifference(new String[] {"i am a machine", "i am a
> robot"}) -> 7</code>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.indexOfDifference(null) = -1
> + * StringUtils.indexOfDifference(new String[] {}) = -1
> + * StringUtils.indexOfDifference(new String[] {"abc"}) = -1
> + * StringUtils.indexOfDifference(new String[] {null, null}) = -1
> + * StringUtils.indexOfDifference(new String[] {"", ""}) = -1
> + * StringUtils.indexOfDifference(new String[] {"", null}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", null, null}) = 0
> + * StringUtils.indexOfDifference(new String[] {null, null, "abc"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"", "abc"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", ""}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", "abc"}) = -1
> + * StringUtils.indexOfDifference(new String[] {"abc", "a"}) = 1
> + * StringUtils.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
> + * StringUtils.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
> + * StringUtils.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"i am a machine", "i
> am a robot"}) = 7
> + * </pre>
> + *
> + * @param css array of CharSequences, entries may be null
> + * @return the index where the strings begin to differ; -1 if they
> are all
> + * equal
> + * @since 2.4
> + * @since 3.0 Changed signature from indexOfDifference(String...) to
> + * indexOfDifference(CharSequence...)
> + */
> + protected static int indexOfDifference(final CharSequence... css) {
> + if (css == null || css.length <= 1) {
> + return INDEX_NOT_FOUND;
> + }
> + boolean anyStringNull = false;
> + boolean allStringsNull = true;
> + final int arrayLen = css.length;
> + int shortestStrLen = Integer.MAX_VALUE;
> + int longestStrLen = 0;
> +
> + // find the min and max string lengths; this avoids checking to
> make
> + // sure we are not exceeding the length of the string each time
> through
> + // the bottom loop.
> + for (int i = 0; i < arrayLen; i++) {
> + if (css[i] == null) {
> + anyStringNull = true;
> + shortestStrLen = 0;
> + } else {
> + allStringsNull = false;
> + shortestStrLen = Math.min(css[i].length(),
> shortestStrLen);
> + longestStrLen = Math.max(css[i].length(), longestStrLen);
> + }
> + }
> +
> + // handle lists containing all nulls or all empty strings
> + if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
> + return INDEX_NOT_FOUND;
> + }
> +
> + // handle lists containing some nulls or some empty strings
> + if (shortestStrLen == 0) {
> + return 0;
> + }
> +
> + // find the position with the first difference across all strings
> + int firstDiff = -1;
> + for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
> + final char comparisonChar = css[0].charAt(stringPos);
> + for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
> + if (css[arrayPos].charAt(stringPos) != comparisonChar) {
> + firstDiff = stringPos;
> + break;
> + }
> + }
> + if (firstDiff != -1) {
> + break;
> + }
> + }
> +
> + if (firstDiff == -1 && shortestStrLen != longestStrLen) {
> + // we compared all of the characters up to the length of the
> + // shortest string and didn't find a match, but the string
> lengths
> + // vary, so return the length of the shortest string.
> + return shortestStrLen;
> + }
> + return firstDiff;
> + }
> +
> + /**
> + * Gets a set of matching characters between two strings.
> + *
> + * <p>
> + * Two characters from the first string and the second string are
> + * considered matching if the character's respective positions are no
> + * farther than the limit value.
> + * </p>
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @param limit The maximum distance to consider.
> + * @return A string contain the set of common characters.
> + */
> + protected static String getSetOfMatchingCharacterWithin(
> + final CharSequence first, final CharSequence second, final
> int limit) {
> + final StringBuilder common = new StringBuilder();
> + final StringBuilder copy = new StringBuilder(second);
> +
> + for (int i = 0; i < first.length(); i++) {
> + final char ch = first.charAt(i);
> + boolean found = false;
> +
> + // See if the character is within the limit positions away
> from the
> + // original position of that character.
> + for (int j = Math.max(0, i - limit); !found
> + && j < Math.min(i + limit, second.length()); j++) {
> + if (copy.charAt(j) == ch) {
> + found = true;
> + common.append(ch);
> + copy.setCharAt(j, '*');
> + }
> + }
> + }
> + return common.toString();
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> new file mode 100644
> index 0000000..1793f1e
> --- /dev/null
> +++
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> @@ -0,0 +1,258 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import java.util.Arrays;
> +
> +/**
> + * <p>
> + * A string metric for measuring the difference between two sequences.
> + * </p>
> + *
> + * <p>
> + * This code has been adapted from Apache Commons Lang 3.3.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class LevenshteinDistance implements StringMetric<Integer> {
> +
> + /**
> + * <p>
> + * Find the Levenshtein distance between two Strings.
> + * </p>
> + *
> + * <p>
> + * This is the number of changes needed to change one String into
> another,
> + * where each change is a single character modification (deletion,
> insertion
> + * or substitution).
> + * </p>
> + *
> + * <p>
> + * The previous implementation of the Levenshtein distance algorithm
> was
> + * from <a
> + * href="http://www.merriampark.com/ld.htm">
> http://www.merriampark.com
> + * /ld.htm</a>
> + * </p>
> + *
> + * <p>
> + * Chas Emerick has written an implementation in Java, which avoids an
> + * OutOfMemoryError which can occur when my Java implementation is
> used with
> + * very large strings.<br>
> + * This implementation of the Levenshtein distance algorithm is from
> <a
> + * href="http://www.merriampark.com/ldjava.htm">
> http://www.merriampark.com/
> + * ldjava.htm</a>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getLevenshteinDistance(null, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, null) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance("","") = 0
> + * StringUtils.getLevenshteinDistance("","a") = 1
> + * StringUtils.getLevenshteinDistance("aaapppp", "") = 7
> + * StringUtils.getLevenshteinDistance("frog", "fog") = 1
> + * StringUtils.getLevenshteinDistance("fly", "ant") = 3
> + * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7
> + * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7
> + * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8
> + * StringUtils.getLevenshteinDistance("hello", "hallo") = 1
> + * </pre>
> + *
> + * @param left the first string, must not be null
> + * @param right the second string, must not be null
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Integer compare(CharSequence left, CharSequence right) {
> + return compare(left, right, Integer.MAX_VALUE);
> + }
> +
> + /**
> + * <p>
> + * Find the Levenshtein distance between two Strings if it's less
> than or
> + * equal to a given threshold.
> + * </p>
> + *
> + * <p>
> + * This is the number of changes needed to change one String into
> another,
> + * where each change is a single character modification (deletion,
> insertion
> + * or substitution).
> + * </p>
> + *
> + * <p>
> + * This implementation follows from Algorithms on Strings, Trees and
> + * Sequences by Dan Gusfield and Chas Emerick's implementation of the
> + * Levenshtein distance algorithm from <a
> + * href="http://www.merriampark.com/ld.htm"
> + * >http://www.merriampark.com/ld.htm</a>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getLevenshteinDistance(null, *, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, null, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, *, -1) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance("","", 0) = 0
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
> + * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
> + * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
> + * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
> + * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
> + * </pre>
> + *
> + * @param left the first string, must not be null
> + * @param right the second string, must not be null
> + * @param threshold the target threshold, must not be negative
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null} or
> + * negative threshold
> + */
> + public Integer compare(CharSequence left, CharSequence right, int
> threshold) {
> + if (left == null || right == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + }
> + if (threshold < 0) {
> + throw new IllegalArgumentException("Threshold must not be
> negative");
> + }
> +
> + /*
> + * This implementation only computes the distance if it's less
> than or
> + * equal to the threshold value, returning -1 if it's greater. The
> + * advantage is performance: unbounded distance is O(nm), but a
> bound of
> + * k allows us to reduce it to O(km) time by only computing a
> diagonal
> + * stripe of width 2k + 1 of the cost table. It is also possible
> to use
> + * this to compute the unbounded Levenshtein distance by starting
> the
> + * threshold at 1 and doubling each time until the distance is
> found;
> + * this is O(dm), where d is the distance.
> + *
> + * One subtlety comes from needing to ignore entries on the
> border of
> + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore
> the entry
> + * to the left of the leftmost member We must ignore the entry
> above the
> + * rightmost member
> + *
> + * Another subtlety comes from our stripe running off the matrix
> if the
> + * strings aren't of the same size. Since string s is always
> swapped to
> + * be the shorter of the two, the stripe will always run off to
> the
> + * upper right instead of the lower left of the matrix.
> + *
> + * As a concrete example, suppose s is of length 5, t is of
> length 7,
> + * and our threshold is 1. In this case we're going to walk a
> stripe of
> + * length 3. The matrix would look like so:
> + *
> + * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | |
> |#|#|#| 5 |
> + * | | |#|#| 6 | | | | |#| 7 | | | | | |
> + *
> + * Note how the stripe leads off the table as there is no
> possible way
> + * to turn a string of length 5 into one of length 7 in edit
> distance of
> + * 1.
> + *
> + * Additionally, this implementation decreases memory usage by
> using two
> + * single-dimensional arrays and swapping them back and forth
> instead of
> + * allocating an entire n by m matrix. This requires a few minor
> + * changes, such as immediately returning when it's detected that
> the
> + * stripe has run off the matrix and initially filling the arrays
> with
> + * large values so that entries we don't compute are ignored.
> + *
> + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield
> for
> + * some discussion.
> + */
> +
> + int n = left.length(); // length of s
> + int m = right.length(); // length of t
> +
> + // if one string is empty, the edit distance is necessarily the
> length
> + // of the other
> + if (n == 0) {
> + return m <= threshold ? m : -1;
> + } else if (m == 0) {
> + return n <= threshold ? n : -1;
> + }
> +
> + if (n > m) {
> + // swap the two strings to consume less memory
> + final CharSequence tmp = left;
> + left = right;
> + right = tmp;
> + n = m;
> + m = right.length();
> + }
> +
> + int[] p = new int[n + 1]; // 'previous' cost array, horizontally
> + int[] d = new int[n + 1]; // cost array, horizontally
> + int[] _d; // placeholder to assist in swapping p and d
> +
> + // fill in starting table values
> + final int boundary = Math.min(n, threshold) + 1;
> + for (int i = 0; i < boundary; i++) {
> + p[i] = i;
> + }
> + // these fills ensure that the value above the rightmost entry of
> our
> + // stripe will be ignored in following loop iterations
> + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
> + Arrays.fill(d, Integer.MAX_VALUE);
> +
> + // iterates through t
> + for (int j = 1; j <= m; j++) {
> + final char t_j = right.charAt(j - 1); // jth character of t
> + d[0] = j;
> +
> + // compute stripe indices, constrain to array size
> + final int min = Math.max(1, j - threshold);
> + final int max = j > Integer.MAX_VALUE - threshold ? n :
> Math.min(
> + n, j + threshold);
> +
> + // the stripe may lead off of the table if s and t are of
> different
> + // sizes
> + if (min > max) {
> + return -1;
> + }
> +
> + // ignore entry left of leftmost
> + if (min > 1) {
> + d[min - 1] = Integer.MAX_VALUE;
> + }
> +
> + // iterates through [min, max] in s
> + for (int i = min; i <= max; i++) {
> + if (left.charAt(i - 1) == t_j) {
> + // diagonally left and up
> + d[i] = p[i - 1];
> + } else {
> + // 1 + minimum of cell to the left, to the top,
> diagonally
> + // left and up
> + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i -
> 1]);
> + }
> + }
> +
> + // copy current distance counts to 'previous row' distance
> counts
> + _d = p;
> + p = d;
> + d = _d;
> + }
> +
> + // if p[n] is greater than the threshold, there's no guarantee on
> it
> + // being the correct
> + // distance
> + if (p[n] <= threshold) {
> + return p[n];
> + }
> + return -1;
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> new file mode 100644
> index 0000000..6765bbd
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> @@ -0,0 +1,45 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +/**
> + * <p>
> + * Marker interface for <a
> + * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a>
> + * </p>
> + *
> + * <p>
> + * A string metric measures the distance between two text strings.
> + * </p>
> + *
> + * @param <R> return type of the edit distance
> + * @since 1.0
> + */
> +public interface StringMetric<R> {
> +
> + /**
> + * <p>
> + * Compares two strings.
> + * </p>
> + *
> + * @param left the first string
> + * @param right the second string
> + * @return result distance
> + */
> + R compare(CharSequence left, CharSequence right);
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/package-info.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/package-info.java
> b/src/main/java/org/apache/commons/text/similarity/package-info.java
> new file mode 100644
> index 0000000..8e9d478
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
> @@ -0,0 +1,22 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +/**
> + * <p>Provides algorithms for string similarity.</p>
> + *
> + * @since 1.0
> + */
> +package org.apache.commons.text.similarity;
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> new file mode 100644
> index 0000000..30b1dfc
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> @@ -0,0 +1,75 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import java.util.Locale;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.FuzzyDistance}.
> + */
> +public class TestFuzzyDistance {
> +
> + private static FuzzyDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new FuzzyDistance();
> + }
> +
> + @Test
> + public void testGetFuzzyDistance() throws Exception {
> + assertEquals(0, (int) distance.compare("", "", Locale.ENGLISH));
> + assertEquals(0,
> + (int) distance.compare("Workshop", "b", Locale.ENGLISH));
> + assertEquals(1,
> + (int) distance.compare("Room", "o", Locale.ENGLISH));
> + assertEquals(1,
> + (int) distance.compare("Workshop", "w", Locale.ENGLISH));
> + assertEquals(2,
> + (int) distance.compare("Workshop", "ws", Locale.ENGLISH));
> + assertEquals(4,
> + (int) distance.compare("Workshop", "wo", Locale.ENGLISH));
> + assertEquals(3, (int) distance.compare(
> + "Apache Software Foundation", "asf", Locale.ENGLISH));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_NullNullNull() throws Exception {
> + distance.compare(null, null, null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
> + distance.compare(" ", null, Locale.ENGLISH);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_NullStringLocale() throws Exception {
> + distance.compare(null, "clear", Locale.ENGLISH);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_StringStringNull() throws Exception {
> + distance.compare(" ", "clear", null);
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> new file mode 100644
> index 0000000..6477de0
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> @@ -0,0 +1,62 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
> + */
> +public class TestJaroWrinklerDistance {
> +
> + private static JaroWrinklerDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new JaroWrinklerDistance();
> + }
> +
> + @Test
> + public void testGetJaroWinklerDistance_StringString() {
> + assertEquals(0.93d, (double) distance.compare("frog", "fog"),
> 0.0d);
> + assertEquals(0.0d, (double) distance.compare("fly", "ant"), 0.0d);
> + assertEquals(0.44d, (double) distance.compare("elephant",
> "hippo"), 0.0d);
> + assertEquals(0.91d, (double) distance.compare("ABC Corporation",
> "ABC Corp"), 0.0d);
> + assertEquals(0.93d, (double) distance.compare("D N H Enterprises
> Inc", "D & H Enterprises, Inc."), 0.0d);
> + assertEquals(0.94d, (double) distance.compare("My Gym Children's
> Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
> + assertEquals(0.9d, (double) distance.compare("PENNSYLVANIA",
> "PENNCISYLVNIA"), 0.0d);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_NullNull() throws Exception {
> + distance.compare(null, null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_StringNull() throws Exception {
> + distance.compare(" ", null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_NullString() throws Exception {
> + distance.compare(null, "clear");
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> new file mode 100644
> index 0000000..7790b05
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> @@ -0,0 +1,146 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
> + */
> +public class TestLevenshteinDistance {
> +
> + private static LevenshteinDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new LevenshteinDistance();
> + }
> +
> + @Test
> + public void testGetLevenshteinDistance_StringString() {
> + assertEquals(0, (int) distance.compare("", ""));
> + assertEquals(1, (int) distance.compare("", "a"));
> + assertEquals(7, (int) distance.compare("aaapppp", ""));
> + assertEquals(1, (int) distance.compare("frog", "fog"));
> + assertEquals(3, (int) distance.compare("fly", "ant"));
> + assertEquals(7, (int) distance.compare("elephant", "hippo"));
> + assertEquals(7, (int) distance.compare("hippo", "elephant"));
> + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
> + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
> + assertEquals(1, (int) distance.compare("hello", "hallo"));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_NullString() throws Exception {
> + distance.compare("a", null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringNull() throws Exception {
> + distance.compare(null, "a");
> + }
> +
> + @Test
> + public void testGetLevenshteinDistance_StringStringInt() {
> + // empty strings
> + assertEquals(0, (int) distance.compare("", "", 0));
> + assertEquals(7, (int) distance.compare("aaapppp", "", 8));
> + assertEquals(7, (int) distance.compare("aaapppp", "", 7));
> + assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
> +
> + // unequal strings, zero threshold
> + assertEquals(-1, (int) distance.compare("b", "a", 0));
> + assertEquals(-1, (int) distance.compare("a", "b", 0));
> +
> + // equal strings
> + assertEquals(0, (int) distance.compare("aa", "aa", 0));
> + assertEquals(0, (int) distance.compare("aa", "aa", 2));
> +
> + // same length
> + assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
> + assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
> +
> + // big stripe
> + assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
> +
> + // distance less than threshold
> + assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
> + assertEquals(3, (int) distance.compare("a", "bbb", 4));
> +
> + // distance equal to threshold
> + assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
> + assertEquals(3, (int) distance.compare("a", "bbb", 3));
> +
> + // distance greater than threshold
> + assertEquals(-1, (int) distance.compare("a", "bbb", 2));
> + assertEquals(-1, (int) distance.compare("bbb", "a", 2));
> + assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
> +
> + // stripe runs off array, strings not similar
> + assertEquals(-1, (int) distance.compare("a", "bbb", 1));
> + assertEquals(-1, (int) distance.compare("bbb", "a", 1));
> +
> + // stripe runs off array, strings are similar
> + assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
> + assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
> +
> + // old getLevenshteinDistance test cases
> + assertEquals(1, (int) distance.compare("frog", "fog", 1));
> + assertEquals(3, (int) distance.compare("fly", "ant", 3));
> + assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
> + assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
> + assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
> + assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
> + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
> + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
> + assertEquals(1, (int) distance.compare("hello", "hallo", 1));
> +
> + assertEquals(1,
> + (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
> + assertEquals(3, (int) distance.compare("fly", "ant",
> Integer.MAX_VALUE));
> + assertEquals(7,
> + (int) distance.compare("elephant", "hippo",
> Integer.MAX_VALUE));
> + assertEquals(7,
> + (int) distance.compare("hippo", "elephant",
> Integer.MAX_VALUE));
> + assertEquals(8,
> + (int) distance.compare("hippo", "zzzzzzzz",
> Integer.MAX_VALUE));
> + assertEquals(8,
> + (int) distance.compare("zzzzzzzz", "hippo",
> Integer.MAX_VALUE));
> + assertEquals(1,
> + (int) distance.compare("hello", "hallo",
> Integer.MAX_VALUE));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_NullStringInt() throws
> Exception {
> + distance.compare(null, "a", 0);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringNullInt() throws
> Exception {
> + distance.compare("a", null, 0);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringStringNegativeInt()
> + throws Exception {
> + distance.compare("a", "a", -1);
> + }
> +
> +}
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> For additional commands, e-mail: dev-help@commons.apache.org
>
>
--
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter
Re: commons-text git commit: SANDBOX-483 Incorporate String
algorithms from Commons Lang
Posted by Benedikt Ritter <br...@apache.org>.
Hello Bruno,
when building commons-text I get:
testGetJaroWinklerDistance_StringString(org.apache.commons.text.similarity.TestJaroWrinklerDistance)
Time elapsed: 0.015 sec <<< FAILURE!
java.lang.AssertionError: expected:<0.93> but was:<0.02>
at org.junit.Assert.fail(Assert.java:88)
at org.junit.Assert.failNotEquals(Assert.java:743)
at org.junit.Assert.assertEquals(Assert.java:494)
at org.junit.Assert.assertEquals(Assert.java:592)
at
org.apache.commons.text.similarity.TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString(TestJaroWrinklerDistance.java:38)
Running org.apache.commons.text.similarity.TestLevenshteinDistance
Tests run: 7, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.006 sec -
in org.apache.commons.text.similarity.TestLevenshteinDistance
Results :
Failed tests:
TestJaroWrinklerDistance.testGetJaroWinklerDistance_StringString:38
expected:<0.93> but was:<0.02>
Tests run: 16, Failures: 1, Errors: 0, Skipped: 0
can you check?
thanks!
Benedikt
2014-11-20 5:54 GMT+01:00 <ki...@apache.org>:
> Repository: commons-text
> Updated Branches:
> refs/heads/master 71b59af90 -> c0e8c0210
>
>
> SANDBOX-483 Incorporate String algorithms from Commons Lang
>
>
> Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
> Commit:
> http://git-wip-us.apache.org/repos/asf/commons-text/commit/c0e8c021
> Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c0e8c021
> Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c0e8c021
>
> Branch: refs/heads/master
> Commit: c0e8c021059d8300f90cf2123528a35d7c468b57
> Parents: 71b59af
> Author: Bruno P. Kinoshita <br...@yahoo.com.br>
> Authored: Thu Nov 20 02:23:05 2014 -0200
> Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
> Committed: Thu Nov 20 02:23:05 2014 -0200
>
> ----------------------------------------------------------------------
> .../commons/text/similarity/FuzzyDistance.java | 130 +++++++
> .../text/similarity/JaroWrinklerDistance.java | 373 +++++++++++++++++++
> .../text/similarity/LevenshteinDistance.java | 258 +++++++++++++
> .../commons/text/similarity/StringMetric.java | 45 +++
> .../commons/text/similarity/package-info.java | 22 ++
> .../text/similarity/TestFuzzyDistance.java | 75 ++++
> .../similarity/TestJaroWrinklerDistance.java | 62 +++
> .../similarity/TestLevenshteinDistance.java | 146 ++++++++
> 8 files changed, 1111 insertions(+)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> new file mode 100644
> index 0000000..8e9228a
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java
> @@ -0,0 +1,130 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import java.util.Locale;
> +
> +/**
> + * <p>
> + * This string matching algorithm is similar to the algorithms of editors
> such
> + * as Sublime Text, TextMate, Atom and others. One point is given for
> every
> + * matched character. Subsequent matches yield two bonus points. A higher
> score
> + * indicates a higher similarity.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class FuzzyDistance implements StringMetric<Integer> {
> +
> + /**
> + * <p>
> + * Find the Fuzzy Distance which indicates the similarity score
> between two
> + * Strings. This method uses the default locale.
> + * </p>
> + *
> + * @param term a full term that should be matched against, must not
> be null
> + * @param query the query that will be matched against a term, must
> not be
> + * null
> + * @return result score
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Integer compare(CharSequence term, CharSequence query) {
> + return compare(term, query, Locale.getDefault());
> + }
> +
> + /**
> + * <p>
> + * Find the Fuzzy Distance which indicates the similarity score
> between two
> + * Strings.
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getFuzzyDistance(null, null, null)
> = IllegalArgumentException
> + * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)
> = 0
> + * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)
> = 0
> + * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)
> = 1
> + * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)
> = 1
> + * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)
> = 2
> + * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)
> = 4
> + * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf",
> Locale.ENGLISH) = 3
> + * </pre>
> + *
> + * @param term a full term that should be matched against, must not
> be null
> + * @param query the query that will be matched against a term, must
> not be
> + * null
> + * @param locale This string matching logic is case insensitive. A
> locale is
> + * necessary to normalize both Strings to lower case.
> + * @return result score
> + * @throws IllegalArgumentException if either String input {@code
> null} or
> + * Locale input {@code null}
> + */
> + public Integer compare(CharSequence term, CharSequence query, Locale
> locale) {
> + if (term == null || query == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + } else if (locale == null) {
> + throw new IllegalArgumentException("Locale must not be null");
> + }
> +
> + // fuzzy logic is case insensitive. We normalize the Strings to
> lower
> + // case right from the start. Turning characters to lower case
> + // via Character.toLowerCase(char) is unfortunately insufficient
> + // as it does not accept a locale.
> + final String termLowerCase = term.toString().toLowerCase(locale);
> + final String queryLowerCase =
> query.toString().toLowerCase(locale);
> +
> + // the resulting score
> + int score = 0;
> +
> + // the position in the term which will be scanned next for
> potential
> + // query character matches
> + int termIndex = 0;
> +
> + // index of the previously matched character in the term
> + int previousMatchingCharacterIndex = Integer.MIN_VALUE;
> +
> + for (int queryIndex = 0; queryIndex < queryLowerCase.length();
> queryIndex++) {
> + final char queryChar = queryLowerCase.charAt(queryIndex);
> +
> + boolean termCharacterMatchFound = false;
> + for (; termIndex < termLowerCase.length()
> + && !termCharacterMatchFound; termIndex++) {
> + final char termChar = termLowerCase.charAt(termIndex);
> +
> + if (queryChar == termChar) {
> + // simple character matches result in one point
> + score++;
> +
> + // subsequent character matches further improve
> + // the score.
> + if (previousMatchingCharacterIndex + 1 == termIndex) {
> + score += 2;
> + }
> +
> + previousMatchingCharacterIndex = termIndex;
> +
> + // we can leave the nested loop. Every character in
> the
> + // query can match at most one character in the term.
> + termCharacterMatchFound = true;
> + }
> + }
> + }
> +
> + return score;
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> new file mode 100644
> index 0000000..2dcd51c
> --- /dev/null
> +++
> b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
> @@ -0,0 +1,373 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +/**
> + * <p>
> + * The Jaro measure is the weighted sum of percentage of matched
> characters
> + * from each file and transposed characters. Winkler increased this
> measure
> + * for matching initial characters.
> + * </p>
> + *
> + * <p>
> + * This implementation is based on the Jaro Winkler similarity algorithm
> + * from <a href="
> http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
> + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
> + * </p>
> + *
> + * <p>
> + * This code has been adapted from Apache Commons Lang 3.3.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class JaroWrinklerDistance implements StringMetric<Double> {
> +
> + /**
> + * Represents a failed index search.
> + */
> + public static final int INDEX_NOT_FOUND = -1;
> +
> + /**
> + * <p>
> + * Find the Jaro Winkler Distance which indicates the similarity score
> + * between two Strings.
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getJaroWinklerDistance(null, null) =
> IllegalArgumentException
> + * StringUtils.getJaroWinklerDistance("","") = 0.0
> + * StringUtils.getJaroWinklerDistance("","a") = 0.0
> + * StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
> + * StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
> + * StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
> + * StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
> + * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
> + * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
> + * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
> + * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp")
> = 0.91
> + * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D
> & H Enterprises, Inc.") = 0.93
> + * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness
> Center", "My Gym. Childrens Fitness") = 0.94
> + * StringUtils.getJaroWinklerDistance("PENNSYLVANIA",
> "PENNCISYLVNIA") = 0.9
> + * </pre>
> + *
> + * @param left the first String, must not be null
> + * @param right the second String, must not be null
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Double compare(CharSequence left, CharSequence right) {
> + final double DEFAULT_SCALING_FACTOR = 0.1;
> +
> + if (left == null || right == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + }
> +
> + final double jaro = score(left, right);
> + final int cl = commonPrefixLength(left, right);
> + final double matchScore = Math.round(jaro +
> (DEFAULT_SCALING_FACTOR
> + * cl * (1.0 - jaro)) * 100.0) / 100.0;
> +
> + return matchScore;
> + }
> +
> + // TODO: we can move these methods to a Util class, keep them here,
> + // create a common abstract class, shade lang-3.3...
> +
> + /**
> + * Calculates the number of characters from the beginning of the
> strings
> + * that match exactly one-to-one, up to a maximum of four (4)
> characters.
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @return A number between 0 and 4.
> + */
> + private static int commonPrefixLength(final CharSequence first,
> + final CharSequence second) {
> + final int result = getCommonPrefix(first.toString(),
> second.toString())
> + .length();
> +
> + // Limit the result to 4.
> + return result > 4 ? 4 : result;
> + }
> +
> + /**
> + * <p>
> + * Compares all Strings in an array and returns the initial sequence
> of
> + * characters that is common to all of them.
> + * </p>
> + *
> + * <p>
> + * For example,
> + * <code>getCommonPrefix(new String[] {"i am a machine", "i am a
> robot"}) -> "i am a "</code>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getCommonPrefix(null) = ""
> + * StringUtils.getCommonPrefix(new String[] {}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc"}) = "abc"
> + * StringUtils.getCommonPrefix(new String[] {null, null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", ""}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", null, null}) = ""
> + * StringUtils.getCommonPrefix(new String[] {null, null, "abc"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"", "abc"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", ""}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
> + * StringUtils.getCommonPrefix(new String[] {"abc", "a"}) = "a"
> + * StringUtils.getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
> + * StringUtils.getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
> + * StringUtils.getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
> + * StringUtils.getCommonPrefix(new String[] {"i am a machine", "i am
> a robot"}) = "i am a "
> + * </pre>
> + *
> + * @param strs array of String objects, entries may be null
> + * @return the initial sequence of characters that are common to all
> Strings
> + * in the array; empty String if the array is null, the
> elements are
> + * all null or if there is no common prefix.
> + * @since 2.4
> + */
> + public static String getCommonPrefix(final String... strs) {
> + if (strs == null || strs.length == 0) {
> + return "";
> + }
> + final int smallestIndexOfDiff = indexOfDifference(strs);
> + if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
> + // all strings were identical
> + if (strs[0] == null) {
> + return "";
> + }
> + return strs[0];
> + } else if (smallestIndexOfDiff == 0) {
> + // there were no common initial characters
> + return "";
> + } else {
> + // we found a common initial character sequence
> + return strs[0].substring(0, smallestIndexOfDiff);
> + }
> + }
> +
> + /**
> + * This method returns the Jaro-Winkler score for string matching.
> + *
> + * @param first the first string to be matched
> + * @param second the second string to be machted
> + * @return matching score without scaling factor impact
> + */
> + protected static double score(final CharSequence first,
> + final CharSequence second) {
> + String shorter;
> + String longer;
> +
> + // Determine which String is longer.
> + if (first.length() > second.length()) {
> + longer = first.toString().toLowerCase();
> + shorter = second.toString().toLowerCase();
> + } else {
> + longer = second.toString().toLowerCase();
> + shorter = first.toString().toLowerCase();
> + }
> +
> + // Calculate the half length() distance of the shorter String.
> + final int halflength = shorter.length() / 2 + 1;
> +
> + // Find the set of matching characters between the shorter and
> longer
> + // strings. Note that
> + // the set of matching characters may be different depending on
> the
> + // order of the strings.
> + final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
> + halflength);
> + final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
> + halflength);
> +
> + // If one or both of the sets of common characters is empty, then
> + // there is no similarity between the two strings.
> + if (m1.length() == 0 || m2.length() == 0) {
> + return 0.0;
> + }
> +
> + // If the set of common characters is not the same size, then
> + // there is no similarity between the two strings, either.
> + if (m1.length() != m2.length()) {
> + return 0.0;
> + }
> +
> + // Calculate the number of transposition between the two sets
> + // of common characters.
> + final int transpositions = transpositions(m1, m2);
> +
> + // Calculate the distance.
> + final double dist = (m1.length() / ((double) shorter.length())
> + + m2.length() / ((double) longer.length()) + (m1.length()
> - transpositions)
> + / ((double) m1.length())) / 3.0;
> + return dist;
> + }
> +
> + /**
> + * Calculates the number of transposition between two strings.
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @return The number of transposition between the two strings.
> + */
> + protected static int transpositions(final CharSequence first,
> + final CharSequence second) {
> + int transpositions = 0;
> + for (int i = 0; i < first.length(); i++) {
> + if (first.charAt(i) != second.charAt(i)) {
> + transpositions++;
> + }
> + }
> + return transpositions / 2;
> + }
> +
> + /**
> + * <p>
> + * Compares all CharSequences in an array and returns the index at
> which the
> + * CharSequences begin to differ.
> + * </p>
> + *
> + * <p>
> + * For example,
> + * <code>indexOfDifference(new String[] {"i am a machine", "i am a
> robot"}) -> 7</code>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.indexOfDifference(null) = -1
> + * StringUtils.indexOfDifference(new String[] {}) = -1
> + * StringUtils.indexOfDifference(new String[] {"abc"}) = -1
> + * StringUtils.indexOfDifference(new String[] {null, null}) = -1
> + * StringUtils.indexOfDifference(new String[] {"", ""}) = -1
> + * StringUtils.indexOfDifference(new String[] {"", null}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", null, null}) = 0
> + * StringUtils.indexOfDifference(new String[] {null, null, "abc"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"", "abc"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", ""}) = 0
> + * StringUtils.indexOfDifference(new String[] {"abc", "abc"}) = -1
> + * StringUtils.indexOfDifference(new String[] {"abc", "a"}) = 1
> + * StringUtils.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
> + * StringUtils.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
> + * StringUtils.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
> + * StringUtils.indexOfDifference(new String[] {"i am a machine", "i
> am a robot"}) = 7
> + * </pre>
> + *
> + * @param css array of CharSequences, entries may be null
> + * @return the index where the strings begin to differ; -1 if they
> are all
> + * equal
> + * @since 2.4
> + * @since 3.0 Changed signature from indexOfDifference(String...) to
> + * indexOfDifference(CharSequence...)
> + */
> + protected static int indexOfDifference(final CharSequence... css) {
> + if (css == null || css.length <= 1) {
> + return INDEX_NOT_FOUND;
> + }
> + boolean anyStringNull = false;
> + boolean allStringsNull = true;
> + final int arrayLen = css.length;
> + int shortestStrLen = Integer.MAX_VALUE;
> + int longestStrLen = 0;
> +
> + // find the min and max string lengths; this avoids checking to
> make
> + // sure we are not exceeding the length of the string each time
> through
> + // the bottom loop.
> + for (int i = 0; i < arrayLen; i++) {
> + if (css[i] == null) {
> + anyStringNull = true;
> + shortestStrLen = 0;
> + } else {
> + allStringsNull = false;
> + shortestStrLen = Math.min(css[i].length(),
> shortestStrLen);
> + longestStrLen = Math.max(css[i].length(), longestStrLen);
> + }
> + }
> +
> + // handle lists containing all nulls or all empty strings
> + if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
> + return INDEX_NOT_FOUND;
> + }
> +
> + // handle lists containing some nulls or some empty strings
> + if (shortestStrLen == 0) {
> + return 0;
> + }
> +
> + // find the position with the first difference across all strings
> + int firstDiff = -1;
> + for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
> + final char comparisonChar = css[0].charAt(stringPos);
> + for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
> + if (css[arrayPos].charAt(stringPos) != comparisonChar) {
> + firstDiff = stringPos;
> + break;
> + }
> + }
> + if (firstDiff != -1) {
> + break;
> + }
> + }
> +
> + if (firstDiff == -1 && shortestStrLen != longestStrLen) {
> + // we compared all of the characters up to the length of the
> + // shortest string and didn't find a match, but the string
> lengths
> + // vary, so return the length of the shortest string.
> + return shortestStrLen;
> + }
> + return firstDiff;
> + }
> +
> + /**
> + * Gets a set of matching characters between two strings.
> + *
> + * <p>
> + * Two characters from the first string and the second string are
> + * considered matching if the character's respective positions are no
> + * farther than the limit value.
> + * </p>
> + *
> + * @param first The first string.
> + * @param second The second string.
> + * @param limit The maximum distance to consider.
> + * @return A string contain the set of common characters.
> + */
> + protected static String getSetOfMatchingCharacterWithin(
> + final CharSequence first, final CharSequence second, final
> int limit) {
> + final StringBuilder common = new StringBuilder();
> + final StringBuilder copy = new StringBuilder(second);
> +
> + for (int i = 0; i < first.length(); i++) {
> + final char ch = first.charAt(i);
> + boolean found = false;
> +
> + // See if the character is within the limit positions away
> from the
> + // original position of that character.
> + for (int j = Math.max(0, i - limit); !found
> + && j < Math.min(i + limit, second.length()); j++) {
> + if (copy.charAt(j) == ch) {
> + found = true;
> + common.append(ch);
> + copy.setCharAt(j, '*');
> + }
> + }
> + }
> + return common.toString();
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> new file mode 100644
> index 0000000..1793f1e
> --- /dev/null
> +++
> b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
> @@ -0,0 +1,258 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import java.util.Arrays;
> +
> +/**
> + * <p>
> + * A string metric for measuring the difference between two sequences.
> + * </p>
> + *
> + * <p>
> + * This code has been adapted from Apache Commons Lang 3.3.
> + * </p>
> + *
> + * @since 1.0
> + */
> +public class LevenshteinDistance implements StringMetric<Integer> {
> +
> + /**
> + * <p>
> + * Find the Levenshtein distance between two Strings.
> + * </p>
> + *
> + * <p>
> + * This is the number of changes needed to change one String into
> another,
> + * where each change is a single character modification (deletion,
> insertion
> + * or substitution).
> + * </p>
> + *
> + * <p>
> + * The previous implementation of the Levenshtein distance algorithm
> was
> + * from <a
> + * href="http://www.merriampark.com/ld.htm">
> http://www.merriampark.com
> + * /ld.htm</a>
> + * </p>
> + *
> + * <p>
> + * Chas Emerick has written an implementation in Java, which avoids an
> + * OutOfMemoryError which can occur when my Java implementation is
> used with
> + * very large strings.<br>
> + * This implementation of the Levenshtein distance algorithm is from
> <a
> + * href="http://www.merriampark.com/ldjava.htm">
> http://www.merriampark.com/
> + * ldjava.htm</a>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getLevenshteinDistance(null, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, null) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance("","") = 0
> + * StringUtils.getLevenshteinDistance("","a") = 1
> + * StringUtils.getLevenshteinDistance("aaapppp", "") = 7
> + * StringUtils.getLevenshteinDistance("frog", "fog") = 1
> + * StringUtils.getLevenshteinDistance("fly", "ant") = 3
> + * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7
> + * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7
> + * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8
> + * StringUtils.getLevenshteinDistance("hello", "hallo") = 1
> + * </pre>
> + *
> + * @param left the first string, must not be null
> + * @param right the second string, must not be null
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null}
> + */
> + @Override
> + public Integer compare(CharSequence left, CharSequence right) {
> + return compare(left, right, Integer.MAX_VALUE);
> + }
> +
> + /**
> + * <p>
> + * Find the Levenshtein distance between two Strings if it's less
> than or
> + * equal to a given threshold.
> + * </p>
> + *
> + * <p>
> + * This is the number of changes needed to change one String into
> another,
> + * where each change is a single character modification (deletion,
> insertion
> + * or substitution).
> + * </p>
> + *
> + * <p>
> + * This implementation follows from Algorithms on Strings, Trees and
> + * Sequences by Dan Gusfield and Chas Emerick's implementation of the
> + * Levenshtein distance algorithm from <a
> + * href="http://www.merriampark.com/ld.htm"
> + * >http://www.merriampark.com/ld.htm</a>
> + * </p>
> + *
> + * <pre>
> + * StringUtils.getLevenshteinDistance(null, *, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, null, *) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance(*, *, -1) =
> IllegalArgumentException
> + * StringUtils.getLevenshteinDistance("","", 0) = 0
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
> + * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
> + * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
> + * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
> + * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
> + * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
> + * </pre>
> + *
> + * @param left the first string, must not be null
> + * @param right the second string, must not be null
> + * @param threshold the target threshold, must not be negative
> + * @return result distance
> + * @throws IllegalArgumentException if either String input {@code
> null} or
> + * negative threshold
> + */
> + public Integer compare(CharSequence left, CharSequence right, int
> threshold) {
> + if (left == null || right == null) {
> + throw new IllegalArgumentException("Strings must not be
> null");
> + }
> + if (threshold < 0) {
> + throw new IllegalArgumentException("Threshold must not be
> negative");
> + }
> +
> + /*
> + * This implementation only computes the distance if it's less
> than or
> + * equal to the threshold value, returning -1 if it's greater. The
> + * advantage is performance: unbounded distance is O(nm), but a
> bound of
> + * k allows us to reduce it to O(km) time by only computing a
> diagonal
> + * stripe of width 2k + 1 of the cost table. It is also possible
> to use
> + * this to compute the unbounded Levenshtein distance by starting
> the
> + * threshold at 1 and doubling each time until the distance is
> found;
> + * this is O(dm), where d is the distance.
> + *
> + * One subtlety comes from needing to ignore entries on the
> border of
> + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore
> the entry
> + * to the left of the leftmost member We must ignore the entry
> above the
> + * rightmost member
> + *
> + * Another subtlety comes from our stripe running off the matrix
> if the
> + * strings aren't of the same size. Since string s is always
> swapped to
> + * be the shorter of the two, the stripe will always run off to
> the
> + * upper right instead of the lower left of the matrix.
> + *
> + * As a concrete example, suppose s is of length 5, t is of
> length 7,
> + * and our threshold is 1. In this case we're going to walk a
> stripe of
> + * length 3. The matrix would look like so:
> + *
> + * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | |
> |#|#|#| 5 |
> + * | | |#|#| 6 | | | | |#| 7 | | | | | |
> + *
> + * Note how the stripe leads off the table as there is no
> possible way
> + * to turn a string of length 5 into one of length 7 in edit
> distance of
> + * 1.
> + *
> + * Additionally, this implementation decreases memory usage by
> using two
> + * single-dimensional arrays and swapping them back and forth
> instead of
> + * allocating an entire n by m matrix. This requires a few minor
> + * changes, such as immediately returning when it's detected that
> the
> + * stripe has run off the matrix and initially filling the arrays
> with
> + * large values so that entries we don't compute are ignored.
> + *
> + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield
> for
> + * some discussion.
> + */
> +
> + int n = left.length(); // length of s
> + int m = right.length(); // length of t
> +
> + // if one string is empty, the edit distance is necessarily the
> length
> + // of the other
> + if (n == 0) {
> + return m <= threshold ? m : -1;
> + } else if (m == 0) {
> + return n <= threshold ? n : -1;
> + }
> +
> + if (n > m) {
> + // swap the two strings to consume less memory
> + final CharSequence tmp = left;
> + left = right;
> + right = tmp;
> + n = m;
> + m = right.length();
> + }
> +
> + int[] p = new int[n + 1]; // 'previous' cost array, horizontally
> + int[] d = new int[n + 1]; // cost array, horizontally
> + int[] _d; // placeholder to assist in swapping p and d
> +
> + // fill in starting table values
> + final int boundary = Math.min(n, threshold) + 1;
> + for (int i = 0; i < boundary; i++) {
> + p[i] = i;
> + }
> + // these fills ensure that the value above the rightmost entry of
> our
> + // stripe will be ignored in following loop iterations
> + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
> + Arrays.fill(d, Integer.MAX_VALUE);
> +
> + // iterates through t
> + for (int j = 1; j <= m; j++) {
> + final char t_j = right.charAt(j - 1); // jth character of t
> + d[0] = j;
> +
> + // compute stripe indices, constrain to array size
> + final int min = Math.max(1, j - threshold);
> + final int max = j > Integer.MAX_VALUE - threshold ? n :
> Math.min(
> + n, j + threshold);
> +
> + // the stripe may lead off of the table if s and t are of
> different
> + // sizes
> + if (min > max) {
> + return -1;
> + }
> +
> + // ignore entry left of leftmost
> + if (min > 1) {
> + d[min - 1] = Integer.MAX_VALUE;
> + }
> +
> + // iterates through [min, max] in s
> + for (int i = min; i <= max; i++) {
> + if (left.charAt(i - 1) == t_j) {
> + // diagonally left and up
> + d[i] = p[i - 1];
> + } else {
> + // 1 + minimum of cell to the left, to the top,
> diagonally
> + // left and up
> + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i -
> 1]);
> + }
> + }
> +
> + // copy current distance counts to 'previous row' distance
> counts
> + _d = p;
> + p = d;
> + d = _d;
> + }
> +
> + // if p[n] is greater than the threshold, there's no guarantee on
> it
> + // being the correct
> + // distance
> + if (p[n] <= threshold) {
> + return p[n];
> + }
> + return -1;
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> new file mode 100644
> index 0000000..6765bbd
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/StringMetric.java
> @@ -0,0 +1,45 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +/**
> + * <p>
> + * Marker interface for <a
> + * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a>
> + * </p>
> + *
> + * <p>
> + * A string metric measures the distance between two text strings.
> + * </p>
> + *
> + * @param <R> return type of the edit distance
> + * @since 1.0
> + */
> +public interface StringMetric<R> {
> +
> + /**
> + * <p>
> + * Compares two strings.
> + * </p>
> + *
> + * @param left the first string
> + * @param right the second string
> + * @return result distance
> + */
> + R compare(CharSequence left, CharSequence right);
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/package-info.java
> ----------------------------------------------------------------------
> diff --git
> a/src/main/java/org/apache/commons/text/similarity/package-info.java
> b/src/main/java/org/apache/commons/text/similarity/package-info.java
> new file mode 100644
> index 0000000..8e9d478
> --- /dev/null
> +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
> @@ -0,0 +1,22 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +/**
> + * <p>Provides algorithms for string similarity.</p>
> + *
> + * @since 1.0
> + */
> +package org.apache.commons.text.similarity;
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> new file mode 100644
> index 0000000..30b1dfc
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java
> @@ -0,0 +1,75 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import java.util.Locale;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.FuzzyDistance}.
> + */
> +public class TestFuzzyDistance {
> +
> + private static FuzzyDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new FuzzyDistance();
> + }
> +
> + @Test
> + public void testGetFuzzyDistance() throws Exception {
> + assertEquals(0, (int) distance.compare("", "", Locale.ENGLISH));
> + assertEquals(0,
> + (int) distance.compare("Workshop", "b", Locale.ENGLISH));
> + assertEquals(1,
> + (int) distance.compare("Room", "o", Locale.ENGLISH));
> + assertEquals(1,
> + (int) distance.compare("Workshop", "w", Locale.ENGLISH));
> + assertEquals(2,
> + (int) distance.compare("Workshop", "ws", Locale.ENGLISH));
> + assertEquals(4,
> + (int) distance.compare("Workshop", "wo", Locale.ENGLISH));
> + assertEquals(3, (int) distance.compare(
> + "Apache Software Foundation", "asf", Locale.ENGLISH));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_NullNullNull() throws Exception {
> + distance.compare(null, null, null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
> + distance.compare(" ", null, Locale.ENGLISH);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_NullStringLocale() throws Exception {
> + distance.compare(null, "clear", Locale.ENGLISH);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetFuzzyDistance_StringStringNull() throws Exception {
> + distance.compare(" ", "clear", null);
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> new file mode 100644
> index 0000000..6477de0
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java
> @@ -0,0 +1,62 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
> + */
> +public class TestJaroWrinklerDistance {
> +
> + private static JaroWrinklerDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new JaroWrinklerDistance();
> + }
> +
> + @Test
> + public void testGetJaroWinklerDistance_StringString() {
> + assertEquals(0.93d, (double) distance.compare("frog", "fog"),
> 0.0d);
> + assertEquals(0.0d, (double) distance.compare("fly", "ant"), 0.0d);
> + assertEquals(0.44d, (double) distance.compare("elephant",
> "hippo"), 0.0d);
> + assertEquals(0.91d, (double) distance.compare("ABC Corporation",
> "ABC Corp"), 0.0d);
> + assertEquals(0.93d, (double) distance.compare("D N H Enterprises
> Inc", "D & H Enterprises, Inc."), 0.0d);
> + assertEquals(0.94d, (double) distance.compare("My Gym Children's
> Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
> + assertEquals(0.9d, (double) distance.compare("PENNSYLVANIA",
> "PENNCISYLVNIA"), 0.0d);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_NullNull() throws Exception {
> + distance.compare(null, null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_StringNull() throws Exception {
> + distance.compare(" ", null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetJaroWinklerDistance_NullString() throws Exception {
> + distance.compare(null, "clear");
> + }
> +
> +}
>
>
> http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> ----------------------------------------------------------------------
> diff --git
> a/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> new file mode 100644
> index 0000000..7790b05
> --- /dev/null
> +++
> b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java
> @@ -0,0 +1,146 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.commons.text.similarity;
> +
> +import static org.junit.Assert.assertEquals;
> +
> +import org.junit.BeforeClass;
> +import org.junit.Test;
> +
> +/**
> + * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
> + */
> +public class TestLevenshteinDistance {
> +
> + private static LevenshteinDistance distance;
> +
> + @BeforeClass
> + public static void setUp() {
> + distance = new LevenshteinDistance();
> + }
> +
> + @Test
> + public void testGetLevenshteinDistance_StringString() {
> + assertEquals(0, (int) distance.compare("", ""));
> + assertEquals(1, (int) distance.compare("", "a"));
> + assertEquals(7, (int) distance.compare("aaapppp", ""));
> + assertEquals(1, (int) distance.compare("frog", "fog"));
> + assertEquals(3, (int) distance.compare("fly", "ant"));
> + assertEquals(7, (int) distance.compare("elephant", "hippo"));
> + assertEquals(7, (int) distance.compare("hippo", "elephant"));
> + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
> + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
> + assertEquals(1, (int) distance.compare("hello", "hallo"));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_NullString() throws Exception {
> + distance.compare("a", null);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringNull() throws Exception {
> + distance.compare(null, "a");
> + }
> +
> + @Test
> + public void testGetLevenshteinDistance_StringStringInt() {
> + // empty strings
> + assertEquals(0, (int) distance.compare("", "", 0));
> + assertEquals(7, (int) distance.compare("aaapppp", "", 8));
> + assertEquals(7, (int) distance.compare("aaapppp", "", 7));
> + assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
> +
> + // unequal strings, zero threshold
> + assertEquals(-1, (int) distance.compare("b", "a", 0));
> + assertEquals(-1, (int) distance.compare("a", "b", 0));
> +
> + // equal strings
> + assertEquals(0, (int) distance.compare("aa", "aa", 0));
> + assertEquals(0, (int) distance.compare("aa", "aa", 2));
> +
> + // same length
> + assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
> + assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
> +
> + // big stripe
> + assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
> +
> + // distance less than threshold
> + assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
> + assertEquals(3, (int) distance.compare("a", "bbb", 4));
> +
> + // distance equal to threshold
> + assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
> + assertEquals(3, (int) distance.compare("a", "bbb", 3));
> +
> + // distance greater than threshold
> + assertEquals(-1, (int) distance.compare("a", "bbb", 2));
> + assertEquals(-1, (int) distance.compare("bbb", "a", 2));
> + assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
> +
> + // stripe runs off array, strings not similar
> + assertEquals(-1, (int) distance.compare("a", "bbb", 1));
> + assertEquals(-1, (int) distance.compare("bbb", "a", 1));
> +
> + // stripe runs off array, strings are similar
> + assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
> + assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
> +
> + // old getLevenshteinDistance test cases
> + assertEquals(1, (int) distance.compare("frog", "fog", 1));
> + assertEquals(3, (int) distance.compare("fly", "ant", 3));
> + assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
> + assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
> + assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
> + assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
> + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
> + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
> + assertEquals(1, (int) distance.compare("hello", "hallo", 1));
> +
> + assertEquals(1,
> + (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
> + assertEquals(3, (int) distance.compare("fly", "ant",
> Integer.MAX_VALUE));
> + assertEquals(7,
> + (int) distance.compare("elephant", "hippo",
> Integer.MAX_VALUE));
> + assertEquals(7,
> + (int) distance.compare("hippo", "elephant",
> Integer.MAX_VALUE));
> + assertEquals(8,
> + (int) distance.compare("hippo", "zzzzzzzz",
> Integer.MAX_VALUE));
> + assertEquals(8,
> + (int) distance.compare("zzzzzzzz", "hippo",
> Integer.MAX_VALUE));
> + assertEquals(1,
> + (int) distance.compare("hello", "hallo",
> Integer.MAX_VALUE));
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_NullStringInt() throws
> Exception {
> + distance.compare(null, "a", 0);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringNullInt() throws
> Exception {
> + distance.compare("a", null, 0);
> + }
> +
> + @Test(expected = IllegalArgumentException.class)
> + public void testGetLevenshteinDistance_StringStringNegativeInt()
> + throws Exception {
> + distance.compare("a", "a", -1);
> + }
> +
> +}
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
> For additional commands, e-mail: dev-help@commons.apache.org
>
>
--
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter