You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2017/02/10 13:17:27 UTC
[14/25] [text] chore: update packages back to
org.apache.commons.text.*
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/EditScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/EditScript.java b/src/main/java/org/apache/commons/text/diff/EditScript.java
new file mode 100644
index 0000000..bf4b185
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/EditScript.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class gathers all the {@link EditCommand commands} needed to transform
+ * one objects sequence into another objects sequence.
+ * <p>
+ * An edit script is the most general view of the differences between two
+ * sequences. It is built as the result of the comparison between two sequences
+ * by the {@link StringsComparator StringsComparator} class. The user can
+ * walk through it using the <em>visitor</em> design pattern.
+ * <p>
+ * It is guaranteed that the objects embedded in the {@link InsertCommand insert
+ * commands} come from the second sequence and that the objects embedded in
+ * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep
+ * commands} come from the first sequence. This can be important if subclassing
+ * is used for some elements in the first sequence and the <code>equals</code>
+ * method is specialized.
+ *
+ * @see StringsComparator
+ * @see EditCommand
+ * @see CommandVisitor
+ * @see ReplacementsHandler
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class EditScript<T> {
+
+ /** Container for the commands. */
+ private final List<EditCommand<T>> commands;
+
+ /** Length of the longest common subsequence. */
+ private int lcsLength;
+
+ /** Number of modifications. */
+ private int modifications;
+
+ /**
+ * Simple constructor. Creates a new empty script.
+ */
+ public EditScript() {
+ commands = new ArrayList<>();
+ lcsLength = 0;
+ modifications = 0;
+ }
+
+ /**
+ * Add a keep command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final KeepCommand<T> command) {
+ commands.add(command);
+ ++lcsLength;
+ }
+
+ /**
+ * Add an insert command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final InsertCommand<T> command) {
+ commands.add(command);
+ ++modifications;
+ }
+
+ /**
+ * Add a delete command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final DeleteCommand<T> command) {
+ commands.add(command);
+ ++modifications;
+ }
+
+ /**
+ * Visit the script. The script implements the <em>visitor</em> design
+ * pattern, this method is the entry point to which the user supplies its
+ * own visitor, the script will be responsible to drive it through the
+ * commands in order and call the appropriate method as each command is
+ * encountered.
+ *
+ * @param visitor the visitor that will visit all commands in turn
+ */
+ public void visit(final CommandVisitor<T> visitor) {
+ for (final EditCommand<T> command : commands) {
+ command.accept(visitor);
+ }
+ }
+
+ /**
+ * Get the length of the Longest Common Subsequence (LCS). The length of the
+ * longest common subsequence is the number of {@link KeepCommand keep
+ * commands} in the script.
+ *
+ * @return length of the Longest Common Subsequence
+ */
+ public int getLCSLength() {
+ return lcsLength;
+ }
+
+ /**
+ * Get the number of effective modifications. The number of effective
+ * modification is the number of {@link DeleteCommand delete} and
+ * {@link InsertCommand insert} commands in the script.
+ *
+ * @return number of effective modifications
+ */
+ public int getModifications() {
+ return modifications;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/InsertCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/diff/InsertCommand.java
new file mode 100644
index 0000000..f0337dc
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/InsertCommand.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Command representing the insertion of one object of the second sequence.
+ * <p>
+ * When one object of the second sequence has no corresponding object in the
+ * first sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the insertion of this object. The objects embedded in
+ * these type of commands always come from the second sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class InsertCommand<T> extends EditCommand<T> {
+
+ /**
+ * Simple constructor. Creates a new instance of InsertCommand
+ *
+ * @param object the object of the second sequence that should be inserted
+ */
+ public InsertCommand(final T object) {
+ super(object);
+ }
+
+ /**
+ * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor,
+ * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand}
+ * method.
+ *
+ * @param visitor the visitor to be accepted
+ */
+ @Override
+ public void accept(final CommandVisitor<T> visitor) {
+ visitor.visitInsertCommand(getObject());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/KeepCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/diff/KeepCommand.java
new file mode 100644
index 0000000..34c6fe7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/KeepCommand.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Command representing the keeping of one object present in both sequences.
+ * <p>
+ * When one object of the first sequence <code>equals</code> another objects in
+ * the second sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the keeping of this object. The objects embedded in
+ * these type of commands always come from the first sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class KeepCommand<T> extends EditCommand<T> {
+
+ /**
+ * Simple constructor. Creates a new instance of KeepCommand
+ *
+ * @param object the object belonging to both sequences (the object is a
+ * reference to the instance in the first sequence which is known
+ * to be equal to an instance in the second sequence)
+ */
+ public KeepCommand(final T object) {
+ super(object);
+ }
+
+ /**
+ * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it
+ * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method.
+ *
+ * @param visitor the visitor to be accepted
+ */
+ @Override
+ public void accept(final CommandVisitor<T> visitor) {
+ visitor.visitKeepCommand(getObject());
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
new file mode 100644
index 0000000..46f1b88
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class handles sequences of replacements resulting from a comparison.
+ * <p>
+ * The comparison of two objects sequences leads to the identification of common
+ * parts and parts which only belong to the first or to the second sequence. The
+ * common parts appear in the edit script in the form of <em>keep</em> commands,
+ * they can be considered as synchronization objects between the two sequences.
+ * These synchronization objects split the two sequences in synchronized
+ * sub-sequences. The first sequence can be transformed into the second one by
+ * replacing each synchronized sub-sequence of the first sequence by the
+ * corresponding sub-sequence of the second sequence. This is a synthetic way to
+ * see an {@link EditScript edit script}, replacing individual
+ * {@link DeleteCommand delete}, {@link KeepCommand keep} and
+ * {@link InsertCommand insert} commands by fewer replacements acting on
+ * complete sub-sequences.
+ * </p>
+ * <p>
+ * This class is devoted to perform this interpretation. It visits an
+ * {@link EditScript edit script} (because it implements the
+ * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied
+ * handler implementing the {@link ReplacementsHandler ReplacementsHandler}
+ * interface to process the sub-sequences.
+ * </p>
+ *
+ * @see ReplacementsHandler
+ * @see EditScript
+ * @see StringsComparator
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class ReplacementsFinder<T> implements CommandVisitor<T> {
+
+ /**
+ * List of pending insertions.
+ */
+ private final List<T> pendingInsertions;
+ /**
+ * List of pending deletions.
+ */
+ private final List<T> pendingDeletions;
+ /**
+ * Count of elements skipped.
+ */
+ private int skipped;
+
+ /** Handler to call when synchronized sequences are found. */
+ private final ReplacementsHandler<T> handler;
+
+ /**
+ * Simple constructor. Creates a new instance of {@link ReplacementsFinder}.
+ *
+ * @param handler handler to call when synchronized sequences are found
+ */
+ public ReplacementsFinder(final ReplacementsHandler<T> handler) {
+ pendingInsertions = new ArrayList<>();
+ pendingDeletions = new ArrayList<>();
+ skipped = 0;
+ this.handler = handler;
+ }
+
+ /**
+ * Add an object to the pending insertions set.
+ *
+ * @param object object to insert
+ */
+ @Override
+ public void visitInsertCommand(final T object) {
+ pendingInsertions.add(object);
+ }
+
+ /**
+ * Handle a synchronization object.
+ * <p>
+ * When a synchronization object is identified, the pending insertions and
+ * pending deletions sets are provided to the user handler as subsequences.
+ * </p>
+ *
+ * @param object synchronization object detected
+ */
+ @Override
+ public void visitKeepCommand(final T object) {
+ if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) {
+ ++skipped;
+ } else {
+ handler.handleReplacement(skipped, pendingDeletions, pendingInsertions);
+ pendingDeletions.clear();
+ pendingInsertions.clear();
+ skipped = 1;
+ }
+ }
+
+ /**
+ * Add an object to the pending deletions set.
+ *
+ * @param object object to delete
+ */
+ @Override
+ public void visitDeleteCommand(final T object) {
+ pendingDeletions.add(object);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
new file mode 100644
index 0000000..3beb716
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.List;
+
+/**
+ * This interface is devoted to handle synchronized replacement sequences.
+ *
+ * @param <T> object type
+ * @see ReplacementsFinder
+ * @since 1.0
+ */
+public interface ReplacementsHandler<T> {
+
+ /**
+ * Handle two synchronized sequences.
+ * <p>
+ * This method is called by a {@link ReplacementsFinder ReplacementsFinder}
+ * instance when it has synchronized two sub-sequences of object arrays
+ * being compared, and at least one of the sequences is non-empty. Since the
+ * sequences are synchronized, the objects before the two sub-sequences are
+ * equals (if they exist). This property also holds for the objects after
+ * the two sub-sequences.
+ * <p>
+ * The replacement is defined as replacing the <code>from</code>
+ * sub-sequence into the <code>to</code> sub-sequence.
+ *
+ * @param skipped number of tokens skipped since the last call (i.e. number of
+ * tokens that were in both sequences), this number should be strictly positive
+ * except on the very first call where it can be zero (if the first object of
+ * the two sequences are different)
+ * @param from sub-sequence of objects coming from the first sequence
+ * @param to sub-sequence of objects coming from the second sequence
+ */
+ void handleReplacement(int skipped, List<T> from, List<T> to);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/StringsComparator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/diff/StringsComparator.java
new file mode 100644
index 0000000..a286c4a
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/StringsComparator.java
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * <p>
+ * It is guaranteed that the comparisons will always be done as
+ * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+ * sequence and <code>o2</code> belongs to the second sequence. This can
+ * be important if subclassing is used for some elements in the first
+ * sequence and the <code>equals</code> method is specialized.
+ * </p>
+ * <p>
+ * Comparison can be seen from two points of view: either as giving the smallest
+ * modification allowing to transform the first sequence into the second one, or
+ * as giving the longest sequence which is a subsequence of both initial
+ * sequences. The <code>equals</code> method is used to compare objects, so any
+ * object can be put into sequences. Modifications include deleting, inserting
+ * or keeping one object, starting from the beginning of the first sequence.
+ * </p>
+ * <p>
+ * This class implements the comparison algorithm, which is the very efficient
+ * algorithm from Eugene W. Myers
+ * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
+ * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
+ * the shortest possible {@link EditScript edit script} containing all the
+ * {@link EditCommand commands} needed to transform the first sequence into
+ * the second one.
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Collections 4.0.
+ * </p>
+ *
+ * @see EditScript
+ * @see EditCommand
+ * @see CommandVisitor
+ * @since 1.0
+ */
+public class StringsComparator {
+
+ /**
+ * First character sequence.
+ */
+ private final String left;
+ /**
+ * Second character sequence.
+ */
+ private final String right;
+ /**
+ * Temporary array.
+ */
+ private final int[] vDown;
+ /**
+ * Temporary array.
+ */
+ private final int[] vUp;
+
+ /**
+ * Simple constructor.
+ * <p>
+ * Creates a new instance of StringsComparator.
+ * </p>
+ * <p>
+ * It is <em>guaranteed</em> that the comparisons will always be done as
+ * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+ * sequence and <code>o2</code> belongs to the second sequence. This can be
+ * important if subclassing is used for some elements in the first sequence
+ * and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * @param left first character sequence to be compared
+ * @param right second character sequence to be compared
+ */
+ public StringsComparator(final String left, final String right) {
+ this.left = left;
+ this.right = right;
+
+ final int size = left.length() + right.length() + 2;
+ vDown = new int[size];
+ vUp = new int[size];
+ }
+
+ /**
+ * Get the {@link EditScript} object.
+ * <p>
+ * It is guaranteed that the objects embedded in the {@link InsertCommand
+ * insert commands} come from the second sequence and that the objects
+ * embedded in either the {@link DeleteCommand delete commands} or
+ * {@link KeepCommand keep commands} come from the first sequence. This can
+ * be important if subclassing is used for some elements in the first
+ * sequence and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * @return the edit script resulting from the comparison of the two
+ * sequences
+ */
+ public EditScript<Character> getScript() {
+ final EditScript<Character> script = new EditScript<>();
+ buildScript(0, left.length(), 0, right.length(), script);
+ return script;
+ }
+
+ /**
+ * Build an edit script.
+ *
+ * @param start1 the begin of the first sequence to be compared
+ * @param end1 the end of the first sequence to be compared
+ * @param start2 the begin of the second sequence to be compared
+ * @param end2 the end of the second sequence to be compared
+ * @param script the edited script
+ */
+ private void buildScript(final int start1, final int end1, final int start2, final int end2,
+ final EditScript<Character> script) {
+ final Snake middle = getMiddleSnake(start1, end1, start2, end2);
+
+ if (middle == null
+ || middle.getStart() == end1 && middle.getDiag() == end1 - end2
+ || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
+
+ int i = start1;
+ int j = start2;
+ while (i < end1 || j < end2) {
+ if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
+ script.append(new KeepCommand<>(left.charAt(i)));
+ ++i;
+ ++j;
+ } else {
+ if (end1 - start1 > end2 - start2) {
+ script.append(new DeleteCommand<>(left.charAt(i)));
+ ++i;
+ } else {
+ script.append(new InsertCommand<>(right.charAt(j)));
+ ++j;
+ }
+ }
+ }
+
+ } else {
+
+ buildScript(start1, middle.getStart(),
+ start2, middle.getStart() - middle.getDiag(),
+ script);
+ for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
+ script.append(new KeepCommand<>(left.charAt(i)));
+ }
+ buildScript(middle.getEnd(), end1,
+ middle.getEnd() - middle.getDiag(), end2,
+ script);
+ }
+ }
+
+ /**
+ * Get the middle snake corresponding to two subsequences of the
+ * main sequences.
+ * <p>
+ * The snake is found using the MYERS Algorithm (this algorithms has
+ * also been implemented in the GNU diff program). This algorithm is
+ * explained in Eugene Myers article:
+ * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
+ * An O(ND) Difference Algorithm and Its Variations</a>.
+ * </p>
+ *
+ * @param start1 the begin of the first sequence to be compared
+ * @param end1 the end of the first sequence to be compared
+ * @param start2 the begin of the second sequence to be compared
+ * @param end2 the end of the second sequence to be compared
+ * @return the middle snake
+ */
+ private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
+ // Myers Algorithm
+ // Initialisations
+ final int m = end1 - start1;
+ final int n = end2 - start2;
+ if (m == 0 || n == 0) {
+ return null;
+ }
+
+ final int delta = m - n;
+ final int sum = n + m;
+ final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
+ vDown[1 + offset] = start1;
+ vUp[1 + offset] = end1 + 1;
+
+ for (int d = 0; d <= offset; ++d) {
+ // Down
+ for (int k = -d; k <= d; k += 2) {
+ // First step
+
+ final int i = k + offset;
+ if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
+ vDown[i] = vDown[i + 1];
+ } else {
+ vDown[i] = vDown[i - 1] + 1;
+ }
+
+ int x = vDown[i];
+ int y = x - start1 + start2 - k;
+
+ while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
+ vDown[i] = ++x;
+ ++y;
+ }
+ // Second step
+ if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
+ if (vUp[i - delta] <= vDown[i]) { // NOPMD
+ return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
+ }
+ }
+ }
+
+ // Up
+ for (int k = delta - d; k <= delta + d; k += 2) {
+ // First step
+ final int i = k + offset - delta;
+ if (k == delta - d
+ || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
+ vUp[i] = vUp[i + 1] - 1;
+ } else {
+ vUp[i] = vUp[i - 1];
+ }
+
+ int x = vUp[i] - 1;
+ int y = x - start1 + start2 - k;
+ while (x >= start1 && y >= start2
+ && left.charAt(x) == right.charAt(y)) {
+ vUp[i] = x--;
+ y--;
+ }
+ // Second step
+ if (delta % 2 == 0 && -d <= k && k <= d) {
+ if (vUp[i] <= vDown[i + delta]) { // NOPMD
+ return buildSnake(vUp[i], k + start1 - start2, end1, end2);
+ }
+ }
+ }
+ }
+
+ // this should not happen
+ throw new RuntimeException("Internal Error");
+ }
+
+ /**
+ * Build a snake.
+ *
+ * @param start the value of the start of the snake
+ * @param diag the value of the diagonal of the snake
+ * @param end1 the value of the end of the first sequence to be compared
+ * @param end2 the value of the end of the second sequence to be compared
+ * @return the snake built
+ */
+ private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
+ int end = start;
+ while (end - diag < end2
+ && end < end1
+ && left.charAt(end) == right.charAt(end - diag)) {
+ ++end;
+ }
+ return new Snake(start, end, diag);
+ }
+
+ /**
+ * This class is a simple placeholder to hold the end part of a path
+ * under construction in a {@link StringsComparator StringsComparator}.
+ */
+ private static class Snake {
+
+ /** Start index. */
+ private final int start;
+
+ /** End index. */
+ private final int end;
+
+ /** Diagonal number. */
+ private final int diag;
+
+ /**
+ * Simple constructor. Creates a new instance of Snake with specified indices.
+ *
+ * @param start start index of the snake
+ * @param end end index of the snake
+ * @param diag diagonal number
+ */
+ public Snake(final int start, final int end, final int diag) {
+ this.start = start;
+ this.end = end;
+ this.diag = diag;
+ }
+
+ /**
+ * Get the start index of the snake.
+ *
+ * @return start index of the snake
+ */
+ public int getStart() {
+ return start;
+ }
+
+ /**
+ * Get the end index of the snake.
+ *
+ * @return end index of the snake
+ */
+ public int getEnd() {
+ return end;
+ }
+
+ /**
+ * Get the diagonal number of the snake.
+ *
+ * @return diagonal number of the snake
+ */
+ public int getDiag() {
+ return diag;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/diff/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/package-info.java b/src/main/java/org/apache/commons/text/diff/package-info.java
new file mode 100644
index 0000000..92fde86
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Provides algorithms for diff between strings.</p>
+ *
+ * <p>The initial implementation of the Myers algorithm was adapted from the
+ * commons-collections sequence package.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text.diff;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/package-info.java b/src/main/java/org/apache/commons/text/package-info.java
new file mode 100644
index 0000000..f36b6dc
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Basic classes for text handling.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
new file mode 100644
index 0000000..516c215
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Map;
+
+/**
+ * Measures the cosine distance between two character sequences.
+ *
+ * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences
+ * are converted into vectors through a simple tokenizer that works with a regular expression
+ * to split words in a sentence.</p>
+ *
+ * <p>
+ * For further explanation about Cosine Similarity and Cosine Distance, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 1.0
+ * @see CosineSimilarity
+ */
+public class CosineDistance implements EditDistance<Double> {
+ /**
+ * Tokenizer used to convert the character sequence into a vector.
+ */
+ private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
+ /**
+ * Cosine similarity.
+ */
+ private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
+
+ @Override
+ public Double apply(final CharSequence left, final CharSequence right) {
+ final CharSequence[] leftTokens = tokenizer.tokenize(left);
+ final CharSequence[] rightTokens = tokenizer.tokenize(right);
+
+ final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
+ final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
+ final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
+ return 1.0 - similarity;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
new file mode 100644
index 0000000..50790c1
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Measures the Cosine similarity of two vectors of an inner product space and
+ * compares the angle between them.
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class CosineSimilarity {
+
+ /**
+ * Calculates the cosine similarity for two given vectors.
+ *
+ * @param leftVector left vector
+ * @param rightVector right vector
+ * @return cosine similarity between the two vectors
+ */
+ public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector,
+ final Map<CharSequence, Integer> rightVector) {
+ if (leftVector == null || rightVector == null) {
+ throw new IllegalArgumentException("Vectors must not be null");
+ }
+
+ final Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
+
+ final double dotProduct = dot(leftVector, rightVector, intersection);
+ double d1 = 0.0d;
+ for (final Integer value : leftVector.values()) {
+ d1 += Math.pow(value, 2);
+ }
+ double d2 = 0.0d;
+ for (final Integer value : rightVector.values()) {
+ d2 += Math.pow(value, 2);
+ }
+ double cosineSimilarity;
+ if (d1 <= 0.0 || d2 <= 0.0) {
+ cosineSimilarity = 0.0;
+ } else {
+ cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
+ }
+ return cosineSimilarity;
+ }
+
+ /**
+ * Returns a set with strings common to the two given maps.
+ *
+ * @param leftVector left vector map
+ * @param rightVector right vector map
+ * @return common strings
+ */
+ private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector,
+ final Map<CharSequence, Integer> rightVector) {
+ final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet());
+ intersection.retainAll(rightVector.keySet());
+ return intersection;
+ }
+
+ /**
+ * Computes the dot product of two vectors. It ignores remaining elements. It means
+ * that if a vector is longer than other, then a smaller part of it will be used to compute
+ * the dot product.
+ *
+ * @param leftVector left vector
+ * @param rightVector right vector
+ * @param intersection common elements
+ * @return the dot product
+ */
+ private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector,
+ final Set<CharSequence> intersection) {
+ long dotProduct = 0;
+ for (final CharSequence key : intersection) {
+ dotProduct += leftVector.get(key) * rightVector.get(key);
+ }
+ return dotProduct;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/Counter.java b/src/main/java/org/apache/commons/text/similarity/Counter.java
new file mode 100644
index 0000000..d259d6b
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/Counter.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Java implementation of Python's collections Counter module.
+ *
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
+ * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
+ * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
+ *
+ * @since 1.0
+ */
+final class Counter {
+
+ /**
+ * Hidden constructor.
+ */
+ private Counter() {
+ super();
+ }
+
+ /**
+ * It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.
+ *
+ * @param tokens array of tokens
+ * @return dict, where the elements are key, and the count the value
+ */
+ public static Map<CharSequence, Integer> of(final CharSequence[] tokens) {
+ final Map<CharSequence, Integer> innerCounter = new HashMap<>();
+ for (final CharSequence token : tokens) {
+ if (innerCounter.containsKey(token)) {
+ int value = innerCounter.get(token);
+ innerCounter.put(token, ++value);
+ } else {
+ innerCounter.put(token, 1);
+ }
+ }
+ return innerCounter;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/EditDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/EditDistance.java b/src/main/java/org/apache/commons/text/similarity/EditDistance.java
new file mode 100644
index 0000000..cf4e2c0
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/EditDistance.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * Interface for <a href="http://en.wikipedia.org/wiki/Edit_distance">Edit Distances</a>.
+ *
+ * <p>
+ * An edit distance is a formal metric on the Kleene closure (<code>X<sup>*</sup></code>) over an
+ * alphabet (<code>X</code>). Note, that a <a href="https://en.wikipedia.org/wiki/Metric_(mathematics)">metric</a>
+ * on a set <code>S</code> is a function <code>d: [S * S] -> [0, INFINITY)</code> such
+ * that the following hold for <code>x,y,z</code> in
+ * the set <code>S</code>:
+ * </p>
+ * <ul>
+ * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li>
+ * <li><code>d(x,y) == 0</code>, if and only if, <code>x == y</code></li>
+ * <li><code>d(x,y) == d(y,x)</code>, symmetry, and</li>
+ * <li><code>d(x,z) <= d(x,y) + d(y,z)</code>, the triangle inequality</li>
+ * </ul>
+ *
+ *
+ * <p>
+ * This is a BiFunction<CharSequence, CharSequence, R>.
+ * The <code>apply</code> method
+ * accepts a pair of {@link CharSequence} parameters
+ * and returns an <code>R</code> type similarity score.
+ * </p>
+ *
+ * @param <R> The type of similarity score unit used by this EditDistance.
+ * @since 1.0
+ */
+public interface EditDistance<R> extends SimilarityScore<R> {
+
+ /**
+ * Compares two CharSequences.
+ *
+ * @param left the first CharSequence
+ * @param right the second CharSequence
+ * @return the similarity score between two CharSequences
+ */
+ @Override
+ R apply(CharSequence left, CharSequence right);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java b/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java
new file mode 100644
index 0000000..b67a41f
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * <p>
+ * This stores a {@link EditDistance} implementation and a {@link CharSequence} "left" string.
+ * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the
+ * comparison function for the pair of strings.
+ * </p>
+ *
+ * <p>
+ * The following is an example which finds the most similar string:
+ * </p>
+ * <pre>
+ * EditDistance<Integer> editDistance = new LevenshteinDistance();
+ * String target = "Apache";
+ * EditDistanceFrom<Integer> editDistanceFrom =
+ * new EditDistanceFrom<Integer>(editDistance, target);
+ * String mostSimilar = null;
+ * Integer shortestDistance = null;
+ *
+ * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+ * Integer distance = editDistanceFrom.apply(test);
+ * if (shortestDistance == null || distance < shortestDistance) {
+ * shortestDistance = distance;
+ * mostSimilar = test;
+ * }
+ * }
+ *
+ * System.out.println("The string most similar to \"" + target + "\" "
+ * + "is \"" + mostSimilar + "\" because "
+ * + "its distance is only " + shortestDistance + ".");
+ * </pre>
+ *
+ * @param <R> This is the type of similarity score used by the EditDistance function.
+ * @since 1.0
+ */
+public class EditDistanceFrom<R> {
+
+ /**
+ * Edit distance.
+ */
+ private final EditDistance<R> editDistance;
+ /**
+ * Left parameter used in distance function.
+ */
+ private final CharSequence left;
+
+ /**
+ * <p>This accepts the edit distance implementation and the "left" string.</p>
+ *
+ * @param editDistance This may not be null.
+ * @param left This may be null here,
+ * but the EditDistance#compare(CharSequence left, CharSequence right)
+ * implementation may not accept nulls.
+ */
+ public EditDistanceFrom(final EditDistance<R> editDistance, final CharSequence left) {
+ if (editDistance == null) {
+ throw new IllegalArgumentException("The edit distance may not be null.");
+ }
+
+ this.editDistance = editDistance;
+ this.left = left;
+ }
+
+ /**
+ * <p>
+ * This compares "left" field against the "right" parameter
+ * using the "edit distance" implementation.
+ * </p>
+ *
+ * @param right the second CharSequence
+ * @return the similarity score between two CharSequences
+ */
+ public R apply(final CharSequence right) {
+ return editDistance.apply(left, right);
+ }
+
+ /**
+ * Gets the left parameter.
+ *
+ * @return the left parameter
+ */
+ public CharSequence getLeft() {
+ return left;
+ }
+
+ /**
+ * Gets the edit distance.
+ *
+ * @return the edit distance
+ */
+ public EditDistance<R> getEditDistance() {
+ return editDistance;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
new file mode 100644
index 0000000..8356960
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Locale;
+
+/**
+ * A matching algorithm that is similar to the searching algorithms implemented in editors such
+ * as Sublime Text, TextMate, Atom and others.
+ *
+ * <p>
+ * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
+ * indicates a higher similarity.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class FuzzyScore {
+
+ /**
+ * Locale used to change the case of text.
+ */
+ private final Locale locale;
+
+
+ /**
+ * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
+ *
+ * @param locale The string matching logic is case insensitive.
+ A {@link Locale} is necessary to normalize both Strings to lower case.
+ * @throws IllegalArgumentException
+ * This is thrown if the {@link Locale} parameter is {@code null}.
+ */
+ public FuzzyScore(final Locale locale) {
+ if (locale == null) {
+ throw new IllegalArgumentException("Locale must not be null");
+ }
+ this.locale = locale;
+ }
+
+ /**
+ * <p>
+ * Find the Fuzzy Score which indicates the similarity score between two
+ * Strings.
+ * </p>
+ *
+ * <pre>
+ * score.fuzzyScore(null, null, null) = IllegalArgumentException
+ * score.fuzzyScore("", "", Locale.ENGLISH) = 0
+ * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0
+ * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1
+ * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1
+ * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2
+ * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4
+ * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
+ * </pre>
+ *
+ * @param term a full term that should be matched against, must not be null
+ * @param query the query that will be matched against a term, must not be
+ * null
+ * @return result score
+ * @throws IllegalArgumentException if either String input {@code null} or
+ * Locale input {@code null}
+ */
+ public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
+ if (term == null || query == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ // fuzzy logic is case insensitive. We normalize the Strings to lower
+ // case right from the start. Turning characters to lower case
+ // via Character.toLowerCase(char) is unfortunately insufficient
+ // as it does not accept a locale.
+ final String termLowerCase = term.toString().toLowerCase(locale);
+ final String queryLowerCase = query.toString().toLowerCase(locale);
+
+ // the resulting score
+ int score = 0;
+
+ // the position in the term which will be scanned next for potential
+ // query character matches
+ int termIndex = 0;
+
+ // index of the previously matched character in the term
+ int previousMatchingCharacterIndex = Integer.MIN_VALUE;
+
+ for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
+ final char queryChar = queryLowerCase.charAt(queryIndex);
+
+ boolean termCharacterMatchFound = false;
+ for (; termIndex < termLowerCase.length()
+ && !termCharacterMatchFound; termIndex++) {
+ final char termChar = termLowerCase.charAt(termIndex);
+
+ if (queryChar == termChar) {
+ // simple character matches result in one point
+ score++;
+
+ // subsequent character matches further improve
+ // the score.
+ if (previousMatchingCharacterIndex + 1 == termIndex) {
+ score += 2;
+ }
+
+ previousMatchingCharacterIndex = termIndex;
+
+ // we can leave the nested loop. Every character in the
+ // query can match at most one character in the term.
+ termCharacterMatchFound = true;
+ }
+ }
+ }
+
+ return score;
+ }
+
+ /**
+ * Gets the locale.
+ *
+ * @return the locale
+ */
+ public Locale getLocale() {
+ return locale;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/HammingDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/HammingDistance.java b/src/main/java/org/apache/commons/text/similarity/HammingDistance.java
new file mode 100644
index 0000000..8d88fe8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/HammingDistance.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * The hamming distance between two strings of equal length is the number of
+ * positions at which the corresponding symbols are different.
+ *
+ * <p>
+ * For further explanation about the Hamming Distance, take a look at its
+ * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class HammingDistance implements EditDistance<Integer> {
+
+ /**
+ * Find the Hamming Distance between two strings with the same
+ * length.
+ *
+ * <p>The distance starts with zero, and for each occurrence of a
+ * different character in either String, it increments the distance
+ * by 1, and finally return its value.</p>
+ *
+ * <p>Since the Hamming Distance can only be calculated between strings of equal length, input of different lengths
+ * will throw IllegalArgumentException</p>
+ *
+ * <pre>
+ * distance.apply("", "") = 0
+ * distance.apply("pappa", "pappa") = 0
+ * distance.apply("1011101", "1011111") = 1
+ * distance.apply("ATCG", "ACCC") = 2
+ * distance.apply("karolin", "kerstin" = 3
+ * </pre>
+ *
+ * @param left the first CharSequence, must not be null
+ * @param right the second CharSequence, must not be null
+ * @return distance
+ * @throws IllegalArgumentException if either input is {@code null} or
+ * if they do not have the same length
+ */
+ @Override
+ public Integer apply(final CharSequence left, final CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ if (left.length() != right.length()) {
+ throw new IllegalArgumentException("Strings must have the same length");
+ }
+
+ int distance = 0;
+
+ for (int i = 0; i < left.length(); i++) {
+ if (left.charAt(i) != right.charAt(i)) {
+ distance++;
+ }
+ }
+
+ return distance;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
new file mode 100644
index 0000000..1e30b07
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * Measures the Jaccard distance of two sets of character sequence. Jaccard
+ * distance is the dissimilarity between two sets. Its the complementary of
+ * Jaccard similarity.
+ *
+ * <p>
+ * For further explanation about Jaccard Distance, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaccardDistance implements EditDistance<Double> {
+
+ /**
+ * We normalize the jaccardSimilarity for the purpose of computing the distance.
+ */
+ private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity();
+
+ /**
+ * Calculates Jaccard distance of two set character sequence passed as
+ * input. Calculates Jaccard similarity and returns the complement of it.
+ *
+ * @param left first character sequence
+ * @param right second character sequence
+ * @return index
+ * @throws IllegalArgumentException
+ * if either String input {@code null}
+ */
+ @Override
+ public Double apply(CharSequence left, CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Input cannot be null");
+ }
+ return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d;
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
new file mode 100644
index 0000000..af7ce71
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
+ * sequence. Jaccard similarity is the size of the intersection divided by the
+ * size of the union of the two sets.
+ *
+ * <p>
+ * For further explanation about Jaccard Similarity, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaccardSimilarity implements SimilarityScore<Double> {
+
+ /**
+ * Calculates Jaccard Similarity of two set character sequence passed as
+ * input.
+ *
+ * @param left first character sequence
+ * @param right second character sequence
+ * @return index
+ * @throws IllegalArgumentException
+ * if either String input {@code null}
+ */
+ @Override
+ public Double apply(CharSequence left, CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Input cannot be null");
+ }
+ return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
+ }
+
+ /**
+ * Calculates Jaccard Similarity of two character sequences passed as
+ * input. Does the calculation by identifying the union (characters in at
+ * least one of the two sets) of the two sets and intersection (characters
+ * which are present in set one which are present in set two)
+ *
+ * @param left first character sequence
+ * @param right second character sequence
+ * @return index
+ */
+ private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
+ Set<String> intersectionSet = new HashSet<String>();
+ Set<String> unionSet = new HashSet<String>();
+ boolean unionFilled = false;
+ int leftLength = left.length();
+ int rightLength = right.length();
+ if (leftLength == 0 || rightLength == 0) {
+ return 0d;
+ }
+
+ for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
+ unionSet.add(String.valueOf(left.charAt(leftIndex)));
+ for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
+ if (!unionFilled) {
+ unionSet.add(String.valueOf(right.charAt(rightIndex)));
+ }
+ if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
+ intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
+ }
+ }
+ unionFilled = true;
+ }
+ return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
new file mode 100644
index 0000000..0190c7b
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Arrays;
+
+/**
+ * A similarity algorithm indicating the percentage of matched characters between two character sequences.
+ *
+ * <p>
+ * The Jaro measure is the weighted sum of percentage of matched characters
+ * from each file and transposed characters. Winkler increased this measure
+ * for matching initial characters.
+ * </p>
+ *
+ * <p>
+ * This implementation is based on the Jaro Winkler similarity algorithm
+ * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
+ * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaroWinklerDistance implements SimilarityScore<Double> {
+
+ /**
+ * Represents a failed index search.
+ */
+ public static final int INDEX_NOT_FOUND = -1;
+
+ /**
+ * Find the Jaro Winkler Distance which indicates the similarity score
+ * between two CharSequences.
+ *
+ * <pre>
+ * distance.apply(null, null) = IllegalArgumentException
+ * distance.apply("","") = 0.0
+ * distance.apply("","a") = 0.0
+ * distance.apply("aaapppp", "") = 0.0
+ * distance.apply("frog", "fog") = 0.93
+ * distance.apply("fly", "ant") = 0.0
+ * distance.apply("elephant", "hippo") = 0.44
+ * distance.apply("hippo", "elephant") = 0.44
+ * distance.apply("hippo", "zzzzzzzz") = 0.0
+ * distance.apply("hello", "hallo") = 0.88
+ * distance.apply("ABC Corporation", "ABC Corp") = 0.93
+ * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95
+ * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
+ * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
+ * </pre>
+ *
+ * @param left the first String, must not be null
+ * @param right the second String, must not be null
+ * @return result distance
+ * @throws IllegalArgumentException if either String input {@code null}
+ */
+ @Override
+ public Double apply(final CharSequence left, final CharSequence right) {
+ final double defaultScalingFactor = 0.1;
+ final double percentageRoundValue = 100.0;
+
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ }
+
+ int[] mtp = matches(left, right);
+ double m = mtp[0];
+ if (m == 0) {
+ return 0D;
+ }
+ double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
+ double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
+ return Math.round(jw * percentageRoundValue) / percentageRoundValue;
+ }
+
+ /**
+ * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array.
+ *
+ * @param first the first string to be matched
+ * @param second the second string to be machted
+ * @return mtp array containing: matches, transpositions, prefix, and max length
+ */
+ protected static int[] matches(final CharSequence first, final CharSequence second) {
+ CharSequence max, min;
+ if (first.length() > second.length()) {
+ max = first;
+ min = second;
+ } else {
+ max = second;
+ min = first;
+ }
+ int range = Math.max(max.length() / 2 - 1, 0);
+ int[] matchIndexes = new int[min.length()];
+ Arrays.fill(matchIndexes, -1);
+ boolean[] matchFlags = new boolean[max.length()];
+ int matches = 0;
+ for (int mi = 0; mi < min.length(); mi++) {
+ char c1 = min.charAt(mi);
+ for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
+ if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+ matchIndexes[mi] = xi;
+ matchFlags[xi] = true;
+ matches++;
+ break;
+ }
+ }
+ }
+ char[] ms1 = new char[matches];
+ char[] ms2 = new char[matches];
+ for (int i = 0, si = 0; i < min.length(); i++) {
+ if (matchIndexes[i] != -1) {
+ ms1[si] = min.charAt(i);
+ si++;
+ }
+ }
+ for (int i = 0, si = 0; i < max.length(); i++) {
+ if (matchFlags[i]) {
+ ms2[si] = max.charAt(i);
+ si++;
+ }
+ }
+ int transpositions = 0;
+ for (int mi = 0; mi < ms1.length; mi++) {
+ if (ms1[mi] != ms2[mi]) {
+ transpositions++;
+ }
+ }
+ int prefix = 0;
+ for (int mi = 0; mi < min.length(); mi++) {
+ if (first.charAt(mi) == second.charAt(mi)) {
+ prefix++;
+ } else {
+ break;
+ }
+ }
+ return new int[] { matches, transpositions / 2, prefix, max.length() };
+ }
+
+}