You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/02/11 01:28:56 UTC
[text] Migrating Myers algorithm from [collections]
Repository: commons-text
Updated Branches:
refs/heads/master 1a236bada -> 93fb453cc
Migrating Myers algorithm from [collections]
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/93fb453c
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/93fb453c
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/93fb453c
Branch: refs/heads/master
Commit: 93fb453ccc2b4d048e416de55c28e33d0327daeb
Parents: 1a236ba
Author: Bruno P. Kinoshita <ki...@apache.org>
Authored: Sun Dec 14 23:41:43 2014 -0200
Committer: Bruno P. Kinoshita <ki...@apache.org>
Committed: Tue Feb 10 22:27:19 2015 -0200
----------------------------------------------------------------------
.../commons/text/diff/CommandVisitor.java | 145 ++++++++
.../apache/commons/text/diff/DeleteCommand.java | 55 ++++
.../apache/commons/text/diff/EditCommand.java | 87 +++++
.../apache/commons/text/diff/EditScript.java | 132 ++++++++
.../apache/commons/text/diff/InsertCommand.java | 57 ++++
.../apache/commons/text/diff/KeepCommand.java | 57 ++++
.../commons/text/diff/ReplacementsFinder.java | 111 +++++++
.../commons/text/diff/ReplacementsHandler.java | 51 +++
.../commons/text/diff/StringsComparator.java | 328 +++++++++++++++++++
.../apache/commons/text/diff/package-info.java | 25 ++
.../text/diff/ReplacementsFinderTest.java | 107 ++++++
.../text/diff/StringsComparatorTest.java | 122 +++++++
12 files changed, 1277 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/CommandVisitor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/CommandVisitor.java b/src/main/java/org/apache/commons/text/diff/CommandVisitor.java
new file mode 100644
index 0000000..d73dde8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/CommandVisitor.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * This interface should be implemented by user object to walk
+ * through {@link EditScript EditScript} objects.
+ * <p>
+ * Users should implement this interface in order to walk through
+ * the {@link EditScript EditScript} object created by the comparison
+ * of two sequences. This is a direct application of the visitor
+ * design pattern. The {@link EditScript#visit EditScript.visit}
+ * method takes an object implementing this interface as an argument,
+ * it will perform the loop over all commands in the script and the
+ * proper methods of the user class will be called as the commands are
+ * encountered.
+ * </p>
+ * <p>
+ * The implementation of the user visitor class will depend on the
+ * need. Here are two examples.
+ * </p>
+ * <p>
+ * The first example is a visitor that build the longest common
+ * subsequence:
+ * </p>
+ * <pre>
+ * import org.apache.commons.text.diff.CommandVisitor;
+ *
+ * import java.util.ArrayList;
+ *
+ * public class LongestCommonSubSequence implements CommandVisitor {
+ *
+ * public LongestCommonSubSequence() {
+ * a = new ArrayList();
+ * }
+ *
+ * public void visitInsertCommand(Object object) {
+ * }
+ *
+ * public void visitKeepCommand(Object object) {
+ * a.add(object);
+ * }
+ *
+ * public void visitDeleteCommand(Object object) {
+ * }
+ *
+ * public Object[] getSubSequence() {
+ * return a.toArray();
+ * }
+ *
+ * private ArrayList a;
+ *
+ * }
+ * </pre>
+ * <p>
+ * The second example is a visitor that shows the commands and the way
+ * they transform the first sequence into the second one:
+ * <pre>
+ * import org.apache.commons.text.diff.CommandVisitor;
+ *
+ * import java.util.Arrays;
+ * import java.util.ArrayList;
+ * import java.util.Iterator;
+ *
+ * public class ShowVisitor implements CommandVisitor {
+ *
+ * public ShowVisitor(Object[] sequence1) {
+ * v = new ArrayList();
+ * v.addAll(Arrays.asList(sequence1));
+ * index = 0;
+ * }
+ *
+ * public void visitInsertCommand(Object object) {
+ * v.insertElementAt(object, index++);
+ * display("insert", object);
+ * }
+ *
+ * public void visitKeepCommand(Object object) {
+ * ++index;
+ * display("keep ", object);
+ * }
+ *
+ * public void visitDeleteCommand(Object object) {
+ * v.remove(index);
+ * display("delete", object);
+ * }
+ *
+ * private void display(String commandName, Object object) {
+ * System.out.println(commandName + " " + object + ": " + this);
+ * }
+ *
+ * public String toString() {
+ * StringBuffer buffer = new StringBuffer();
+ * for (Iterator iter = v.iterator(); iter.hasNext();) {
+ * buffer.append(' ').append(iter.next());
+ * }
+ * return buffer.toString();
+ * }
+ *
+ * private ArrayList v;
+ * private int index;
+ *
+ * }
+ * </pre>
+ *
+ * @since 1.0
+ */
+public interface CommandVisitor<T> {
+
+ /**
+ * Method called when an insert command is encountered.
+ *
+ * @param object object to insert (this object comes from the second sequence)
+ */
+ void visitInsertCommand(T object);
+
+ /**
+ * Method called when a keep command is encountered.
+ *
+ * @param object object to keep (this object comes from the first sequence)
+ */
+ void visitKeepCommand(T object);
+
+ /**
+ * Method called when a delete command is encountered.
+ *
+ * @param object object to delete (this object comes from the first sequence)
+ */
+ void visitDeleteCommand(T object);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/DeleteCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/DeleteCommand.java b/src/main/java/org/apache/commons/text/diff/DeleteCommand.java
new file mode 100644
index 0000000..7494002
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/DeleteCommand.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Command representing the deletion of one object of the first sequence.
+ * <p>
+ * When one object of the first sequence has no corresponding object in the
+ * second sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the deletion of this object. The objects embedded in
+ * these type of commands always come from the first sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @since 1.0
+ */
+public class DeleteCommand<T> extends EditCommand<T> {
+
+ /**
+ * Simple constructor. Creates a new instance of {@link DeleteCommand}.
+ *
+ * @param object the object of the first sequence that should be deleted
+ */
+ public DeleteCommand(final T object) {
+ super(object);
+ }
+
+ /**
+ * Accept a visitor. When a <code>DeleteCommand</code> accepts a visitor, it calls
+ * its {@link CommandVisitor#visitDeleteCommand visitDeleteCommand} method.
+ *
+ * @param visitor the visitor to be accepted
+ */
+ @Override
+ public void accept(final CommandVisitor<T> visitor) {
+ visitor.visitDeleteCommand(getObject());
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/EditCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/EditCommand.java b/src/main/java/org/apache/commons/text/diff/EditCommand.java
new file mode 100644
index 0000000..972cebb
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/EditCommand.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Abstract base class for all commands used to transform an objects sequence
+ * into another one.
+ * <p>
+ * When two objects sequences are compared through the
+ * {@link StringsComparator#getScript StringsComparator.getScript} method,
+ * the result is provided has a {@link EditScript script} containing the commands
+ * that progressively transform the first sequence into the second one.
+ * </p>
+ * <p>
+ * There are only three types of commands, all of which are subclasses of this
+ * abstract class. Each command is associated with one object belonging to at
+ * least one of the sequences. These commands are {@link InsertCommand
+ * InsertCommand} which correspond to an object of the second sequence being
+ * inserted into the first sequence, {@link DeleteCommand DeleteCommand} which
+ * correspond to an object of the first sequence being removed and
+ * {@link KeepCommand KeepCommand} which correspond to an object of the first
+ * sequence which <code>equals</code> an object in the second sequence. It is
+ * guaranteed that comparison is always performed this way (i.e. the
+ * <code>equals</code> method of the object from the first sequence is used and
+ * the object passed as an argument comes from the second sequence) ; this can
+ * be important if subclassing is used for some elements in the first sequence
+ * and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Collections 4.0.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @since 1.0
+ */
+public abstract class EditCommand<T> {
+
+ /** Object on which the command should be applied. */
+ private final T object;
+
+ /**
+ * Simple constructor. Creates a new instance of EditCommand
+ *
+ * @param object reference to the object associated with this command, this
+ * refers to an element of one of the sequences being compared
+ */
+ protected EditCommand(final T object) {
+ this.object = object;
+ }
+
+ /**
+ * Returns the object associated with this command.
+ *
+ * @return the object on which the command is applied
+ */
+ protected T getObject() {
+ return object;
+ }
+
+ /**
+ * Accept a visitor.
+ * <p>
+ * This method is invoked for each commands belonging to
+ * an {@link EditScript EditScript}, in order to implement the visitor design pattern
+ *
+ * @param visitor the visitor to be accepted
+ */
+ public abstract void accept(CommandVisitor<T> visitor);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/EditScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/EditScript.java b/src/main/java/org/apache/commons/text/diff/EditScript.java
new file mode 100644
index 0000000..641d60b
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/EditScript.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class gathers all the {@link EditCommand commands} needed to transform
+ * one objects sequence into another objects sequence.
+ * <p>
+ * An edit script is the most general view of the differences between two
+ * sequences. It is built as the result of the comparison between two sequences
+ * by the {@link StringsComparator StringsComparator} class. The user can
+ * walk through it using the <em>visitor</em> design pattern.
+ * <p>
+ * It is guaranteed that the objects embedded in the {@link InsertCommand insert
+ * commands} come from the second sequence and that the objects embedded in
+ * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep
+ * commands} come from the first sequence. This can be important if subclassing
+ * is used for some elements in the first sequence and the <code>equals</code>
+ * method is specialized.
+ *
+ * @see StringsComparator
+ * @see EditCommand
+ * @see CommandVisitor
+ * @see ReplacementsHandler
+ *
+ * @since 1.0
+ */
+public class EditScript<T> {
+
+ /** Container for the commands. */
+ private final List<EditCommand<T>> commands;
+
+ /** Length of the longest common subsequence. */
+ private int lcsLength;
+
+ /** Number of modifications. */
+ private int modifications;
+
+ /**
+ * Simple constructor. Creates a new empty script.
+ */
+ public EditScript() {
+ commands = new ArrayList<EditCommand<T>>();
+ lcsLength = 0;
+ modifications = 0;
+ }
+
+ /**
+ * Add a keep command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final KeepCommand<T> command) {
+ commands.add(command);
+ ++lcsLength;
+ }
+
+ /**
+ * Add an insert command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final InsertCommand<T> command) {
+ commands.add(command);
+ ++modifications;
+ }
+
+ /**
+ * Add a delete command to the script.
+ *
+ * @param command command to add
+ */
+ public void append(final DeleteCommand<T> command) {
+ commands.add(command);
+ ++modifications;
+ }
+
+ /**
+ * Visit the script. The script implements the <em>visitor</em> design
+ * pattern, this method is the entry point to which the user supplies its
+ * own visitor, the script will be responsible to drive it through the
+ * commands in order and call the appropriate method as each command is
+ * encountered.
+ *
+ * @param visitor the visitor that will visit all commands in turn
+ */
+ public void visit(final CommandVisitor<T> visitor) {
+ for (final EditCommand<T> command : commands) {
+ command.accept(visitor);
+ }
+ }
+
+ /**
+ * Get the length of the Longest Common Subsequence (LCS). The length of the
+ * longest common subsequence is the number of {@link KeepCommand keep
+ * commands} in the script.
+ *
+ * @return length of the Longest Common Subsequence
+ */
+ public int getLCSLength() {
+ return lcsLength;
+ }
+
+ /**
+ * Get the number of effective modifications. The number of effective
+ * modification is the number of {@link DeleteCommand delete} and
+ * {@link InsertCommand insert} commands in the script.
+ *
+ * @return number of effective modifications
+ */
+ public int getModifications() {
+ return modifications;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/InsertCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/diff/InsertCommand.java
new file mode 100644
index 0000000..9a365d0
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/InsertCommand.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Command representing the insertion of one object of the second sequence.
+ * <p>
+ * When one object of the second sequence has no corresponding object in the
+ * first sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the insertion of this object. The objects embedded in
+ * these type of commands always come from the second sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @since 1.0
+ */
+public class InsertCommand<T> extends EditCommand<T> {
+
+ /**
+ * Simple constructor. Creates a new instance of InsertCommand
+ *
+ * @param object the object of the second sequence that should be inserted
+ */
+ public InsertCommand(final T object) {
+ super(object);
+ }
+
+ /**
+ * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor,
+ * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand}
+ * method.
+ *
+ * @param visitor the visitor to be accepted
+ */
+ @Override
+ public void accept(final CommandVisitor<T> visitor) {
+ visitor.visitInsertCommand(getObject());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/KeepCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/diff/KeepCommand.java
new file mode 100644
index 0000000..687f7e7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/KeepCommand.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * Command representing the keeping of one object present in both sequences.
+ * <p>
+ * When one object of the first sequence <code>equals</code> another objects in
+ * the second sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the keeping of this object. The objects embedded in
+ * these type of commands always come from the first sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @since 1.0
+ */
+public class KeepCommand<T> extends EditCommand<T> {
+
+ /**
+ * Simple constructor. Creates a new instance of KeepCommand
+ *
+ * @param object the object belonging to both sequences (the object is a
+ * reference to the instance in the first sequence which is known
+ * to be equal to an instance in the second sequence)
+ */
+ public KeepCommand(final T object) {
+ super(object);
+ }
+
+ /**
+ * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it
+ * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method.
+ *
+ * @param visitor the visitor to be accepted
+ */
+ @Override
+ public void accept(final CommandVisitor<T> visitor) {
+ visitor.visitKeepCommand(getObject());
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
new file mode 100644
index 0000000..52e4112
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class handles sequences of replacements resulting from a comparison.
+ * <p>
+ * The comparison of two objects sequences leads to the identification of common
+ * parts and parts which only belong to the first or to the second sequence. The
+ * common parts appear in the edit script in the form of <em>keep</em> commands,
+ * they can be considered as synchronization objects between the two sequences.
+ * These synchronization objects split the two sequences in synchronized
+ * sub-sequences. The first sequence can be transformed into the second one by
+ * replacing each synchronized sub-sequence of the first sequence by the
+ * corresponding sub-sequence of the second sequence. This is a synthetic way to
+ * see an {@link EditScript edit script}, replacing individual
+ * {@link DeleteCommand delete}, {@link KeepCommand keep} and
+ * {@link InsertCommand insert} commands by fewer replacements acting on
+ * complete sub-sequences.
+ * </p>
+ * <p>
+ * This class is devoted to perform this interpretation. It visits an
+ * {@link EditScript edit script} (because it implements the
+ * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied
+ * handler implementing the {@link ReplacementsHandler ReplacementsHandler}
+ * interface to process the sub-sequences.
+ * </p>
+ *
+ * @see ReplacementsHandler
+ * @see EditScript
+ * @see StringsComparator
+ *
+ * @since 1.0
+ */
+public class ReplacementsFinder<T> implements CommandVisitor<T> {
+
+ private final List<T> pendingInsertions;
+ private final List<T> pendingDeletions;
+ private int skipped;
+
+ /** Handler to call when synchronized sequences are found. */
+ private final ReplacementsHandler<T> handler;
+
+ /**
+ * Simple constructor. Creates a new instance of {@link ReplacementsFinder}.
+ *
+ * @param handler handler to call when synchronized sequences are found
+ */
+ public ReplacementsFinder(final ReplacementsHandler<T> handler) {
+ pendingInsertions = new ArrayList<T>();
+ pendingDeletions = new ArrayList<T>();
+ skipped = 0;
+ this.handler = handler;
+ }
+
+ /**
+ * Add an object to the pending insertions set.
+ *
+ * @param object object to insert
+ */
+ public void visitInsertCommand(final T object) {
+ pendingInsertions.add(object);
+ }
+
+ /**
+ * Handle a synchronization object.
+ * <p>
+ * When a synchronization object is identified, the pending insertions and
+ * pending deletions sets are provided to the user handler as subsequences.
+ * </p>
+ *
+ * @param object synchronization object detected
+ */
+ public void visitKeepCommand(final T object) {
+ if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) {
+ ++skipped;
+ } else {
+ handler.handleReplacement(skipped, pendingDeletions, pendingInsertions);
+ pendingDeletions.clear();
+ pendingInsertions.clear();
+ skipped = 1;
+ }
+ }
+
+ /**
+ * Add an object to the pending deletions set.
+ *
+ * @param object object to delete
+ */
+ public void visitDeleteCommand(final T object) {
+ pendingDeletions.add(object);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
new file mode 100644
index 0000000..d5d61a4
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+import java.util.List;
+
+/**
+ * This interface is devoted to handle synchronized replacement sequences.
+ *
+ * @see ReplacementsFinder
+ * @since 1.0
+ */
+public interface ReplacementsHandler<T> {
+
+ /**
+ * Handle two synchronized sequences.
+ * <p>
+ * This method is called by a {@link ReplacementsFinder ReplacementsFinder}
+ * instance when it has synchronized two sub-sequences of object arrays
+ * being compared, and at least one of the sequences is non-empty. Since the
+ * sequences are synchronized, the objects before the two sub-sequences are
+ * equals (if they exist). This property also holds for the objects after
+ * the two sub-sequences.
+ * <p>
+ * The replacement is defined as replacing the <code>from</code>
+ * sub-sequence into the <code>to</code> sub-sequence.
+ *
+ * @param skipped number of tokens skipped since the last call (i.e. number of
+ * tokens that were in both sequences), this number should be strictly positive
+ * except on the very first call where it can be zero (if the first object of
+ * the two sequences are different)
+ * @param from sub-sequence of objects coming from the first sequence
+ * @param to sub-sequence of objects coming from the second sequence
+ */
+ void handleReplacement(int skipped, List<T> from, List<T> to);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/StringsComparator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/diff/StringsComparator.java
new file mode 100644
index 0000000..b2940fa
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/StringsComparator.java
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+
+/**
+ * <p>
+ * It is guaranteed that the comparisons will always be done as
+ * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+ * sequence and <code>o2</code> belongs to the second sequence. This can
+ * be important if subclassing is used for some elements in the first
+ * sequence and the <code>equals</code> method is specialized.
+ * </p>
+ * <p>
+ * Comparison can be seen from two points of view: either as giving the smallest
+ * modification allowing to transform the first sequence into the second one, or
+ * as giving the longest sequence which is a subsequence of both initial
+ * sequences. The <code>equals</code> method is used to compare objects, so any
+ * object can be put into sequences. Modifications include deleting, inserting
+ * or keeping one object, starting from the beginning of the first sequence.
+ * </p>
+ * <p>
+ * This class implements the comparison algorithm, which is the very efficient
+ * algorithm from Eugene W. Myers
+ * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
+ * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
+ * the shortest possible {@link EditScript edit script} containing all the
+ * {@link EditCommand commands} needed to transform the first sequence into
+ * the second one.
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Collections 4.0.
+ * </p>
+ *
+ * @see EditScript
+ * @see EditCommand
+ * @see CommandVisitor
+ *
+ * @since 1.0
+ */
+public class StringsComparator {
+
+ /**
+ * First character sequence.
+ */
+ private final String left;
+ /**
+ * Second character sequence.
+ */
+ private final String right;
+
+ /** Temporary variables. */
+ private final int[] vDown;
+ private final int[] vUp;
+
+ /**
+ * Simple constructor.
+ * <p>
+ * Creates a new instance of StringsComparator.
+ * </p>
+ * <p>
+ * It is <em>guaranteed</em> that the comparisons will always be done as
+ * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+ * sequence and <code>o2</code> belongs to the second sequence. This can be
+ * important if subclassing is used for some elements in the first sequence
+ * and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * @param left first character sequence to be compared
+ * @param right second character sequence to be compared
+ */
+ public StringsComparator(String left, String right) {
+ this.left = left;
+ this.right = right;
+
+ final int size = left.length() + right.length() + 2;
+ vDown = new int[size];
+ vUp = new int[size];
+ }
+
+ /**
+ * Get the {@link EditScript} object.
+ * <p>
+ * It is guaranteed that the objects embedded in the {@link InsertCommand
+ * insert commands} come from the second sequence and that the objects
+ * embedded in either the {@link DeleteCommand delete commands} or
+ * {@link KeepCommand keep commands} come from the first sequence. This can
+ * be important if subclassing is used for some elements in the first
+ * sequence and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * @return the edit script resulting from the comparison of the two
+ * sequences
+ */
+ public EditScript<Character> getScript() {
+ final EditScript<Character> script = new EditScript<Character>();
+ buildScript(0, left.length(), 0, right.length(), script);
+ return script;
+ }
+
+ /**
+ * Build an edit script.
+ *
+ * @param start1 the begin of the first sequence to be compared
+ * @param end1 the end of the first sequence to be compared
+ * @param start2 the begin of the second sequence to be compared
+ * @param end2 the end of the second sequence to be compared
+ * @param script the edited script
+ */
+ private void buildScript(final int start1, final int end1, final int start2, final int end2,
+ final EditScript<Character> script) {
+ final Snake middle = getMiddleSnake(start1, end1, start2, end2);
+
+ if (middle == null
+ || middle.getStart() == end1 && middle.getDiag() == end1 - end2
+ || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
+
+ int i = start1;
+ int j = start2;
+ while (i < end1 || j < end2) {
+ if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
+ script.append(new KeepCommand<Character>(left.charAt(i)));
+ ++i;
+ ++j;
+ } else {
+ if (end1 - start1 > end2 - start2) {
+ script.append(new DeleteCommand<Character>(left.charAt(i)));
+ ++i;
+ } else {
+ script.append(new InsertCommand<Character>(right.charAt(j)));
+ ++j;
+ }
+ }
+ }
+
+ } else {
+
+ buildScript(start1, middle.getStart(),
+ start2, middle.getStart() - middle.getDiag(),
+ script);
+ for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
+ script.append(new KeepCommand<Character>(left.charAt(i)));
+ }
+ buildScript(middle.getEnd(), end1,
+ middle.getEnd() - middle.getDiag(), end2,
+ script);
+ }
+ }
+
+ /**
+ * Get the middle snake corresponding to two subsequences of the
+ * main sequences.
+ * <p>
+ * The snake is found using the MYERS Algorithm (this algorithms has
+ * also been implemented in the GNU diff program). This algorithm is
+ * explained in Eugene Myers article:
+ * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
+ * An O(ND) Difference Algorithm and Its Variations</a>.
+ * </p>
+ *
+ * @param start1 the begin of the first sequence to be compared
+ * @param end1 the end of the first sequence to be compared
+ * @param start2 the begin of the second sequence to be compared
+ * @param end2 the end of the second sequence to be compared
+ * @return the middle snake
+ */
+ private Snake getMiddleSnake(int start1, int end1, int start2, int end2) {
+ // Myers Algorithm
+ // Initialisations
+ final int m = end1 - start1;
+ final int n = end2 - start2;
+ if (m == 0 || n == 0) {
+ return null;
+ }
+
+ final int delta = m - n;
+ final int sum = n + m;
+ final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
+ vDown[1+offset] = start1;
+ vUp[1+offset] = end1 + 1;
+
+ for (int d = 0; d <= offset ; ++d) {
+ // Down
+ for (int k = -d; k <= d; k += 2) {
+ // First step
+
+ final int i = k + offset;
+ if (k == -d || k != d && vDown[i-1] < vDown[i+1]) {
+ vDown[i] = vDown[i+1];
+ } else {
+ vDown[i] = vDown[i-1] + 1;
+ }
+
+ int x = vDown[i];
+ int y = x - start1 + start2 - k;
+
+ while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
+ vDown[i] = ++x;
+ ++y;
+ }
+ // Second step
+ if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
+ if (vUp[i-delta] <= vDown[i]) {
+ return buildSnake(vUp[i-delta], k + start1 - start2, end1, end2);
+ }
+ }
+ }
+
+ // Up
+ for (int k = delta - d; k <= delta + d; k += 2) {
+ // First step
+ final int i = k + offset - delta;
+ if (k == delta - d
+ || k != delta + d && vUp[i+1] <= vUp[i-1]) {
+ vUp[i] = vUp[i+1] - 1;
+ } else {
+ vUp[i] = vUp[i-1];
+ }
+
+ int x = vUp[i] - 1;
+ int y = x - start1 + start2 - k;
+ while (x >= start1 && y >= start2
+ && left.charAt(x) == right.charAt(y)) {
+ vUp[i] = x--;
+ y--;
+ }
+ // Second step
+ if (delta % 2 == 0 && -d <= k && k <= d ) {
+ if (vUp[i] <= vDown[i + delta]) {
+ return buildSnake(vUp[i], k + start1 - start2, end1, end2);
+ }
+ }
+ }
+ }
+
+ // this should not happen
+ throw new RuntimeException("Internal Error");
+ }
+
+ /**
+ * Build a snake.
+ *
+ * @param start the value of the start of the snake
+ * @param diag the value of the diagonal of the snake
+ * @param end1 the value of the end of the first sequence to be compared
+ * @param end2 the value of the end of the second sequence to be compared
+ * @return the snake built
+ */
+ private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
+ int end = start;
+ while (end - diag < end2
+ && end < end1
+ && left.charAt(end) == right.charAt(end - diag)) {
+ ++end;
+ }
+ return new Snake(start, end, diag);
+ }
+
+ /**
+ * This class is a simple placeholder to hold the end part of a path
+ * under construction in a {@link StringsComparator StringsComparator}.
+ */
+ private static class Snake {
+
+ /** Start index. */
+ private final int start;
+
+ /** End index. */
+ private final int end;
+
+ /** Diagonal number. */
+ private final int diag;
+
+ /**
+ * Simple constructor. Creates a new instance of Snake with specified indices.
+ *
+ * @param start start index of the snake
+ * @param end end index of the snake
+ * @param diag diagonal number
+ */
+ public Snake(final int start, final int end, final int diag) {
+ this.start = start;
+ this.end = end;
+ this.diag = diag;
+ }
+
+ /**
+ * Get the start index of the snake.
+ *
+ * @return start index of the snake
+ */
+ public int getStart() {
+ return start;
+ }
+
+ /**
+ * Get the end index of the snake.
+ *
+ * @return end index of the snake
+ */
+ public int getEnd() {
+ return end;
+ }
+
+ /**
+ * Get the diagonal number of the snake.
+ *
+ * @return diagonal number of the snake
+ */
+ public int getDiag() {
+ return diag;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/main/java/org/apache/commons/text/diff/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/diff/package-info.java b/src/main/java/org/apache/commons/text/diff/package-info.java
new file mode 100644
index 0000000..92fde86
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/diff/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Provides algorithms for diff between strings.</p>
+ *
+ * <p>The initial implementation of the Myers algorithm was adapted from the
+ * commons-collections sequence package.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text.diff;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/test/java/org/apache/commons/text/diff/ReplacementsFinderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/diff/ReplacementsFinderTest.java b/src/test/java/org/apache/commons/text/diff/ReplacementsFinderTest.java
new file mode 100644
index 0000000..d611475
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/diff/ReplacementsFinderTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+/**
+ * Tests for the ReplacementsFinder.
+ */
+@RunWith(Parameterized.class)
+public class ReplacementsFinderTest {
+ private SimpleHandler handler = null;
+ private String left;
+ private String right;
+ private int skipped;
+ private Character[] from;
+ private Character[] to;
+ @Before
+ public void setUp() {
+ handler = new SimpleHandler();
+ }
+ @Parameters
+ public static Collection<Object[]> data() {
+ return Arrays.asList(new Object[][] {
+ {
+ "branco",
+ "blanco",
+ 1,
+ new Character[] {'r'},
+ new Character[] {'l'}},
+ {
+ "test the blocks before you use it",
+ "try the blocks before you put it",
+ 25,
+ new Character[] {'e', 's', 't', 's', 'e'},
+ new Character[] {'r', 'y', 'p', 't'}
+ }
+ });
+ }
+ public ReplacementsFinderTest(String left, String right, int skipped,
+ Character[] from, Character[] to) {
+ this.left = left;
+ this.right = right;
+ this.skipped = skipped;
+ this.from = from;
+ this.to = to;
+ }
+ @Test
+ public void testReplacementsHandler() {
+ final StringsComparator sc = new StringsComparator(left, right);
+ final ReplacementsFinder<Character> replacementFinder = new ReplacementsFinder<>(handler);
+ sc.getScript().visit(replacementFinder);
+ assertEquals("Skipped characters do not match", skipped, handler.getSkipped());
+ assertArrayEquals("From characters do not match", from,
+ handler.getFrom().toArray(new Character[0]));
+ assertArrayEquals("To characters do not match", to,
+ handler.getTo().toArray(new Character[0]));
+ }
+ // Helper RecplacementsHandler implementation for testing
+ private class SimpleHandler implements ReplacementsHandler<Character> {
+ private int skipped;
+ private List<Character> from;
+ private List<Character> to;
+ public SimpleHandler() {
+ skipped = 0;
+ from = new ArrayList<>();
+ to = new ArrayList<>();
+ }
+ public int getSkipped() {
+ return skipped;
+ }
+ public List<Character> getFrom() {
+ return from;
+ }
+ public List<Character> getTo() {
+ return to;
+ }
+ @Override
+ public void handleReplacement(int skipped, List<Character> from, List<Character> to) {
+ this.skipped += skipped;
+ this.from.addAll(from);
+ this.to.addAll(to);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/93fb453c/src/test/java/org/apache/commons/text/diff/StringsComparatorTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/diff/StringsComparatorTest.java b/src/test/java/org/apache/commons/text/diff/StringsComparatorTest.java
new file mode 100644
index 0000000..d5d7690
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/diff/StringsComparatorTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.diff;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+/**
+ * Tests for the StringsComparator.
+ */
+public class StringsComparatorTest {
+ private List<String> before;
+ private List<String> after;
+ private int[] length;
+ private int[] lcs;
+ @Test
+ public void testLength() {
+ for (int i = 0; i < before.size(); ++i) {
+ final StringsComparator comparator = new StringsComparator(before.get(i), after.get(i));
+ Assert.assertEquals(length[i], comparator.getScript().getModifications());
+ }
+ }
+ @Test
+ public void testLongestCommonSubsequence() {
+ for (int i = 0; i < before.size(); ++i) {
+ final StringsComparator comparator = new StringsComparator(before.get(i), after.get(i));
+ Assert.assertEquals(lcs[i], comparator.getScript().getLCSLength());
+ }
+ }
+ @Test
+ public void testExecution() {
+ for (int i = 0; i < before.size(); ++i) {
+ final ExecutionVisitor<Character> ev = new ExecutionVisitor<Character>();
+ new StringsComparator(before.get(i), after.get(i)).getScript().visit(ev);
+ Assert.assertEquals(after.get(i), ev.getString());
+ }
+ }
+ private class ExecutionVisitor<T> implements CommandVisitor<T> {
+ private StringBuilder v;
+ public ExecutionVisitor() {
+ v = new StringBuilder();
+ }
+ public void visitInsertCommand(final T object) {
+ v.append(object);
+ }
+ public void visitKeepCommand(final T object) {
+ v.append(object);
+ }
+ public void visitDeleteCommand(final T object) {
+ }
+ public String getString() {
+ return v.toString();
+ }
+ }
+ @Before
+ public void setUp() {
+ before = Arrays.asList(
+ "bottle",
+ "nematode knowledge",
+ "",
+ "aa",
+ "prefixed string",
+ "ABCABBA",
+ "glop glop",
+ "coq",
+ "spider-man");
+ after = Arrays.asList(
+ "noodle",
+ "empty bottle",
+ "",
+ "C",
+ "prefix",
+ "CBABAC",
+ "pas glop pas glop",
+ "ane",
+ "klingon");
+ length = new int[] {
+ 6,
+ 16,
+ 0,
+ 3,
+ 9,
+ 5,
+ 8,
+ 6,
+ 13
+ };
+ lcs = new int[] {
+ 3,
+ 7,
+ 0,
+ 0,
+ 6,
+ 4,
+ 9,
+ 0,
+ 2
+ };
+ }
+ @After
+ public void tearDown() {
+ before = null;
+ after = null;
+ length = null;
+ }
+}