You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2017/02/10 13:17:30 UTC
[17/25] [text] chore: update packages back to
org.apache.commons.text.*
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java b/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
deleted file mode 100644
index a98a719..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-/**
- * Abstract base class for all commands used to transform an objects sequence
- * into another one.
- * <p>
- * When two objects sequences are compared through the
- * {@link StringsComparator#getScript StringsComparator.getScript} method,
- * the result is provided has a {@link EditScript script} containing the commands
- * that progressively transform the first sequence into the second one.
- * </p>
- * <p>
- * There are only three types of commands, all of which are subclasses of this
- * abstract class. Each command is associated with one object belonging to at
- * least one of the sequences. These commands are {@link InsertCommand
- * InsertCommand} which correspond to an object of the second sequence being
- * inserted into the first sequence, {@link DeleteCommand DeleteCommand} which
- * correspond to an object of the first sequence being removed and
- * {@link KeepCommand KeepCommand} which correspond to an object of the first
- * sequence which <code>equals</code> an object in the second sequence. It is
- * guaranteed that comparison is always performed this way (i.e. the
- * <code>equals</code> method of the object from the first sequence is used and
- * the object passed as an argument comes from the second sequence) ; this can
- * be important if subclassing is used for some elements in the first sequence
- * and the <code>equals</code> method is specialized.
- * </p>
- *
- * <p>
- * This code has been adapted from Apache Commons Collections 4.0.
- * </p>
- *
- * @see StringsComparator
- * @see EditScript
- *
- * @param <T> object type
- * @since 1.0
- */
-public abstract class EditCommand<T> {
-
- /** Object on which the command should be applied. */
- private final T object;
-
- /**
- * Simple constructor. Creates a new instance of EditCommand
- *
- * @param object reference to the object associated with this command, this
- * refers to an element of one of the sequences being compared
- */
- protected EditCommand(final T object) {
- this.object = object;
- }
-
- /**
- * Returns the object associated with this command.
- *
- * @return the object on which the command is applied
- */
- protected T getObject() {
- return object;
- }
-
- /**
- * Accept a visitor.
- * <p>
- * This method is invoked for each commands belonging to
- * an {@link EditScript EditScript}, in order to implement the visitor design pattern
- *
- * @param visitor the visitor to be accepted
- */
- public abstract void accept(CommandVisitor<T> visitor);
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditScript.java b/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
deleted file mode 100644
index ed18438..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * This class gathers all the {@link EditCommand commands} needed to transform
- * one objects sequence into another objects sequence.
- * <p>
- * An edit script is the most general view of the differences between two
- * sequences. It is built as the result of the comparison between two sequences
- * by the {@link StringsComparator StringsComparator} class. The user can
- * walk through it using the <em>visitor</em> design pattern.
- * <p>
- * It is guaranteed that the objects embedded in the {@link InsertCommand insert
- * commands} come from the second sequence and that the objects embedded in
- * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep
- * commands} come from the first sequence. This can be important if subclassing
- * is used for some elements in the first sequence and the <code>equals</code>
- * method is specialized.
- *
- * @see StringsComparator
- * @see EditCommand
- * @see CommandVisitor
- * @see ReplacementsHandler
- *
- * @param <T> object type
- * @since 1.0
- */
-public class EditScript<T> {
-
- /** Container for the commands. */
- private final List<EditCommand<T>> commands;
-
- /** Length of the longest common subsequence. */
- private int lcsLength;
-
- /** Number of modifications. */
- private int modifications;
-
- /**
- * Simple constructor. Creates a new empty script.
- */
- public EditScript() {
- commands = new ArrayList<>();
- lcsLength = 0;
- modifications = 0;
- }
-
- /**
- * Add a keep command to the script.
- *
- * @param command command to add
- */
- public void append(final KeepCommand<T> command) {
- commands.add(command);
- ++lcsLength;
- }
-
- /**
- * Add an insert command to the script.
- *
- * @param command command to add
- */
- public void append(final InsertCommand<T> command) {
- commands.add(command);
- ++modifications;
- }
-
- /**
- * Add a delete command to the script.
- *
- * @param command command to add
- */
- public void append(final DeleteCommand<T> command) {
- commands.add(command);
- ++modifications;
- }
-
- /**
- * Visit the script. The script implements the <em>visitor</em> design
- * pattern, this method is the entry point to which the user supplies its
- * own visitor, the script will be responsible to drive it through the
- * commands in order and call the appropriate method as each command is
- * encountered.
- *
- * @param visitor the visitor that will visit all commands in turn
- */
- public void visit(final CommandVisitor<T> visitor) {
- for (final EditCommand<T> command : commands) {
- command.accept(visitor);
- }
- }
-
- /**
- * Get the length of the Longest Common Subsequence (LCS). The length of the
- * longest common subsequence is the number of {@link KeepCommand keep
- * commands} in the script.
- *
- * @return length of the Longest Common Subsequence
- */
- public int getLCSLength() {
- return lcsLength;
- }
-
- /**
- * Get the number of effective modifications. The number of effective
- * modification is the number of {@link DeleteCommand delete} and
- * {@link InsertCommand insert} commands in the script.
- *
- * @return number of effective modifications
- */
- public int getModifications() {
- return modifications;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
deleted file mode 100644
index ecdde02..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-/**
- * Command representing the insertion of one object of the second sequence.
- * <p>
- * When one object of the second sequence has no corresponding object in the
- * first sequence at the right place, the {@link EditScript edit script}
- * transforming the first sequence into the second sequence uses an instance of
- * this class to represent the insertion of this object. The objects embedded in
- * these type of commands always come from the second sequence.
- * </p>
- *
- * @see StringsComparator
- * @see EditScript
- *
- * @param <T> object type
- * @since 1.0
- */
-public class InsertCommand<T> extends EditCommand<T> {
-
- /**
- * Simple constructor. Creates a new instance of InsertCommand
- *
- * @param object the object of the second sequence that should be inserted
- */
- public InsertCommand(final T object) {
- super(object);
- }
-
- /**
- * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor,
- * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand}
- * method.
- *
- * @param visitor the visitor to be accepted
- */
- @Override
- public void accept(final CommandVisitor<T> visitor) {
- visitor.visitInsertCommand(getObject());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
deleted file mode 100644
index 7f03ade..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-/**
- * Command representing the keeping of one object present in both sequences.
- * <p>
- * When one object of the first sequence <code>equals</code> another objects in
- * the second sequence at the right place, the {@link EditScript edit script}
- * transforming the first sequence into the second sequence uses an instance of
- * this class to represent the keeping of this object. The objects embedded in
- * these type of commands always come from the first sequence.
- * </p>
- *
- * @see StringsComparator
- * @see EditScript
- *
- * @param <T> object type
- * @since 1.0
- */
-public class KeepCommand<T> extends EditCommand<T> {
-
- /**
- * Simple constructor. Creates a new instance of KeepCommand
- *
- * @param object the object belonging to both sequences (the object is a
- * reference to the instance in the first sequence which is known
- * to be equal to an instance in the second sequence)
- */
- public KeepCommand(final T object) {
- super(object);
- }
-
- /**
- * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it
- * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method.
- *
- * @param visitor the visitor to be accepted
- */
- @Override
- public void accept(final CommandVisitor<T> visitor) {
- visitor.visitKeepCommand(getObject());
- }
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
deleted file mode 100644
index a09c015..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * This class handles sequences of replacements resulting from a comparison.
- * <p>
- * The comparison of two objects sequences leads to the identification of common
- * parts and parts which only belong to the first or to the second sequence. The
- * common parts appear in the edit script in the form of <em>keep</em> commands,
- * they can be considered as synchronization objects between the two sequences.
- * These synchronization objects split the two sequences in synchronized
- * sub-sequences. The first sequence can be transformed into the second one by
- * replacing each synchronized sub-sequence of the first sequence by the
- * corresponding sub-sequence of the second sequence. This is a synthetic way to
- * see an {@link EditScript edit script}, replacing individual
- * {@link DeleteCommand delete}, {@link KeepCommand keep} and
- * {@link InsertCommand insert} commands by fewer replacements acting on
- * complete sub-sequences.
- * </p>
- * <p>
- * This class is devoted to perform this interpretation. It visits an
- * {@link EditScript edit script} (because it implements the
- * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied
- * handler implementing the {@link ReplacementsHandler ReplacementsHandler}
- * interface to process the sub-sequences.
- * </p>
- *
- * @see ReplacementsHandler
- * @see EditScript
- * @see StringsComparator
- *
- * @param <T> object type
- * @since 1.0
- */
-public class ReplacementsFinder<T> implements CommandVisitor<T> {
-
- /**
- * List of pending insertions.
- */
- private final List<T> pendingInsertions;
- /**
- * List of pending deletions.
- */
- private final List<T> pendingDeletions;
- /**
- * Count of elements skipped.
- */
- private int skipped;
-
- /** Handler to call when synchronized sequences are found. */
- private final ReplacementsHandler<T> handler;
-
- /**
- * Simple constructor. Creates a new instance of {@link ReplacementsFinder}.
- *
- * @param handler handler to call when synchronized sequences are found
- */
- public ReplacementsFinder(final ReplacementsHandler<T> handler) {
- pendingInsertions = new ArrayList<>();
- pendingDeletions = new ArrayList<>();
- skipped = 0;
- this.handler = handler;
- }
-
- /**
- * Add an object to the pending insertions set.
- *
- * @param object object to insert
- */
- @Override
- public void visitInsertCommand(final T object) {
- pendingInsertions.add(object);
- }
-
- /**
- * Handle a synchronization object.
- * <p>
- * When a synchronization object is identified, the pending insertions and
- * pending deletions sets are provided to the user handler as subsequences.
- * </p>
- *
- * @param object synchronization object detected
- */
- @Override
- public void visitKeepCommand(final T object) {
- if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) {
- ++skipped;
- } else {
- handler.handleReplacement(skipped, pendingDeletions, pendingInsertions);
- pendingDeletions.clear();
- pendingInsertions.clear();
- skipped = 1;
- }
- }
-
- /**
- * Add an object to the pending deletions set.
- *
- * @param object object to delete
- */
- @Override
- public void visitDeleteCommand(final T object) {
- pendingDeletions.add(object);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
deleted file mode 100644
index 8744081..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-import java.util.List;
-
-/**
- * This interface is devoted to handle synchronized replacement sequences.
- *
- * @param <T> object type
- * @see ReplacementsFinder
- * @since 1.0
- */
-public interface ReplacementsHandler<T> {
-
- /**
- * Handle two synchronized sequences.
- * <p>
- * This method is called by a {@link ReplacementsFinder ReplacementsFinder}
- * instance when it has synchronized two sub-sequences of object arrays
- * being compared, and at least one of the sequences is non-empty. Since the
- * sequences are synchronized, the objects before the two sub-sequences are
- * equals (if they exist). This property also holds for the objects after
- * the two sub-sequences.
- * <p>
- * The replacement is defined as replacing the <code>from</code>
- * sub-sequence into the <code>to</code> sub-sequence.
- *
- * @param skipped number of tokens skipped since the last call (i.e. number of
- * tokens that were in both sequences), this number should be strictly positive
- * except on the very first call where it can be zero (if the first object of
- * the two sequences are different)
- * @param from sub-sequence of objects coming from the first sequence
- * @param to sub-sequence of objects coming from the second sequence
- */
- void handleReplacement(int skipped, List<T> from, List<T> to);
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
deleted file mode 100644
index 3d39433..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.diff;
-
-/**
- * <p>
- * It is guaranteed that the comparisons will always be done as
- * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
- * sequence and <code>o2</code> belongs to the second sequence. This can
- * be important if subclassing is used for some elements in the first
- * sequence and the <code>equals</code> method is specialized.
- * </p>
- * <p>
- * Comparison can be seen from two points of view: either as giving the smallest
- * modification allowing to transform the first sequence into the second one, or
- * as giving the longest sequence which is a subsequence of both initial
- * sequences. The <code>equals</code> method is used to compare objects, so any
- * object can be put into sequences. Modifications include deleting, inserting
- * or keeping one object, starting from the beginning of the first sequence.
- * </p>
- * <p>
- * This class implements the comparison algorithm, which is the very efficient
- * algorithm from Eugene W. Myers
- * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
- * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
- * the shortest possible {@link EditScript edit script} containing all the
- * {@link EditCommand commands} needed to transform the first sequence into
- * the second one.
- *
- * <p>
- * This code has been adapted from Apache Commons Collections 4.0.
- * </p>
- *
- * @see EditScript
- * @see EditCommand
- * @see CommandVisitor
- * @since 1.0
- */
-public class StringsComparator {
-
- /**
- * First character sequence.
- */
- private final String left;
- /**
- * Second character sequence.
- */
- private final String right;
- /**
- * Temporary array.
- */
- private final int[] vDown;
- /**
- * Temporary array.
- */
- private final int[] vUp;
-
- /**
- * Simple constructor.
- * <p>
- * Creates a new instance of StringsComparator.
- * </p>
- * <p>
- * It is <em>guaranteed</em> that the comparisons will always be done as
- * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
- * sequence and <code>o2</code> belongs to the second sequence. This can be
- * important if subclassing is used for some elements in the first sequence
- * and the <code>equals</code> method is specialized.
- * </p>
- *
- * @param left first character sequence to be compared
- * @param right second character sequence to be compared
- */
- public StringsComparator(final String left, final String right) {
- this.left = left;
- this.right = right;
-
- final int size = left.length() + right.length() + 2;
- vDown = new int[size];
- vUp = new int[size];
- }
-
- /**
- * Get the {@link EditScript} object.
- * <p>
- * It is guaranteed that the objects embedded in the {@link InsertCommand
- * insert commands} come from the second sequence and that the objects
- * embedded in either the {@link DeleteCommand delete commands} or
- * {@link KeepCommand keep commands} come from the first sequence. This can
- * be important if subclassing is used for some elements in the first
- * sequence and the <code>equals</code> method is specialized.
- * </p>
- *
- * @return the edit script resulting from the comparison of the two
- * sequences
- */
- public EditScript<Character> getScript() {
- final EditScript<Character> script = new EditScript<>();
- buildScript(0, left.length(), 0, right.length(), script);
- return script;
- }
-
- /**
- * Build an edit script.
- *
- * @param start1 the begin of the first sequence to be compared
- * @param end1 the end of the first sequence to be compared
- * @param start2 the begin of the second sequence to be compared
- * @param end2 the end of the second sequence to be compared
- * @param script the edited script
- */
- private void buildScript(final int start1, final int end1, final int start2, final int end2,
- final EditScript<Character> script) {
- final Snake middle = getMiddleSnake(start1, end1, start2, end2);
-
- if (middle == null
- || middle.getStart() == end1 && middle.getDiag() == end1 - end2
- || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
-
- int i = start1;
- int j = start2;
- while (i < end1 || j < end2) {
- if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
- script.append(new KeepCommand<>(left.charAt(i)));
- ++i;
- ++j;
- } else {
- if (end1 - start1 > end2 - start2) {
- script.append(new DeleteCommand<>(left.charAt(i)));
- ++i;
- } else {
- script.append(new InsertCommand<>(right.charAt(j)));
- ++j;
- }
- }
- }
-
- } else {
-
- buildScript(start1, middle.getStart(),
- start2, middle.getStart() - middle.getDiag(),
- script);
- for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
- script.append(new KeepCommand<>(left.charAt(i)));
- }
- buildScript(middle.getEnd(), end1,
- middle.getEnd() - middle.getDiag(), end2,
- script);
- }
- }
-
- /**
- * Get the middle snake corresponding to two subsequences of the
- * main sequences.
- * <p>
- * The snake is found using the MYERS Algorithm (this algorithms has
- * also been implemented in the GNU diff program). This algorithm is
- * explained in Eugene Myers article:
- * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
- * An O(ND) Difference Algorithm and Its Variations</a>.
- * </p>
- *
- * @param start1 the begin of the first sequence to be compared
- * @param end1 the end of the first sequence to be compared
- * @param start2 the begin of the second sequence to be compared
- * @param end2 the end of the second sequence to be compared
- * @return the middle snake
- */
- private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
- // Myers Algorithm
- // Initialisations
- final int m = end1 - start1;
- final int n = end2 - start2;
- if (m == 0 || n == 0) {
- return null;
- }
-
- final int delta = m - n;
- final int sum = n + m;
- final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
- vDown[1 + offset] = start1;
- vUp[1 + offset] = end1 + 1;
-
- for (int d = 0; d <= offset; ++d) {
- // Down
- for (int k = -d; k <= d; k += 2) {
- // First step
-
- final int i = k + offset;
- if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
- vDown[i] = vDown[i + 1];
- } else {
- vDown[i] = vDown[i - 1] + 1;
- }
-
- int x = vDown[i];
- int y = x - start1 + start2 - k;
-
- while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
- vDown[i] = ++x;
- ++y;
- }
- // Second step
- if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
- if (vUp[i - delta] <= vDown[i]) { // NOPMD
- return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
- }
- }
- }
-
- // Up
- for (int k = delta - d; k <= delta + d; k += 2) {
- // First step
- final int i = k + offset - delta;
- if (k == delta - d
- || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
- vUp[i] = vUp[i + 1] - 1;
- } else {
- vUp[i] = vUp[i - 1];
- }
-
- int x = vUp[i] - 1;
- int y = x - start1 + start2 - k;
- while (x >= start1 && y >= start2
- && left.charAt(x) == right.charAt(y)) {
- vUp[i] = x--;
- y--;
- }
- // Second step
- if (delta % 2 == 0 && -d <= k && k <= d) {
- if (vUp[i] <= vDown[i + delta]) { // NOPMD
- return buildSnake(vUp[i], k + start1 - start2, end1, end2);
- }
- }
- }
- }
-
- // this should not happen
- throw new RuntimeException("Internal Error");
- }
-
- /**
- * Build a snake.
- *
- * @param start the value of the start of the snake
- * @param diag the value of the diagonal of the snake
- * @param end1 the value of the end of the first sequence to be compared
- * @param end2 the value of the end of the second sequence to be compared
- * @return the snake built
- */
- private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
- int end = start;
- while (end - diag < end2
- && end < end1
- && left.charAt(end) == right.charAt(end - diag)) {
- ++end;
- }
- return new Snake(start, end, diag);
- }
-
- /**
- * This class is a simple placeholder to hold the end part of a path
- * under construction in a {@link StringsComparator StringsComparator}.
- */
- private static class Snake {
-
- /** Start index. */
- private final int start;
-
- /** End index. */
- private final int end;
-
- /** Diagonal number. */
- private final int diag;
-
- /**
- * Simple constructor. Creates a new instance of Snake with specified indices.
- *
- * @param start start index of the snake
- * @param end end index of the snake
- * @param diag diagonal number
- */
- public Snake(final int start, final int end, final int diag) {
- this.start = start;
- this.end = end;
- this.diag = diag;
- }
-
- /**
- * Get the start index of the snake.
- *
- * @return start index of the snake
- */
- public int getStart() {
- return start;
- }
-
- /**
- * Get the end index of the snake.
- *
- * @return end index of the snake
- */
- public int getEnd() {
- return end;
- }
-
- /**
- * Get the diagonal number of the snake.
- *
- * @return diagonal number of the snake
- */
- public int getDiag() {
- return diag;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/diff/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/package-info.java b/src/main/java/org/apache/commons/text/beta/diff/package-info.java
deleted file mode 100644
index 0d1323f..0000000
--- a/src/main/java/org/apache/commons/text/beta/diff/package-info.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * <p>Provides algorithms for diff between strings.</p>
- *
- * <p>The initial implementation of the Myers algorithm was adapted from the
- * commons-collections sequence package.</p>
- *
- * @since 1.0
- */
-package org.apache.commons.text.beta.diff;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/package-info.java b/src/main/java/org/apache/commons/text/beta/package-info.java
deleted file mode 100644
index dc58111..0000000
--- a/src/main/java/org/apache/commons/text/beta/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * <p>Basic classes for text handling.</p>
- *
- * @since 1.0
- */
-package org.apache.commons.text.beta;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
deleted file mode 100644
index a51fbee..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.Map;
-
-/**
- * Measures the cosine distance between two character sequences.
- *
- * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences
- * are converted into vectors through a simple tokenizer that works with a regular expression
- * to split words in a sentence.</p>
- *
- * <p>
- * For further explanation about Cosine Similarity and Cosine Distance, refer to
- * http://en.wikipedia.org/wiki/Cosine_similarity.
- * </p>
- *
- * @since 1.0
- * @see CosineSimilarity
- */
-public class CosineDistance implements EditDistance<Double> {
- /**
- * Tokenizer used to convert the character sequence into a vector.
- */
- private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
- /**
- * Cosine similarity.
- */
- private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
-
- @Override
- public Double apply(final CharSequence left, final CharSequence right) {
- final CharSequence[] leftTokens = tokenizer.tokenize(left);
- final CharSequence[] rightTokens = tokenizer.tokenize(right);
-
- final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
- final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
- final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
- return 1.0 - similarity;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
deleted file mode 100644
index d318dc3..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Measures the Cosine similarity of two vectors of an inner product space and
- * compares the angle between them.
- *
- * <p>
- * For further explanation about the Cosine Similarity, refer to
- * http://en.wikipedia.org/wiki/Cosine_similarity.
- * </p>
- *
- * @since 1.0
- */
-public class CosineSimilarity {
-
- /**
- * Calculates the cosine similarity for two given vectors.
- *
- * @param leftVector left vector
- * @param rightVector right vector
- * @return cosine similarity between the two vectors
- */
- public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector,
- final Map<CharSequence, Integer> rightVector) {
- if (leftVector == null || rightVector == null) {
- throw new IllegalArgumentException("Vectors must not be null");
- }
-
- final Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
-
- final double dotProduct = dot(leftVector, rightVector, intersection);
- double d1 = 0.0d;
- for (final Integer value : leftVector.values()) {
- d1 += Math.pow(value, 2);
- }
- double d2 = 0.0d;
- for (final Integer value : rightVector.values()) {
- d2 += Math.pow(value, 2);
- }
- double cosineSimilarity;
- if (d1 <= 0.0 || d2 <= 0.0) {
- cosineSimilarity = 0.0;
- } else {
- cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
- }
- return cosineSimilarity;
- }
-
- /**
- * Returns a set with strings common to the two given maps.
- *
- * @param leftVector left vector map
- * @param rightVector right vector map
- * @return common strings
- */
- private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector,
- final Map<CharSequence, Integer> rightVector) {
- final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet());
- intersection.retainAll(rightVector.keySet());
- return intersection;
- }
-
- /**
- * Computes the dot product of two vectors. It ignores remaining elements. It means
- * that if a vector is longer than other, then a smaller part of it will be used to compute
- * the dot product.
- *
- * @param leftVector left vector
- * @param rightVector right vector
- * @param intersection common elements
- * @return the dot product
- */
- private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector,
- final Set<CharSequence> intersection) {
- long dotProduct = 0;
- for (final CharSequence key : intersection) {
- dotProduct += leftVector.get(key) * rightVector.get(key);
- }
- return dotProduct;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/Counter.java b/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
deleted file mode 100644
index 5093cbf..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Java implementation of Python's collections Counter module.
- *
- * <p>It counts how many times each element provided occurred in an array and
- * returns a dict with the element as key and the count as value.</p>
- *
- * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
- * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
- *
- * @since 1.0
- */
-final class Counter {
-
- /**
- * Hidden constructor.
- */
- private Counter() {
- super();
- }
-
- /**
- * It counts how many times each element provided occurred in an array and
- * returns a dict with the element as key and the count as value.
- *
- * @param tokens array of tokens
- * @return dict, where the elements are key, and the count the value
- */
- public static Map<CharSequence, Integer> of(final CharSequence[] tokens) {
- final Map<CharSequence, Integer> innerCounter = new HashMap<>();
- for (final CharSequence token : tokens) {
- if (innerCounter.containsKey(token)) {
- int value = innerCounter.get(token);
- innerCounter.put(token, ++value);
- } else {
- innerCounter.put(token, 1);
- }
- }
- return innerCounter;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
deleted file mode 100644
index 4de96fb..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-/**
- * Interface for <a href="http://en.wikipedia.org/wiki/Edit_distance">Edit Distances</a>.
- *
- * <p>
- * An edit distance is a formal metric on the Kleene closure (<code>X<sup>*</sup></code>) over an
- * alphabet (<code>X</code>). Note, that a <a href="https://en.wikipedia.org/wiki/Metric_(mathematics)">metric</a>
- * on a set <code>S</code> is a function <code>d: [S * S] -> [0, INFINITY)</code> such
- * that the following hold for <code>x,y,z</code> in
- * the set <code>S</code>:
- * </p>
- * <ul>
- * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li>
- * <li><code>d(x,y) == 0</code>, if and only if, <code>x == y</code></li>
- * <li><code>d(x,y) == d(y,x)</code>, symmetry, and</li>
- * <li><code>d(x,z) <= d(x,y) + d(y,z)</code>, the triangle inequality</li>
- * </ul>
- *
- *
- * <p>
- * This is a BiFunction<CharSequence, CharSequence, R>.
- * The <code>apply</code> method
- * accepts a pair of {@link CharSequence} parameters
- * and returns an <code>R</code> type similarity score.
- * </p>
- *
- * @param <R> The type of similarity score unit used by this EditDistance.
- * @since 1.0
- */
-public interface EditDistance<R> extends SimilarityScore<R> {
-
- /**
- * Compares two CharSequences.
- *
- * @param left the first CharSequence
- * @param right the second CharSequence
- * @return the similarity score between two CharSequences
- */
- @Override
- R apply(CharSequence left, CharSequence right);
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java b/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
deleted file mode 100644
index 34452fb..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-/**
- * <p>
- * This stores a {@link EditDistance} implementation and a {@link CharSequence} "left" string.
- * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the
- * comparison function for the pair of strings.
- * </p>
- *
- * <p>
- * The following is an example which finds the most similar string:
- * </p>
- * <pre>
- * EditDistance<Integer> editDistance = new LevenshteinDistance();
- * String target = "Apache";
- * EditDistanceFrom<Integer> editDistanceFrom =
- * new EditDistanceFrom<Integer>(editDistance, target);
- * String mostSimilar = null;
- * Integer shortestDistance = null;
- *
- * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
- * Integer distance = editDistanceFrom.apply(test);
- * if (shortestDistance == null || distance < shortestDistance) {
- * shortestDistance = distance;
- * mostSimilar = test;
- * }
- * }
- *
- * System.out.println("The string most similar to \"" + target + "\" "
- * + "is \"" + mostSimilar + "\" because "
- * + "its distance is only " + shortestDistance + ".");
- * </pre>
- *
- * @param <R> This is the type of similarity score used by the EditDistance function.
- * @since 1.0
- */
-public class EditDistanceFrom<R> {
-
- /**
- * Edit distance.
- */
- private final EditDistance<R> editDistance;
- /**
- * Left parameter used in distance function.
- */
- private final CharSequence left;
-
- /**
- * <p>This accepts the edit distance implementation and the "left" string.</p>
- *
- * @param editDistance This may not be null.
- * @param left This may be null here,
- * but the EditDistance#compare(CharSequence left, CharSequence right)
- * implementation may not accept nulls.
- */
- public EditDistanceFrom(final EditDistance<R> editDistance, final CharSequence left) {
- if (editDistance == null) {
- throw new IllegalArgumentException("The edit distance may not be null.");
- }
-
- this.editDistance = editDistance;
- this.left = left;
- }
-
- /**
- * <p>
- * This compares "left" field against the "right" parameter
- * using the "edit distance" implementation.
- * </p>
- *
- * @param right the second CharSequence
- * @return the similarity score between two CharSequences
- */
- public R apply(final CharSequence right) {
- return editDistance.apply(left, right);
- }
-
- /**
- * Gets the left parameter.
- *
- * @return the left parameter
- */
- public CharSequence getLeft() {
- return left;
- }
-
- /**
- * Gets the edit distance.
- *
- * @return the edit distance
- */
- public EditDistance<R> getEditDistance() {
- return editDistance;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
deleted file mode 100644
index 3647a49..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.Locale;
-
-/**
- * A matching algorithm that is similar to the searching algorithms implemented in editors such
- * as Sublime Text, TextMate, Atom and others.
- *
- * <p>
- * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
- * indicates a higher similarity.
- * </p>
- *
- * <p>
- * This code has been adapted from Apache Commons Lang 3.3.
- * </p>
- *
- * @since 1.0
- */
-public class FuzzyScore {
-
- /**
- * Locale used to change the case of text.
- */
- private final Locale locale;
-
-
- /**
- * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
- *
- * @param locale The string matching logic is case insensitive.
- A {@link Locale} is necessary to normalize both Strings to lower case.
- * @throws IllegalArgumentException
- * This is thrown if the {@link Locale} parameter is {@code null}.
- */
- public FuzzyScore(final Locale locale) {
- if (locale == null) {
- throw new IllegalArgumentException("Locale must not be null");
- }
- this.locale = locale;
- }
-
- /**
- * <p>
- * Find the Fuzzy Score which indicates the similarity score between two
- * Strings.
- * </p>
- *
- * <pre>
- * score.fuzzyScore(null, null, null) = IllegalArgumentException
- * score.fuzzyScore("", "", Locale.ENGLISH) = 0
- * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0
- * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1
- * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1
- * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2
- * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4
- * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
- * </pre>
- *
- * @param term a full term that should be matched against, must not be null
- * @param query the query that will be matched against a term, must not be
- * null
- * @return result score
- * @throws IllegalArgumentException if either String input {@code null} or
- * Locale input {@code null}
- */
- public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
- if (term == null || query == null) {
- throw new IllegalArgumentException("Strings must not be null");
- }
-
- // fuzzy logic is case insensitive. We normalize the Strings to lower
- // case right from the start. Turning characters to lower case
- // via Character.toLowerCase(char) is unfortunately insufficient
- // as it does not accept a locale.
- final String termLowerCase = term.toString().toLowerCase(locale);
- final String queryLowerCase = query.toString().toLowerCase(locale);
-
- // the resulting score
- int score = 0;
-
- // the position in the term which will be scanned next for potential
- // query character matches
- int termIndex = 0;
-
- // index of the previously matched character in the term
- int previousMatchingCharacterIndex = Integer.MIN_VALUE;
-
- for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
- final char queryChar = queryLowerCase.charAt(queryIndex);
-
- boolean termCharacterMatchFound = false;
- for (; termIndex < termLowerCase.length()
- && !termCharacterMatchFound; termIndex++) {
- final char termChar = termLowerCase.charAt(termIndex);
-
- if (queryChar == termChar) {
- // simple character matches result in one point
- score++;
-
- // subsequent character matches further improve
- // the score.
- if (previousMatchingCharacterIndex + 1 == termIndex) {
- score += 2;
- }
-
- previousMatchingCharacterIndex = termIndex;
-
- // we can leave the nested loop. Every character in the
- // query can match at most one character in the term.
- termCharacterMatchFound = true;
- }
- }
- }
-
- return score;
- }
-
- /**
- * Gets the locale.
- *
- * @return the locale
- */
- public Locale getLocale() {
- return locale;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
deleted file mode 100644
index 2ab73ea..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-/**
- * The hamming distance between two strings of equal length is the number of
- * positions at which the corresponding symbols are different.
- *
- * <p>
- * For further explanation about the Hamming Distance, take a look at its
- * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
- * </p>
- *
- * @since 1.0
- */
-public class HammingDistance implements EditDistance<Integer> {
-
- /**
- * Find the Hamming Distance between two strings with the same
- * length.
- *
- * <p>The distance starts with zero, and for each occurrence of a
- * different character in either String, it increments the distance
- * by 1, and finally return its value.</p>
- *
- * <p>Since the Hamming Distance can only be calculated between strings of equal length, input of different lengths
- * will throw IllegalArgumentException</p>
- *
- * <pre>
- * distance.apply("", "") = 0
- * distance.apply("pappa", "pappa") = 0
- * distance.apply("1011101", "1011111") = 1
- * distance.apply("ATCG", "ACCC") = 2
- * distance.apply("karolin", "kerstin" = 3
- * </pre>
- *
- * @param left the first CharSequence, must not be null
- * @param right the second CharSequence, must not be null
- * @return distance
- * @throws IllegalArgumentException if either input is {@code null} or
- * if they do not have the same length
- */
- @Override
- public Integer apply(final CharSequence left, final CharSequence right) {
- if (left == null || right == null) {
- throw new IllegalArgumentException("Strings must not be null");
- }
-
- if (left.length() != right.length()) {
- throw new IllegalArgumentException("Strings must have the same length");
- }
-
- int distance = 0;
-
- for (int i = 0; i < left.length(); i++) {
- if (left.charAt(i) != right.charAt(i)) {
- distance++;
- }
- }
-
- return distance;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
deleted file mode 100644
index 6dcfbd4..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-/**
- * Measures the Jaccard distance of two sets of character sequence. Jaccard
- * distance is the dissimilarity between two sets. Its the complementary of
- * Jaccard similarity.
- *
- * <p>
- * For further explanation about Jaccard Distance, refer
- * https://en.wikipedia.org/wiki/Jaccard_index
- * </p>
- *
- * @since 1.0
- */
-public class JaccardDistance implements EditDistance<Double> {
-
- /**
- * We normalize the jaccardSimilarity for the purpose of computing the distance.
- */
- private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity();
-
- /**
- * Calculates Jaccard distance of two set character sequence passed as
- * input. Calculates Jaccard similarity and returns the complement of it.
- *
- * @param left first character sequence
- * @param right second character sequence
- * @return index
- * @throws IllegalArgumentException
- * if either String input {@code null}
- */
- @Override
- public Double apply(CharSequence left, CharSequence right) {
- if (left == null || right == null) {
- throw new IllegalArgumentException("Input cannot be null");
- }
- return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d;
- }
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
deleted file mode 100644
index 42da85d..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
- * sequence. Jaccard similarity is the size of the intersection divided by the
- * size of the union of the two sets.
- *
- * <p>
- * For further explanation about Jaccard Similarity, refer
- * https://en.wikipedia.org/wiki/Jaccard_index
- * </p>
- *
- * @since 1.0
- */
-public class JaccardSimilarity implements SimilarityScore<Double> {
-
- /**
- * Calculates Jaccard Similarity of two set character sequence passed as
- * input.
- *
- * @param left first character sequence
- * @param right second character sequence
- * @return index
- * @throws IllegalArgumentException
- * if either String input {@code null}
- */
- @Override
- public Double apply(CharSequence left, CharSequence right) {
- if (left == null || right == null) {
- throw new IllegalArgumentException("Input cannot be null");
- }
- return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
- }
-
- /**
- * Calculates Jaccard Similarity of two character sequences passed as
- * input. Does the calculation by identifying the union (characters in at
- * least one of the two sets) of the two sets and intersection (characters
- * which are present in set one which are present in set two)
- *
- * @param left first character sequence
- * @param right second character sequence
- * @return index
- */
- private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
- Set<String> intersectionSet = new HashSet<String>();
- Set<String> unionSet = new HashSet<String>();
- boolean unionFilled = false;
- int leftLength = left.length();
- int rightLength = right.length();
- if (leftLength == 0 || rightLength == 0) {
- return 0d;
- }
-
- for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
- unionSet.add(String.valueOf(left.charAt(leftIndex)));
- for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
- if (!unionFilled) {
- unionSet.add(String.valueOf(right.charAt(rightIndex)));
- }
- if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
- intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
- }
- }
- unionFilled = true;
- }
- return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
- }
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
deleted file mode 100644
index 7d1718b..0000000
--- a/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.beta.similarity;
-
-import java.util.Arrays;
-
-/**
- * A similarity algorithm indicating the percentage of matched characters between two character sequences.
- *
- * <p>
- * The Jaro measure is the weighted sum of percentage of matched characters
- * from each file and transposed characters. Winkler increased this measure
- * for matching initial characters.
- * </p>
- *
- * <p>
- * This implementation is based on the Jaro Winkler similarity algorithm
- * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
- * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
- * </p>
- *
- * <p>
- * This code has been adapted from Apache Commons Lang 3.3.
- * </p>
- *
- * @since 1.0
- */
-public class JaroWinklerDistance implements SimilarityScore<Double> {
-
- /**
- * Represents a failed index search.
- */
- public static final int INDEX_NOT_FOUND = -1;
-
- /**
- * Find the Jaro Winkler Distance which indicates the similarity score
- * between two CharSequences.
- *
- * <pre>
- * distance.apply(null, null) = IllegalArgumentException
- * distance.apply("","") = 0.0
- * distance.apply("","a") = 0.0
- * distance.apply("aaapppp", "") = 0.0
- * distance.apply("frog", "fog") = 0.93
- * distance.apply("fly", "ant") = 0.0
- * distance.apply("elephant", "hippo") = 0.44
- * distance.apply("hippo", "elephant") = 0.44
- * distance.apply("hippo", "zzzzzzzz") = 0.0
- * distance.apply("hello", "hallo") = 0.88
- * distance.apply("ABC Corporation", "ABC Corp") = 0.93
- * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95
- * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
- * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
- * </pre>
- *
- * @param left the first String, must not be null
- * @param right the second String, must not be null
- * @return result distance
- * @throws IllegalArgumentException if either String input {@code null}
- */
- @Override
- public Double apply(final CharSequence left, final CharSequence right) {
- final double defaultScalingFactor = 0.1;
- final double percentageRoundValue = 100.0;
-
- if (left == null || right == null) {
- throw new IllegalArgumentException("Strings must not be null");
- }
-
- int[] mtp = matches(left, right);
- double m = mtp[0];
- if (m == 0) {
- return 0D;
- }
- double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
- double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
- return Math.round(jw * percentageRoundValue) / percentageRoundValue;
- }
-
- /**
- * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array.
- *
- * @param first the first string to be matched
- * @param second the second string to be machted
- * @return mtp array containing: matches, transpositions, prefix, and max length
- */
- protected static int[] matches(final CharSequence first, final CharSequence second) {
- CharSequence max, min;
- if (first.length() > second.length()) {
- max = first;
- min = second;
- } else {
- max = second;
- min = first;
- }
- int range = Math.max(max.length() / 2 - 1, 0);
- int[] matchIndexes = new int[min.length()];
- Arrays.fill(matchIndexes, -1);
- boolean[] matchFlags = new boolean[max.length()];
- int matches = 0;
- for (int mi = 0; mi < min.length(); mi++) {
- char c1 = min.charAt(mi);
- for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
- if (!matchFlags[xi] && c1 == max.charAt(xi)) {
- matchIndexes[mi] = xi;
- matchFlags[xi] = true;
- matches++;
- break;
- }
- }
- }
- char[] ms1 = new char[matches];
- char[] ms2 = new char[matches];
- for (int i = 0, si = 0; i < min.length(); i++) {
- if (matchIndexes[i] != -1) {
- ms1[si] = min.charAt(i);
- si++;
- }
- }
- for (int i = 0, si = 0; i < max.length(); i++) {
- if (matchFlags[i]) {
- ms2[si] = max.charAt(i);
- si++;
- }
- }
- int transpositions = 0;
- for (int mi = 0; mi < ms1.length; mi++) {
- if (ms1[mi] != ms2[mi]) {
- transpositions++;
- }
- }
- int prefix = 0;
- for (int mi = 0; mi < min.length(); mi++) {
- if (first.charAt(mi) == second.charAt(mi)) {
- prefix++;
- } else {
- break;
- }
- }
- return new int[] { matches, transpositions / 2, prefix, max.length() };
- }
-
-}