You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/12 13:25:58 UTC

[text] [SANDBOX-490] Add cosine similarity and cosine distance. Strings are converted into vectors using a simple regex tokenizer. Vectors are used to get the cosine similarity and, finally, the distance is equal to 1.0 - the distance.

Repository: commons-text
Updated Branches:
  refs/heads/master 980791b69 -> 3ccdabb1e


[SANDBOX-490] Add cosine similarity and cosine distance. Strings are converted into vectors using
a simple regex tokenizer. Vectors are used to get the cosine similarity and, finally, the
distance is equal to 1.0 - the distance.

New classes used by the cosine distance and the cosine similarity had to be created (including
the tokenizers). These classes were put in a new internal package, annotated accordingly.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/3ccdabb1
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/3ccdabb1
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/3ccdabb1

Branch: refs/heads/master
Commit: 3ccdabb1e27bd41bf88e5b623136481d989b2b0e
Parents: 980791b
Author: Bruno P. Kinoshita <ki...@apache.org>
Authored: Sat Feb 14 14:23:42 2015 -0200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 12 23:25:06 2015 +1200

----------------------------------------------------------------------
 pom.xml                                         |   4 +
 .../commons/text/similarity/CosineDistance.java |  54 +++++
 .../text/similarity/CosineSimilarity.java       | 101 +++++++++
 .../text/similarity/JaccardDistance.java        |  21 ++
 .../commons/text/similarity/JaccardIndex.java   |  21 ++
 .../text/similarity/internal/Counter.java       |  61 ++++++
 .../similarity/internal/RegexTokenizer.java     |  52 +++++
 .../text/similarity/internal/Tokenizer.java     |  35 ++++
 .../text/similarity/internal/package-info.java  |  23 +++
 .../commons/text/checkstyle-suppressions.xml    |  28 +++
 .../org/apache/commons/text/checkstyle.xml      | 206 +++++++++++++++++++
 .../org/apache/commons/text/fb-excludes.xml     |  19 ++
 .../org/apache/commons/text/license-header.txt  |  16 ++
 .../text/similarity/CosineDistanceTest.java     |  72 +++++++
 14 files changed, 713 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2f46e9a..6b2a50f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,6 +150,10 @@
         <version>2.13</version>
         <configuration>
           <enableRulesSummary>false</enableRulesSummary>
+          <configLocation>${basedir}/src/main/resources/org/apache/commons/text/checkstyle.xml</configLocation>
+          <headerLocation>${basedir}/src/main/resources/org/apache/commons/text/license-header.txt</headerLocation>
+          <suppressionsLocation>${basedir}/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml</suppressionsLocation>
+          <suppressionsFileExpression>${basedir}/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml</suppressionsFileExpression>
         </configuration>
         <reportSets>
           <reportSet>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
new file mode 100644
index 0000000..69cfad7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Map;
+
+import org.apache.commons.text.similarity.internal.Counter;
+import org.apache.commons.text.similarity.internal.RegexTokenizer;
+import org.apache.commons.text.similarity.internal.Tokenizer;
+
+/**
+ * Measures the cosine distance between two character sequences.
+ *
+ * <p>It utilizes the CosineSimilarity to compute the distance. Character sequences
+ * are converted into vectors through a simple tokenizer that works with </p>
+ *
+ * @since 0.1
+ */
+public class CosineDistance implements StringMetric<Double> {
+    /**
+     * Tokenizer used to convert the character sequence into a vector.
+     */
+    private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
+    /**
+     * Cosine similarity.
+     */
+    private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
+
+    @Override
+    public Double compare(CharSequence left, CharSequence right) {
+        CharSequence[] leftTokens = tokenizer.tokenize(left);
+        CharSequence[] rightTokens = tokenizer.tokenize(right);
+
+        Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
+        Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
+        double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
+        return 1.0 - similarity;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
new file mode 100644
index 0000000..4b29a04
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Measures the Cosine similarity of two vectors of an inner product space and
+ * compares the angle between them.
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 0.1
+ */
+public class CosineSimilarity {
+
+    /**
+     * Calculates the cosine similarity for two given vectors.
+     *
+     * @param leftVector left vector
+     * @param rightVector right vector
+     * @return cosine similarity between the two vectors
+     */
+    public Double cosineSimilarity(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector) {
+        if (leftVector == null || rightVector == null) {
+            throw new IllegalArgumentException("Vectors must not be null");
+        }
+
+        Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
+
+        double dotProduct = dot(leftVector, rightVector, intersection);
+        double d1 = 0.0d;
+        for (Integer value : leftVector.values()) {
+            d1 += Math.pow(value, 2);
+        }
+        double d2 = 0.0d;
+        for (Integer value : rightVector.values()) {
+            d2 += Math.pow(value, 2);
+        }
+        double cosineSimilarity;
+        if (d1 <= 0.0 || d2 <= 0.0) {
+            cosineSimilarity = 0.0;
+        } else {
+            cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
+        }
+        return cosineSimilarity;
+    }
+
+    /**
+     * Returns a set with strings common to the two given maps.
+     *
+     * @param leftVector left vector map
+     * @param rightVector right vector map
+     * @return common strings
+     */
+    private Set<CharSequence> getIntersection(Map<CharSequence, Integer> leftVector,
+            Map<CharSequence, Integer> rightVector) {
+        Set<CharSequence> intersection = new HashSet<CharSequence>(leftVector.keySet());
+        intersection.retainAll(rightVector.keySet());
+        return intersection;
+    }
+
+    /**
+     * Computes the dot product of two vectors. It ignores remaining elements. It means
+     * that if a vector is longer than other, then a smaller part of it will be used to compute
+     * the dot product.
+     *
+     * @param leftVector left vector
+     * @param rightVector right vector
+     * @param intersection common elements
+     * @return the dot product
+     */
+    private double dot(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector,
+            Set<CharSequence> intersection) {
+        long dotProduct = 0;
+        for (CharSequence key : intersection) {
+            dotProduct += leftVector.get(key) * rightVector.get(key);
+        }
+        return dotProduct;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
new file mode 100644
index 0000000..7237244
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+public class JaccardDistance {
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java b/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
new file mode 100644
index 0000000..ceaa468
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+public class JaccardIndex {
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
new file mode 100644
index 0000000..1f2f0d9
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Java implementation of Python's collections Counter module.
+ *
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
+ * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
+ * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
+ * @since 0.1
+ */
+public final class Counter {
+
+    /**
+     * Hidden constructor.
+     */
+    private Counter() {
+        super();
+    }
+
+    /**
+     * It counts how many times each element provided occurred in an array and
+     * returns a dict with the element as key and the count as value.
+     *
+     * @param tokens array of tokens
+     * @return dict, where the elements are key, and the count the value
+     */
+    public static Map<CharSequence, Integer> of(CharSequence[] tokens) {
+        final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>();
+        for (CharSequence token : tokens) {
+            if (innerCounter.containsKey(token)) {
+                int value = innerCounter.get(token);
+                innerCounter.put(token, ++value);
+            } else {
+                innerCounter.put(token, 1);
+            }
+        }
+        return innerCounter;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
new file mode 100644
index 0000000..cf49536
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A simple word tokenizer that utilizes regex to find words. It applies a regex
+ * {@code}(\\w)+{@code} over the input text to extract words from a given character
+ * sequence.
+ *
+ * @since 0.1
+ */
+public class RegexTokenizer implements Tokenizer<CharSequence> {
+
+    /**
+     * {@inheritDoc}
+     *
+     * @throws IllegalArgumentException if the input text is blank
+     */
+    @Override
+    public CharSequence[] tokenize(CharSequence text) {
+        if (text == null || text.toString().trim().equals("")) {
+            throw new IllegalArgumentException("Invalid text");
+        }
+        Pattern pattern = Pattern.compile("(\\w)+");
+        Matcher matcher = pattern.matcher(text.toString());
+        List<String> tokens = new ArrayList<String>();
+        while (matcher.find()) {
+            tokens.add(matcher.group(0));
+        }
+        return tokens.toArray(new String[0]);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
new file mode 100644
index 0000000..0c85503
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+/**
+ * A tokenizer. Can produce arrays of tokens from a given type.
+ *
+ * @param <T> given type
+ * @since 0.1
+ */
+public interface Tokenizer<T> {
+
+    /**
+     * Returns an array of tokens.
+     *
+     * @param text input text
+     * @return array of tokens
+     */
+    T[] tokenize(CharSequence text);
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
new file mode 100644
index 0000000..548e2b7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Classes used internally by similarity algorithms. Internal use only, backward compatibility
+ * not guaranteed.
+ *
+ * @since 0.1
+ */
+package org.apache.commons.text.similarity.internal;

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml b/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
new file mode 100644
index 0000000..7f310bd
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!DOCTYPE suppressions PUBLIC
+     "-//Puppy Crawl//DTD Suppressions 1.0//EN"
+     "http://www.puppycrawl.com/dtds/suppressions_1_0.dtd">
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<suppressions>
+  <suppress checks="AvoidInlineConditionals" files=".+\.java" lines="0-9999" />
+  <suppress checks="FinalParameters" files=".+\.java" lines="0-9999" />
+  <suppress checks="HiddenField" files=".+\.java" lines="0-9999" />
+  <suppress checks="DesignForExtension" files=".+\.java" lines="0-9999" />
+  <suppress checks="HideUtilityClassConstructor" files=".+\.java" lines="0-9999" />
+  <suppress checks="VisibilityModifier" files=".+\.java" lines="0-9999" />
+</suppressions>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/checkstyle.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/checkstyle.xml b/src/main/resources/org/apache/commons/text/checkstyle.xml
new file mode 100644
index 0000000..6afc1dd
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/checkstyle.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<!DOCTYPE module PUBLIC
+    "-//Puppy Crawl//DTD Check Configuration 1.2//EN"
+    "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">
+
+<!--
+
+  Checkstyle configuration that checks the sun coding conventions from:
+
+    - the Java Language Specification at
+      http://java.sun.com/docs/books/jls/second_edition/html/index.html
+
+    - the Sun Code Conventions at http://java.sun.com/docs/codeconv/
+
+    - the Javadoc guidelines at
+      http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
+
+    - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
+
+    - some best practices
+
+  Checkstyle is very configurable. Be sure to read the documentation at
+  http://checkstyle.sf.net (or in your downloaded distribution).
+
+  Most Checks are configurable, be sure to consult the documentation.
+
+  To completely disable a check, just comment it out or delete it from the file.
+
+  Finally, it is worth reading the documentation.
+
+-->
+
+<module name="Checker">
+    <!--
+        If you set the basedir property below, then all reported file
+        names will be relative to the specified directory. See
+        http://checkstyle.sourceforge.net/5.x/config.html#Checker
+
+        <property name="basedir" value="${basedir}"/>
+    -->
+
+    <!-- Checks that each Java package has a Javadoc file used for commenting. -->
+    <!-- See http://checkstyle.sf.net/config_javadoc.html#JavadocPackage       -->
+    <module name="JavadocPackage">
+      <property name="allowLegacy" value="true"/>
+    </module>
+
+    <!-- Checks whether files end with a new line.                        -->
+    <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile -->
+    <module name="NewlineAtEndOfFile"/>
+
+    <!-- Checks that property files contain the same keys.         -->
+    <!-- See http://checkstyle.sf.net/config_misc.html#Translation -->
+    <module name="Translation"/>
+
+    <module name="FileLength"/>
+
+    <!-- Following interprets the header file as regular expressions. -->
+    <!-- <module name="RegexpHeader"/>                                -->
+
+    <module name="FileTabCharacter">
+        <property name="eachLine" value="true"/>
+    </module>
+
+    <module name="RegexpSingleline">
+        <!-- \s matches whitespace character, $ matches end of line. -->
+        <property name="format" value="\s+$"/>
+        <property name="message" value="Line has trailing spaces."/>
+    </module>
+
+    <module name="TreeWalker">
+
+        <property name="cacheFile" value="${checkstyle.cache.file}"/>
+
+        <!-- Checks for Javadoc comments.                     -->
+        <!-- See http://checkstyle.sf.net/config_javadoc.html -->
+        <module name="JavadocMethod">
+            <property name="allowUndeclaredRTE" value="true"/>
+        </module>
+        <module name="JavadocType"/>
+        <module name="JavadocVariable"/>
+        <module name="JavadocStyle"/>
+
+
+        <!-- Checks for Naming Conventions.                  -->
+        <!-- See http://checkstyle.sf.net/config_naming.html -->
+        <module name="ConstantName"/>
+        <module name="LocalFinalVariableName"/>
+        <module name="LocalVariableName"/>
+        <module name="MemberName"/>
+        <module name="MethodName"/>
+        <module name="PackageName"/>
+        <module name="ParameterName"/>
+        <module name="StaticVariableName"/>
+        <module name="TypeName"/>
+
+
+        <!-- Checks for Headers                                -->
+        <!-- See http://checkstyle.sf.net/config_header.html   -->
+        <!-- <module name="Header">                            -->
+            <!-- The follow property value demonstrates the ability     -->
+            <!-- to have access to ANT properties. In this case it uses -->
+            <!-- the ${basedir} property to allow Checkstyle to be run  -->
+            <!-- from any directory within a project. See property      -->
+            <!-- expansion,                                             -->
+            <!-- http://checkstyle.sf.net/config.html#properties        -->
+            <!-- <property                                              -->
+            <!--     name="headerFile"                                  -->
+            <!--     value="${basedir}/java.header"/>                   -->
+        <!-- </module> -->
+
+
+        <!-- Checks for imports                              -->
+        <!-- See http://checkstyle.sf.net/config_import.html -->
+        <module name="AvoidStarImport"/>
+        <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
+        <module name="RedundantImport"/>
+        <module name="UnusedImports"/>
+
+
+        <!-- Checks for Size Violations.                    -->
+        <!-- See http://checkstyle.sf.net/config_sizes.html -->
+        <module name="LineLength">
+            <property name="max" value="120"/>
+        </module>
+        <module name="MethodLength"/>
+        <module name="ParameterNumber"/>
+
+
+        <!-- Checks for whitespace                               -->
+        <!-- See http://checkstyle.sf.net/config_whitespace.html -->
+        <module name="EmptyForIteratorPad"/>
+        <module name="MethodParamPad"/>
+        <module name="NoWhitespaceAfter"/>
+        <module name="NoWhitespaceBefore"/>
+        <module name="OperatorWrap"/>
+        <module name="ParenPad"/>
+        <module name="TypecastParenPad"/>
+        <module name="WhitespaceAfter"/>
+        <module name="WhitespaceAround"/>
+
+
+        <!-- Modifier Checks                                    -->
+        <!-- See http://checkstyle.sf.net/config_modifiers.html -->
+        <module name="ModifierOrder"/>
+        <module name="RedundantModifier"/>
+
+
+        <!-- Checks for blocks. You know, those {}'s         -->
+        <!-- See http://checkstyle.sf.net/config_blocks.html -->
+        <module name="AvoidNestedBlocks"/>
+        <module name="EmptyBlock"/>
+        <module name="LeftCurly"/>
+        <module name="NeedBraces"/>
+        <module name="RightCurly"/>
+
+
+        <!-- Checks for common coding problems               -->
+        <!-- See http://checkstyle.sf.net/config_coding.html -->
+        <module name="AvoidInlineConditionals"/>
+        <module name="EmptyStatement"/>
+        <module name="EqualsHashCode"/>
+        <module name="HiddenField"/>
+        <module name="IllegalInstantiation"/>
+        <module name="InnerAssignment"/>
+        <module name="MagicNumber"/>
+        <module name="MissingSwitchDefault"/>
+        <module name="RedundantThrows"/>
+        <module name="SimplifyBooleanExpression"/>
+        <module name="SimplifyBooleanReturn"/>
+
+        <!-- Checks for class design                         -->
+        <!-- See http://checkstyle.sf.net/config_design.html -->
+        <module name="DesignForExtension"/>
+        <module name="FinalClass"/>
+        <module name="HideUtilityClassConstructor"/>
+        <module name="InterfaceIsType"/>
+        <module name="VisibilityModifier"/>
+
+
+        <!-- Miscellaneous other checks.                   -->
+        <!-- See http://checkstyle.sf.net/config_misc.html -->
+        <module name="ArrayTypeStyle"/>
+        <module name="FinalParameters"/>
+        <module name="TodoComment"/>
+        <module name="UpperEll"/>
+
+    </module>
+
+</module>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/fb-excludes.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/fb-excludes.xml b/src/main/resources/org/apache/commons/text/fb-excludes.xml
new file mode 100644
index 0000000..55abc26
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/fb-excludes.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<FindBugsFilter>
+</FindBugsFilter>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/license-header.txt
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/license-header.txt b/src/main/resources/org/apache/commons/text/license-header.txt
new file mode 100644
index 0000000..f974c9a
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/license-header.txt
@@ -0,0 +1,16 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
new file mode 100644
index 0000000..0d6879a
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}.
+ *
+ * @since 0.1
+ */
+public class CosineDistanceTest {
+
+    /**
+     * Cosine distance under test.
+     */
+    private static CosineDistance cosineDistance;
+
+    /**
+     * Creates the cosine distance object used throughout the tests.
+     */
+    @BeforeClass
+    public static void setUp() {
+        cosineDistance = new CosineDistance();
+    }
+
+    /**
+     * Tests the cosine distance with several inputs.
+     */
+    @Test
+    public void testCosineDistance() {
+        assertEquals(Double.valueOf(0.5d), roundValue(cosineDistance.compare("the house", "da house")));
+        assertEquals(Double.valueOf(0.0d), roundValue(cosineDistance.compare("AB", "AB")));
+        assertEquals(Double.valueOf(1.0d), roundValue(cosineDistance.compare("AB", "BA")));
+        assertEquals(Double.valueOf(0.08d), roundValue(cosineDistance.compare(
+                "the boy was from tamana shi, kumamoto ken, and the girl was from rio de janeiro, rio",
+                "the boy was from tamana shi, kumamoto, and the boy was from rio de janeiro, rio de janeiro")));
+    }
+
+    // --- Utility methods
+
+    /**
+     * Rounds up a value.
+     *
+     * @param value a value
+     * @return rounded up value
+     */
+    private Double roundValue(Double value) {
+        return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue();
+    }
+
+}