You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/12 13:25:58 UTC
[text] [SANDBOX-490] Add cosine similarity and cosine distance.
Strings are converted into vectors using a simple regex tokenizer. Vectors
are used to get the cosine similarity and, finally,
the distance is equal to 1.0 - the distance.
Repository: commons-text
Updated Branches:
refs/heads/master 980791b69 -> 3ccdabb1e
[SANDBOX-490] Add cosine similarity and cosine distance. Strings are converted into vectors using
a simple regex tokenizer. Vectors are used to get the cosine similarity and, finally, the
distance is equal to 1.0 - the distance.
New classes used by the cosine distance and the cosine similarity had to be created (including
the tokenizers). These classes were put in a new internal package, annotated accordingly.
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/3ccdabb1
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/3ccdabb1
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/3ccdabb1
Branch: refs/heads/master
Commit: 3ccdabb1e27bd41bf88e5b623136481d989b2b0e
Parents: 980791b
Author: Bruno P. Kinoshita <ki...@apache.org>
Authored: Sat Feb 14 14:23:42 2015 -0200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 12 23:25:06 2015 +1200
----------------------------------------------------------------------
pom.xml | 4 +
.../commons/text/similarity/CosineDistance.java | 54 +++++
.../text/similarity/CosineSimilarity.java | 101 +++++++++
.../text/similarity/JaccardDistance.java | 21 ++
.../commons/text/similarity/JaccardIndex.java | 21 ++
.../text/similarity/internal/Counter.java | 61 ++++++
.../similarity/internal/RegexTokenizer.java | 52 +++++
.../text/similarity/internal/Tokenizer.java | 35 ++++
.../text/similarity/internal/package-info.java | 23 +++
.../commons/text/checkstyle-suppressions.xml | 28 +++
.../org/apache/commons/text/checkstyle.xml | 206 +++++++++++++++++++
.../org/apache/commons/text/fb-excludes.xml | 19 ++
.../org/apache/commons/text/license-header.txt | 16 ++
.../text/similarity/CosineDistanceTest.java | 72 +++++++
14 files changed, 713 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2f46e9a..6b2a50f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,6 +150,10 @@
<version>2.13</version>
<configuration>
<enableRulesSummary>false</enableRulesSummary>
+ <configLocation>${basedir}/src/main/resources/org/apache/commons/text/checkstyle.xml</configLocation>
+ <headerLocation>${basedir}/src/main/resources/org/apache/commons/text/license-header.txt</headerLocation>
+ <suppressionsLocation>${basedir}/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml</suppressionsLocation>
+ <suppressionsFileExpression>${basedir}/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml</suppressionsFileExpression>
</configuration>
<reportSets>
<reportSet>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
new file mode 100644
index 0000000..69cfad7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Map;
+
+import org.apache.commons.text.similarity.internal.Counter;
+import org.apache.commons.text.similarity.internal.RegexTokenizer;
+import org.apache.commons.text.similarity.internal.Tokenizer;
+
+/**
+ * Measures the cosine distance between two character sequences.
+ *
+ * <p>It utilizes the CosineSimilarity to compute the distance. Character sequences
+ * are converted into vectors through a simple tokenizer that works with </p>
+ *
+ * @since 0.1
+ */
+public class CosineDistance implements StringMetric<Double> {
+ /**
+ * Tokenizer used to convert the character sequence into a vector.
+ */
+ private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
+ /**
+ * Cosine similarity.
+ */
+ private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
+
+ @Override
+ public Double compare(CharSequence left, CharSequence right) {
+ CharSequence[] leftTokens = tokenizer.tokenize(left);
+ CharSequence[] rightTokens = tokenizer.tokenize(right);
+
+ Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
+ Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
+ double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
+ return 1.0 - similarity;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
new file mode 100644
index 0000000..4b29a04
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Measures the Cosine similarity of two vectors of an inner product space and
+ * compares the angle between them.
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 0.1
+ */
+public class CosineSimilarity {
+
+ /**
+ * Calculates the cosine similarity for two given vectors.
+ *
+ * @param leftVector left vector
+ * @param rightVector right vector
+ * @return cosine similarity between the two vectors
+ */
+ public Double cosineSimilarity(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector) {
+ if (leftVector == null || rightVector == null) {
+ throw new IllegalArgumentException("Vectors must not be null");
+ }
+
+ Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
+
+ double dotProduct = dot(leftVector, rightVector, intersection);
+ double d1 = 0.0d;
+ for (Integer value : leftVector.values()) {
+ d1 += Math.pow(value, 2);
+ }
+ double d2 = 0.0d;
+ for (Integer value : rightVector.values()) {
+ d2 += Math.pow(value, 2);
+ }
+ double cosineSimilarity;
+ if (d1 <= 0.0 || d2 <= 0.0) {
+ cosineSimilarity = 0.0;
+ } else {
+ cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
+ }
+ return cosineSimilarity;
+ }
+
+ /**
+ * Returns a set with strings common to the two given maps.
+ *
+ * @param leftVector left vector map
+ * @param rightVector right vector map
+ * @return common strings
+ */
+ private Set<CharSequence> getIntersection(Map<CharSequence, Integer> leftVector,
+ Map<CharSequence, Integer> rightVector) {
+ Set<CharSequence> intersection = new HashSet<CharSequence>(leftVector.keySet());
+ intersection.retainAll(rightVector.keySet());
+ return intersection;
+ }
+
+ /**
+ * Computes the dot product of two vectors. It ignores remaining elements. It means
+ * that if a vector is longer than other, then a smaller part of it will be used to compute
+ * the dot product.
+ *
+ * @param leftVector left vector
+ * @param rightVector right vector
+ * @param intersection common elements
+ * @return the dot product
+ */
+ private double dot(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector,
+ Set<CharSequence> intersection) {
+ long dotProduct = 0;
+ for (CharSequence key : intersection) {
+ dotProduct += leftVector.get(key) * rightVector.get(key);
+ }
+ return dotProduct;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
new file mode 100644
index 0000000..7237244
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+public class JaccardDistance {
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java b/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
new file mode 100644
index 0000000..ceaa468
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardIndex.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+public class JaccardIndex {
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
new file mode 100644
index 0000000..1f2f0d9
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Java implementation of Python's collections Counter module.
+ *
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
+ * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
+ * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
+ * @since 0.1
+ */
+public final class Counter {
+
+ /**
+ * Hidden constructor.
+ */
+ private Counter() {
+ super();
+ }
+
+ /**
+ * It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.
+ *
+ * @param tokens array of tokens
+ * @return dict, where the elements are key, and the count the value
+ */
+ public static Map<CharSequence, Integer> of(CharSequence[] tokens) {
+ final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>();
+ for (CharSequence token : tokens) {
+ if (innerCounter.containsKey(token)) {
+ int value = innerCounter.get(token);
+ innerCounter.put(token, ++value);
+ } else {
+ innerCounter.put(token, 1);
+ }
+ }
+ return innerCounter;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
new file mode 100644
index 0000000..cf49536
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A simple word tokenizer that utilizes regex to find words. It applies a regex
+ * {@code}(\\w)+{@code} over the input text to extract words from a given character
+ * sequence.
+ *
+ * @since 0.1
+ */
+public class RegexTokenizer implements Tokenizer<CharSequence> {
+
+ /**
+ * {@inheritDoc}
+ *
+ * @throws IllegalArgumentException if the input text is blank
+ */
+ @Override
+ public CharSequence[] tokenize(CharSequence text) {
+ if (text == null || text.toString().trim().equals("")) {
+ throw new IllegalArgumentException("Invalid text");
+ }
+ Pattern pattern = Pattern.compile("(\\w)+");
+ Matcher matcher = pattern.matcher(text.toString());
+ List<String> tokens = new ArrayList<String>();
+ while (matcher.find()) {
+ tokens.add(matcher.group(0));
+ }
+ return tokens.toArray(new String[0]);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
new file mode 100644
index 0000000..0c85503
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+/**
+ * A tokenizer. Can produce arrays of tokens from a given type.
+ *
+ * @param <T> given type
+ * @since 0.1
+ */
+public interface Tokenizer<T> {
+
+ /**
+ * Returns an array of tokens.
+ *
+ * @param text input text
+ * @return array of tokens
+ */
+ T[] tokenize(CharSequence text);
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
new file mode 100644
index 0000000..548e2b7
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Classes used internally by similarity algorithms. Internal use only, backward compatibility
+ * not guaranteed.
+ *
+ * @since 0.1
+ */
+package org.apache.commons.text.similarity.internal;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml b/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
new file mode 100644
index 0000000..7f310bd
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/checkstyle-suppressions.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!DOCTYPE suppressions PUBLIC
+ "-//Puppy Crawl//DTD Suppressions 1.0//EN"
+ "http://www.puppycrawl.com/dtds/suppressions_1_0.dtd">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<suppressions>
+ <suppress checks="AvoidInlineConditionals" files=".+\.java" lines="0-9999" />
+ <suppress checks="FinalParameters" files=".+\.java" lines="0-9999" />
+ <suppress checks="HiddenField" files=".+\.java" lines="0-9999" />
+ <suppress checks="DesignForExtension" files=".+\.java" lines="0-9999" />
+ <suppress checks="HideUtilityClassConstructor" files=".+\.java" lines="0-9999" />
+ <suppress checks="VisibilityModifier" files=".+\.java" lines="0-9999" />
+</suppressions>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/checkstyle.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/checkstyle.xml b/src/main/resources/org/apache/commons/text/checkstyle.xml
new file mode 100644
index 0000000..6afc1dd
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/checkstyle.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!DOCTYPE module PUBLIC
+ "-//Puppy Crawl//DTD Check Configuration 1.2//EN"
+ "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">
+
+<!--
+
+ Checkstyle configuration that checks the sun coding conventions from:
+
+ - the Java Language Specification at
+ http://java.sun.com/docs/books/jls/second_edition/html/index.html
+
+ - the Sun Code Conventions at http://java.sun.com/docs/codeconv/
+
+ - the Javadoc guidelines at
+ http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
+
+ - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
+
+ - some best practices
+
+ Checkstyle is very configurable. Be sure to read the documentation at
+ http://checkstyle.sf.net (or in your downloaded distribution).
+
+ Most Checks are configurable, be sure to consult the documentation.
+
+ To completely disable a check, just comment it out or delete it from the file.
+
+ Finally, it is worth reading the documentation.
+
+-->
+
+<module name="Checker">
+ <!--
+ If you set the basedir property below, then all reported file
+ names will be relative to the specified directory. See
+ http://checkstyle.sourceforge.net/5.x/config.html#Checker
+
+ <property name="basedir" value="${basedir}"/>
+ -->
+
+ <!-- Checks that each Java package has a Javadoc file used for commenting. -->
+ <!-- See http://checkstyle.sf.net/config_javadoc.html#JavadocPackage -->
+ <module name="JavadocPackage">
+ <property name="allowLegacy" value="true"/>
+ </module>
+
+ <!-- Checks whether files end with a new line. -->
+ <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile -->
+ <module name="NewlineAtEndOfFile"/>
+
+ <!-- Checks that property files contain the same keys. -->
+ <!-- See http://checkstyle.sf.net/config_misc.html#Translation -->
+ <module name="Translation"/>
+
+ <module name="FileLength"/>
+
+ <!-- Following interprets the header file as regular expressions. -->
+ <!-- <module name="RegexpHeader"/> -->
+
+ <module name="FileTabCharacter">
+ <property name="eachLine" value="true"/>
+ </module>
+
+ <module name="RegexpSingleline">
+ <!-- \s matches whitespace character, $ matches end of line. -->
+ <property name="format" value="\s+$"/>
+ <property name="message" value="Line has trailing spaces."/>
+ </module>
+
+ <module name="TreeWalker">
+
+ <property name="cacheFile" value="${checkstyle.cache.file}"/>
+
+ <!-- Checks for Javadoc comments. -->
+ <!-- See http://checkstyle.sf.net/config_javadoc.html -->
+ <module name="JavadocMethod">
+ <property name="allowUndeclaredRTE" value="true"/>
+ </module>
+ <module name="JavadocType"/>
+ <module name="JavadocVariable"/>
+ <module name="JavadocStyle"/>
+
+
+ <!-- Checks for Naming Conventions. -->
+ <!-- See http://checkstyle.sf.net/config_naming.html -->
+ <module name="ConstantName"/>
+ <module name="LocalFinalVariableName"/>
+ <module name="LocalVariableName"/>
+ <module name="MemberName"/>
+ <module name="MethodName"/>
+ <module name="PackageName"/>
+ <module name="ParameterName"/>
+ <module name="StaticVariableName"/>
+ <module name="TypeName"/>
+
+
+ <!-- Checks for Headers -->
+ <!-- See http://checkstyle.sf.net/config_header.html -->
+ <!-- <module name="Header"> -->
+ <!-- The follow property value demonstrates the ability -->
+ <!-- to have access to ANT properties. In this case it uses -->
+ <!-- the ${basedir} property to allow Checkstyle to be run -->
+ <!-- from any directory within a project. See property -->
+ <!-- expansion, -->
+ <!-- http://checkstyle.sf.net/config.html#properties -->
+ <!-- <property -->
+ <!-- name="headerFile" -->
+ <!-- value="${basedir}/java.header"/> -->
+ <!-- </module> -->
+
+
+ <!-- Checks for imports -->
+ <!-- See http://checkstyle.sf.net/config_import.html -->
+ <module name="AvoidStarImport"/>
+ <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
+ <module name="RedundantImport"/>
+ <module name="UnusedImports"/>
+
+
+ <!-- Checks for Size Violations. -->
+ <!-- See http://checkstyle.sf.net/config_sizes.html -->
+ <module name="LineLength">
+ <property name="max" value="120"/>
+ </module>
+ <module name="MethodLength"/>
+ <module name="ParameterNumber"/>
+
+
+ <!-- Checks for whitespace -->
+ <!-- See http://checkstyle.sf.net/config_whitespace.html -->
+ <module name="EmptyForIteratorPad"/>
+ <module name="MethodParamPad"/>
+ <module name="NoWhitespaceAfter"/>
+ <module name="NoWhitespaceBefore"/>
+ <module name="OperatorWrap"/>
+ <module name="ParenPad"/>
+ <module name="TypecastParenPad"/>
+ <module name="WhitespaceAfter"/>
+ <module name="WhitespaceAround"/>
+
+
+ <!-- Modifier Checks -->
+ <!-- See http://checkstyle.sf.net/config_modifiers.html -->
+ <module name="ModifierOrder"/>
+ <module name="RedundantModifier"/>
+
+
+ <!-- Checks for blocks. You know, those {}'s -->
+ <!-- See http://checkstyle.sf.net/config_blocks.html -->
+ <module name="AvoidNestedBlocks"/>
+ <module name="EmptyBlock"/>
+ <module name="LeftCurly"/>
+ <module name="NeedBraces"/>
+ <module name="RightCurly"/>
+
+
+ <!-- Checks for common coding problems -->
+ <!-- See http://checkstyle.sf.net/config_coding.html -->
+ <module name="AvoidInlineConditionals"/>
+ <module name="EmptyStatement"/>
+ <module name="EqualsHashCode"/>
+ <module name="HiddenField"/>
+ <module name="IllegalInstantiation"/>
+ <module name="InnerAssignment"/>
+ <module name="MagicNumber"/>
+ <module name="MissingSwitchDefault"/>
+ <module name="RedundantThrows"/>
+ <module name="SimplifyBooleanExpression"/>
+ <module name="SimplifyBooleanReturn"/>
+
+ <!-- Checks for class design -->
+ <!-- See http://checkstyle.sf.net/config_design.html -->
+ <module name="DesignForExtension"/>
+ <module name="FinalClass"/>
+ <module name="HideUtilityClassConstructor"/>
+ <module name="InterfaceIsType"/>
+ <module name="VisibilityModifier"/>
+
+
+ <!-- Miscellaneous other checks. -->
+ <!-- See http://checkstyle.sf.net/config_misc.html -->
+ <module name="ArrayTypeStyle"/>
+ <module name="FinalParameters"/>
+ <module name="TodoComment"/>
+ <module name="UpperEll"/>
+
+ </module>
+
+</module>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/fb-excludes.xml
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/fb-excludes.xml b/src/main/resources/org/apache/commons/text/fb-excludes.xml
new file mode 100644
index 0000000..55abc26
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/fb-excludes.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FindBugsFilter>
+</FindBugsFilter>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/main/resources/org/apache/commons/text/license-header.txt
----------------------------------------------------------------------
diff --git a/src/main/resources/org/apache/commons/text/license-header.txt b/src/main/resources/org/apache/commons/text/license-header.txt
new file mode 100644
index 0000000..f974c9a
--- /dev/null
+++ b/src/main/resources/org/apache/commons/text/license-header.txt
@@ -0,0 +1,16 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/commons-text/blob/3ccdabb1/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
new file mode 100644
index 0000000..0d6879a
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}.
+ *
+ * @since 0.1
+ */
+public class CosineDistanceTest {
+
+ /**
+ * Cosine distance under test.
+ */
+ private static CosineDistance cosineDistance;
+
+ /**
+ * Creates the cosine distance object used throughout the tests.
+ */
+ @BeforeClass
+ public static void setUp() {
+ cosineDistance = new CosineDistance();
+ }
+
+ /**
+ * Tests the cosine distance with several inputs.
+ */
+ @Test
+ public void testCosineDistance() {
+ assertEquals(Double.valueOf(0.5d), roundValue(cosineDistance.compare("the house", "da house")));
+ assertEquals(Double.valueOf(0.0d), roundValue(cosineDistance.compare("AB", "AB")));
+ assertEquals(Double.valueOf(1.0d), roundValue(cosineDistance.compare("AB", "BA")));
+ assertEquals(Double.valueOf(0.08d), roundValue(cosineDistance.compare(
+ "the boy was from tamana shi, kumamoto ken, and the girl was from rio de janeiro, rio",
+ "the boy was from tamana shi, kumamoto, and the boy was from rio de janeiro, rio de janeiro")));
+ }
+
+ // --- Utility methods
+
+ /**
+ * Rounds up a value.
+ *
+ * @param value a value
+ * @return rounded up value
+ */
+ private Double roundValue(Double value) {
+ return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue();
+ }
+
+}