You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/12 11:54:22 UTC
[1/2] [text] Code style, formatting and removing unused classes
Repository: commons-text
Updated Branches:
refs/heads/NEW-METRICS 81f679dea -> e99b839ad
Code style, formatting and removing unused classes
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/e00d85b3
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/e00d85b3
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/e00d85b3
Branch: refs/heads/NEW-METRICS
Commit: e00d85b38632dce4758f32b3e4af6afcecd66d3c
Parents: 81f679d
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Sun Apr 12 21:52:33 2015 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 12 21:52:33 2015 +1200
----------------------------------------------------------------------
.../commons/text/similarity/CosineDistance.java | 33 ++++++++-----
.../text/similarity/CosineSimilarity.java | 19 +++----
.../similarity/internal/CharacterTokenizer.java | 43 ----------------
.../text/similarity/internal/Counter.java | 24 +++++++--
.../similarity/internal/RegexTokenizer.java | 52 ++++++++++++++++++++
.../similarity/internal/SimpleTokenizer.java | 50 -------------------
.../similarity/internal/StringTokenizer.java | 36 --------------
.../text/similarity/internal/Tokenizer.java | 5 +-
.../internal/WhiteSpaceTokenizer.java | 46 -----------------
.../text/similarity/internal/package-info.java | 6 ++-
10 files changed, 105 insertions(+), 209 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
index 54970e4..6ed2d69 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -19,27 +19,34 @@ package org.apache.commons.text.similarity;
import java.util.Map;
import org.apache.commons.text.similarity.internal.Counter;
-import org.apache.commons.text.similarity.internal.SimpleTokenizer;
+import org.apache.commons.text.similarity.internal.RegexTokenizer;
import org.apache.commons.text.similarity.internal.Tokenizer;
/**
- * <p>
- * Calcules the cosine distance.
- * </p>
+ * Measures the cosine distance between two character sequences.
+ *
+ * <p>It utilizes the CosineSimilarity to compute the distance. Character sequences
+ * are converted into vectors through a simple tokenizer that works with </p>
+ *
+ * @since 0.1
*/
public class CosineDistance implements StringMetric<Double> {
-
- private final Tokenizer<String> tokenizer = new SimpleTokenizer();
-
+ /**
+ * Tokenizer used to convert the character sequence into a vector.
+ */
+ private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
+ /**
+ * Cosine similarity.
+ */
private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
-
+
@Override
public Double compare(CharSequence left, CharSequence right) {
- String[] leftTokens = tokenizer.tokenize(left);
- String[] rightTokens = tokenizer.tokenize(right);
-
- Map<String, Integer> leftVector = Counter.of(leftTokens);
- Map<String, Integer> rightVector = Counter.of(rightTokens);
+ CharSequence[] leftTokens = tokenizer.tokenize(left);
+ CharSequence[] rightTokens = tokenizer.tokenize(right);
+
+ Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
+ Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
return 1.0 - similarity;
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
index 992e5e9..096f2aa 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -20,16 +20,13 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-
/**
- * <p>
* Measures the Cosine similarity of two vectors of an inner product space and
* compares the angle between them.
- * </p>
*
* <p>
- * For further explanation about the Cosine Similarity, take a look at its
- * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity.
+ * For further explanation about the Cosine Similarity, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
* </p>
*
* @since 0.1
@@ -43,12 +40,12 @@ public class CosineSimilarity {
* @param rightVector right vector
* @return cosine similarity between the two vectors
*/
- public Double cosineSimilarity(Map<String, Integer> leftVector, Map<String, Integer> rightVector) {
+ public Double cosineSimilarity(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector) {
if (leftVector == null || rightVector == null) {
throw new IllegalArgumentException("Vectors must not be null");
}
- Set<String> intersection = getIntersection(leftVector, rightVector);
+ Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
double dotProduct = dot(leftVector, rightVector, intersection);
double d1 = 0.0d;
@@ -75,8 +72,8 @@ public class CosineSimilarity {
* @param rightVector right vector map
* @return common strings
*/
- private Set<String> getIntersection(Map<String, Integer> leftVector, Map<String, Integer> rightVector) {
- Set<String> intersection = new HashSet<String>(leftVector.keySet());
+ private Set<CharSequence> getIntersection(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector) {
+ Set<CharSequence> intersection = new HashSet<CharSequence>(leftVector.keySet());
intersection.retainAll(rightVector.keySet());
return intersection;
}
@@ -91,9 +88,9 @@ public class CosineSimilarity {
* @param intersection common elements
* @return the dot product
*/
- private double dot(Map<String, Integer> leftVector, Map<String, Integer> rightVector, Set<String> intersection) {
+ private double dot(Map<CharSequence, Integer> leftVector, Map<CharSequence, Integer> rightVector, Set<CharSequence> intersection) {
long dotProduct = 0;
- for (String key : intersection) {
+ for (CharSequence key : intersection) {
dotProduct += leftVector.get(key) * rightVector.get(key);
}
return dotProduct;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java
deleted file mode 100644
index 5bab5a9..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-/**
- * <p>
- * Returns each character in a string as a token.
- * </p>
- */
-public class CharacterTokenizer implements Tokenizer<Character> {
-
- /**
- * {@inheritDoc}
- *
- * @throws IllegalArgumentException if the input text is blank
- */
- @Override
- public Character[] tokenize(CharSequence text) {
- if (text == null || text.toString().trim().equals("")) {
- throw new IllegalArgumentException("Invalid text");
- }
- Character[] tokens = new Character[text.length()];
- for (int i = 0; i < text.length(); ++i) {
- tokens[i] = text.charAt(i);
- }
- return tokens;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
index 6d2a4c4..3eb730b 100644
--- a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
@@ -22,15 +22,29 @@ import java.util.Map;
/**
* Java implementation of Python's collections Counter module.
*
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
* @see https://docs.python.org/dev/library/collections.html#collections.Counter
+ * @since 0.1
*/
public class Counter {
+ /**
+ * Hidden constructor.
+ */
private Counter() {}
-
- public static Map<String, Integer> of(String[] tokens) {
- Map<String, Integer> innerCounter = new HashMap<String, Integer>();
- for (String token : tokens) {
+
+ /**
+ * It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.
+ *
+ * @param tokens array of tokens
+ * @return dict, where the elements are key, and the count the value
+ */
+ public static Map<CharSequence, Integer> of(CharSequence[] tokens) {
+ final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>();
+ for (CharSequence token : tokens) {
if (innerCounter.containsKey(token)) {
int value = innerCounter.get(token);
innerCounter.put(token, ++value);
@@ -40,5 +54,5 @@ public class Counter {
}
return innerCounter;
}
-
+
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
new file mode 100644
index 0000000..fa585d8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity.internal;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A simple word tokenizer that utilizes regex to find words. It applies a regex
+ * {@code}(\\w)+{@code} over the input text to extract words from a given character
+ * sequence.
+ *
+ * @since 0.1
+ */
+public class RegexTokenizer implements Tokenizer<CharSequence> {
+
+ /**
+ * {@inheritDoc}
+ *
+ * @throws IllegalArgumentException if the input text is blank
+ */
+ @Override
+ public CharSequence[] tokenize(CharSequence text) {
+ if (text == null || text.toString().trim().equals("")) {
+ throw new IllegalArgumentException("Invalid text");
+ }
+ Pattern pattern = Pattern.compile("(\\w)+");
+ Matcher matcher = pattern.matcher(text.toString());
+ List<String> tokens = new ArrayList<String>();
+ while (matcher.find()) {
+ tokens.add(matcher.group(0));
+ }
+ return tokens.toArray(new String[0]);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java
deleted file mode 100644
index 9df5299..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * <p>
- * A simple word tokenizer that utilizes regex.
- * </p>
- */
-public class SimpleTokenizer implements Tokenizer<String> {
-
- /**
- * {@inheritDoc}
- *
- * @throws IllegalArgumentException if the input text is blank
- */
- @Override
- public String[] tokenize(CharSequence text) {
- if (text == null || text.toString().trim().equals("")) {
- throw new IllegalArgumentException("Invalid text");
- }
- Pattern pattern = Pattern.compile("(\\w)+");
- Matcher matcher = pattern.matcher(text.toString());
- List<String> tokens = new ArrayList<String>();
- while (matcher.find()) {
- tokens.add(matcher.group(0));
- }
- return tokens.toArray(new String[0]);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java
deleted file mode 100644
index 137f338..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-/**
- * <p>
- * A string tokenizer. Can produce arrays of tokens from a given type.
- * </p>
- *
- * @param <T> given type
- */
-public interface StringTokenizer<T> {
-
- /**
- * Returns an array of tokens.
- *
- * @param text input text
- * @return array of tokens
- */
- T[] tokenize(CharSequence text);
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
index b5b877b..0c85503 100644
--- a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
+++ b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
@@ -17,11 +17,10 @@
package org.apache.commons.text.similarity.internal;
/**
- * <p>
* A tokenizer. Can produce arrays of tokens from a given type.
- * </p>
*
* @param <T> given type
+ * @since 0.1
*/
public interface Tokenizer<T> {
@@ -32,5 +31,5 @@ public interface Tokenizer<T> {
* @return array of tokens
*/
T[] tokenize(CharSequence text);
-
+
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java
deleted file mode 100644
index a827fdf..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-/**
- * <p>
- * A simple white space tokenizer.
- * </p>
- */
-public class WhiteSpaceTokenizer implements Tokenizer<String> {
-
- private static final String SPACE_DELIMITER = " ";
-
- /**
- * {@inheritDoc}
- *
- * @throws IllegalArgumentException if the input text is blank
- */
- @Override
- public String[] tokenize(CharSequence text) {
- if (text == null || text.toString().trim().equals("")) {
- throw new IllegalArgumentException("Invalid text");
- }
- java.util.StringTokenizer stringTokenizer = new java.util.StringTokenizer(text.toString(), SPACE_DELIMITER);
- String[] tokens = new String[stringTokenizer.countTokens()];
- for (int i = 0; i < tokens.length; ++i) {
- tokens[i] = stringTokenizer.nextToken();
- }
- return tokens;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e00d85b3/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
index a346c07..ce7b02c 100644
--- a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
+++ b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
@@ -15,7 +15,9 @@
* limitations under the License.
*/
/**
- * Classes used internally by similarity algorithms. Internal use, backward compatibility
- * is not guaranteed.
+ * Classes used internally by similarity algorithms. Internal use only, backward compatibility
+ * not guaranteed.
+ *
+ * @since 0.1
*/
package org.apache.commons.text.similarity.internal;
\ No newline at end of file
[2/2] [text] Formatting, code style and comments
Posted by ki...@apache.org.
Formatting, code style and comments
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/e99b839a
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/e99b839a
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/e99b839a
Branch: refs/heads/NEW-METRICS
Commit: e99b839ad6fc023dd57519e07b8a2f36e2b58b39
Parents: e00d85b
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Sun Apr 12 21:54:11 2015 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 12 21:54:11 2015 +1200
----------------------------------------------------------------------
.../text/similarity/CosineDistanceTest.java | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/e99b839a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
index ce33572..0d6879a 100644
--- a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java
@@ -26,18 +26,29 @@ import org.junit.Test;
/**
* Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}.
+ *
+ * @since 0.1
*/
public class CosineDistanceTest {
+ /**
+ * Cosine distance under test.
+ */
private static CosineDistance cosineDistance;
+ /**
+ * Creates the cosine distance object used throughout the tests.
+ */
@BeforeClass
public static void setUp() {
cosineDistance = new CosineDistance();
}
+ /**
+ * Tests the cosine distance with several inputs.
+ */
@Test
- public void testCosineSimilarity() {
+ public void testCosineDistance() {
assertEquals(Double.valueOf(0.5d), roundValue(cosineDistance.compare("the house", "da house")));
assertEquals(Double.valueOf(0.0d), roundValue(cosineDistance.compare("AB", "AB")));
assertEquals(Double.valueOf(1.0d), roundValue(cosineDistance.compare("AB", "BA")));
@@ -48,6 +59,12 @@ public class CosineDistanceTest {
// --- Utility methods
+ /**
+ * Rounds up a value.
+ *
+ * @param value a value
+ * @return rounded up value
+ */
private Double roundValue(Double value) {
return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue();
}