You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by br...@apache.org on 2015/04/19 11:02:33 UTC

[2/6] [text] Move classes from the internal package into the package where they are used and make them package private.

Move classes from the internal package into the package where they
are used and make them package private.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/df681238
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/df681238
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/df681238

Branch: refs/heads/master
Commit: df681238bf5bcb2fece950b644a7d00a712d0cc8
Parents: 75db6de
Author: Benedikt Ritter <br...@apache.org>
Authored: Sun Apr 19 10:32:13 2015 +0200
Committer: Benedikt Ritter <br...@apache.org>
Committed: Sun Apr 19 10:37:50 2015 +0200

----------------------------------------------------------------------
 .../commons/text/similarity/CosineDistance.java |  6 --
 .../apache/commons/text/similarity/Counter.java | 60 ++++++++++++++++++++
 .../commons/text/similarity/RegexTokenizer.java | 50 ++++++++++++++++
 .../commons/text/similarity/Tokenizer.java      | 34 +++++++++++
 .../text/similarity/internal/Counter.java       | 60 --------------------
 .../similarity/internal/RegexTokenizer.java     | 50 ----------------
 .../text/similarity/internal/Tokenizer.java     | 34 -----------
 .../text/similarity/internal/package-info.java  | 23 --------
 .../commons/text/similarity/package-info.java   |  2 +-
 9 files changed, 145 insertions(+), 174 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
index 2fa4515..c5e8853 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -18,17 +18,11 @@ package org.apache.commons.text.similarity;
 
 import java.util.Map;
 
-import org.apache.commons.text.similarity.internal.Counter;
-import org.apache.commons.text.similarity.internal.RegexTokenizer;
-import org.apache.commons.text.similarity.internal.Tokenizer;
-
 /**
  * Measures the cosine distance between two character sequences.
  *
  * <p>It utilizes the CosineSimilarity to compute the distance. Character sequences
  * are converted into vectors through a simple tokenizer that works with </p>
- *
- * @see org.apache.commons.text.similarity.internal.RegexTokenizer
  */
 public class CosineDistance implements EditDistance<Double> {
     /**

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/Counter.java b/src/main/java/org/apache/commons/text/similarity/Counter.java
new file mode 100644
index 0000000..5eefc51
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/Counter.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Java implementation of Python's collections Counter module.
+ *
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
+ * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
+ * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
+ */
+final class Counter {
+
+    /**
+     * Hidden constructor.
+     */
+    private Counter() {
+        super();
+    }
+
+    /**
+     * It counts how many times each element provided occurred in an array and
+     * returns a dict with the element as key and the count as value.
+     *
+     * @param tokens array of tokens
+     * @return dict, where the elements are key, and the count the value
+     */
+    public static Map<CharSequence, Integer> of(CharSequence[] tokens) {
+        final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>();
+        for (CharSequence token : tokens) {
+            if (innerCounter.containsKey(token)) {
+                int value = innerCounter.get(token);
+                innerCounter.put(token, ++value);
+            } else {
+                innerCounter.put(token, 1);
+            }
+        }
+        return innerCounter;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
new file mode 100644
index 0000000..5a6c5d3
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A simple word tokenizer that utilizes regex to find words. It applies a regex
+ * {@code}(\w)+{@code} over the input text to extract words from a given character
+ * sequence.
+ */
+class RegexTokenizer implements Tokenizer<CharSequence> {
+
+    /**
+     * {@inheritDoc}
+     *
+     * @throws IllegalArgumentException if the input text is blank
+     */
+    @Override
+    public CharSequence[] tokenize(CharSequence text) {
+        if (text == null || text.toString().trim().equals("")) {
+            throw new IllegalArgumentException("Invalid text");
+        }
+        Pattern pattern = Pattern.compile("(\\w)+");
+        Matcher matcher = pattern.matcher(text.toString());
+        List<String> tokens = new ArrayList<String>();
+        while (matcher.find()) {
+            tokens.add(matcher.group(0));
+        }
+        return tokens.toArray(new String[0]);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/Tokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java
new file mode 100644
index 0000000..0a69d24
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * A tokenizer. Can produce arrays of tokens from a given type.
+ *
+ * @param <T> given type
+ */
+interface Tokenizer<T> {
+
+    /**
+     * Returns an array of tokens.
+     *
+     * @param text input text
+     * @return array of tokens
+     */
+    T[] tokenize(CharSequence text);
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
deleted file mode 100644
index c0dd2e6..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Java implementation of Python's collections Counter module.
- *
- * <p>It counts how many times each element provided occurred in an array and
- * returns a dict with the element as key and the count as value.</p>
- *
- * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter">
- * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
- */
-public final class Counter {
-
-    /**
-     * Hidden constructor.
-     */
-    private Counter() {
-        super();
-    }
-
-    /**
-     * It counts how many times each element provided occurred in an array and
-     * returns a dict with the element as key and the count as value.
-     *
-     * @param tokens array of tokens
-     * @return dict, where the elements are key, and the count the value
-     */
-    public static Map<CharSequence, Integer> of(CharSequence[] tokens) {
-        final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>();
-        for (CharSequence token : tokens) {
-            if (innerCounter.containsKey(token)) {
-                int value = innerCounter.get(token);
-                innerCounter.put(token, ++value);
-            } else {
-                innerCounter.put(token, 1);
-            }
-        }
-        return innerCounter;
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
deleted file mode 100644
index 082ac05..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A simple word tokenizer that utilizes regex to find words. It applies a regex
- * {@code}(\w)+{@code} over the input text to extract words from a given character
- * sequence.
- */
-public class RegexTokenizer implements Tokenizer<CharSequence> {
-
-    /**
-     * {@inheritDoc}
-     *
-     * @throws IllegalArgumentException if the input text is blank
-     */
-    @Override
-    public CharSequence[] tokenize(CharSequence text) {
-        if (text == null || text.toString().trim().equals("")) {
-            throw new IllegalArgumentException("Invalid text");
-        }
-        Pattern pattern = Pattern.compile("(\\w)+");
-        Matcher matcher = pattern.matcher(text.toString());
-        List<String> tokens = new ArrayList<String>();
-        while (matcher.find()) {
-            tokens.add(matcher.group(0));
-        }
-        return tokens.toArray(new String[0]);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
deleted file mode 100644
index 9dc63e4..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity.internal;
-
-/**
- * A tokenizer. Can produce arrays of tokens from a given type.
- *
- * @param <T> given type
- */
-public interface Tokenizer<T> {
-
-    /**
-     * Returns an array of tokens.
-     *
-     * @param text input text
-     * @return array of tokens
-     */
-    T[] tokenize(CharSequence text);
-
-}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
deleted file mode 100644
index 548e2b7..0000000
--- a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Classes used internally by similarity algorithms. Internal use only, backward compatibility
- * not guaranteed.
- *
- * @since 0.1
- */
-package org.apache.commons.text.similarity.internal;

http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java
index bd1e400..d2a19a6 100644
--- a/src/main/java/org/apache/commons/text/similarity/package-info.java
+++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
@@ -33,7 +33,7 @@
  * </ul>
  *
  * <p>The {@link org.apache.commons.text.similarity.CosineDistance Cosine Distance}
- * utilises a {@link org.apache.commons.text.similarity.internal.RegexTokenizer regular expression tokenizer (\w+)}.
+ * utilises a {@link org.apache.commons.text.similarity.RegexTokenizer regular expression tokenizer (\w+)}.
  * And the {@link org.apache.commons.text.similarity.LevenshteinDistance Levenshtein Distance}'s
  * behaviour can be changed to take into consideration a maximum throughput.</p>
  *