You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2018/09/28 16:47:28 UTC

[1/4] [text] [TEXT-139] Improve JaccardSimilarity computational cost.

Repository: commons-text
Updated Branches:
  refs/heads/master 8ae4ff075 -> 6872117ae


[TEXT-139] Improve JaccardSimilarity computational cost.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/0d4c9c45
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/0d4c9c45
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/0d4c9c45

Branch: refs/heads/master
Commit: 0d4c9c4593fa98909c603fc701c8116975d8d8a8
Parents: 85465e2
Author: nickwongwong <a4...@yeah.net>
Authored: Mon Sep 10 22:01:46 2018 +0800
Committer: nickwongwong <a4...@yeah.net>
Committed: Mon Sep 10 22:01:46 2018 +0800

----------------------------------------------------------------------
 .../text/similarity/JaccardSimilarity.java      | 27 ++++++++------------
 1 file changed, 11 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/0d4c9c45/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
index 1dc2b85..2e88dd2 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -62,27 +62,22 @@ public class JaccardSimilarity implements SimilarityScore<Double> {
      * @return index
      */
     private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) {
-        final Set<String> intersectionSet = new HashSet<>();
-        final Set<String> unionSet = new HashSet<>();
-        boolean unionFilled = false;
         final int leftLength = left.length();
         final int rightLength = right.length();
         if (leftLength == 0 || rightLength == 0) {
             return 0d;
         }
-
-        for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
-            unionSet.add(String.valueOf(left.charAt(leftIndex)));
-            for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
-                if (!unionFilled) {
-                    unionSet.add(String.valueOf(right.charAt(rightIndex)));
-                }
-                if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
-                    intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
-                }
-            }
-            unionFilled = true;
+        final Set<Character> leftSet = new HashSet<>();
+        for (int i = 0; i < leftLength; i++) {
+            leftSet.add(left.charAt(i));
+        }
+        final Set<Character> rightSet = new HashSet<>();
+        for (int i = 0; i < rightLength; i++) {
+            rightSet.add(right.charAt(i));
         }
-        return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
+        final Set<Character> unionSet = new HashSet<>(leftSet);
+        unionSet.addAll(rightSet);
+        final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size();
+        return 1.0d * intersectionSize / unionSet.size();
     }
 }


[3/4] [text] Merge branch 'master' of https://github.com/nickwongwong/commons-text

Posted by ch...@apache.org.
Merge branch 'master' of https://github.com/nickwongwong/commons-text


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/9c7dff19
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/9c7dff19
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/9c7dff19

Branch: refs/heads/master
Commit: 9c7dff199a3ccd84f817b0ad8a76659cb26fd16a
Parents: 8ae4ff0 189f421
Author: Rob Tompkins <ch...@apache.org>
Authored: Fri Sep 28 12:43:59 2018 -0400
Committer: Rob Tompkins <ch...@apache.org>
Committed: Fri Sep 28 12:43:59 2018 -0400

----------------------------------------------------------------------
 .../text/lookup/AbstractStringLookup.java       | 12 +++++++++
 .../commons/text/lookup/ScriptStringLookup.java |  4 +--
 .../text/lookup/StringLookupFactory.java        |  1 +
 .../commons/text/lookup/UrlStringLookup.java    |  5 ++--
 .../text/similarity/JaccardSimilarity.java      | 27 ++++++++------------
 src/test/resources/document.properties          |  2 +-
 6 files changed, 30 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
index e3c44e9,404bbdc..4dbdb65
--- a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
@@@ -24,8 -24,14 +24,20 @@@ package org.apache.commons.text.lookup
   */
  abstract class AbstractStringLookup implements StringLookup {
  
++
++    /**
++     * The empty string.
++     */
 +    private static final String EMPTY = "";
++
+     /**
+      * The default split char.
+      */
      protected static final char SPLIT_CH = ':';
+ 
+     /**
+      * The default split string.
+      */
      protected static final String SPLIT_STR = String.valueOf(SPLIT_CH);
  
      /**

http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
----------------------------------------------------------------------


[2/4] [text] Fixed code formatting

Posted by ch...@apache.org.
Fixed code formatting


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/189f4210
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/189f4210
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/189f4210

Branch: refs/heads/master
Commit: 189f421051f0e29789a8cdbac9dfa7d07f66ac2d
Parents: 0d4c9c4
Author: nickwongwong <a4...@yeah.net>
Authored: Fri Sep 14 20:34:11 2018 +0800
Committer: nickwongwong <a4...@yeah.net>
Committed: Fri Sep 14 20:34:11 2018 +0800

----------------------------------------------------------------------
 .../org/apache/commons/text/lookup/AbstractStringLookup.java  | 7 +++++++
 .../org/apache/commons/text/lookup/ScriptStringLookup.java    | 4 ++--
 .../org/apache/commons/text/lookup/StringLookupFactory.java   | 1 +
 .../java/org/apache/commons/text/lookup/UrlStringLookup.java  | 5 +++--
 src/test/resources/document.properties                        | 2 +-
 5 files changed, 14 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
index f265bf7..404bbdc 100644
--- a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
@@ -24,7 +24,14 @@ package org.apache.commons.text.lookup;
  */
 abstract class AbstractStringLookup implements StringLookup {
 
+    /**
+     * The default split char.
+     */
     protected static final char SPLIT_CH = ':';
+
+    /**
+     * The default split string.
+     */
     protected static final String SPLIT_STR = String.valueOf(SPLIT_CH);
 
     /**

http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java b/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
index affb1a3..99f1be9 100644
--- a/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
@@ -78,8 +78,8 @@ final class ScriptStringLookup extends AbstractStringLookup {
             final Object eval = scriptEngine.eval(script);
             return Objects.toString(eval, null);
         } catch (final Exception e) {
-            throw IllegalArgumentExceptions.format(e, "Error looking up script engine [%s] for script [%s].", engineName,
-                    script);
+            throw IllegalArgumentExceptions.format(e, "Error looking up script engine [%s] for script [%s].",
+                engineName, script);
         }
     }
 

http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java b/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
index 73a4398..25f6596 100644
--- a/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
+++ b/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
@@ -54,6 +54,7 @@ public final class StringLookupFactory {
      * </ul>
      *
      * @param stringLookupMap
+     *            the map of string lookups.
      * @since 1.5
      */
     public void addDefaultStringLookups(final Map<String, StringLookup> stringLookupMap) {

http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java b/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
index e17e668..6e0d4d4 100644
--- a/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
@@ -72,8 +72,9 @@ final class UrlStringLookup extends AbstractStringLookup {
         final String urlStr = substringAfter(key, SPLIT_CH);
         try {
             final URL url = new URL(urlStr);
-            final StringWriter writer = new StringWriter(8192);
-            final char[] buffer = new char[8192];
+            final int size = 8192;
+            final StringWriter writer = new StringWriter(size);
+            final char[] buffer = new char[size];
             try (InputStreamReader reader = new InputStreamReader(new BufferedInputStream(url.openStream()),
                     charsetName)) {
                 int n;

http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/test/resources/document.properties
----------------------------------------------------------------------
diff --git a/src/test/resources/document.properties b/src/test/resources/document.properties
index f411d5e..2f3bc8c 100644
--- a/src/test/resources/document.properties
+++ b/src/test/resources/document.properties
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-mykey = Hello World!
\ No newline at end of file
+mykey = Hello World!


[4/4] [text] TEXT-139: Thanks @nickwongwong

Posted by ch...@apache.org.
TEXT-139: Thanks @nickwongwong


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6872117a
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6872117a
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6872117a

Branch: refs/heads/master
Commit: 6872117ae1d33450bf4be694b3a6af6772e006f2
Parents: 9c7dff1
Author: Rob Tompkins <ch...@apache.org>
Authored: Fri Sep 28 12:45:58 2018 -0400
Committer: Rob Tompkins <ch...@apache.org>
Committed: Fri Sep 28 12:45:58 2018 -0400

----------------------------------------------------------------------
 pom.xml                 | 3 +++
 src/changes/changes.xml | 1 +
 2 files changed, 4 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/6872117a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index f79cfd3..2b83a03 100644
--- a/pom.xml
+++ b/pom.xml
@@ -360,6 +360,9 @@
     <contributor>
       <name>Nandor Kollar</name>
     </contributor>
+    <contributor>
+      <name>Nick Wong</name>
+    </contributor>
   </contributors>
 
   <scm>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/6872117a/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 91d507d..43ead66 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
   <body>
 
   <release version="1.5" date="2018-MM-DD" description="Release 1.5">
+    <action issue="TEXT-139" type="fix" dev="chtompki" due-to="Nick Wong">Improve JaccardSimilarity computational cost</action>
     <action issue="TEXT-118" type="fix" dev="chtompki" due-to="Nandor Kollar">JSON escaping incorrect for the delete control character</action>
     <action issue="TEXT-130" type="fix" dev="chtompki" due-to="Jan Martin Keil">Fixes JaroWinklerDistance: Wrong results due to precision of transpositions</action>
     <action issue="TEXT-131" type="fix" dev="chtompki" due-to="Jan Martin Keil">JaroWinklerDistance: Calculation deviates from definition</action>