You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2018/09/28 16:47:28 UTC
[1/4] [text] [TEXT-139] Improve JaccardSimilarity computational cost.
Repository: commons-text
Updated Branches:
refs/heads/master 8ae4ff075 -> 6872117ae
[TEXT-139] Improve JaccardSimilarity computational cost.
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/0d4c9c45
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/0d4c9c45
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/0d4c9c45
Branch: refs/heads/master
Commit: 0d4c9c4593fa98909c603fc701c8116975d8d8a8
Parents: 85465e2
Author: nickwongwong <a4...@yeah.net>
Authored: Mon Sep 10 22:01:46 2018 +0800
Committer: nickwongwong <a4...@yeah.net>
Committed: Mon Sep 10 22:01:46 2018 +0800
----------------------------------------------------------------------
.../text/similarity/JaccardSimilarity.java | 27 ++++++++------------
1 file changed, 11 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/0d4c9c45/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
index 1dc2b85..2e88dd2 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -62,27 +62,22 @@ public class JaccardSimilarity implements SimilarityScore<Double> {
* @return index
*/
private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) {
- final Set<String> intersectionSet = new HashSet<>();
- final Set<String> unionSet = new HashSet<>();
- boolean unionFilled = false;
final int leftLength = left.length();
final int rightLength = right.length();
if (leftLength == 0 || rightLength == 0) {
return 0d;
}
-
- for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
- unionSet.add(String.valueOf(left.charAt(leftIndex)));
- for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
- if (!unionFilled) {
- unionSet.add(String.valueOf(right.charAt(rightIndex)));
- }
- if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
- intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
- }
- }
- unionFilled = true;
+ final Set<Character> leftSet = new HashSet<>();
+ for (int i = 0; i < leftLength; i++) {
+ leftSet.add(left.charAt(i));
+ }
+ final Set<Character> rightSet = new HashSet<>();
+ for (int i = 0; i < rightLength; i++) {
+ rightSet.add(right.charAt(i));
}
- return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
+ final Set<Character> unionSet = new HashSet<>(leftSet);
+ unionSet.addAll(rightSet);
+ final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size();
+ return 1.0d * intersectionSize / unionSet.size();
}
}
[3/4] [text] Merge branch 'master' of
https://github.com/nickwongwong/commons-text
Posted by ch...@apache.org.
Merge branch 'master' of https://github.com/nickwongwong/commons-text
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/9c7dff19
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/9c7dff19
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/9c7dff19
Branch: refs/heads/master
Commit: 9c7dff199a3ccd84f817b0ad8a76659cb26fd16a
Parents: 8ae4ff0 189f421
Author: Rob Tompkins <ch...@apache.org>
Authored: Fri Sep 28 12:43:59 2018 -0400
Committer: Rob Tompkins <ch...@apache.org>
Committed: Fri Sep 28 12:43:59 2018 -0400
----------------------------------------------------------------------
.../text/lookup/AbstractStringLookup.java | 12 +++++++++
.../commons/text/lookup/ScriptStringLookup.java | 4 +--
.../text/lookup/StringLookupFactory.java | 1 +
.../commons/text/lookup/UrlStringLookup.java | 5 ++--
.../text/similarity/JaccardSimilarity.java | 27 ++++++++------------
src/test/resources/document.properties | 2 +-
6 files changed, 30 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
index e3c44e9,404bbdc..4dbdb65
--- a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
@@@ -24,8 -24,14 +24,20 @@@ package org.apache.commons.text.lookup
*/
abstract class AbstractStringLookup implements StringLookup {
++
++ /**
++ * The empty string.
++ */
+ private static final String EMPTY = "";
++
+ /**
+ * The default split char.
+ */
protected static final char SPLIT_CH = ':';
+
+ /**
+ * The default split string.
+ */
protected static final String SPLIT_STR = String.valueOf(SPLIT_CH);
/**
http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/9c7dff19/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
----------------------------------------------------------------------
[2/4] [text] Fixed code formatting
Posted by ch...@apache.org.
Fixed code formatting
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/189f4210
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/189f4210
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/189f4210
Branch: refs/heads/master
Commit: 189f421051f0e29789a8cdbac9dfa7d07f66ac2d
Parents: 0d4c9c4
Author: nickwongwong <a4...@yeah.net>
Authored: Fri Sep 14 20:34:11 2018 +0800
Committer: nickwongwong <a4...@yeah.net>
Committed: Fri Sep 14 20:34:11 2018 +0800
----------------------------------------------------------------------
.../org/apache/commons/text/lookup/AbstractStringLookup.java | 7 +++++++
.../org/apache/commons/text/lookup/ScriptStringLookup.java | 4 ++--
.../org/apache/commons/text/lookup/StringLookupFactory.java | 1 +
.../java/org/apache/commons/text/lookup/UrlStringLookup.java | 5 +++--
src/test/resources/document.properties | 2 +-
5 files changed, 14 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
index f265bf7..404bbdc 100644
--- a/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/AbstractStringLookup.java
@@ -24,7 +24,14 @@ package org.apache.commons.text.lookup;
*/
abstract class AbstractStringLookup implements StringLookup {
+ /**
+ * The default split char.
+ */
protected static final char SPLIT_CH = ':';
+
+ /**
+ * The default split string.
+ */
protected static final String SPLIT_STR = String.valueOf(SPLIT_CH);
/**
http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java b/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
index affb1a3..99f1be9 100644
--- a/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/ScriptStringLookup.java
@@ -78,8 +78,8 @@ final class ScriptStringLookup extends AbstractStringLookup {
final Object eval = scriptEngine.eval(script);
return Objects.toString(eval, null);
} catch (final Exception e) {
- throw IllegalArgumentExceptions.format(e, "Error looking up script engine [%s] for script [%s].", engineName,
- script);
+ throw IllegalArgumentExceptions.format(e, "Error looking up script engine [%s] for script [%s].",
+ engineName, script);
}
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java b/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
index 73a4398..25f6596 100644
--- a/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
+++ b/src/main/java/org/apache/commons/text/lookup/StringLookupFactory.java
@@ -54,6 +54,7 @@ public final class StringLookupFactory {
* </ul>
*
* @param stringLookupMap
+ * the map of string lookups.
* @since 1.5
*/
public void addDefaultStringLookups(final Map<String, StringLookup> stringLookupMap) {
http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java b/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
index e17e668..6e0d4d4 100644
--- a/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
+++ b/src/main/java/org/apache/commons/text/lookup/UrlStringLookup.java
@@ -72,8 +72,9 @@ final class UrlStringLookup extends AbstractStringLookup {
final String urlStr = substringAfter(key, SPLIT_CH);
try {
final URL url = new URL(urlStr);
- final StringWriter writer = new StringWriter(8192);
- final char[] buffer = new char[8192];
+ final int size = 8192;
+ final StringWriter writer = new StringWriter(size);
+ final char[] buffer = new char[size];
try (InputStreamReader reader = new InputStreamReader(new BufferedInputStream(url.openStream()),
charsetName)) {
int n;
http://git-wip-us.apache.org/repos/asf/commons-text/blob/189f4210/src/test/resources/document.properties
----------------------------------------------------------------------
diff --git a/src/test/resources/document.properties b/src/test/resources/document.properties
index f411d5e..2f3bc8c 100644
--- a/src/test/resources/document.properties
+++ b/src/test/resources/document.properties
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-mykey = Hello World!
\ No newline at end of file
+mykey = Hello World!
[4/4] [text] TEXT-139: Thanks @nickwongwong
Posted by ch...@apache.org.
TEXT-139: Thanks @nickwongwong
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6872117a
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6872117a
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6872117a
Branch: refs/heads/master
Commit: 6872117ae1d33450bf4be694b3a6af6772e006f2
Parents: 9c7dff1
Author: Rob Tompkins <ch...@apache.org>
Authored: Fri Sep 28 12:45:58 2018 -0400
Committer: Rob Tompkins <ch...@apache.org>
Committed: Fri Sep 28 12:45:58 2018 -0400
----------------------------------------------------------------------
pom.xml | 3 +++
src/changes/changes.xml | 1 +
2 files changed, 4 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6872117a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index f79cfd3..2b83a03 100644
--- a/pom.xml
+++ b/pom.xml
@@ -360,6 +360,9 @@
<contributor>
<name>Nandor Kollar</name>
</contributor>
+ <contributor>
+ <name>Nick Wong</name>
+ </contributor>
</contributors>
<scm>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/6872117a/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 91d507d..43ead66 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
<body>
<release version="1.5" date="2018-MM-DD" description="Release 1.5">
+ <action issue="TEXT-139" type="fix" dev="chtompki" due-to="Nick Wong">Improve JaccardSimilarity computational cost</action>
<action issue="TEXT-118" type="fix" dev="chtompki" due-to="Nandor Kollar">JSON escaping incorrect for the delete control character</action>
<action issue="TEXT-130" type="fix" dev="chtompki" due-to="Jan Martin Keil">Fixes JaroWinklerDistance: Wrong results due to precision of transpositions</action>
<action issue="TEXT-131" type="fix" dev="chtompki" due-to="Jan Martin Keil">JaroWinklerDistance: Calculation deviates from definition</action>