You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2018/11/03 07:39:09 UTC

[1/2] incubator-hivemall git commit: Fixed bm25() UDF for help message

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master ce70aa482 -> 62a97798b


Fixed bm25() UDF for help message


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/b97af4fe
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/b97af4fe
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/b97af4fe

Branch: refs/heads/master
Commit: b97af4fe0faebcbc78875b4e88fb292eaa576b42
Parents: ce70aa4
Author: Makoto Yui <my...@apache.org>
Authored: Sat Nov 3 16:38:13 2018 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Sat Nov 3 16:38:13 2018 +0900

----------------------------------------------------------------------
 core/src/main/java/hivemall/UDFWithOptions.java | 53 +++++++++++++-------
 .../java/hivemall/ftvec/text/OkapiBM25UDF.java  |  2 +-
 2 files changed, 36 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/b97af4fe/core/src/main/java/hivemall/UDFWithOptions.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/UDFWithOptions.java b/core/src/main/java/hivemall/UDFWithOptions.java
index 04d6fdc..89e7662 100644
--- a/core/src/main/java/hivemall/UDFWithOptions.java
+++ b/core/src/main/java/hivemall/UDFWithOptions.java
@@ -89,29 +89,46 @@ public abstract class UDFWithOptions extends GenericUDF {
         CommandLine cl = CommandLineUtils.parseOptions(args, opts);
 
         if (cl.hasOption("help")) {
-            Description funcDesc = getClass().getAnnotation(Description.class);
-            final String cmdLineSyntax;
-            if (funcDesc == null) {
-                cmdLineSyntax = getClass().getSimpleName();
-            } else {
-                String funcName = funcDesc.name();
-                cmdLineSyntax = funcName == null ? getClass().getSimpleName()
-                        : funcDesc.value().replace("_FUNC_", funcDesc.name());
-            }
-            StringWriter sw = new StringWriter();
-            sw.write('\n');
-            PrintWriter pw = new PrintWriter(sw);
-            HelpFormatter formatter = new HelpFormatter();
-            formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, cmdLineSyntax, null, opts,
-                HelpFormatter.DEFAULT_LEFT_PAD, HelpFormatter.DEFAULT_DESC_PAD, null, true);
-            pw.flush();
-            String helpMsg = sw.toString();
-            throw new UDFArgumentException(helpMsg);
+            showHelp(opts);
         }
 
         return cl;
     }
 
+    protected void showHelp(@Nullable String errMsg) throws UDFArgumentException {
+        showHelp(getOptions(), errMsg);
+    }
+
+    private void showHelp(@Nonnull Options opts) throws UDFArgumentException {
+        showHelp(getOptions(), null);
+    }
+
+    private void showHelp(@Nonnull Options opts, @Nullable String errMsg)
+            throws UDFArgumentException {
+        Description funcDesc = getClass().getAnnotation(Description.class);
+        final String cmdLineSyntax;
+        if (funcDesc == null) {
+            cmdLineSyntax = getClass().getSimpleName();
+        } else {
+            String funcName = funcDesc.name();
+            cmdLineSyntax = funcName == null ? getClass().getSimpleName()
+                    : funcDesc.value().replace("_FUNC_", funcDesc.name());
+        }
+        StringWriter sw = new StringWriter();
+        sw.write('\n');
+        if (errMsg != null) {
+            sw.write(errMsg);
+            sw.write("\n\n");
+        }
+        PrintWriter pw = new PrintWriter(sw);
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, cmdLineSyntax, null, opts,
+            HelpFormatter.DEFAULT_LEFT_PAD, HelpFormatter.DEFAULT_DESC_PAD, null, true);
+        pw.flush();
+        String helpMsg = sw.toString();
+        throw new UDFArgumentException(helpMsg);
+    }
+
     /**
      * Raise {@link UDFArgumentException} if the given condition is false.
      *

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/b97af4fe/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java b/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
index cd36d6f..acd80bf 100644
--- a/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
+++ b/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
@@ -108,7 +108,7 @@ public final class OkapiBM25UDF extends UDFWithOptions {
             throws UDFArgumentException {
         final int numArgOIs = argOIs.length;
         if (numArgOIs < 5) {
-            throw new UDFArgumentException("argOIs.length must be greater than or equal to 5");
+            showHelp("#arguments must be greater than or equal to 5: " + numArgOIs);
         } else if (numArgOIs == 6) {
             String opts = HiveUtils.getConstString(argOIs[5]);
             processOptions(opts);


[2/2] incubator-hivemall git commit: Fixed term vector space tutorial

Posted by my...@apache.org.
Fixed term vector space tutorial


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/62a97798
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/62a97798
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/62a97798

Branch: refs/heads/master
Commit: 62a97798bbab688d0f24f5126c755c67209f31af
Parents: b97af4f
Author: Makoto Yui <my...@apache.org>
Authored: Sat Nov 3 16:38:47 2018 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Sat Nov 3 16:38:47 2018 +0900

----------------------------------------------------------------------
 docs/gitbook/SUMMARY.md                    |  5 +++--
 docs/gitbook/ft_engineering/bm25.md        | 24 +++++++++++++++++++++++-
 docs/gitbook/ft_engineering/term_vector.md |  3 +++
 3 files changed, 29 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 3484bfb..31a0311 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -65,8 +65,9 @@
 * [Feature Transformation](ft_engineering/ft_trans.md)
     * [Feature vectorization](ft_engineering/vectorization.md)
     * [Quantify non-number features](ft_engineering/quantify.md)
-* [TF-IDF Calculation](ft_engineering/tfidf.md)
-* [BM25](ft_engineering/bm25.md)
+* [Term Vector Model](ft_engineering/term_vector.md)
+    * [TF-IDF Term Weighting](ft_engineering/tfidf.md)
+    * [Okapi BM25 Term Weighting](ft_engineering/bm25.md)
 
 ## Part IV - Evaluation
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/ft_engineering/bm25.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/bm25.md b/docs/gitbook/ft_engineering/bm25.md
index 4ca029f..b70ecfe 100644
--- a/docs/gitbook/ft_engineering/bm25.md
+++ b/docs/gitbook/ft_engineering/bm25.md
@@ -139,7 +139,29 @@ from
 ;
 ```
 
-## Show important terms
+## Hyperparameters
+
+`bm25()`'s function signature and hyperparameters are as follows:
+
+```sql
+hive> select bm25();
+FAILED: SemanticException Line 1:7 Wrong arguments 'bm25':
+
+#arguments must be greater than or equal to 5: 0
+
+usage: bm25(double termFrequency, int docLength, double avgDocLength, int
+       numDocs, int numDocsWithTerm [, const string options]) - Return an
+       Okapi BM25 score in double [-b <arg>] [-d <arg>] [-k1 <arg>]
+       [-min_idf <arg>]
+ -b <arg>                   Hyperparameter with type double in range 0.0
+                            and 1.0 [default: 0.75]
+ -d,--delta <arg>           Hyperparameter delta of BM25+ [default: 0.0]
+ -k1 <arg>                  Hyperparameter with type double, usually in
+                            range 1.2 and 2.0 [default: 1.2]
+ -min_idf,--epsilon <arg>   Hyperparameter delta of BM25+ [default: 1e-8]
+```
+
+## Show important terms for each document
 
 ```sql
 select

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/ft_engineering/term_vector.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/term_vector.md b/docs/gitbook/ft_engineering/term_vector.md
new file mode 100644
index 0000000..ff8c61f
--- /dev/null
+++ b/docs/gitbook/ft_engineering/term_vector.md
@@ -0,0 +1,3 @@
+Term vector model or [Vector space model](https://en.wikipedia.org/wiki/Vector_space_model) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers.
+
+It is used in information filtering, information retrieval, relevancy rankings, and machine learning.