You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2018/11/03 07:39:09 UTC
[1/2] incubator-hivemall git commit: Fixed bm25() UDF for help message
Repository: incubator-hivemall
Updated Branches:
refs/heads/master ce70aa482 -> 62a97798b
Fixed bm25() UDF for help message
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/b97af4fe
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/b97af4fe
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/b97af4fe
Branch: refs/heads/master
Commit: b97af4fe0faebcbc78875b4e88fb292eaa576b42
Parents: ce70aa4
Author: Makoto Yui <my...@apache.org>
Authored: Sat Nov 3 16:38:13 2018 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Sat Nov 3 16:38:13 2018 +0900
----------------------------------------------------------------------
core/src/main/java/hivemall/UDFWithOptions.java | 53 +++++++++++++-------
.../java/hivemall/ftvec/text/OkapiBM25UDF.java | 2 +-
2 files changed, 36 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/b97af4fe/core/src/main/java/hivemall/UDFWithOptions.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/UDFWithOptions.java b/core/src/main/java/hivemall/UDFWithOptions.java
index 04d6fdc..89e7662 100644
--- a/core/src/main/java/hivemall/UDFWithOptions.java
+++ b/core/src/main/java/hivemall/UDFWithOptions.java
@@ -89,29 +89,46 @@ public abstract class UDFWithOptions extends GenericUDF {
CommandLine cl = CommandLineUtils.parseOptions(args, opts);
if (cl.hasOption("help")) {
- Description funcDesc = getClass().getAnnotation(Description.class);
- final String cmdLineSyntax;
- if (funcDesc == null) {
- cmdLineSyntax = getClass().getSimpleName();
- } else {
- String funcName = funcDesc.name();
- cmdLineSyntax = funcName == null ? getClass().getSimpleName()
- : funcDesc.value().replace("_FUNC_", funcDesc.name());
- }
- StringWriter sw = new StringWriter();
- sw.write('\n');
- PrintWriter pw = new PrintWriter(sw);
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, cmdLineSyntax, null, opts,
- HelpFormatter.DEFAULT_LEFT_PAD, HelpFormatter.DEFAULT_DESC_PAD, null, true);
- pw.flush();
- String helpMsg = sw.toString();
- throw new UDFArgumentException(helpMsg);
+ showHelp(opts);
}
return cl;
}
+ protected void showHelp(@Nullable String errMsg) throws UDFArgumentException {
+ showHelp(getOptions(), errMsg);
+ }
+
+ private void showHelp(@Nonnull Options opts) throws UDFArgumentException {
+ showHelp(getOptions(), null);
+ }
+
+ private void showHelp(@Nonnull Options opts, @Nullable String errMsg)
+ throws UDFArgumentException {
+ Description funcDesc = getClass().getAnnotation(Description.class);
+ final String cmdLineSyntax;
+ if (funcDesc == null) {
+ cmdLineSyntax = getClass().getSimpleName();
+ } else {
+ String funcName = funcDesc.name();
+ cmdLineSyntax = funcName == null ? getClass().getSimpleName()
+ : funcDesc.value().replace("_FUNC_", funcDesc.name());
+ }
+ StringWriter sw = new StringWriter();
+ sw.write('\n');
+ if (errMsg != null) {
+ sw.write(errMsg);
+ sw.write("\n\n");
+ }
+ PrintWriter pw = new PrintWriter(sw);
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, cmdLineSyntax, null, opts,
+ HelpFormatter.DEFAULT_LEFT_PAD, HelpFormatter.DEFAULT_DESC_PAD, null, true);
+ pw.flush();
+ String helpMsg = sw.toString();
+ throw new UDFArgumentException(helpMsg);
+ }
+
/**
* Raise {@link UDFArgumentException} if the given condition is false.
*
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/b97af4fe/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java b/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
index cd36d6f..acd80bf 100644
--- a/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
+++ b/core/src/main/java/hivemall/ftvec/text/OkapiBM25UDF.java
@@ -108,7 +108,7 @@ public final class OkapiBM25UDF extends UDFWithOptions {
throws UDFArgumentException {
final int numArgOIs = argOIs.length;
if (numArgOIs < 5) {
- throw new UDFArgumentException("argOIs.length must be greater than or equal to 5");
+ showHelp("#arguments must be greater than or equal to 5: " + numArgOIs);
} else if (numArgOIs == 6) {
String opts = HiveUtils.getConstString(argOIs[5]);
processOptions(opts);
[2/2] incubator-hivemall git commit: Fixed term vector space tutorial
Posted by my...@apache.org.
Fixed term vector space tutorial
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/62a97798
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/62a97798
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/62a97798
Branch: refs/heads/master
Commit: 62a97798bbab688d0f24f5126c755c67209f31af
Parents: b97af4f
Author: Makoto Yui <my...@apache.org>
Authored: Sat Nov 3 16:38:47 2018 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Sat Nov 3 16:38:47 2018 +0900
----------------------------------------------------------------------
docs/gitbook/SUMMARY.md | 5 +++--
docs/gitbook/ft_engineering/bm25.md | 24 +++++++++++++++++++++++-
docs/gitbook/ft_engineering/term_vector.md | 3 +++
3 files changed, 29 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 3484bfb..31a0311 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -65,8 +65,9 @@
* [Feature Transformation](ft_engineering/ft_trans.md)
* [Feature vectorization](ft_engineering/vectorization.md)
* [Quantify non-number features](ft_engineering/quantify.md)
-* [TF-IDF Calculation](ft_engineering/tfidf.md)
-* [BM25](ft_engineering/bm25.md)
+* [Term Vector Model](ft_engineering/term_vector.md)
+ * [TF-IDF Term Weighting](ft_engineering/tfidf.md)
+ * [Okapi BM25 Term Weighting](ft_engineering/bm25.md)
## Part IV - Evaluation
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/ft_engineering/bm25.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/bm25.md b/docs/gitbook/ft_engineering/bm25.md
index 4ca029f..b70ecfe 100644
--- a/docs/gitbook/ft_engineering/bm25.md
+++ b/docs/gitbook/ft_engineering/bm25.md
@@ -139,7 +139,29 @@ from
;
```
-## Show important terms
+## Hyperparameters
+
+`bm25()`'s function signature and hyperparameters are as follows:
+
+```sql
+hive> select bm25();
+FAILED: SemanticException Line 1:7 Wrong arguments 'bm25':
+
+#arguments must be greater than or equal to 5: 0
+
+usage: bm25(double termFrequency, int docLength, double avgDocLength, int
+ numDocs, int numDocsWithTerm [, const string options]) - Return an
+ Okapi BM25 score in double [-b <arg>] [-d <arg>] [-k1 <arg>]
+ [-min_idf <arg>]
+ -b <arg> Hyperparameter with type double in range 0.0
+ and 1.0 [default: 0.75]
+ -d,--delta <arg> Hyperparameter delta of BM25+ [default: 0.0]
+ -k1 <arg> Hyperparameter with type double, usually in
+ range 1.2 and 2.0 [default: 1.2]
+ -min_idf,--epsilon <arg> Hyperparameter delta of BM25+ [default: 1e-8]
+```
+
+## Show important terms for each document
```sql
select
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/62a97798/docs/gitbook/ft_engineering/term_vector.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/term_vector.md b/docs/gitbook/ft_engineering/term_vector.md
new file mode 100644
index 0000000..ff8c61f
--- /dev/null
+++ b/docs/gitbook/ft_engineering/term_vector.md
@@ -0,0 +1,3 @@
+Term vector model or [Vector space model](https://en.wikipedia.org/wiki/Vector_space_model) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers.
+
+It is used in information filtering, information retrieval, relevancy rankings, and machine learning.