You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2017/10/16 12:06:27 UTC
[1/4] incubator-hivemall git commit: Close #122:
[HIVEMALL-147][Spark] Support all Hivemall functions of v0.5-rc.1 in Spark
Dataframe
Repository: incubator-hivemall
Updated Branches:
refs/heads/master fdf702143 -> e8abae257
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
index 6b5d4cd..de2481c 100644
--- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
+++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
@@ -31,7 +31,8 @@ import org.apache.spark.sql.types._
import org.apache.spark.test.TestFPWrapper._
import org.apache.spark.test.TestUtils
-final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
+
+class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
test("anomaly") {
import hiveContext.implicits._
@@ -42,61 +43,113 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
}
test("knn.similarity") {
- val df1 = DummyInputData.select(cosine_sim(lit2(Seq(1, 2, 3, 4)), lit2(Seq(3, 4, 5, 6))))
- assert(df1.collect.apply(0).getFloat(0) ~== 0.500f)
+ import hiveContext.implicits._
+
+ val df1 = DummyInputData.select(
+ cosine_similarity(typedLit(Seq(1, 2, 3, 4)), typedLit(Seq(3, 4, 5, 6))))
+ val rows1 = df1.collect
+ assert(rows1.length == 1)
+ assert(rows1(0).getFloat(0) ~== 0.500f)
- val df2 = DummyInputData.select(jaccard(lit(5), lit(6)))
- assert(df2.collect.apply(0).getFloat(0) ~== 0.96875f)
+ val df2 = DummyInputData.select(jaccard_similarity(lit(5), lit(6)))
+ val rows2 = df2.collect
+ assert(rows2.length == 1)
+ assert(rows2(0).getFloat(0) ~== 0.96875f)
- val df3 = DummyInputData.select(angular_similarity(lit2(Seq(1, 2, 3)), lit2(Seq(4, 5, 6))))
- assert(df3.collect.apply(0).getFloat(0) ~== 0.500f)
+ val df3 = DummyInputData.select(
+ angular_similarity(typedLit(Seq(1, 2, 3)), typedLit(Seq(4, 5, 6))))
+ val rows3 = df3.collect
+ assert(rows3.length == 1)
+ assert(rows3(0).getFloat(0) ~== 0.500f)
- val df4 = DummyInputData.select(euclid_similarity(lit2(Seq(5, 3, 1)), lit2(Seq(2, 8, 3))))
- assert(df4.collect.apply(0).getFloat(0) ~== 0.33333334f)
+ val df4 = DummyInputData.select(
+ euclid_similarity(typedLit(Seq(5, 3, 1)), typedLit(Seq(2, 8, 3))))
+ val rows4 = df4.collect
+ assert(rows4.length == 1)
+ assert(rows4(0).getFloat(0) ~== 0.33333334f)
val df5 = DummyInputData.select(distance2similarity(lit(1.0)))
- assert(df5.collect.apply(0).getFloat(0) ~== 0.5f)
+ val rows5 = df5.collect
+ assert(rows5.length == 1)
+ assert(rows5(0).getFloat(0) ~== 0.5f)
+
+ val df6 = Seq((Seq("1:0.3", "4:0.1"), Map(0 -> 0.5))).toDF("a", "b")
+ // TODO: Currently, just check if no exception thrown
+ assert(df6.dimsum_mapper(df6("a"), df6("b")).collect.isEmpty)
}
test("knn.distance") {
val df1 = DummyInputData.select(hamming_distance(lit(1), lit(3)))
- checkAnswer(df1, Row(1) :: Nil)
+ checkAnswer(df1, Row(1))
val df2 = DummyInputData.select(popcnt(lit(1)))
- checkAnswer(df2, Row(1) :: Nil)
-
- val df3 = DummyInputData.select(kld(lit(0.1), lit(0.5), lit(0.2), lit(0.5)))
- assert(df3.collect.apply(0).getDouble(0) ~== 0.01)
-
- val df4 = DummyInputData.select(
- euclid_distance(lit2(Seq("0.1", "0.5")), lit2(Seq("0.2", "0.5"))))
- assert(df4.collect.apply(0).getFloat(0) ~== 1.4142135f)
-
- val df5 = DummyInputData.select(
- cosine_distance(lit2(Seq("0.8", "0.3")), lit2(Seq("0.4", "0.6"))))
- assert(df5.collect.apply(0).getFloat(0) ~== 1.0f)
-
- val df6 = DummyInputData.select(
- angular_distance(lit2(Seq("0.1", "0.1")), lit2(Seq("0.3", "0.8"))))
- assert(df6.collect.apply(0).getFloat(0) ~== 0.50f)
-
- val df7 = DummyInputData.select(
- manhattan_distance(lit2(Seq("0.7", "0.8")), lit2(Seq("0.5", "0.6"))))
- assert(df7.collect.apply(0).getFloat(0) ~== 4.0f)
-
- val df8 = DummyInputData.select(
- minkowski_distance(lit2(Seq("0.1", "0.2")), lit2(Seq("0.2", "0.2")), lit2(1.0)))
- assert(df8.collect.apply(0).getFloat(0) ~== 2.0f)
+ checkAnswer(df2, Row(1))
+
+ val rows3 = DummyInputData.select(kld(lit(0.1), lit(0.5), lit(0.2), lit(0.5))).collect
+ assert(rows3.length === 1)
+ assert(rows3(0).getDouble(0) ~== 0.01)
+
+ val rows4 = DummyInputData.select(
+ euclid_distance(typedLit(Seq("0.1", "0.5")), typedLit(Seq("0.2", "0.5")))).collect
+ assert(rows4.length === 1)
+ assert(rows4(0).getFloat(0) ~== 1.4142135f)
+
+ val rows5 = DummyInputData.select(
+ cosine_distance(typedLit(Seq("0.8", "0.3")), typedLit(Seq("0.4", "0.6")))).collect
+ assert(rows5.length === 1)
+ assert(rows5(0).getFloat(0) ~== 1.0f)
+
+ val rows6 = DummyInputData.select(
+ angular_distance(typedLit(Seq("0.1", "0.1")), typedLit(Seq("0.3", "0.8")))).collect
+ assert(rows6.length === 1)
+ assert(rows6(0).getFloat(0) ~== 0.50f)
+
+ val rows7 = DummyInputData.select(
+ manhattan_distance(typedLit(Seq("0.7", "0.8")), typedLit(Seq("0.5", "0.6")))).collect
+ assert(rows7.length === 1)
+ assert(rows7(0).getFloat(0) ~== 4.0f)
+
+ val rows8 = DummyInputData.select(
+ minkowski_distance(typedLit(Seq("0.1", "0.2")), typedLit(Seq("0.2", "0.2")), typedLit(1.0))
+ ).collect
+ assert(rows8.length === 1)
+ assert(rows8(0).getFloat(0) ~== 2.0f)
+
+ val rows9 = DummyInputData.select(
+ jaccard_distance(typedLit(Seq("0.3", "0.8")), typedLit(Seq("0.1", "0.2")))).collect
+ assert(rows9.length === 1)
+ assert(rows9(0).getFloat(0) ~== 1.0f)
}
test("knn.lsh") {
import hiveContext.implicits._
- assert(IntList2Data.minhash(lit(1), $"target").count() > 0)
-
- assert(DummyInputData.select(bbit_minhash(lit2(Seq("1:0.1", "2:0.5")), lit(false))).count
- == DummyInputData.count)
- assert(DummyInputData.select(minhashes(lit2(Seq("1:0.1", "2:0.5")), lit(false))).count
- == DummyInputData.count)
+ checkAnswer(
+ IntList2Data.minhash(lit(1), $"target"),
+ Row(1016022700, 1) ::
+ Row(1264890450, 1) ::
+ Row(1304330069, 1) ::
+ Row(1321870696, 1) ::
+ Row(1492709716, 1) ::
+ Row(1511363108, 1) ::
+ Row(1601347428, 1) ::
+ Row(1974434012, 1) ::
+ Row(2022223284, 1) ::
+ Row(326269457, 1) ::
+ Row(50559334, 1) ::
+ Row(716040854, 1) ::
+ Row(759249519, 1) ::
+ Row(809187771, 1) ::
+ Row(900899651, 1) ::
+ Nil
+ )
+ checkAnswer(
+ DummyInputData.select(bbit_minhash(typedLit(Seq("1:0.1", "2:0.5")), lit(false))),
+ Row("31175986876675838064867796245644543067")
+ )
+ checkAnswer(
+ DummyInputData.select(minhashes(typedLit(Seq("1:0.1", "2:0.5")), lit(false))),
+ Row(Seq(1571683640, 987207869, 370931990, 988455638, 846963275))
+ )
}
test("ftvec - add_bias") {
@@ -111,12 +164,13 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
test("ftvec - extract_feature") {
val df = DummyInputData.select(extract_feature(lit("1:0.8")))
- checkAnswer(df, Row("1") :: Nil)
+ checkAnswer(df, Row("1"))
}
test("ftvec - extract_weight") {
- val df = DummyInputData.select(extract_weight(lit("3:0.1")))
- assert(df.collect.apply(0).getDouble(0) ~== 0.1)
+ val rows = DummyInputData.select(extract_weight(lit("3:0.1"))).collect
+ assert(rows.length === 1)
+ assert(rows(0).getDouble(0) ~== 0.1)
}
test("ftvec - explode_array") {
@@ -162,26 +216,36 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
}
test("ftvec.hash") {
- assert(DummyInputData.select(mhash(lit("test"))).count == DummyInputData.count)
- assert(DummyInputData.select(org.apache.spark.sql.hive.HivemallOps.sha1(lit("test"))).count ==
- DummyInputData.count)
- // TODO: The tests below failed because:
- // org.apache.spark.sql.AnalysisException: List type in java is unsupported because JVM type
- // erasure makes spark fail to catch a component type in List<>;
- //
- // assert(DummyInputData.select(array_hash_values(lit2(Seq("aaa", "bbb")))).count
- // == DummyInputData.count)
- // assert(DummyInputData.select(
- // prefixed_hash_values(lit2(Seq("ccc", "ddd")), lit("prefix"))).count
- // == DummyInputData.count)
+ checkAnswer(DummyInputData.select(mhash(lit("test"))), Row(4948445))
+ checkAnswer(DummyInputData.select(HivemallOps.sha1(lit("test"))), Row(12184508))
+ checkAnswer(DummyInputData.select(feature_hashing(typedLit(Seq("1:0.1", "3:0.5")))),
+ Row(Seq("11293631:0.1", "4331412:0.5")))
+ checkAnswer(DummyInputData.select(array_hash_values(typedLit(Seq("aaa", "bbb")))),
+ Row(Seq(4063537, 8459207)))
+ checkAnswer(DummyInputData.select(
+ prefixed_hash_values(typedLit(Seq("ccc", "ddd")), lit("prefix"))),
+ Row(Seq("prefix7873825", "prefix8965544")))
+ }
+
+ test("ftvec.parting") {
+ checkAnswer(DummyInputData.select(polynomial_features(typedLit(Seq("2:0.4", "6:0.1")), lit(2))),
+ Row(Seq("2:0.4", "2^2:0.16000001", "2^6:0.040000003", "6:0.1", "6^6:0.010000001")))
+ checkAnswer(DummyInputData.select(powered_features(typedLit(Seq("4:0.8", "5:0.2")), lit(2))),
+ Row(Seq("4:0.8", "4^2:0.64000005", "5:0.2", "5^2:0.040000003")))
}
test("ftvec.scaling") {
- val df1 = TinyTrainData.select(rescale(lit(2.0f), lit(1.0), lit(5.0)))
- assert(df1.collect.apply(0).getFloat(0) === 0.25f)
- val df2 = TinyTrainData.select(zscore(lit(1.0f), lit(0.5), lit(0.5)))
- assert(df2.collect.apply(0).getFloat(0) === 1.0f)
- val df3 = TinyTrainData.select(normalize(TinyTrainData.col("features")))
+ val rows1 = TinyTrainData.select(rescale(lit(2.0f), lit(1.0), lit(5.0))).collect
+ assert(rows1.length === 3)
+ assert(rows1(0).getFloat(0) ~== 0.25f)
+ assert(rows1(1).getFloat(0) ~== 0.25f)
+ assert(rows1(2).getFloat(0) ~== 0.25f)
+ val rows2 = TinyTrainData.select(zscore(lit(1.0f), lit(0.5), lit(0.5))).collect
+ assert(rows2.length === 3)
+ assert(rows2(0).getFloat(0) ~== 1.0f)
+ assert(rows2(1).getFloat(0) ~== 1.0f)
+ assert(rows2(2).getFloat(0) ~== 1.0f)
+ val df3 = TinyTrainData.select(l2_normalize(TinyTrainData.col("features")))
checkAnswer(
df3,
Row(Seq("1:0.9701425", "2:0.24253562")) ::
@@ -205,10 +269,10 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589)))
.toDF("arg0", "arg1")
- val result = df.select(chi2(df("arg0"), df("arg1"))).collect
- assert(result.length == 1)
- val chi2Val = result.head.getAs[Row](0).getAs[Seq[Double]](0)
- val pVal = result.head.getAs[Row](0).getAs[Seq[Double]](1)
+ val rows = df.select(chi2(df("arg0"), df("arg1"))).collect
+ assert(rows.length == 1)
+ val chi2Val = rows.head.getAs[Row](0).getAs[Seq[Double]](0)
+ val pVal = rows.head.getAs[Row](0).getAs[Seq[Double]](1)
(chi2Val, Seq(10.81782088, 3.59449902, 116.16984746, 67.24482759))
.zipped
@@ -240,58 +304,260 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
// assert(TinyTrainData.rand_amplify(lit(3), lit("-buf 8", $"label", $"features")).count() == 9)
}
- ignore("ftvec.conv") {
+ test("ftvec.conv") {
import hiveContext.implicits._
- val df1 = Seq((0.0, "1:0.1" :: "3:0.3" :: Nil), (1, 0, "2:0.2" :: Nil)).toDF("a", "b")
checkAnswer(
- df1.select(to_dense_features(df1("b"), lit(3))),
- Row(Array(0.1f, 0.0f, 0.3f)) :: Row(Array(0.0f, 0.2f, 0.0f)) :: Nil
+ DummyInputData.select(to_dense_features(typedLit(Seq("0:0.1", "1:0.3")), lit(1))),
+ Row(Array(0.1f, 0.3f))
+ )
+ checkAnswer(
+ DummyInputData.select(to_sparse_features(typedLit(Seq(0.1f, 0.2f, 0.3f)))),
+ Row(Seq("0:0.1", "1:0.2", "2:0.3"))
)
- val df2 = Seq((0.1, 0.2, 0.3), (0.2, 0.5, 0.4)).toDF("a", "b", "c")
checkAnswer(
- df2.select(to_sparse_features(df2("a"), df2("b"), df2("c"))),
- Row(Seq("1:0.1", "2:0.2", "3:0.3")) :: Row(Seq("1:0.2", "2:0.5", "3:0.4")) :: Nil
+ DummyInputData.select(feature_binning(typedLit(Seq("1")), typedLit(Map("1" -> Seq(0, 3))))),
+ Row(Seq("1"))
)
}
test("ftvec.trans") {
import hiveContext.implicits._
+ checkAnswer(
+ DummyInputData.select(vectorize_features(typedLit(Seq("a", "b")), lit(0.1f), lit(0.2f))),
+ Row(Seq("a:0.1", "b:0.2"))
+ )
+ checkAnswer(
+ DummyInputData.select(categorical_features(typedLit(Seq("a", "b")), lit("c11"), lit("c12"))),
+ Row(Seq("a#c11", "b#c12"))
+ )
+ checkAnswer(
+ DummyInputData.select(indexed_features(lit(0.1), lit(0.2), lit(0.3))),
+ Row(Seq("1:0.1", "2:0.2", "3:0.3"))
+ )
+ checkAnswer(
+ DummyInputData.select(quantitative_features(typedLit(Seq("a", "b")), lit(0.1), lit(0.2))),
+ Row(Seq("a:0.1", "b:0.2"))
+ )
+ checkAnswer(
+ DummyInputData.select(ffm_features(typedLit(Seq("1", "2")), lit(0.5), lit(0.2))),
+ Row(Seq("190:140405:1", "111:1058718:1"))
+ )
+ checkAnswer(
+ DummyInputData.select(add_field_indicies(typedLit(Seq("0.5", "0.1")))),
+ Row(Seq("1:0.5", "2:0.1"))
+ )
+
val df1 = Seq((1, -3, 1), (2, -2, 1)).toDF("a", "b", "c")
checkAnswer(
df1.binarize_label($"a", $"b", $"c"),
Row(1, 1) :: Row(1, 1) :: Row(1, 1) :: Nil
)
+ val df2 = Seq(("xxx", "yyy", 0), ("zzz", "yyy", 1)).toDF("a", "b", "c").coalesce(1)
+ checkAnswer(
+ df2.quantified_features(lit(true), df2("a"), df2("b"), df2("c")),
+ Row(Seq(0.0, 0.0, 0.0)) :: Row(Seq(1.0, 0.0, 1.0)) :: Nil
+ )
+ }
+
+ test("ftvec.ranking") {
+ import hiveContext.implicits._
+
+ val df1 = Seq((1, 0 :: 3 :: 4 :: Nil), (2, 8 :: 9 :: Nil)).toDF("a", "b").coalesce(1)
+ checkAnswer(
+ df1.bpr_sampling($"a", $"b"),
+ Row(1, 0, 7) ::
+ Row(1, 3, 6) ::
+ Row(2, 8, 0) ::
+ Row(2, 8, 4) ::
+ Row(2, 9, 7) ::
+ Nil
+ )
+ val df2 = Seq(1 :: 8 :: 9 :: Nil, 0 :: 3 :: Nil).toDF("a").coalesce(1)
+ checkAnswer(
+ df2.item_pairs_sampling($"a", lit(3)),
+ Row(0, 1) ::
+ Row(1, 0) ::
+ Row(3, 2) ::
+ Nil
+ )
+ val df3 = Seq(3 :: 5 :: Nil, 0 :: Nil).toDF("a").coalesce(1)
+ checkAnswer(
+ df3.populate_not_in($"a", lit(1)),
+ Row(0) ::
+ Row(1) ::
+ Row(1) ::
+ Nil
+ )
+ }
- val df2 = Seq((0.1f, 0.2f), (0.5f, 0.3f)).toDF("a", "b")
+ test("tools") {
+ // checkAnswer(
+ // DummyInputData.select(convert_label(lit(5))),
+ // Nil
+ // )
checkAnswer(
- df2.select(vectorize_features(lit2(Seq("a", "b")), df2("a"), df2("b"))),
- Row(Seq("a:0.1", "b:0.2")) :: Row(Seq("a:0.5", "b:0.3")) :: Nil
+ DummyInputData.select(x_rank(lit("abc"))),
+ Row(1)
)
+ }
- val df3 = Seq(("c11", "c12"), ("c21", "c22")).toDF("a", "b")
+ test("tools.array") {
+ checkAnswer(
+ DummyInputData.select(float_array(lit(3))),
+ Row(Seq())
+ )
+ checkAnswer(
+ DummyInputData.select(array_remove(typedLit(Seq(1, 2, 3)), lit(2))),
+ Row(Seq(1, 3))
+ )
+ checkAnswer(
+ DummyInputData.select(sort_and_uniq_array(typedLit(Seq(2, 1, 3, 1)))),
+ Row(Seq(1, 2, 3))
+ )
+ checkAnswer(
+ DummyInputData.select(subarray_endwith(typedLit(Seq(1, 2, 3, 4, 5)), lit(4))),
+ Row(Seq(1, 2, 3, 4))
+ )
+ checkAnswer(
+ DummyInputData.select(
+ array_concat(typedLit(Seq(1, 2)), typedLit(Seq(3)), typedLit(Seq(4, 5)))),
+ Row(Seq(1, 2, 3, 4, 5))
+ )
checkAnswer(
- df3.select(categorical_features(lit2(Seq("a", "b")), df3("a"), df3("b"))),
- Row(Seq("a#c11", "b#c12")) :: Row(Seq("a#c21", "b#c22")) :: Nil
+ DummyInputData.select(subarray(typedLit(Seq(1, 2, 3, 4, 5)), lit(2), lit(4))),
+ Row(Seq(3, 4))
)
+ checkAnswer(
+ DummyInputData.select(to_string_array(typedLit(Seq(1, 2, 3, 4, 5)))),
+ Row(Seq("1", "2", "3", "4", "5"))
+ )
+ checkAnswer(
+ DummyInputData.select(array_intersect(typedLit(Seq(1, 2, 3)), typedLit(Seq(2, 3, 4)))),
+ Row(Seq(2, 3))
+ )
+ }
+
+ test("tools.array - select_k_best") {
+ import hiveContext.implicits._
+
+ val data = Seq(Seq(0, 1, 3), Seq(2, 4, 1), Seq(5, 4, 9))
+ val df = data.map(d => (d, Seq(3, 1, 2))).toDF("features", "importance_list")
+ val k = 2
- val df4 = Seq((0.1, 0.2, 0.3), (0.2, 0.5, 0.4)).toDF("a", "b", "c")
checkAnswer(
- df4.select(indexed_features(df4("a"), df4("b"), df4("c"))),
- Row(Seq("1:0.1", "2:0.2", "3:0.3")) :: Row(Seq("1:0.2", "2:0.5", "3:0.4")) :: Nil
+ df.select(select_k_best(df("features"), df("importance_list"), lit(k))),
+ Row(Seq(0.0, 3.0)) :: Row(Seq(2.0, 1.0)) :: Row(Seq(5.0, 9.0)) :: Nil
)
+ }
- val df5 = Seq(("xxx", "yyy", 0), ("zzz", "yyy", 1)).toDF("a", "b", "c").coalesce(1)
+ test("tools.bits") {
checkAnswer(
- df5.quantified_features(lit(true), df5("a"), df5("b"), df5("c")),
- Row(Seq(0.0, 0.0, 0.0)) :: Row(Seq(1.0, 0.0, 1.0)) :: Nil
+ DummyInputData.select(to_bits(typedLit(Seq(1, 3, 9)))),
+ Row(Seq(522L))
)
+ checkAnswer(
+ DummyInputData.select(unbits(typedLit(Seq(1L, 3L)))),
+ Row(Seq(0L, 64L, 65L))
+ )
+ checkAnswer(
+ DummyInputData.select(bits_or(typedLit(Seq(1L, 3L)), typedLit(Seq(8L, 23L)))),
+ Row(Seq(9L, 23L))
+ )
+ }
- val df6 = Seq((0.1, 0.2), (0.5, 0.3)).toDF("a", "b")
+ test("tools.compress") {
checkAnswer(
- df6.select(quantitative_features(lit2(Seq("a", "b")), df6("a"), df6("b"))),
- Row(Seq("a:0.1", "b:0.2")) :: Row(Seq("a:0.5", "b:0.3")) :: Nil
+ DummyInputData.select(inflate(deflate(lit("input text")))),
+ Row("input text")
+ )
+ }
+
+ test("tools.map") {
+ val rows = DummyInputData.select(
+ map_get_sum(typedLit(Map(1 -> 0.2f, 2 -> 0.5f, 4 -> 0.8f)), typedLit(Seq(1, 4)))
+ ).collect
+ assert(rows.length === 1)
+ assert(rows(0).getDouble(0) ~== 1.0f)
+
+ checkAnswer(
+ DummyInputData.select(map_tail_n(typedLit(Map(1 -> 2, 2 -> 5)), lit(1))),
+ Row(Map(2 -> 5))
+ )
+ }
+
+ test("tools.text") {
+ checkAnswer(
+ DummyInputData.select(tokenize(lit("This is a pen"))),
+ Row("This" :: "is" :: "a" :: "pen" :: Nil)
+ )
+ checkAnswer(
+ DummyInputData.select(is_stopword(lit("because"))),
+ Row(true)
+ )
+ checkAnswer(
+ DummyInputData.select(singularize(lit("between"))),
+ Row("between")
+ )
+ checkAnswer(
+ DummyInputData.select(split_words(lit("Hello, world"))),
+ Row("Hello," :: "world" :: Nil)
+ )
+ checkAnswer(
+ DummyInputData.select(normalize_unicode(lit("abcdefg"))),
+ Row("abcdefg")
+ )
+ checkAnswer(
+ DummyInputData.select(base91(typedLit("input text".getBytes))),
+ Row("xojg[@TX;R..B")
+ )
+ checkAnswer(
+ DummyInputData.select(unbase91(lit("XXXX"))),
+ Row(68 :: -120 :: 8 :: Nil)
+ )
+ checkAnswer(
+ DummyInputData.select(word_ngrams(typedLit("abcd" :: "efg" :: "hij" :: Nil), lit(2), lit(2))),
+ Row("abcd efg" :: "efg hij" :: Nil)
+ )
+ }
+
+ test("tools - generated_series") {
+ checkAnswer(
+ DummyInputData.generate_series(lit(0), lit(3)),
+ Row(0) :: Row(1) :: Row(2) :: Row(3) :: Nil
+ )
+ }
+
+ test("geospatial") {
+ val rows1 = DummyInputData.select(tilex2lon(lit(1), lit(6))).collect
+ assert(rows1.length === 1)
+ assert(rows1(0).getDouble(0) ~== -174.375)
+
+ val rows2 = DummyInputData.select(tiley2lat(lit(1), lit(3))).collect
+ assert(rows2.length === 1)
+ assert(rows2(0).getDouble(0) ~== 79.17133464081945)
+
+ val rows3 = DummyInputData.select(
+ haversine_distance(lit(0.3), lit(0.1), lit(0.4), lit(0.1))).collect
+ assert(rows3.length === 1)
+ assert(rows3(0).getDouble(0) ~== 11.119492664455878)
+
+ checkAnswer(
+ DummyInputData.select(tile(lit(0.1), lit(0.8), lit(3))),
+ Row(28)
+ )
+ checkAnswer(
+ DummyInputData.select(map_url(lit(0.1), lit(0.8), lit(3))),
+ Row("http://tile.openstreetmap.org/3/4/3.png")
+ )
+ checkAnswer(
+ DummyInputData.select(lat2tiley(lit(0.3), lit(3))),
+ Row(3)
+ )
+ checkAnswer(
+ DummyInputData.select(lon2tilex(lit(0.4), lit(2))),
+ Row(2)
)
}
@@ -494,7 +760,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
val schema = new StructType().add("a", IntegerType).add("b", StringType)
checkAnswer(
df.select(from_csv($"value", schema)),
- Row(Row(1, "abc")) :: Nil)
+ Row(Row(1, "abc")))
}
test("misc - to_csv") {
@@ -522,7 +788,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
val model = Seq((0.0, 0.1 :: 0.1 :: Nil), (1.0, 0.2 :: 0.3 :: 0.2 :: Nil))
.toDF("label", "features")
- .train_randomforest_regr($"features", $"label")
+ .train_randomforest_regressor($"features", $"label")
val testData = Seq((0.0, 0.1 :: 0.0 :: Nil), (1.0, 0.3 :: 0.5 :: 0.4 :: Nil))
.toDF("label", "features")
@@ -542,22 +808,11 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
checkAnswer(predicted, Seq(Row(0), Row(1)))
}
- test("tools.array - select_k_best") {
- import hiveContext.implicits._
-
- val data = Seq(Seq(0, 1, 3), Seq(2, 4, 1), Seq(5, 4, 9))
- val df = data.map(d => (d, Seq(3, 1, 2))).toDF("features", "importance_list")
- val k = 2
-
- checkAnswer(
- df.select(select_k_best(df("features"), df("importance_list"), lit(k))),
- Row(Seq(0.0, 3.0)) :: Row(Seq(2.0, 1.0)) :: Row(Seq(5.0, 9.0)) :: Nil
- )
- }
-
test("misc - sigmoid") {
import hiveContext.implicits._
- assert(DummyInputData.select(sigmoid($"c0")).collect.apply(0).getDouble(0) ~== 0.500)
+ val rows = DummyInputData.select(sigmoid($"c0")).collect
+ assert(rows.length === 1)
+ assert(rows(0).getDouble(0) ~== 0.500)
}
test("misc - lr_datagen") {
@@ -567,16 +822,18 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
test("invoke regression functions") {
import hiveContext.implicits._
Seq(
- "train_adadelta",
- "train_adagrad",
+ "train_regressor",
+ "train_adadelta_regr",
+ "train_adagrad_regr",
"train_arow_regr",
"train_arowe_regr",
"train_arowe2_regr",
- "train_logregr",
+ "train_logistic_regr",
"train_pa1_regr",
"train_pa1a_regr",
"train_pa2_regr",
"train_pa2a_regr"
+ // "train_randomforest_regressor"
).map { func =>
TestUtils.invokeFunc(new HivemallOps(TinyTrainData), func, Seq($"features", $"label"))
.foreach(_ => {}) // Just call it
@@ -586,6 +843,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
test("invoke classifier functions") {
import hiveContext.implicits._
Seq(
+ "train_classifier",
"train_perceptron",
"train_pa",
"train_pa1",
@@ -596,6 +854,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
"train_scw",
"train_scw2",
"train_adagrad_rda"
+ // "train_randomforest_classifier"
).map { func =>
TestUtils.invokeFunc(new HivemallOps(TinyTrainData), func, Seq($"features", $"label"))
.foreach(_ => {}) // Just call it
@@ -611,6 +870,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
"train_multiclass_pa2",
"train_multiclass_cw",
"train_multiclass_arow",
+ "train_multiclass_arowh",
"train_multiclass_scw",
"train_multiclass_scw2"
).map { func =>
@@ -628,7 +888,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
(Array(0.3, 0.1, 0.2), 0),
(Array(0.3, 0.1, 0.2), 0)).toDF("features", "label")
Seq(
- "train_randomforest_regr",
+ "train_randomforest_regressor",
"train_randomforest_classifier"
).map { func =>
TestUtils.invokeFunc(new HivemallOps(testDf.coalesce(1)), func, Seq($"features", $"label"))
@@ -636,6 +896,27 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
}
}
+ test("invoke recommend functions") {
+ import hiveContext.implicits._
+ val df = Seq((1, Map(1 -> 0.3), Map(2 -> Map(4 -> 0.1)), 0, Map(3 -> 0.5)))
+ .toDF("i", "r_i", "topKRatesOfI", "j", "r_j")
+ // Just call it
+ df.train_slim($"i", $"r_i", $"topKRatesOfI", $"j", $"r_j").collect
+
+ }
+
+ ignore("invoke topicmodel functions") {
+ import hiveContext.implicits._
+ val testDf = Seq(Seq("abcd", "'efghij", "klmn")).toDF("words")
+ Seq(
+ "train_lda",
+ "train_plsa"
+ ).map { func =>
+ TestUtils.invokeFunc(new HivemallOps(testDf.coalesce(1)), func, Seq($"words"))
+ .foreach(_ => {}) // Just call it
+ }
+ }
+
protected def checkRegrPrecision(func: String): Unit = {
import hiveContext.implicits._
@@ -730,12 +1011,12 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
ignore("check regression precision") {
Seq(
- "train_adadelta",
- "train_adagrad",
+ "train_adadelta_regr",
+ "train_adagrad_regr",
"train_arow_regr",
"train_arowe_regr",
"train_arowe2_regr",
- "train_logregr",
+ "train_logistic_regr",
"train_pa1_regr",
"train_pa1a_regr",
"train_pa2_regr",
@@ -762,61 +1043,212 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
}
}
- test("user-defined aggregators for ensembles") {
+ test("aggregations for classifiers") {
+ import hiveContext.implicits._
+ val df1 = Seq((1, 0.1, 0.1, 0.2f, 0.2f, 0.2f, 0.2f))
+ .toDF("key", "xh", "xk", "w0", "w1", "w2", "w3")
+ val row1 = df1.groupBy($"key").kpa_predict("xh", "xk", "w0", "w1", "w2", "w3").collect
+ assert(row1.length === 1)
+ assert(row1(0).getDouble(1) ~== 0.002000000029802)
+ }
+
+ test("aggregations for ensembles") {
import hiveContext.implicits._
- val df1 = Seq((1, 0.1f), (1, 0.2f), (2, 0.1f)).toDF("c0", "c1")
- val row1 = df1.groupBy($"c0").voted_avg("c1").collect
- assert(row1(0).getDouble(1) ~== 0.15)
- assert(row1(1).getDouble(1) ~== 0.10)
+ val df1 = Seq((1, 0.1), (1, 0.2), (2, 0.1)).toDF("c0", "c1")
+ val rows1 = df1.groupBy($"c0").voted_avg("c1").collect
+ assert(rows1.length === 2)
+ assert(rows1(0).getDouble(1) ~== 0.15)
+ assert(rows1(1).getDouble(1) ~== 0.10)
- val df3 = Seq((1, 0.2f), (1, 0.8f), (2, 0.3f)).toDF("c0", "c1")
- val row3 = df3.groupBy($"c0").weight_voted_avg("c1").collect
- assert(row3(0).getDouble(1) ~== 0.50)
- assert(row3(1).getDouble(1) ~== 0.30)
+ val df3 = Seq((1, 0.2), (1, 0.8), (2, 0.3)).toDF("c0", "c1")
+ val rows3 = df3.groupBy($"c0").weight_voted_avg("c1").collect
+ assert(rows3.length === 2)
+ assert(rows3(0).getDouble(1) ~== 0.50)
+ assert(rows3(1).getDouble(1) ~== 0.30)
val df5 = Seq((1, 0.2f, 0.1f), (1, 0.4f, 0.2f), (2, 0.8f, 0.9f)).toDF("c0", "c1", "c2")
- val row5 = df5.groupBy($"c0").argmin_kld("c1", "c2").collect
- assert(row5(0).getFloat(1) ~== 0.266666666)
- assert(row5(1).getFloat(1) ~== 0.80)
+ val rows5 = df5.groupBy($"c0").argmin_kld("c1", "c2").collect
+ assert(rows5.length === 2)
+ assert(rows5(0).getFloat(1) ~== 0.266666666)
+ assert(rows5(1).getFloat(1) ~== 0.80)
+
+ val df6 = Seq((1, "id-0", 0.2), (1, "id-1", 0.4), (1, "id-2", 0.1)).toDF("c0", "c1", "c2")
+ val rows6 = df6.groupBy($"c0").max_label("c2", "c1").collect
+ assert(rows6.length === 1)
+ assert(rows6(0).getString(1) == "id-1")
+
+ val df7 = Seq((1, "id-0", 0.5), (1, "id-1", 0.1), (1, "id-2", 0.2)).toDF("c0", "c1", "c2")
+ val rows7 = df7.groupBy($"c0").maxrow("c2", "c1").toDF("c0", "c1").select($"c1.col1").collect
+ assert(rows7.length === 1)
+ assert(rows7(0).getString(0) == "id-0")
+
+ val df8 = Seq((1, 1), (1, 2), (2, 1), (1, 5)).toDF("c0", "c1")
+ val rows8 = df8.groupBy($"c0").rf_ensemble("c1").toDF("c0", "c1")
+ .select("c1.probability").collect
+ assert(rows8.length === 2)
+ assert(rows8(0).getDouble(0) ~== 0.3333333333)
+ assert(rows8(1).getDouble(0) ~== 1.0)
+ }
+
+ test("aggregations for evaluation") {
+ import hiveContext.implicits._
- val df6 = Seq((1, "id-0", 0.2f), (1, "id-1", 0.4f), (1, "id-2", 0.1f)).toDF("c0", "c1", "c2")
- val row6 = df6.groupBy($"c0").max_label("c2", "c1").collect
- assert(row6(0).getString(1) == "id-1")
+ val testDf1 = Seq((1, 1.0, 0.5), (1, 0.3, 0.5), (1, 0.1, 0.2)).toDF("c0", "c1", "c2")
+ val rows1 = testDf1.groupBy($"c0").mae("c1", "c2").collect
+ assert(rows1.length === 1)
+ assert(rows1(0).getDouble(1) ~== 0.26666666)
+ val rows2 = testDf1.groupBy($"c0").mse("c1", "c2").collect
+ assert(rows2.length === 1)
+ assert(rows2(0).getDouble(1) ~== 0.1)
+ val rows3 = testDf1.groupBy($"c0").rmse("c1", "c2").collect
+ assert(rows3.length === 1)
+ assert(rows3(0).getDouble(1) ~== 0.31622776601683794)
+ val rows4 = testDf1.groupBy($"c0").r2("c1", "c2").collect
+ assert(rows4.length === 1)
+ assert(rows4(0).getDouble(1) ~== -4.0)
+ val rows5 = testDf1.groupBy($"c0").logloss("c1", "c2").collect
+ assert(rows5.length === 1)
+ assert(rows5(0).getDouble(1) ~== 6.198305767142615)
+
+ val testDf2 = Seq((1, Array(1, 2), Array(2, 3)), (1, Array(3, 8), Array(5, 4)))
+ .toDF("c0", "c1", "c2")
+ val rows6 = testDf2.groupBy($"c0").ndcg("c1", "c2").collect
+ assert(rows6.length === 1)
+ assert(rows6(0).getDouble(1) ~== 0.19342640361727081)
+ val rows7 = testDf2.groupBy($"c0").precision_at("c1", "c2").collect
+ assert(rows7.length === 1)
+ assert(rows7(0).getDouble(1) ~== 0.25)
+ val rows8 = testDf2.groupBy($"c0").recall_at("c1", "c2").collect
+ assert(rows8.length === 1)
+ assert(rows8(0).getDouble(1) ~== 0.25)
+ val rows9 = testDf2.groupBy($"c0").hitrate("c1", "c2").collect
+ assert(rows9.length === 1)
+ assert(rows9(0).getDouble(1) ~== 0.50)
+ val rows10 = testDf2.groupBy($"c0").mrr("c1", "c2").collect
+ assert(rows10.length === 1)
+ assert(rows10(0).getDouble(1) ~== 0.25)
+ val rows11 = testDf2.groupBy($"c0").average_precision("c1", "c2").collect
+ assert(rows11.length === 1)
+ assert(rows11(0).getDouble(1) ~== 0.25)
+ val rows12 = testDf2.groupBy($"c0").auc("c1", "c2").collect
+ assert(rows12.length === 1)
+ assert(rows12(0).getDouble(1) ~== 0.25)
+ }
- val df7 = Seq((1, "id-0", 0.5f), (1, "id-1", 0.1f), (1, "id-2", 0.2f)).toDF("c0", "c1", "c2")
- val row7 = df7.groupBy($"c0").maxrow("c2", "c1").toDF("c0", "c1").select($"c1.col1").collect
- assert(row7(0).getString(0) == "id-0")
+ test("aggregations for topicmodel") {
+ import hiveContext.implicits._
- // val df8 = Seq((1, 1), (1, 2), (2, 1), (1, 5)).toDF("c0", "c1")
- // val row8 = df8.groupBy($"c0").rf_ensemble("c1").toDF("c0", "c1")
- // .select("c1.probability").collect
- // assert(row8(0).getDouble(0) ~== 0.3333333333)
- // assert(row8(1).getDouble(0) ~== 1.0)
+ val testDf = Seq((1, "abcd", 0.1, 0, 0.1), (1, "efgh", 0.2, 0, 0.1))
+ .toDF("key", "word", "value", "label", "lambda")
+ val rows1 = testDf.groupBy($"key").lda_predict("word", "value", "label", "lambda").collect
+ assert(rows1.length === 1)
+ val result1 = rows1(0).getSeq[Row](1).map { case Row(label: Int, prob: Float) => label -> prob }
+ .toMap[Int, Float]
+ assert(result1.size === 10)
+ assert(result1(0) ~== 0.07692449)
+ assert(result1(1) ~== 0.07701121)
+ assert(result1(2) ~== 0.07701129)
+ assert(result1(3) ~== 0.07705542)
+ assert(result1(4) ~== 0.07701511)
+ assert(result1(5) ~== 0.07701234)
+ assert(result1(6) ~== 0.07701384)
+ assert(result1(7) ~== 0.30693996)
+ assert(result1(8) ~== 0.07700701)
+ assert(result1(9) ~== 0.07700934)
+
+ val rows2 = testDf.groupBy($"key").plsa_predict("word", "value", "label", "lambda").collect
+ assert(rows2.length === 1)
+ val result2 = rows2(0).getSeq[Row](1).map { case Row(label: Int, prob: Float) => label -> prob }
+ .toMap[Int, Float]
+ assert(result2.size === 10)
+ assert(result2(0) ~== 0.062156882)
+ assert(result2(1) ~== 0.05088547)
+ assert(result2(2) ~== 0.12434204)
+ assert(result2(3) ~== 0.31869823)
+ assert(result2(4) ~== 0.01584355)
+ assert(result2(5) ~== 0.0057667173)
+ assert(result2(6) ~== 0.10864779)
+ assert(result2(7) ~== 0.09346106)
+ assert(result2(8) ~== 0.13905199)
+ assert(result2(9) ~== 0.081146255)
}
- test("user-defined aggregators for evaluation") {
+ test("aggregations for ftvec.text") {
import hiveContext.implicits._
+ val testDf = Seq((1, "abc def hi jk l"), (1, "def jk")).toDF("key", "text")
+ val rows = testDf.groupBy($"key").tf("text").collect
+ assert(rows.length === 1)
+ val result = rows(0).getAs[Map[String, Float]](1)
+ assert(result.size === 2)
+ assert(result("def jk") ~== 0.5f)
+ assert(result("abc def hi jk l") ~== 0.5f)
+ }
- val df1 = Seq((1, 1.0f, 0.5f), (1, 0.3f, 0.5f), (1, 0.1f, 0.2f)).toDF("c0", "c1", "c2")
- val row1 = df1.groupBy($"c0").mae("c1", "c2").collect
- assert(row1(0).getDouble(1) ~== 0.26666666)
+ test("aggregations for tools.array") {
+ import hiveContext.implicits._
- val df2 = Seq((1, 0.3f, 0.8f), (1, 1.2f, 2.0f), (1, 0.2f, 0.3f)).toDF("c0", "c1", "c2")
- val row2 = df2.groupBy($"c0").mse("c1", "c2").collect
- assert(row2(0).getDouble(1) ~== 0.29999999)
+ val testDf = Seq((1, 1 :: 3 :: Nil), (1, 3 :: 5 :: Nil)).toDF("key", "ar")
+ val rows1 = testDf.groupBy($"key").array_avg("ar").collect
+ assert(rows1.length === 1)
+ val result1 = rows1(0).getSeq[Float](1)
+ assert(result1.length === 2)
+ assert(result1(0) ~== 2.0f)
+ assert(result1(1) ~== 4.0f)
+
+ val rows2 = testDf.groupBy($"key").array_sum("ar").collect
+ assert(rows2.length === 1)
+ val result2 = rows2(0).getSeq[Double](1)
+ assert(result2.length === 2)
+ assert(result2(0) ~== 4.0)
+ assert(result2(1) ~== 8.0)
+ }
- val df3 = Seq((1, 0.3f, 0.8f), (1, 1.2f, 2.0f), (1, 0.2f, 0.3f)).toDF("c0", "c1", "c2")
- val row3 = df3.groupBy($"c0").rmse("c1", "c2").collect
- assert(row3(0).getDouble(1) ~== 0.54772253)
+ test("aggregations for tools.bits") {
+ import hiveContext.implicits._
+ val testDf = Seq((1, 1), (1, 7)).toDF("key", "x")
+ val rows = testDf.groupBy($"key").bits_collect("x").collect
+ assert(rows.length === 1)
+ val result = rows(0).getSeq[Int](1)
+ assert(result === Seq(130))
+ }
- val df4 = Seq((1, Array(1, 2), Array(2, 3)), (1, Array(3, 8), Array(5, 4))).toDF
- .toDF("c0", "c1", "c2")
- val row4 = df4.groupBy($"c0").f1score("c1", "c2").collect
- assert(row4(0).getDouble(1) ~== 0.25)
+ test("aggregations for tools.list") {
+ import hiveContext.implicits._
+ val testDf = Seq((1, 3), (1, 1), (1, 2)).toDF("key", "x")
+ val rows = testDf.groupBy($"key").to_ordered_list("x").collect
+ assert(rows.length === 1)
+ val result = rows(0).getSeq[Int](1)
+ assert(result === Seq(1, 2, 3))
+ }
+
+ test("aggregations for tools.map") {
+ import hiveContext.implicits._
+ val testDf = Seq((1, 1, "a"), (1, 2, "b"), (1, 3, "c")).toDF("key", "k", "v")
+ val rows = testDf.groupBy($"key").to_map("k", "v").collect
+ assert(rows.length === 1)
+ val result = rows(0).getMap[Int, String](1)
+ assert(result === Map(1 -> "a", 2 -> "b", 3 -> "c"))
+ }
+
+ test("aggregations for tools.math") {
+ import hiveContext.implicits._
+ val testDf = Seq(
+ (1, Seq(1, 2, 3, 4), Seq(5, 6, 7, 8)),
+ (1, Seq(9, 10, 11, 12), Seq(13, 14, 15, 16))
+ ).toDF("key", "mtx1", "mtx2")
+ val rows = testDf.groupBy($"key").transpose_and_dot("mtx1", "mtx2").collect
+ assert(rows.length === 1)
+ val result = rows(0).getSeq[Int](1)
+ assert(result === Seq(
+ Seq(122.0, 132.0, 142.0, 152.0),
+ Seq(140.0, 152.0, 164.0, 176.0),
+ Seq(158.0, 172.0, 186.0, 200.0),
+ Seq(176.0, 192.0, 208.0, 224.0))
+ )
}
- test("user-defined aggregators for ftvec.trans") {
+ test("aggregations for ftvec.trans") {
import hiveContext.implicits._
val df0 = Seq((1, "cat", "mammal", 9), (1, "dog", "mammal", 10), (1, "human", "mammal", 10),
@@ -842,7 +1274,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
assert(result012.values.toSet === Set(9, 10, 11, 12, 13))
}
- test("user-defined aggregators for ftvec.selection") {
+ test("aggregations for ftvec.selection") {
import hiveContext.implicits._
// see also hivemall.ftvec.selection.SignalNoiseRatioUDAFTest
@@ -889,7 +1321,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
.foreach((actual, expected) => assert(actual ~== expected))
}
- test("user-defined aggregators for tools.matrix") {
+ test("aggregations for tools.matrix") {
import hiveContext.implicits._
// | 1 2 3 |T | 5 6 7 |
@@ -950,9 +1382,9 @@ final class HivemallOpsWithVectorSuite extends VectorQueryTest {
)
}
- test("train_logregr") {
+ test("train_logistic_regr") {
checkAnswer(
- mllibTrainDf.train_logregr($"features", $"label")
+ mllibTrainDf.train_logistic_regr($"features", $"label")
.groupBy("feature").agg("weight" -> "avg")
.select($"feature"),
Seq(0, 1, 2, 3, 4, 5, 6).map(v => Row(s"$v"))
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/test/HivemallFeatureQueryTest.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/test/HivemallFeatureQueryTest.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/test/HivemallFeatureQueryTest.scala
index 3ca9bbf..bc656d1 100644
--- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/test/HivemallFeatureQueryTest.scala
+++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/test/HivemallFeatureQueryTest.scala
@@ -36,17 +36,6 @@ abstract class HivemallFeatureQueryTest extends QueryTest with SQLTestUtils with
import hiveContext.implicits._
- /**
- * TODO: spark-2.0 does not support literals for some types (e.g., Seq[_] and Array[_]).
- * So, it provides that functionality here.
- * This helper function will be removed in future releases.
- */
- protected def lit2[T : TypeTag](v: T): Column = {
- val ScalaReflection.Schema(dataType, _) = ScalaReflection.schemaFor[T]
- val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
- Column(Literal(convert(v), dataType))
- }
-
protected val DummyInputData = Seq((0, 0)).toDF("c0", "c1")
protected val IntList2Data =
[3/4] incubator-hivemall git commit: Fixed typos
Posted by my...@apache.org.
Fixed typos
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/c58eaeaf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/c58eaeaf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/c58eaeaf
Branch: refs/heads/master
Commit: c58eaeaf8d5be966967c1db169031d8fa4ee30c8
Parents: 1680c42
Author: Makoto Yui <my...@apache.org>
Authored: Mon Oct 16 20:53:53 2017 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Mon Oct 16 21:05:00 2017 +0900
----------------------------------------------------------------------
.../src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala | 4 ++--
.../test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/c58eaeaf/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
index 45f7b4d..90a21d7 100644
--- a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
+++ b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
@@ -1843,10 +1843,10 @@ object HivemallOps {
* @see [[hivemall.ftvec.trans.AddFieldIndicesUDF]]
* @group ftvec.trans
*/
- def add_field_indicies(features: Column): Column = withExpr {
+ def add_field_indices(features: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.AddFieldIndicesUDF",
- "add_field_indicies",
+ "add_field_indices",
features :: Nil
)
}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/c58eaeaf/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
index de2481c..5d0f075 100644
--- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
+++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
@@ -345,7 +345,7 @@ class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
Row(Seq("190:140405:1", "111:1058718:1"))
)
checkAnswer(
- DummyInputData.select(add_field_indicies(typedLit(Seq("0.5", "0.1")))),
+ DummyInputData.select(add_field_indices(typedLit(Seq("0.5", "0.1")))),
Row(Seq("1:0.5", "2:0.1"))
)
[2/4] incubator-hivemall git commit: Close #122:
[HIVEMALL-147][Spark] Support all Hivemall functions of v0.5-rc.1 in Spark
Dataframe
Posted by my...@apache.org.
Close #122: [HIVEMALL-147][Spark] Support all Hivemall functions of v0.5-rc.1 in Spark Dataframe
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1680c42c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1680c42c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1680c42c
Branch: refs/heads/master
Commit: 1680c42cf762a52183f76613fb02411f8f3a671a
Parents: fdf7021
Author: Takeshi Yamamuro <ya...@apache.org>
Authored: Mon Oct 16 20:52:56 2017 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Mon Oct 16 21:04:41 2017 +0900
----------------------------------------------------------------------
.../main/java/hivemall/evaluation/AUCUDAF.java | 4 +-
.../java/hivemall/evaluation/HitRateUDAF.java | 5 +-
.../main/java/hivemall/evaluation/MAPUDAF.java | 5 +-
.../main/java/hivemall/evaluation/MRRUDAF.java | 5 +-
.../main/java/hivemall/evaluation/NDCGUDAF.java | 5 +-
.../java/hivemall/evaluation/PrecisionUDAF.java | 5 +-
.../java/hivemall/evaluation/RecallUDAF.java | 5 +-
.../tools/array/ArrayAvgGenericUDAF.java | 2 -
.../hivemall/topicmodel/LDAPredictUDAF.java | 2 +-
.../hivemall/topicmodel/PLSAPredictUDAF.java | 2 +-
.../spark/sql/hive/HivemallGroupedDataset.scala | 474 +++++++++--
.../org/apache/spark/sql/hive/HivemallOps.scala | 824 +++++++++++++++++--
.../spark/sql/hive/HivemallOpsSuite.scala | 736 +++++++++++++----
.../hive/test/HivemallFeatureQueryTest.scala | 11 -
14 files changed, 1764 insertions(+), 321 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/AUCUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/AUCUDAF.java b/core/src/main/java/hivemall/evaluation/AUCUDAF.java
index 508e36a..6dba174 100644
--- a/core/src/main/java/hivemall/evaluation/AUCUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/AUCUDAF.java
@@ -110,7 +110,7 @@ public final class AUCUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
@@ -439,7 +439,7 @@ public final class AUCUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/HitRateUDAF.java b/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
index 6df6087..6a2d0da 100644
--- a/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
@@ -71,9 +71,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns HitRate")
public final class HitRateUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private HitRateUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -109,7 +106,7 @@ public final class HitRateUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/MAPUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/MAPUDAF.java b/core/src/main/java/hivemall/evaluation/MAPUDAF.java
index 45e64cb..fef1f43 100644
--- a/core/src/main/java/hivemall/evaluation/MAPUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/MAPUDAF.java
@@ -53,9 +53,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns MAP")
public final class MAPUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private MAPUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -91,7 +88,7 @@ public final class MAPUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/MRRUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/MRRUDAF.java b/core/src/main/java/hivemall/evaluation/MRRUDAF.java
index 98b8c3d..fcd9d51 100644
--- a/core/src/main/java/hivemall/evaluation/MRRUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/MRRUDAF.java
@@ -53,9 +53,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns MRR")
public final class MRRUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private MRRUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -91,7 +88,7 @@ public final class MRRUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/NDCGUDAF.java b/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
index 4e4fde6..00aa16a 100644
--- a/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
@@ -55,9 +55,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns nDCG")
public final class NDCGUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private NDCGUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -94,7 +91,7 @@ public final class NDCGUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java b/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
index de8a876..2e09a71 100644
--- a/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
@@ -53,9 +53,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns Precision")
public final class PrecisionUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private PrecisionUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -91,7 +88,7 @@ public final class PrecisionUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/evaluation/RecallUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/RecallUDAF.java b/core/src/main/java/hivemall/evaluation/RecallUDAF.java
index 30b1712..d084c94 100644
--- a/core/src/main/java/hivemall/evaluation/RecallUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/RecallUDAF.java
@@ -53,9 +53,6 @@ import org.apache.hadoop.io.LongWritable;
+ " - Returns Recall")
public final class RecallUDAF extends AbstractGenericUDAFResolver {
- // prevent instantiation
- private RecallUDAF() {}
-
@Override
public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 2 && typeInfo.length != 3) {
@@ -91,7 +88,7 @@ public final class RecallUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 2 || parameters.length == 3) : parameters.length;
+ assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/tools/array/ArrayAvgGenericUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/ArrayAvgGenericUDAF.java b/core/src/main/java/hivemall/tools/array/ArrayAvgGenericUDAF.java
index 6dbb7d5..090a50c 100644
--- a/core/src/main/java/hivemall/tools/array/ArrayAvgGenericUDAF.java
+++ b/core/src/main/java/hivemall/tools/array/ArrayAvgGenericUDAF.java
@@ -61,8 +61,6 @@ import org.apache.hadoop.io.IntWritable;
+ " in which each element is the mean of a set of numbers")
public final class ArrayAvgGenericUDAF extends AbstractGenericUDAFResolver {
- private ArrayAvgGenericUDAF() {}// prevent instantiation
-
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] typeInfo) throws SemanticException {
if (typeInfo.length != 1) {
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/topicmodel/LDAPredictUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/topicmodel/LDAPredictUDAF.java b/core/src/main/java/hivemall/topicmodel/LDAPredictUDAF.java
index 94d510a..68c802f 100644
--- a/core/src/main/java/hivemall/topicmodel/LDAPredictUDAF.java
+++ b/core/src/main/java/hivemall/topicmodel/LDAPredictUDAF.java
@@ -200,7 +200,7 @@ public final class LDAPredictUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 4 || parameters.length == 5);
+ assert (parameters.length == 1 || parameters.length == 4 || parameters.length == 5);
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/core/src/main/java/hivemall/topicmodel/PLSAPredictUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/topicmodel/PLSAPredictUDAF.java b/core/src/main/java/hivemall/topicmodel/PLSAPredictUDAF.java
index 7702945..6210359 100644
--- a/core/src/main/java/hivemall/topicmodel/PLSAPredictUDAF.java
+++ b/core/src/main/java/hivemall/topicmodel/PLSAPredictUDAF.java
@@ -202,7 +202,7 @@ public final class PLSAPredictUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (parameters.length == 4 || parameters.length == 5);
+ assert (parameters.length == 1 || parameters.length == 4 || parameters.length == 5);
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala
index a012efd..00617b7 100644
--- a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala
+++ b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala
@@ -34,25 +34,61 @@ import org.apache.spark.sql.types._
/**
* Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
*
+ * @groupname classifier
* @groupname ensemble
- * @groupname ftvec.trans
* @groupname evaluation
+ * @groupname topicmodel
+ * @groupname ftvec.selection
+ * @groupname ftvec.text
+ * @groupname ftvec.trans
+ * @groupname tools.array
+ * @groupname tools.bits
+ * @groupname tools.list
+ * @groupname tools.map
+ * @groupname tools.matrix
+ * @groupname tools.math
+ *
+ * A list of unsupported functions is as follows:
+ * * ftvec.conv
+ * - conv2dense
+ * - build_bins
*/
final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) {
/**
+ * @see hivemall.classifier.KPAPredictUDAF
+ * @group classifier
+ */
+ def kpa_predict(xh: String, xk: String, w0: String, w1: String, w2: String, w3: String)
+ : DataFrame = {
+ checkType(xh, DoubleType)
+ checkType(xk, DoubleType)
+ checkType(w0, FloatType)
+ checkType(w1, FloatType)
+ checkType(w2, FloatType)
+ checkType(w3, FloatType)
+ val udaf = HiveUDAFFunction(
+ "kpa_predict",
+ new HiveFunctionWrapper("hivemall.classifier.KPAPredictUDAF"),
+ Seq(xh, xk, w0, w1, w2, w3).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
* @see hivemall.ensemble.bagging.VotedAvgUDAF
* @group ensemble
*/
def voted_avg(weight: String): DataFrame = {
- // checkType(weight, NumericType)
+ checkType(weight, DoubleType)
val udaf = HiveUDAFFunction(
"voted_avg",
new HiveFunctionWrapper("hivemall.ensemble.bagging.WeightVotedAvgUDAF"),
- Seq(weight).map(df.col(_).expr),
+ Seq(weight).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
@@ -60,14 +96,14 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) {
* @group ensemble
*/
def weight_voted_avg(weight: String): DataFrame = {
- // checkType(weight, NumericType)
+ checkType(weight, DoubleType)
val udaf = HiveUDAFFunction(
"weight_voted_avg",
new HiveFunctionWrapper("hivemall.ensemble.bagging.WeightVotedAvgUDAF"),
- Seq(weight).map(df.col(_).expr),
+ Seq(weight).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
@@ -75,15 +111,15 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) {
* @group ensemble
*/
def argmin_kld(weight: String, conv: String): DataFrame = {
- // checkType(weight, NumericType)
- // checkType(conv, NumericType)
+ checkType(weight, FloatType)
+ checkType(conv, FloatType)
val udaf = HiveUDAFFunction(
"argmin_kld",
new HiveFunctionWrapper("hivemall.ensemble.ArgminKLDistanceUDAF"),
- Seq(weight, conv).map(df.col(_).expr),
+ Seq(weight, conv).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
@@ -91,15 +127,15 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) {
* @group ensemble
*/
def max_label(score: String, label: String): DataFrame = {
- // checkType(score, NumericType)
+ checkType(score, DoubleType)
checkType(label, StringType)
val udaf = HiveUDAFFunction(
"max_label",
new HiveFunctionWrapper("hivemall.ensemble.MaxValueLabelUDAF"),
- Seq(score, label).map(df.col(_).expr),
+ Seq(score, label).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
@@ -107,134 +143,430 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) {
* @group ensemble
*/
def maxrow(score: String, label: String): DataFrame = {
- // checkType(score, NumericType)
+ checkType(score, DoubleType)
checkType(label, StringType)
val udaf = HiveUDAFFunction(
"maxrow",
new HiveFunctionWrapper("hivemall.ensemble.MaxRowUDAF"),
- Seq(score, label).map(df.col(_).expr),
+ Seq(score, label).map(df(_).expr),
isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
* @see hivemall.smile.tools.RandomForestEnsembleUDAF
* @group ensemble
*/
- def rf_ensemble(predict: String): DataFrame = {
- // checkType(predict, NumericType)
+ @scala.annotation.varargs
+ def rf_ensemble(yhat: String, others: String*): DataFrame = {
+ checkType(yhat, IntegerType)
val udaf = HiveUDAFFunction(
"rf_ensemble",
new HiveFunctionWrapper("hivemall.smile.tools.RandomForestEnsembleUDAF"),
- Seq(predict).map(df.col(_).expr),
+ (yhat +: others).map(df(_).expr),
isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
- * @see hivemall.tools.matrix.TransposeAndDotUDAF
+ * @see hivemall.evaluation.MeanAbsoluteErrorUDAF
+ * @group evaluation
*/
- def transpose_and_dot(X: String, Y: String): DataFrame = {
+ def mae(predict: String, target: String): DataFrame = {
+ checkType(predict, DoubleType)
+ checkType(target, DoubleType)
val udaf = HiveUDAFFunction(
- "transpose_and_dot",
- new HiveFunctionWrapper("hivemall.tools.matrix.TransposeAndDotUDAF"),
- Seq(X, Y).map(df.col(_).expr),
+ "mae",
+ new HiveFunctionWrapper("hivemall.evaluation.MeanAbsoluteErrorUDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.MeanSquareErrorUDAF
+ * @group evaluation
+ */
+ def mse(predict: String, target: String): DataFrame = {
+ checkType(predict, DoubleType)
+ checkType(target, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "mse",
+ new HiveFunctionWrapper("hivemall.evaluation.MeanSquaredErrorUDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.RootMeanSquareErrorUDAF
+ * @group evaluation
+ */
+ def rmse(predict: String, target: String): DataFrame = {
+ checkType(predict, DoubleType)
+ checkType(target, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "rmse",
+ new HiveFunctionWrapper("hivemall.evaluation.RootMeanSquaredErrorUDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.R2UDAF
+ * @group evaluation
+ */
+ def r2(predict: String, target: String): DataFrame = {
+ checkType(predict, DoubleType)
+ checkType(target, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "r2",
+ new HiveFunctionWrapper("hivemall.evaluation.R2UDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.LogarithmicLossUDAF
+ * @group evaluation
+ */
+ def logloss(predict: String, target: String): DataFrame = {
+ checkType(predict, DoubleType)
+ checkType(target, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "logloss",
+ new HiveFunctionWrapper("hivemall.evaluation.LogarithmicLossUDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.F1ScoreUDAF
+ * @group evaluation
+ */
+ def f1score(predict: String, target: String): DataFrame = {
+ // checkType(target, ArrayType(IntegerType, false))
+ // checkType(predict, ArrayType(IntegerType, false))
+ val udaf = HiveUDAFFunction(
+ "f1score",
+ new HiveFunctionWrapper("hivemall.evaluation.F1ScoreUDAF"),
+ Seq(predict, target).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.NDCGUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def ndcg(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "ndcg",
+ new HiveFunctionWrapper("hivemall.evaluation.NDCGUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.PrecisionUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def precision_at(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "precision_at",
+ new HiveFunctionWrapper("hivemall.evaluation.PrecisionUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.RecallUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def recall_at(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "recall_at",
+ new HiveFunctionWrapper("hivemall.evaluation.RecallUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.HitRateUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def hitrate(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "hitrate",
+ new HiveFunctionWrapper("hivemall.evaluation.HitRateUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.MRRUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def mrr(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "mrr",
+ new HiveFunctionWrapper("hivemall.evaluation.MRRUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.MAPUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def average_precision(rankItems: String, correctItems: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "average_precision",
+ new HiveFunctionWrapper("hivemall.evaluation.MAPUDAF"),
+ (rankItems +: correctItems +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.evaluation.AUCUDAF
+ * @group evaluation
+ */
+ @scala.annotation.varargs
+ def auc(args: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "auc",
+ new HiveFunctionWrapper("hivemall.evaluation.AUCUDAF"),
+ args.map(df(_).expr),
isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF(Seq(Alias(udaf, udaf.prettyName)()))
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.topicmodel.LDAPredictUDAF
+ * @group topicmodel
+ */
+ @scala.annotation.varargs
+ def lda_predict(word: String, value: String, label: String, lambda: String, others: String*)
+ : DataFrame = {
+ checkType(word, StringType)
+ checkType(value, DoubleType)
+ checkType(label, IntegerType)
+ checkType(lambda, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "lda_predict",
+ new HiveFunctionWrapper("hivemall.topicmodel.LDAPredictUDAF"),
+ (word +: value +: label +: lambda +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.topicmodel.PLSAPredictUDAF
+ * @group topicmodel
+ */
+ @scala.annotation.varargs
+ def plsa_predict(word: String, value: String, label: String, prob: String, others: String*)
+ : DataFrame = {
+ checkType(word, StringType)
+ checkType(value, DoubleType)
+ checkType(label, IntegerType)
+ checkType(prob, DoubleType)
+ val udaf = HiveUDAFFunction(
+ "plsa_predict",
+ new HiveFunctionWrapper("hivemall.topicmodel.PLSAPredictUDAF"),
+ (word +: value +: label +: prob +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.ftvec.text.TermFrequencyUDAF
+ * @group ftvec.text
+ */
+ def tf(text: String): DataFrame = {
+ checkType(text, StringType)
+ val udaf = HiveUDAFFunction(
+ "tf",
+ new HiveFunctionWrapper("hivemall.ftvec.text.TermFrequencyUDAF"),
+ Seq(text).map(df(_).expr),
+ isUDAFBridgeRequired = true)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
* @see hivemall.ftvec.trans.OnehotEncodingUDAF
* @group ftvec.trans
*/
- def onehot_encoding(cols: String*): DataFrame = {
+ @scala.annotation.varargs
+ def onehot_encoding(feature: String, others: String*): DataFrame = {
val udaf = HiveUDAFFunction(
"onehot_encoding",
new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"),
- cols.map(df.col(_).expr),
+ (feature +: others).map(df(_).expr),
isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF(Seq(Alias(udaf, udaf.prettyName)()))
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
* @see hivemall.ftvec.selection.SignalNoiseRatioUDAF
+ * @group ftvec.selection
*/
- def snr(X: String, Y: String): DataFrame = {
+ def snr(feature: String, label: String): DataFrame = {
val udaf = HiveUDAFFunction(
"snr",
new HiveFunctionWrapper("hivemall.ftvec.selection.SignalNoiseRatioUDAF"),
- Seq(X, Y).map(df.col(_).expr),
+ Seq(feature, label).map(df(_).expr),
isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF(Seq(Alias(udaf, udaf.prettyName)()))
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
- * @see hivemall.evaluation.MeanAbsoluteErrorUDAF
- * @group evaluation
+ * @see hivemall.tools.array.ArrayAvgGenericUDAF
+ * @group tools.array
*/
- def mae(predict: String, target: String): DataFrame = {
- checkType(predict, FloatType)
- checkType(target, FloatType)
+ def array_avg(ar: String): DataFrame = {
val udaf = HiveUDAFFunction(
- "mae",
- new HiveFunctionWrapper("hivemall.evaluation.MeanAbsoluteErrorUDAF"),
- Seq(predict, target).map(df.col(_).expr),
- isUDAFBridgeRequired = true)
+ "array_avg",
+ new HiveFunctionWrapper("hivemall.tools.array.ArrayAvgGenericUDAF"),
+ Seq(ar).map(df(_).expr),
+ isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
- * @see hivemall.evaluation.MeanSquareErrorUDAF
- * @group evaluation
+ * @see hivemall.tools.array.ArraySumUDAF
+ * @group tools.array
*/
- def mse(predict: String, target: String): DataFrame = {
- checkType(predict, FloatType)
- checkType(target, FloatType)
+ def array_sum(ar: String): DataFrame = {
val udaf = HiveUDAFFunction(
- "mse",
- new HiveFunctionWrapper("hivemall.evaluation.MeanSquaredErrorUDAF"),
- Seq(predict, target).map(df.col(_).expr),
+ "array_sum",
+ new HiveFunctionWrapper("hivemall.tools.array.ArraySumUDAF"),
+ Seq(ar).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
- * @see hivemall.evaluation.RootMeanSquareErrorUDAF
- * @group evaluation
+ * @see hivemall.tools.bits.BitsCollectUDAF
+ * @group tools.bits
*/
- def rmse(predict: String, target: String): DataFrame = {
- checkType(predict, FloatType)
- checkType(target, FloatType)
+ def bits_collect(x: String): DataFrame = {
val udaf = HiveUDAFFunction(
- "rmse",
- new HiveFunctionWrapper("hivemall.evaluation.RootMeanSquaredErrorUDAF"),
- Seq(predict, target).map(df.col(_).expr),
- isUDAFBridgeRequired = true)
+ "bits_collect",
+ new HiveFunctionWrapper("hivemall.tools.bits.BitsCollectUDAF"),
+ Seq(x).map(df(_).expr),
+ isUDAFBridgeRequired = false)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
- * @see hivemall.evaluation.F1ScoreUDAF
- * @group evaluation
+ * @see hivemall.tools.list.UDAFToOrderedList
+ * @group tools.list
*/
- def f1score(predict: String, target: String): DataFrame = {
- // checkType(target, ArrayType(IntegerType))
- // checkType(predict, ArrayType(IntegerType))
+ @scala.annotation.varargs
+ def to_ordered_list(value: String, others: String*): DataFrame = {
val udaf = HiveUDAFFunction(
- "f1score",
- new HiveFunctionWrapper("hivemall.evaluation.F1ScoreUDAF"),
- Seq(predict, target).map(df.col(_).expr),
+ "to_ordered_list",
+ new HiveFunctionWrapper("hivemall.tools.list.UDAFToOrderedList"),
+ (value +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.tools.map.UDAFToMap
+ * @group tools.map
+ */
+ def to_map(key: String, value: String): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "to_map",
+ new HiveFunctionWrapper("hivemall.tools.map.UDAFToMap"),
+ Seq(key, value).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.tools.map.UDAFToOrderedMap
+ * @group tools.map
+ */
+ @scala.annotation.varargs
+ def to_ordered_map(key: String, value: String, others: String*): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "to_ordered_map",
+ new HiveFunctionWrapper("hivemall.tools.map.UDAFToOrderedMap"),
+ (key +: value +: others).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.tools.matrix.TransposeAndDotUDAF
+ * @group tools.matrix
+ */
+ def transpose_and_dot(matrix0_row: String, matrix1_row: String): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "transpose_and_dot",
+ new HiveFunctionWrapper("hivemall.tools.matrix.TransposeAndDotUDAF"),
+ Seq(matrix0_row, matrix1_row).map(df(_).expr),
+ isUDAFBridgeRequired = false)
+ .toAggregateExpression()
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
+ }
+
+ /**
+ * @see hivemall.tools.math.L2NormUDAF
+ * @group tools.math
+ */
+ def l2_norm(xi: String): DataFrame = {
+ val udaf = HiveUDAFFunction(
+ "l2_norm",
+ new HiveFunctionWrapper("hivemall.tools.math.L2NormUDAF"),
+ Seq(xi).map(df(_).expr),
isUDAFBridgeRequired = true)
.toAggregateExpression()
- toDF((Alias(udaf, udaf.prettyName)() :: Nil).toSeq)
+ toDF(Alias(udaf, udaf.prettyName)() :: Nil)
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1680c42c/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
----------------------------------------------------------------------
diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
index 22d3153..45f7b4d 100644
--- a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
+++ b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
@@ -38,12 +38,18 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
+
/**
- * Hivemall wrapper and some utility functions for DataFrame.
+ * Hivemall wrapper and some utility functions for DataFrame. These functions below derives
+ * from `resources/ddl/define-all-as-permanent.hive`.
*
* @groupname regression
* @groupname classifier
* @groupname classifier.multiclass
+ * @groupname recommend
+ * @groupname topicmodel
+ * @groupname geospatial
+ * @groupname smile
* @groupname xgboost
* @groupname anomaly
* @groupname knn.similarity
@@ -52,28 +58,72 @@ import org.apache.spark.unsafe.types.UTF8String
* @groupname ftvec
* @groupname ftvec.amplify
* @groupname ftvec.hashing
+ * @groupname ftvec.paring
* @groupname ftvec.scaling
+ * @groupname ftvec.selection
* @groupname ftvec.conv
* @groupname ftvec.trans
+ * @groupname ftvec.ranking
+ * @groupname tools
+ * @groupname tools.array
+ * @groupname tools.bits
+ * @groupname tools.compress
+ * @groupname tools.map
+ * @groupname tools.text
* @groupname misc
+ *
+ * A list of unsupported functions is as follows:
+ * * smile
+ * - guess_attribute_types
+ * * mapred functions
+ * - taskid
+ * - jobid
+ * - rownum
+ * - distcache_gets
+ * - jobconf_gets
+ * * matrix factorization
+ * - mf_predict
+ * - train_mf_sgd
+ * - train_mf_adagrad
+ * - train_bprmf
+ * - bprmf_predict
+ * * Factorization Machine
+ * - fm_predict
+ * - train_fm
+ * - train_ffm
+ * - ffm_predict
*/
final class HivemallOps(df: DataFrame) extends Logging {
import internal.HivemallOpsImpl._
- private[this] lazy val _sparkSession = df.sparkSession
- private[this] lazy val _analyzer = _sparkSession.sessionState.analyzer
- private[this] lazy val _strategy = new UserProvidedPlanner(_sparkSession.sqlContext.conf)
+ private lazy val _sparkSession = df.sparkSession
+ private lazy val _strategy = new UserProvidedPlanner(_sparkSession.sqlContext.conf)
+
+ /**
+ * @see [[hivemall.regression.GeneralRegressorUDTF]]
+ * @group regression
+ */
+ @scala.annotation.varargs
+ def train_regressor(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.regression.GeneralRegressorUDTF",
+ "train_regressor",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("feature", "weight")
+ )
+ }
/**
* @see [[hivemall.regression.AdaDeltaUDTF]]
* @group regression
*/
@scala.annotation.varargs
- def train_adadelta(exprs: Column*): DataFrame = withTypedPlan {
+ def train_adadelta_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AdaDeltaUDTF",
- "train_adadelta",
+ "train_adadelta_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
@@ -84,11 +134,11 @@ final class HivemallOps(df: DataFrame) extends Logging {
* @group regression
*/
@scala.annotation.varargs
- def train_adagrad(exprs: Column*): DataFrame = withTypedPlan {
+ def train_adagrad_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AdaGradUDTF",
- "train_adagrad",
+ "train_adagrad_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
@@ -144,11 +194,11 @@ final class HivemallOps(df: DataFrame) extends Logging {
* @group regression
*/
@scala.annotation.varargs
- def train_logregr(exprs: Column*): DataFrame = withTypedPlan {
+ def train_logistic_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.LogressUDTF",
- "train_logregr",
+ "train_logistic_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
@@ -215,17 +265,17 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
- * @see [[hivemall.smile.regression.RandomForestRegressionUDTF]]
- * @group regression
+ * @see [[hivemall.classifier.GeneralClassifierUDTF]]
+ * @group classifier
*/
@scala.annotation.varargs
- def train_randomforest_regr(exprs: Column*): DataFrame = withTypedPlan {
+ def train_classifier(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
- "hivemall.smile.regression.RandomForestRegressionUDTF",
- "train_randomforest_regr",
+ "hivemall.classifier.GeneralClassifierUDTF",
+ "train_classifier",
setMixServs(toHivemallFeatures(exprs)),
- Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
+ Seq("feature", "weight")
)
}
@@ -380,17 +430,17 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
- * @see [[hivemall.smile.classification.RandomForestClassifierUDTF]]
+ * @see [[hivemall.classifier.KernelExpansionPassiveAggressiveUDTF]]
* @group classifier
*/
@scala.annotation.varargs
- def train_randomforest_classifier(exprs: Column*): DataFrame = withTypedPlan {
+ def train_kpa(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
- "hivemall.smile.classification.RandomForestClassifierUDTF",
- "train_randomforest_classifier",
+ "hivemall.classifier.KernelExpansionPassiveAggressiveUDTF",
+ "train_kpa",
setMixServs(toHivemallFeatures(exprs)),
- Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
+ Seq("h", "hk", "w0", "w1", "w2", "w3")
)
}
@@ -485,6 +535,21 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
+ * @see [[hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF.AROWh]]
+ * @group classifier.multiclass
+ */
+ @scala.annotation.varargs
+ def train_multiclass_arowh(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF$AROWh",
+ "train_multiclass_arowh",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("label", "feature", "weight", "conv")
+ )
+ }
+
+ /**
* @see [[hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF.SCW1]]
* @group classifier.multiclass
*/
@@ -515,6 +580,81 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
+ * @see [[hivemall.recommend.SlimUDTF]]
+ * @group recommend
+ */
+ @scala.annotation.varargs
+ def train_slim(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.recommend.SlimUDTF",
+ "train_slim",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("j", "nn", "w")
+ )
+ }
+
+ /**
+ * @see [[hivemall.topicmodel.LDAUDTF]]
+ * @group topicmodel
+ */
+ @scala.annotation.varargs
+ def train_lda(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.topicmodel.LDAUDTF",
+ "train_lda",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("topic", "word", "score")
+ )
+ }
+
+ /**
+ * @see [[hivemall.topicmodel.PLSAUDTF]]
+ * @group topicmodel
+ */
+ @scala.annotation.varargs
+ def train_plsa(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.topicmodel.PLSAUDTF",
+ "train_plsa",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("topic", "word", "score")
+ )
+ }
+
+ /**
+ * @see [[hivemall.smile.regression.RandomForestRegressionUDTF]]
+ * @group smile
+ */
+ @scala.annotation.varargs
+ def train_randomforest_regressor(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.smile.regression.RandomForestRegressionUDTF",
+ "train_randomforest_regressor",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
+ )
+ }
+
+ /**
+ * @see [[hivemall.smile.classification.RandomForestClassifierUDTF]]
+ * @group smile
+ */
+ @scala.annotation.varargs
+ def train_randomforest_classifier(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.smile.classification.RandomForestClassifierUDTF",
+ "train_randomforest_classifier",
+ setMixServs(toHivemallFeatures(exprs)),
+ Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
+ )
+ }
+
+ /**
* :: Experimental ::
* @see [[hivemall.xgboost.regression.XGBoostRegressionUDTF]]
* @group xgboost
@@ -600,6 +740,21 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
+ * @see [[hivemall.knn.similarity.DIMSUMMapperUDTF]]
+ * @group knn.similarity
+ */
+ @scala.annotation.varargs
+ def dimsum_mapper(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.knn.similarity.DIMSUMMapperUDTF",
+ "dimsum_mapper",
+ exprs,
+ Seq("j", "k", "b_jk")
+ )
+ }
+
+ /**
* @see [[hivemall.knn.lsh.MinHashUDTF]]
* @group knn.lsh
*/
@@ -609,7 +764,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.knn.lsh.MinHashUDTF",
"minhash",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
Seq("clusterid", "item")
)
}
@@ -624,7 +779,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.ftvec.amplify.AmplifierUDTF",
"amplify",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
Seq("clusterid", "item")
)
}
@@ -668,7 +823,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.ftvec.conv.QuantifyColumnsUDTF",
"quantify",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
(0 until exprs.size - 1).map(i => s"c$i")
)
}
@@ -683,7 +838,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.ftvec.trans.BinarizeLabelUDTF",
"binarize_label",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
(0 until exprs.size - 1).map(i => s"c$i")
)
}
@@ -698,17 +853,62 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.ftvec.trans.QuantifiedFeaturesUDTF",
"quantified_features",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
Seq("features")
)
}
/**
+ * @see [[hivemall.ftvec.ranking.BprSamplingUDTF]]
+ * @group ftvec.ranking
+ */
+ @scala.annotation.varargs
+ def bpr_sampling(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.ftvec.ranking.BprSamplingUDTF",
+ "bpr_sampling",
+ exprs,
+ Seq("user", "pos_item", "neg_item")
+ )
+ }
+
+ /**
+ * @see [[hivemall.ftvec.ranking.ItemPairsSamplingUDTF]]
+ * @group ftvec.ranking
+ */
+ @scala.annotation.varargs
+ def item_pairs_sampling(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.ftvec.ranking.ItemPairsSamplingUDTF",
+ "item_pairs_sampling",
+ exprs,
+ Seq("pos_item_id", "neg_item_id")
+ )
+ }
+
+ /**
+ * @see [[hivemall.ftvec.ranking.PopulateNotInUDTF]]
+ * @group ftvec.ranking
+ */
+ @scala.annotation.varargs
+ def populate_not_in(exprs: Column*): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.ftvec.ranking.PopulateNotInUDTF",
+ "populate_not_in",
+ exprs,
+ Seq("item")
+ )
+ }
+
+ /**
* Splits Seq[String] into pieces.
* @group ftvec
*/
- def explode_array(expr: Column): DataFrame = {
- df.explode(expr) { case Row(v: Seq[_]) =>
+ def explode_array(features: Column): DataFrame = {
+ df.explode(features) { case Row(v: Seq[_]) =>
// Type erasure removes the component type in Seq
v.map(s => HivemallFeature(s.asInstanceOf[String]))
}
@@ -718,7 +918,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
* Splits [[Vector]] into pieces.
* @group ftvec
*/
- def explode_vector(expr: Column): DataFrame = {
+ def explode_vector(features: Column): DataFrame = {
val elementSchema = StructType(
StructField("feature", StringType) :: StructField("weight", DoubleType) :: Nil)
val explodeFunc: Row => TraversableOnce[InternalRow] = (row: Row) => {
@@ -737,7 +937,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
withTypedPlan {
Generate(
- UserDefinedGenerator(elementSchema, explodeFunc, expr.expr :: Nil),
+ UserDefinedGenerator(elementSchema, explodeFunc, features.expr :: Nil),
join = true, outer = false, None,
generatorOutput = Nil,
df.logicalPlan)
@@ -745,6 +945,20 @@ final class HivemallOps(df: DataFrame) extends Logging {
}
/**
+ * @see [[hivemall.tools.GenerateSeriesUDTF]]
+ * @group tools
+ */
+ def generate_series(start: Column, end: Column): DataFrame = withTypedPlan {
+ planHiveGenericUDTF(
+ df,
+ "hivemall.tools.GenerateSeriesUDTF",
+ "generate_series",
+ start :: end :: Nil,
+ Seq("generate_series")
+ )
+ }
+
+ /**
* Returns `top-k` records for each `group`.
* @group misc
*/
@@ -877,7 +1091,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
df,
"hivemall.dataset.LogisticRegressionDataGeneratorUDTFWrapper",
"lr_datagen",
- setMixServs(toHivemallFeatures(exprs)),
+ exprs,
Seq("label", "features")
)
}
@@ -900,7 +1114,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
* in all possible spark workers.
*/
@Experimental
- private[this] def setMixServs(exprs: Seq[Column]): Seq[Column] = {
+ private def setMixServs(exprs: Seq[Column]): Seq[Column] = {
val mixes = System.getenv("HIVEMALL_MIX_SERVERS")
if (mixes != null && !mixes.isEmpty()) {
val groupId = df.sqlContext.sparkContext.applicationId + "-" + UUID.randomUUID
@@ -919,7 +1133,7 @@ final class HivemallOps(df: DataFrame) extends Logging {
/**
* If the input is a [[Vector]], transform it into Hivemall features.
*/
- @inline private[this] def toHivemallFeatures(exprs: Seq[Column]): Seq[Column] = {
+ @inline private def toHivemallFeatures(exprs: Seq[Column]): Seq[Column] = {
df.select(exprs: _*).queryExecution.analyzed.schema.zip(exprs).map {
case (StructField(_, _: VectorUDT, _, _), c) => HivemallUtils.to_hivemall_features(c)
case (_, c) => c
@@ -929,13 +1143,13 @@ final class HivemallOps(df: DataFrame) extends Logging {
/**
* A convenient function to wrap a logical plan and produce a DataFrame.
*/
- @inline private[this] def withTypedPlan(logicalPlan: => LogicalPlan): DataFrame = {
+ @inline private def withTypedPlan(logicalPlan: => LogicalPlan): DataFrame = {
val queryExecution = _sparkSession.sessionState.executePlan(logicalPlan)
val outputSchema = queryExecution.sparkPlan.schema
new Dataset[Row](df.sparkSession, queryExecution, RowEncoder(outputSchema))
}
- @inline private[this] def withTypedPlanInCustomStrategy(logicalPlan: => LogicalPlan)
+ @inline private def withTypedPlanInCustomStrategy(logicalPlan: => LogicalPlan)
: DataFrame = {
// Inject custom strategies
if (!_sparkSession.experimental.extraStrategies.contains(_strategy)) {
@@ -967,6 +1181,118 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.geospatial.TileUDF]]
+ * @group geospatial
+ */
+ def tile(lat: Column, lon: Column, zoom: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.TileUDF",
+ "tile",
+ lat :: lon :: zoom :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.MapURLUDF]]
+ * @group geospatial
+ */
+ @scala.annotation.varargs
+ def map_url(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.MapURLUDF",
+ "map_url",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.Lat2TileYUDF]]
+ * @group geospatial
+ */
+ def lat2tiley(lat: Column, zoom: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.Lat2TileYUDF",
+ "lat2tiley",
+ lat :: zoom :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.Lon2TileXUDF]]
+ * @group geospatial
+ */
+ def lon2tilex(lon: Column, zoom: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.Lon2TileXUDF",
+ "lon2tilex",
+ lon :: zoom :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.TileX2LonUDF]]
+ * @group geospatial
+ */
+ def tilex2lon(x: Column, zoom: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.TileX2LonUDF",
+ "tilex2lon",
+ x :: zoom :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.TileY2LatUDF]]
+ * @group geospatial
+ */
+ def tiley2lat(y: Column, zoom: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.TileY2LatUDF",
+ "tiley2lat",
+ y :: zoom :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.geospatial.HaversineDistanceUDF]]
+ * @group geospatial
+ */
+ @scala.annotation.varargs
+ def haversine_distance(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.geospatial.HaversineDistanceUDF",
+ "haversine_distance",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.smile.tools.TreePredictUDF]]
+ * @group smile
+ */
+ @scala.annotation.varargs
+ def tree_predict(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.smile.tools.TreePredictUDF",
+ "tree_predict",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.smile.tools.TreeExportUDF]]
+ * @group smile
+ */
+ @scala.annotation.varargs
+ def tree_export(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.smile.tools.TreeExportUDF",
+ "tree_export",
+ exprs
+ )
+ }
+
+ /**
* @see [[hivemall.anomaly.ChangeFinderUDF]]
* @group anomaly
*/
@@ -997,10 +1323,10 @@ object HivemallOps {
* @group knn.similarity
*/
@scala.annotation.varargs
- def cosine_sim(exprs: Column*): Column = withExpr {
+ def cosine_similarity(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.similarity.CosineSimilarityUDF",
- "cosine_sim",
+ "cosine_similarity",
exprs
)
}
@@ -1010,10 +1336,10 @@ object HivemallOps {
* @group knn.similarity
*/
@scala.annotation.varargs
- def jaccard(exprs: Column*): Column = withExpr {
+ def jaccard_similarity(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.similarity.JaccardIndexUDF",
- "jaccard",
+ "jaccard_similarity",
exprs
)
}
@@ -1137,6 +1463,19 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.knn.distance.JaccardDistanceUDF]]
+ * @group knn.distance
+ */
+ @scala.annotation.varargs
+ def jaccard_distance(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.knn.distance.JaccardDistanceUDF",
+ "jaccard_distance",
+ exprs
+ )
+ }
+
+ /**
* @see [[hivemall.knn.distance.ManhattanDistanceUDF]]
* @group knn.distance
*/
@@ -1154,7 +1493,7 @@ object HivemallOps {
* @group knn.distance
*/
@scala.annotation.varargs
- def minkowski_distance (exprs: Column*): Column = withExpr {
+ def minkowski_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.MinkowskiDistanceUDF",
"minkowski_distance",
@@ -1237,11 +1576,11 @@ object HivemallOps {
* @see [[hivemall.ftvec.AddFeatureIndexUDFWrapper]]
* @group ftvec
*/
- def add_feature_index(expr: Column): Column = withExpr {
+ def add_feature_index(features: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.AddFeatureIndexUDFWrapper",
"add_feature_index",
- expr :: Nil
+ features :: Nil
)
}
@@ -1273,11 +1612,12 @@ object HivemallOps {
* @see [[hivemall.ftvec.hashing.Sha1UDF]]
* @group ftvec.hashing
*/
- def sha1(expr: Column): Column = withExpr {
+ @scala.annotation.varargs
+ def sha1(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.hashing.Sha1UDF",
"sha1",
- expr :: Nil
+ exprs
)
}
@@ -1287,7 +1627,6 @@ object HivemallOps {
*/
@scala.annotation.varargs
def array_hash_values(exprs: Column*): Column = withExpr {
- // TODO: Need a wrapper class because of using unsupported types
planHiveUDF(
"hivemall.ftvec.hashing.ArrayHashValuesUDF",
"array_hash_values",
@@ -1301,7 +1640,6 @@ object HivemallOps {
*/
@scala.annotation.varargs
def prefixed_hash_values(exprs: Column*): Column = withExpr {
- // TODO: Need a wrapper class because of using unsupported types
planHiveUDF(
"hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF",
"prefixed_hash_values",
@@ -1310,6 +1648,45 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.ftvec.hashing.FeatureHashingUDF]]
+ * @group ftvec.hashing
+ */
+ @scala.annotation.varargs
+ def feature_hashing(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.ftvec.hashing.FeatureHashingUDF",
+ "feature_hashing",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.ftvec.pairing.PolynomialFeaturesUDF]]
+ * @group ftvec.paring
+ */
+ @scala.annotation.varargs
+ def polynomial_features(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.ftvec.pairing.PolynomialFeaturesUDF",
+ "polynomial_features",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.ftvec.pairing.PoweredFeaturesUDF]]
+ * @group ftvec.paring
+ */
+ @scala.annotation.varargs
+ def powered_features(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.ftvec.pairing.PoweredFeaturesUDF",
+ "powered_features",
+ exprs
+ )
+ }
+
+ /**
* @see [[hivemall.ftvec.scaling.RescaleUDF]]
* @group ftvec.scaling
*/
@@ -1338,7 +1715,7 @@ object HivemallOps {
* @see [[hivemall.ftvec.scaling.L2NormalizationUDFWrapper]]
* @group ftvec.scaling
*/
- def normalize(expr: Column): Column = withExpr {
+ def l2_normalize(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.scaling.L2NormalizationUDFWrapper",
"normalize",
@@ -1364,8 +1741,7 @@ object HivemallOps {
*/
@scala.annotation.varargs
def to_dense_features(exprs: Column*): Column = withExpr {
- // TODO: Need a wrapper class because of using unsupported types
- planHiveGenericUDF(
+ planHiveUDF(
"hivemall.ftvec.conv.ToDenseFeaturesUDF",
"to_dense_features",
exprs
@@ -1378,8 +1754,7 @@ object HivemallOps {
*/
@scala.annotation.varargs
def to_sparse_features(exprs: Column*): Column = withExpr {
- // TODO: Need a wrapper class because of using unsupported types
- planHiveGenericUDF(
+ planHiveUDF(
"hivemall.ftvec.conv.ToSparseFeaturesUDF",
"to_sparse_features",
exprs
@@ -1387,6 +1762,19 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.ftvec.binning.FeatureBinningUDF]]
+ * @group ftvec.conv
+ */
+ @scala.annotation.varargs
+ def feature_binning(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.ftvec.binning.FeatureBinningUDF",
+ "feature_binning",
+ exprs
+ )
+ }
+
+ /**
* @see [[hivemall.ftvec.trans.VectorizeFeaturesUDF]]
* @group ftvec.trans
*/
@@ -1413,6 +1801,19 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.ftvec.trans.FFMFeaturesUDF]]
+ * @group ftvec.trans
+ */
+ @scala.annotation.varargs
+ def ffm_features(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.ftvec.trans.FFMFeaturesUDF",
+ "ffm_features",
+ exprs
+ )
+ }
+
+ /**
* @see [[hivemall.ftvec.trans.IndexedFeatures]]
* @group ftvec.trans
*/
@@ -1439,15 +1840,136 @@ object HivemallOps {
}
/**
- * @see [[hivemall.smile.tools.TreePredictUDF]]
- * @group misc
+ * @see [[hivemall.ftvec.trans.AddFieldIndicesUDF]]
+ * @group ftvec.trans
+ */
+ def add_field_indicies(features: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.ftvec.trans.AddFieldIndicesUDF",
+ "add_field_indicies",
+ features :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.ConvertLabelUDF]]
+ * @group tools
+ */
+ def convert_label(label: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.ConvertLabelUDF",
+ "convert_label",
+ label :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.RankSequenceUDF]]
+ * @group tools
+ */
+ def x_rank(key: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.RankSequenceUDF",
+ "x_rank",
+ key :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.AllocFloatArrayUDF]]
+ * @group tools.array
+ */
+ def float_array(nDims: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.AllocFloatArrayUDF",
+ "float_array",
+ nDims :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.ArrayRemoveUDF]]
+ * @group tools.array
+ */
+ def array_remove(original: Column, target: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.ArrayRemoveUDF",
+ "array_remove",
+ original :: target :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.SortAndUniqArrayUDF]]
+ * @group tools.array
+ */
+ def sort_and_uniq_array(ar: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.SortAndUniqArrayUDF",
+ "sort_and_uniq_array",
+ ar :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.SubarrayEndWithUDF]]
+ * @group tools.array
+ */
+ def subarray_endwith(original: Column, key: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.SubarrayEndWithUDF",
+ "subarray_endwith",
+ original :: key :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.ArrayConcatUDF]]
+ * @group tools.array
*/
@scala.annotation.varargs
- def tree_predict(exprs: Column*): Column = withExpr {
+ def array_concat(arrays: Column*): Column = withExpr {
planHiveGenericUDF(
- "hivemall.smile.tools.TreePredictUDF",
- "tree_predict",
- exprs
+ "hivemall.tools.array.ArrayConcatUDF",
+ "array_concat",
+ arrays
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.SubarrayUDF]]
+ * @group tools.array
+ */
+ def subarray(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.SubarrayUDF",
+ "subarray",
+ original :: fromIndex :: toIndex :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.ToStringArrayUDF]]
+ * @group tools.array
+ */
+ def to_string_array(ar: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.array.ToStringArrayUDF",
+ "to_string_array",
+ ar :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.array.ArrayIntersectUDF]]
+ * @group tools.array
+ */
+ @scala.annotation.varargs
+ def array_intersect(arrays: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.array.ArrayIntersectUDF",
+ "array_intersect",
+ arrays
)
}
@@ -1464,6 +1986,194 @@ object HivemallOps {
}
/**
+ * @see [[hivemall.tools.bits.ToBitsUDF]]
+ * @group tools.bits
+ */
+ def to_bits(indexes: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.bits.ToBitsUDF",
+ "to_bits",
+ indexes :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.bits.UnBitsUDF]]
+ * @group tools.bits
+ */
+ def unbits(bitset: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.bits.UnBitsUDF",
+ "unbits",
+ bitset :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.bits.BitsORUDF]]
+ * @group tools.bits
+ */
+ @scala.annotation.varargs
+ def bits_or(bits: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.bits.BitsORUDF",
+ "bits_or",
+ bits
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.compress.InflateUDF]]
+ * @group tools.compress
+ */
+ @scala.annotation.varargs
+ def inflate(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.compress.InflateUDF",
+ "inflate",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.compress.DeflateUDF]]
+ * @group tools.compress
+ */
+ @scala.annotation.varargs
+ def deflate(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.compress.DeflateUDF",
+ "deflate",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.map.MapGetSumUDF]]
+ * @group tools.map
+ */
+ @scala.annotation.varargs
+ def map_get_sum(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.map.MapGetSumUDF",
+ "map_get_sum",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.map.MapTailNUDF]]
+ * @group tools.map
+ */
+ @scala.annotation.varargs
+ def map_tail_n(exprs: Column*): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.map.MapTailNUDF",
+ "map_tail_n",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.TokenizeUDF]]
+ * @group tools.text
+ */
+ @scala.annotation.varargs
+ def tokenize(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.TokenizeUDF",
+ "tokenize",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.StopwordUDF]]
+ * @group tools.text
+ */
+ def is_stopword(word: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.StopwordUDF",
+ "is_stopword",
+ word :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.SingularizeUDF]]
+ * @group tools.text
+ */
+ def singularize(word: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.SingularizeUDF",
+ "singularize",
+ word :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.SplitWordsUDF]]
+ * @group tools.text
+ */
+ @scala.annotation.varargs
+ def split_words(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.SplitWordsUDF",
+ "split_words",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.NormalizeUnicodeUDF]]
+ * @group tools.text
+ */
+ @scala.annotation.varargs
+ def normalize_unicode(exprs: Column*): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.NormalizeUnicodeUDF",
+ "normalize_unicode",
+ exprs
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.Base91UDF]]
+ * @group tools.text
+ */
+ def base91(bin: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.text.Base91UDF",
+ "base91",
+ bin :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.Unbase91UDF]]
+ * @group tools.text
+ */
+ def unbase91(base91String: Column): Column = withExpr {
+ planHiveGenericUDF(
+ "hivemall.tools.text.Unbase91UDF",
+ "unbase91",
+ base91String :: Nil
+ )
+ }
+
+ /**
+ * @see [[hivemall.tools.text.WordNgramsUDF]]
+ * @group tools.text
+ */
+ def word_ngrams(words: Column, minSize: Column, maxSize: Column): Column = withExpr {
+ planHiveUDF(
+ "hivemall.tools.text.WordNgramsUDF",
+ "word_ngrams",
+ words :: minSize :: maxSize :: Nil
+ )
+ }
+
+ /**
* @see [[hivemall.tools.math.SigmoidGenericUDF]]
* @group misc
*/
[4/4] incubator-hivemall git commit: Fixed assertion conditions
Posted by my...@apache.org.
Fixed assertion conditions
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e8abae25
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e8abae25
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e8abae25
Branch: refs/heads/master
Commit: e8abae257f45bab419654ebe2d5387755dc7f105
Parents: c58eaea
Author: Makoto Yui <my...@apache.org>
Authored: Mon Oct 16 20:54:53 2017 +0900
Committer: Makoto Yui <my...@apache.org>
Committed: Mon Oct 16 21:05:00 2017 +0900
----------------------------------------------------------------------
core/src/main/java/hivemall/evaluation/AUCUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/HitRateUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/MAPUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/MRRUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/NDCGUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/PrecisionUDAF.java | 2 +-
core/src/main/java/hivemall/evaluation/RecallUDAF.java | 2 +-
7 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/AUCUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/AUCUDAF.java b/core/src/main/java/hivemall/evaluation/AUCUDAF.java
index 6dba174..3c7faa7 100644
--- a/core/src/main/java/hivemall/evaluation/AUCUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/AUCUDAF.java
@@ -110,7 +110,7 @@ public final class AUCUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length == 1 || parameters.length == 2) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/HitRateUDAF.java b/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
index 6a2d0da..5464654 100644
--- a/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/HitRateUDAF.java
@@ -106,7 +106,7 @@ public final class HitRateUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/MAPUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/MAPUDAF.java b/core/src/main/java/hivemall/evaluation/MAPUDAF.java
index fef1f43..61341a3 100644
--- a/core/src/main/java/hivemall/evaluation/MAPUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/MAPUDAF.java
@@ -88,7 +88,7 @@ public final class MAPUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/MRRUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/MRRUDAF.java b/core/src/main/java/hivemall/evaluation/MRRUDAF.java
index fcd9d51..e92df67 100644
--- a/core/src/main/java/hivemall/evaluation/MRRUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/MRRUDAF.java
@@ -88,7 +88,7 @@ public final class MRRUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/NDCGUDAF.java b/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
index 00aa16a..6b92afe 100644
--- a/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/NDCGUDAF.java
@@ -91,7 +91,7 @@ public final class NDCGUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java b/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
index 2e09a71..ca6c4f0 100644
--- a/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/PrecisionUDAF.java
@@ -88,7 +88,7 @@ public final class PrecisionUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e8abae25/core/src/main/java/hivemall/evaluation/RecallUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/RecallUDAF.java b/core/src/main/java/hivemall/evaluation/RecallUDAF.java
index d084c94..d0722d0 100644
--- a/core/src/main/java/hivemall/evaluation/RecallUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/RecallUDAF.java
@@ -88,7 +88,7 @@ public final class RecallUDAF extends AbstractGenericUDAFResolver {
@Override
public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
- assert (0 < parameters.length && parameters.length <= 3) : parameters.length;
+ assert (parameters.length >=1 && parameters.length <= 3) : parameters.length;
super.init(mode, parameters);
// initialize input