You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by zh...@apache.org on 2023/01/30 09:19:52 UTC

[flink-ml] branch master updated: [FLINK-30568] Add benchmark for PolyNomialExpansion, Normalizer, Binarizer, Interaction, MaxAbsScaler, VectorSlicer, ElementWiseProduct and Featurehasher

This is an automated email from the ASF dual-hosted git repository.

zhangzp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink-ml.git


The following commit(s) were added to refs/heads/master by this push:
     new 169aca4  [FLINK-30568] Add benchmark for PolyNomialExpansion, Normalizer, Binarizer, Interaction, MaxAbsScaler, VectorSlicer, ElementWiseProduct and Featurehasher
169aca4 is described below

commit 169aca44fa6541d9b230e7e8a4c5b146a35cda67
Author: weibo <wb...@pku.edu.cn>
AuthorDate: Mon Jan 30 17:19:46 2023 +0800

    [FLINK-30568] Add benchmark for PolyNomialExpansion, Normalizer, Binarizer, Interaction, MaxAbsScaler, VectorSlicer, ElementWiseProduct and Featurehasher
    
    This closes #198.
---
 .../src/main/resources/binarizer-benchmark.json    | 62 ++++++++++++++++++++++
 .../resources/elementwiseproduct-benchmark.json    | 39 ++++++++++++++
 .../main/resources/featurehasher-benchmark.json    | 54 +++++++++++++++++++
 .../src/main/resources/interaction-benchmark.json  | 48 +++++++++++++++++
 .../src/main/resources/maxabsscaler-benchmark.json | 36 +++++++++++++
 .../src/main/resources/normalizer-benchmark.json   | 39 ++++++++++++++
 .../resources/polynoimalexpansion-benchmark.json   | 39 ++++++++++++++
 .../src/main/resources/vectorslicer-benchmark.json | 39 ++++++++++++++
 8 files changed, 356 insertions(+)

diff --git a/flink-ml-benchmark/src/main/resources/binarizer-benchmark.json b/flink-ml-benchmark/src/main/resources/binarizer-benchmark.json
new file mode 100644
index 0000000..fc0a5fd
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/binarizer-benchmark.json
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "binarizer10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator",
+      "paramMap": {
+        "colNames": [
+          [
+            "f0",
+            "f1",
+            "f2",
+            "f3",
+            "f4"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.binarizer.Binarizer",
+      "paramMap": {
+        "inputCols": [
+          "f0",
+          "f1",
+          "f2",
+          "f3",
+          "f4"
+        ],
+        "outputCols": [
+          "outputCol0",
+          "outputCol1",
+          "outputCol2",
+          "outputCol3",
+          "outputCol4"
+        ],
+        "thresholds": [
+          0.5,
+          0.3,
+          0.3,
+          0.6,
+          0.8
+        ]
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/elementwiseproduct-benchmark.json b/flink-ml-benchmark/src/main/resources/elementwiseproduct-benchmark.json
new file mode 100644
index 0000000..740799a
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/elementwiseproduct-benchmark.json
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "elementwiseproduct10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 5,
+        "colNames": [
+          [
+            "featuresCol"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.elementwiseproduct.ElementwiseProduct",
+      "paramMap": {
+        "scalingVec": {"values": [1.0, 2.0, 3.0, 4.0, 5.0]}
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/featurehasher-benchmark.json b/flink-ml-benchmark/src/main/resources/featurehasher-benchmark.json
new file mode 100644
index 0000000..5c3826f
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/featurehasher-benchmark.json
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "featurehasher10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator",
+      "paramMap": {
+        "colNames": [
+          [
+            "f0",
+            "f1",
+            "f2",
+            "f3",
+            "f4"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.featurehasher.FeatureHasher",
+      "paramMap": {
+        "inputCols": [
+          "f0",
+          "f1",
+          "f2",
+          "f3",
+          "f4"
+        ],
+        "categoricalCols": [
+          "f0",
+          "f1",
+          "f2"
+        ],
+        "numFeatures": 1000
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/interaction-benchmark.json b/flink-ml-benchmark/src/main/resources/interaction-benchmark.json
new file mode 100644
index 0000000..01bb8bb
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/interaction-benchmark.json
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "interaction10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator",
+      "paramMap": {
+        "colNames": [
+          [
+            "f0",
+            "f1",
+            "f2",
+            "f3",
+            "f4"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.interaction.Interaction",
+      "paramMap": {
+        "inputCols": [
+          "f0",
+          "f1",
+          "f2",
+          "f3",
+          "f4"
+        ]
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/maxabsscaler-benchmark.json b/flink-ml-benchmark/src/main/resources/maxabsscaler-benchmark.json
new file mode 100644
index 0000000..9cbe218
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/maxabsscaler-benchmark.json
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "maxabsscaler10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 100,
+        "colNames": [
+          [
+            "featuresCol"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.maxabsscaler.MaxAbsScaler"
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/normalizer-benchmark.json b/flink-ml-benchmark/src/main/resources/normalizer-benchmark.json
new file mode 100644
index 0000000..77d1f9e
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/normalizer-benchmark.json
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "normalizer10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 5,
+        "colNames": [
+          [
+            "featuresCol"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.normalizer.Normalizer",
+      "paramMap": {
+        "p": 2.0
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/polynoimalexpansion-benchmark.json b/flink-ml-benchmark/src/main/resources/polynoimalexpansion-benchmark.json
new file mode 100644
index 0000000..d35a45b
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/polynoimalexpansion-benchmark.json
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "polynoimalexpansion10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 5,
+        "colNames": [
+          [
+            "featuresCol"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.polynomialexpansion.PolynomialExpansion",
+      "paramMap": {
+        "degree": 2
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/vectorslicer-benchmark.json b/flink-ml-benchmark/src/main/resources/vectorslicer-benchmark.json
new file mode 100644
index 0000000..7f77166
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/vectorslicer-benchmark.json
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "vectorslicer10000000": {
+    "inputData": {
+      "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 10,
+        "colNames": [
+          [
+            "featuresCol"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.vectorslicer.VectorSlicer",
+      "paramMap": {
+        "indices": [1, 3 ,5 ,7]
+      }
+    }
+  }
+}
\ No newline at end of file