You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/12/08 08:26:53 UTC
[2/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example
code in ml-features.md using include_example
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 0000000..668f71e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ PolynomialExpansion polyExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3);
+
+ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(0.0, 0.0)),
+ RowFactory.create(Vectors.dense(0.6, -1.1))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+
+ DataFrame df = jsql.createDataFrame(data, schema);
+ DataFrame polyDF = polyExpansion.transform(df);
+
+ Row[] row = polyDF.select("polyFeatures").take(3);
+ for (Row r : row) {
+ System.out.println(r.get(0));
+ }
+ // $example off$
+ jsc.stop();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 0000000..1e1062b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaRFormulaExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaRFormulaExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("country", StringType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("clicked", DoubleType, false)
+ });
+
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(7, "US", 18, 1.0),
+ RowFactory.create(8, "CA", 12, 0.0),
+ RowFactory.create(9, "NZ", 15, 0.0)
+ ));
+
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+ RFormula formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label");
+ DataFrame output = formula.fit(dataset).transform(dataset);
+ output.select("features", "label").show();
+ // $example off$
+ jsc.stop();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
new file mode 100644
index 0000000..0cbdc97
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaStandardScalerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStandardScalerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+ StandardScaler scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false);
+
+ // Compute summary statistics by fitting the StandardScaler
+ StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+ // Normalize each feature to have unit standard deviation.
+ DataFrame scaledData = scalerModel.transform(dataFrame);
+ // $example off$
+ jsc.stop();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
new file mode 100644
index 0000000..b6b201c
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaStopWordsRemoverExample {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField(
+ "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+
+ DataFrame dataset = jsql.createDataFrame(rdd, schema);
+ remover.transform(dataset).show();
+ // $example off$
+ jsc.stop();
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
new file mode 100644
index 0000000..05d12c1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaStringIndexerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStringIndexerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("category", StringType, false)
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+ StringIndexer indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex");
+ DataFrame indexed = indexer.fit(df).transform(df);
+ indexed.show();
+ // $example off$
+ jsc.stop();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
new file mode 100644
index 0000000..617dc3f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RegexTokenizer;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTokenizerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaTokenizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "Hi I heard about Spark"),
+ RowFactory.create(1, "I wish Java could use case classes"),
+ RowFactory.create(2, "Logistic,regression,models,are,neat")
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+ });
+
+ DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+ Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+
+ DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+ for (Row r : wordsDataFrame.select("words", "label"). take(3)) {
+ java.util.List<String> words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+ }
+
+ RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ // $example off$
+ jsc.stop();
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
new file mode 100644
index 0000000..7e230b5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaVectorAssemblerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorAssemblerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("mobile", DoubleType, false),
+ createStructField("userFeatures", new VectorUDT(), false),
+ createStructField("clicked", DoubleType, false)
+ });
+ Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+ VectorAssembler assembler = new VectorAssembler()
+ .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+ .setOutputCol("features");
+
+ DataFrame output = assembler.transform(dataset);
+ System.out.println(output.select("features", "clicked").first());
+ // $example off$
+ jsc.stop();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
new file mode 100644
index 0000000..06b4bf6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Map;
+
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaVectorIndexerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorIndexerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+ VectorIndexer indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10);
+ VectorIndexerModel indexerModel = indexer.fit(data);
+
+ Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+ System.out.print("Chose " + categoryMaps.size() + " categorical features:");
+
+ for (Integer feature : categoryMaps.keySet()) {
+ System.out.print(" " + feature);
+ }
+ System.out.println();
+
+ // Create new column "indexed" with categorical values transformed to indices
+ DataFrame indexedData = indexerModel.transform(data);
+ // $example off$
+ jsc.stop();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
new file mode 100644
index 0000000..4d5cb04
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.attribute.Attribute;
+import org.apache.spark.ml.attribute.AttributeGroup;
+import org.apache.spark.ml.attribute.NumericAttribute;
+import org.apache.spark.ml.feature.VectorSlicer;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaVectorSlicerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ Attribute[] attrs = new Attribute[]{
+ NumericAttribute.defaultAttr().withName("f1"),
+ NumericAttribute.defaultAttr().withName("f2"),
+ NumericAttribute.defaultAttr().withName("f3")
+ };
+ AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+ JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+ RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+ ));
+
+ DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+ VectorSlicer vectorSlicer = new VectorSlicer()
+ .setInputCol("userFeatures").setOutputCol("features");
+
+ vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+ // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+ DataFrame output = vectorSlicer.transform(dataset);
+
+ System.out.println(output.select("userFeatures", "features").first());
+ // $example off$
+ jsc.stop();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/binarizer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
new file mode 100644
index 0000000..960ad20
--- /dev/null
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Binarizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="BinarizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ continuousDataFrame = sqlContext.createDataFrame([
+ (0, 0.1),
+ (1, 0.8),
+ (2, 0.2)
+ ], ["label", "feature"])
+ binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+ binarizedDataFrame = binarizer.transform(continuousDataFrame)
+ binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+ for binarized_feature, in binarizedFeatures.collect():
+ print(binarized_feature)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/bucketizer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
new file mode 100644
index 0000000..a12750a
--- /dev/null
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Bucketizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="BucketizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
+
+ data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ dataFrame = sqlContext.createDataFrame(data, ["features"])
+
+ bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
+
+ # Transform original data into its bucket index.
+ bucketedData = bucketizer.transform(dataFrame)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/elementwise_product_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
new file mode 100644
index 0000000..c85cb0d
--- /dev/null
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="ElementwiseProductExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+ df = sqlContext.createDataFrame(data, ["vector"])
+ transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
+ inputCol="vector", outputCol="transformedVector")
+ transformer.transform(df).show()
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/n_gram_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
new file mode 100644
index 0000000..f2d85f5
--- /dev/null
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import NGram
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="NGramExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ wordDataFrame = sqlContext.createDataFrame([
+ (0, ["Hi", "I", "heard", "about", "Spark"]),
+ (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+ (2, ["Logistic", "regression", "models", "are", "neat"])
+ ], ["label", "words"])
+ ngram = NGram(inputCol="words", outputCol="ngrams")
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+ print(ngrams_label)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/normalizer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
new file mode 100644
index 0000000..833d93e
--- /dev/null
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Normalizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="NormalizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ # Normalize each Vector using $L^1$ norm.
+ normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
+ l1NormData = normalizer.transform(dataFrame)
+
+ # Normalize each Vector using $L^\infty$ norm.
+ lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/onehot_encoder_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
new file mode 100644
index 0000000..7529dfd
--- /dev/null
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import OneHotEncoder, StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="OneHotEncoderExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame([
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+ ], ["id", "category"])
+
+ stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = stringIndexer.fit(df)
+ indexed = model.transform(df)
+ encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoded = encoder.transform(indexed)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/pca_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
new file mode 100644
index 0000000..8b66140
--- /dev/null
+++ b/examples/src/main/python/ml/pca_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="PCAExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+ (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+ (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+ df = sqlContext.createDataFrame(data,["features"])
+ pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+ model = pca.fit(df)
+ result = model.transform(df).select("pcaFeatures")
+ result.show(truncate=False)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/polynomial_expansion_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
new file mode 100644
index 0000000..030a613
--- /dev/null
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PolynomialExpansion
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="PolynomialExpansionExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame(
+ [(Vectors.dense([-2.0, 2.3]), ),
+ (Vectors.dense([0.0, 0.0]), ),
+ (Vectors.dense([0.6, -1.1]), )],
+ ["features"])
+ px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+ polyDF = px.transform(df)
+ for expanded in polyDF.select("polyFeatures").take(3):
+ print(expanded)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/rformula_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
new file mode 100644
index 0000000..b544a14
--- /dev/null
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import RFormula
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="RFormulaExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataset = sqlContext.createDataFrame(
+ [(7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)],
+ ["id", "country", "hour", "clicked"])
+ formula = RFormula(
+ formula="clicked ~ country + hour",
+ featuresCol="features",
+ labelCol="label")
+ output = formula.fit(dataset).transform(dataset)
+ output.select("features", "label").show()
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/standard_scaler_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
new file mode 100644
index 0000000..139acec
--- /dev/null
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StandardScaler
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StandardScalerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
+ withStd=True, withMean=False)
+
+ # Compute summary statistics by fitting the StandardScaler
+ scalerModel = scaler.fit(dataFrame)
+
+ # Normalize each feature to have unit standard deviation.
+ scaledData = scalerModel.transform(dataFrame)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/stopwords_remover_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
new file mode 100644
index 0000000..01f94af
--- /dev/null
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StopWordsRemover
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StopWordsRemoverExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ sentenceData = sqlContext.createDataFrame([
+ (0, ["I", "saw", "the", "red", "baloon"]),
+ (1, ["Mary", "had", "a", "little", "lamb"])
+ ], ["label", "raw"])
+
+ remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+ remover.transform(sentenceData).show(truncate=False)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/string_indexer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
new file mode 100644
index 0000000..58a8cb5
--- /dev/null
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StringIndexerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame(
+ [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+ ["id", "category"])
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ indexed = indexer.fit(df).transform(df)
+ indexed.show()
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/tokenizer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
new file mode 100644
index 0000000..ce9b225
--- /dev/null
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Tokenizer, RegexTokenizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="TokenizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ sentenceDataFrame = sqlContext.createDataFrame([
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+ ], ["label", "sentence"])
+ tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+ wordsDataFrame = tokenizer.transform(sentenceDataFrame)
+ for words_label in wordsDataFrame.select("words", "label").take(3):
+ print(words_label)
+ regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
+ # alternatively, pattern="\\w+", gaps(False)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/vector_assembler_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
new file mode 100644
index 0000000..04f6483
--- /dev/null
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml.feature import VectorAssembler
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="VectorAssemblerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataset = sqlContext.createDataFrame(
+ [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
+ ["id", "hour", "mobile", "userFeatures", "clicked"])
+ assembler = VectorAssembler(
+ inputCols=["hour", "mobile", "userFeatures"],
+ outputCol="features")
+ output = assembler.transform(dataset)
+ print(output.select("features", "clicked").first())
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/python/ml/vector_indexer_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
new file mode 100644
index 0000000..cc00d14
--- /dev/null
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import VectorIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="VectorIndexerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+ indexerModel = indexer.fit(data)
+
+ # Create new column "indexed" with categorical values transformed to indices
+ indexedData = indexerModel.transform(data)
+ # $example off$
+
+ sc.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
new file mode 100644
index 0000000..e724aa5
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Binarizer
+// $example off$
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BinarizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("BinarizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+ // $example on$
+ val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
+ val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+ val binarizer: Binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5)
+
+ val binarizedDataFrame = binarizer.transform(dataFrame)
+ val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+ binarizedFeatures.collect().foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
new file mode 100644
index 0000000..30c2776
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Bucketizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BucketizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("BucketizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+ val data = Array(-0.5, -0.3, 0.0, 0.2)
+ val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+ val bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits)
+
+ // Transform original data into its bucket index.
+ val bucketedData = bucketizer.transform(dataFrame)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
new file mode 100644
index 0000000..314c2c2
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object DCTExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("DCTExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Seq(
+ Vectors.dense(0.0, 1.0, -2.0, 3.0),
+ Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+ Vectors.dense(14.0, -2.0, -5.0, 1.0))
+
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+ val dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false)
+
+ val dctDf = dct.transform(df)
+ dctDf.select("featuresDCT").show(3)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
new file mode 100644
index 0000000..ac50bb7
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object ElementwiseProductExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("ElementwiseProductExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ // Create some vector data; also works for sparse vectors
+ val dataFrame = sqlContext.createDataFrame(Seq(
+ ("a", Vectors.dense(1.0, 2.0, 3.0)),
+ ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+ val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+ val transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector")
+
+ // Batch transform the vectors to create new column:
+ transformer.transform(dataFrame).show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
new file mode 100644
index 0000000..dac3679
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.MinMaxScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object MinMaxScalerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("MinMaxScalerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ val scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+
+ // Compute summary statistics and generate MinMaxScalerModel
+ val scalerModel = scaler.fit(dataFrame)
+
+ // rescale each feature to range [min, max].
+ val scaledData = scalerModel.transform(dataFrame)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
new file mode 100644
index 0000000..8a85f71
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.NGram
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NGramExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("NGramExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val wordDataFrame = sqlContext.createDataFrame(Seq(
+ (0, Array("Hi", "I", "heard", "about", "Spark")),
+ (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+ (2, Array("Logistic", "regression", "models", "are", "neat"))
+ )).toDF("label", "words")
+
+ val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+ val ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
new file mode 100644
index 0000000..17571f0
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Normalizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NormalizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("NormalizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ // Normalize each Vector using $L^1$ norm.
+ val normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0)
+
+ val l1NormData = normalizer.transform(dataFrame)
+
+ // Normalize each Vector using $L^\infty$ norm.
+ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
new file mode 100644
index 0000000..4512736
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object OneHotEncoderExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val df = sqlContext.createDataFrame(Seq(
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+ )).toDF("id", "category")
+
+ val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df)
+ val indexed = indexer.transform(df)
+
+ val encoder = new OneHotEncoder().setInputCol("categoryIndex").
+ setOutputCol("categoryVec")
+ val encoded = encoder.transform(indexed)
+ encoded.select("id", "categoryVec").foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
new file mode 100644
index 0000000..a18d4f3
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PCAExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("PCAExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Array(
+ Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+ Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+ Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df)
+ val pcaDF = pca.transform(df)
+ val result = pcaDF.select("pcaFeatures")
+ result.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
new file mode 100644
index 0000000..b8e9e69
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PolynomialExpansionExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("PolynomialExpansionExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Array(
+ Vectors.dense(-2.0, 2.3),
+ Vectors.dense(0.0, 0.0),
+ Vectors.dense(0.6, -1.1)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val polynomialExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3)
+ val polyDF = polynomialExpansion.transform(df)
+ polyDF.select("polyFeatures").take(3).foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
+
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
new file mode 100644
index 0000000..286866e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.RFormula
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object RFormulaExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("RFormulaExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataset = sqlContext.createDataFrame(Seq(
+ (7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)
+ )).toDF("id", "country", "hour", "clicked")
+ val formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label")
+ val output = formula.fit(dataset).transform(dataset)
+ output.select("features", "label").show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
new file mode 100644
index 0000000..646ce0f
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StandardScalerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("StandardScalerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ val scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false)
+
+ // Compute summary statistics by fitting the StandardScaler.
+ val scalerModel = scaler.fit(dataFrame)
+
+ // Normalize each feature to have unit standard deviation.
+ val scaledData = scalerModel.transform(dataFrame)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
new file mode 100644
index 0000000..655ffce
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StopWordsRemover
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StopWordsRemoverExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("StopWordsRemoverExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (0, Seq("I", "saw", "the", "red", "baloon")),
+ (1, Seq("Mary", "had", "a", "little", "lamb"))
+ )).toDF("id", "raw")
+
+ remover.transform(dataSet).show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org