You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/01/28 01:08:46 UTC
[5/5] spark git commit: [SPARK-5097][SQL] DataFrame
[SPARK-5097][SQL] DataFrame
This pull request redesigns the existing Spark SQL dsl, which already provides data frame like functionalities.
TODOs:
With the exception of Python support, other tasks can be done in separate, follow-up PRs.
- [ ] Audit of the API
- [ ] Documentation
- [ ] More test cases to cover the new API
- [x] Python support
- [ ] Type alias SchemaRDD
Author: Reynold Xin <rx...@databricks.com>
Author: Davies Liu <da...@databricks.com>
Closes #4173 from rxin/df1 and squashes the following commits:
0a1a73b [Reynold Xin] Merge branch 'df1' of github.com:rxin/spark into df1
23b4427 [Reynold Xin] Mima.
828f70d [Reynold Xin] Merge pull request #7 from davies/df
257b9e6 [Davies Liu] add repartition
6bf2b73 [Davies Liu] fix collect with UDT and tests
e971078 [Reynold Xin] Missing quotes.
b9306b4 [Reynold Xin] Remove removeColumn/updateColumn for now.
a728bf2 [Reynold Xin] Example rename.
e8aa3d3 [Reynold Xin] groupby -> groupBy.
9662c9e [Davies Liu] improve DataFrame Python API
4ae51ea [Davies Liu] python API for dataframe
1e5e454 [Reynold Xin] Fixed a bug with symbol conversion.
2ca74db [Reynold Xin] Couple minor fixes.
ea98ea1 [Reynold Xin] Documentation & literal expressions.
2b22684 [Reynold Xin] Got rid of IntelliJ problems.
02bbfbc [Reynold Xin] Tightening imports.
ffbce66 [Reynold Xin] Fixed compilation error.
59b6d8b [Reynold Xin] Style violation.
b85edfb [Reynold Xin] ALS.
8c37f0a [Reynold Xin] Made MLlib and examples compile
6d53134 [Reynold Xin] Hive module.
d35efd5 [Reynold Xin] Fixed compilation error.
ce4a5d2 [Reynold Xin] Fixed test cases in SQL except ParquetIOSuite.
66d5ef1 [Reynold Xin] SQLContext minor patch.
c9bcdc0 [Reynold Xin] Checkpoint: SQL module compiles!
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/119f45d6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/119f45d6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/119f45d6
Branch: refs/heads/master
Commit: 119f45d61d7b48d376cca05e1b4f0c7fcf65bfa8
Parents: b1b35ca
Author: Reynold Xin <rx...@databricks.com>
Authored: Tue Jan 27 16:08:24 2015 -0800
Committer: Reynold Xin <rx...@databricks.com>
Committed: Tue Jan 27 16:08:24 2015 -0800
----------------------------------------------------------------------
.../examples/ml/JavaCrossValidatorExample.java | 10 +-
.../examples/ml/JavaSimpleParamsExample.java | 12 +-
.../JavaSimpleTextClassificationPipeline.java | 10 +-
.../apache/spark/examples/sql/JavaSparkSQL.java | 36 +-
.../src/main/python/mllib/dataset_example.py | 2 +-
examples/src/main/python/sql.py | 16 +-
.../examples/ml/CrossValidatorExample.scala | 3 +-
.../apache/spark/examples/ml/MovieLensALS.scala | 2 +-
.../spark/examples/ml/SimpleParamsExample.scala | 5 +-
.../ml/SimpleTextClassificationPipeline.scala | 3 +-
.../spark/examples/mllib/DatasetExample.scala | 28 +-
.../apache/spark/examples/sql/RDDRelation.scala | 6 +-
.../scala/org/apache/spark/ml/Estimator.scala | 8 +-
.../scala/org/apache/spark/ml/Evaluator.scala | 4 +-
.../scala/org/apache/spark/ml/Pipeline.scala | 6 +-
.../scala/org/apache/spark/ml/Transformer.scala | 17 +-
.../ml/classification/LogisticRegression.scala | 14 +-
.../BinaryClassificationEvaluator.scala | 7 +-
.../spark/ml/feature/StandardScaler.scala | 15 +-
.../apache/spark/ml/recommendation/ALS.scala | 37 +-
.../apache/spark/ml/tuning/CrossValidator.scala | 8 +-
.../org/apache/spark/ml/JavaPipelineSuite.java | 6 +-
.../JavaLogisticRegressionSuite.java | 8 +-
.../ml/tuning/JavaCrossValidatorSuite.java | 4 +-
.../org/apache/spark/ml/PipelineSuite.scala | 14 +-
.../LogisticRegressionSuite.scala | 16 +-
.../spark/ml/recommendation/ALSSuite.scala | 4 +-
.../spark/ml/tuning/CrossValidatorSuite.scala | 4 +-
project/MimaExcludes.scala | 15 +-
python/pyspark/java_gateway.py | 7 +-
python/pyspark/sql.py | 967 ++++++++++++++-----
python/pyspark/tests.py | 155 +--
.../analysis/MultiInstanceRelation.scala | 2 +-
.../catalyst/expressions/namedExpressions.scala | 3 +
.../spark/sql/catalyst/plans/joinTypes.scala | 15 +
.../catalyst/plans/logical/TestRelation.scala | 8 +-
.../org/apache/spark/sql/CacheManager.scala | 8 +-
.../scala/org/apache/spark/sql/Column.scala | 528 ++++++++++
.../scala/org/apache/spark/sql/DataFrame.scala | 596 ++++++++++++
.../org/apache/spark/sql/GroupedDataFrame.scala | 139 +++
.../scala/org/apache/spark/sql/Literal.scala | 98 ++
.../scala/org/apache/spark/sql/SQLContext.scala | 85 +-
.../scala/org/apache/spark/sql/SchemaRDD.scala | 511 ----------
.../org/apache/spark/sql/SchemaRDDLike.scala | 139 ---
.../main/scala/org/apache/spark/sql/api.scala | 289 ++++++
.../org/apache/spark/sql/dsl/package.scala | 495 ++++++++++
.../apache/spark/sql/execution/commands.scala | 8 +-
.../spark/sql/execution/debug/package.scala | 4 +-
.../scala/org/apache/spark/sql/package.scala | 2 +-
.../apache/spark/sql/parquet/ParquetTest.scala | 6 +-
.../spark/sql/sources/DataSourceStrategy.scala | 32 +-
.../org/apache/spark/sql/sources/ddl.scala | 5 +-
.../apache/spark/sql/test/TestSQLContext.scala | 6 +-
.../apache/spark/sql/api/java/JavaAPISuite.java | 4 +-
.../sql/api/java/JavaApplySchemaSuite.java | 16 +-
.../org/apache/spark/sql/CachedTableSuite.scala | 1 +
.../org/apache/spark/sql/DslQuerySuite.scala | 119 +--
.../scala/org/apache/spark/sql/JoinSuite.scala | 67 +-
.../scala/org/apache/spark/sql/QueryTest.scala | 12 +-
.../org/apache/spark/sql/SQLQuerySuite.scala | 18 +-
.../scala/org/apache/spark/sql/TestData.scala | 23 +-
.../scala/org/apache/spark/sql/UDFSuite.scala | 11 +-
.../apache/spark/sql/UserDefinedTypeSuite.scala | 6 +-
.../columnar/InMemoryColumnarQuerySuite.scala | 1 +
.../spark/sql/execution/PlannerSuite.scala | 14 +-
.../apache/spark/sql/execution/TgfSuite.scala | 65 --
.../org/apache/spark/sql/json/JsonSuite.scala | 11 +-
.../spark/sql/parquet/ParquetFilterSuite.scala | 126 +--
.../spark/sql/parquet/ParquetIOSuite.scala | 7 +-
.../spark/sql/sources/PrunedScanSuite.scala | 2 +
.../hive/thriftserver/SparkSQLCLIDriver.scala | 2 +-
.../spark/sql/hive/thriftserver/Shim12.scala | 6 +-
.../spark/sql/hive/thriftserver/Shim13.scala | 6 +-
.../org/apache/spark/sql/hive/HiveContext.scala | 9 +-
.../apache/spark/sql/hive/HiveStrategies.scala | 17 +-
.../org/apache/spark/sql/hive/TestHive.scala | 9 +-
.../scala/org/apache/spark/sql/QueryTest.scala | 10 +-
.../spark/sql/hive/CachedTableSuite.scala | 4 +-
.../sql/hive/InsertIntoHiveTableSuite.scala | 2 +-
.../sql/hive/execution/HiveQuerySuite.scala | 7 +-
.../sql/hive/execution/HiveTableScanSuite.scala | 11 +-
.../spark/sql/hive/execution/HiveUdfSuite.scala | 2 +-
82 files changed, 3444 insertions(+), 1572 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index 247d2a5..0fbee6e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -33,7 +33,7 @@ import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.tuning.CrossValidator;
import org.apache.spark.ml.tuning.CrossValidatorModel;
import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
@@ -71,7 +71,7 @@ public class JavaCrossValidatorExample {
new LabeledDocument(9L, "a e c l", 0.0),
new LabeledDocument(10L, "spark compile", 1.0),
new LabeledDocument(11L, "hadoop software", 0.0));
- SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -112,11 +112,11 @@ public class JavaCrossValidatorExample {
new Document(5L, "l m n"),
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
- SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+ DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents. cvModel uses the best model found (lrModel).
- cvModel.transform(test).registerAsTable("prediction");
- SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+ cvModel.transform(test).registerTempTable("prediction");
+ DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
for (Row r: predictions.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+ ", prediction=" + r.get(3));
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index 5b92655..eaaa344 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -28,7 +28,7 @@ import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
@@ -48,13 +48,13 @@ public class JavaSimpleParamsExample {
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
- // into SchemaRDDs, where it uses the bean metadata to infer the schema.
+ // into DataFrames, where it uses the bean metadata to infer the schema.
List<LabeledPoint> localTraining = Lists.newArrayList(
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
- SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+ DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
@@ -94,14 +94,14 @@ public class JavaSimpleParamsExample {
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+ DataFrame test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
// column since we renamed the lr.scoreCol parameter previously.
- model2.transform(test).registerAsTable("results");
- SchemaRDD results =
+ model2.transform(test).registerTempTable("results");
+ DataFrame results =
jsql.sql("SELECT features, label, probability, prediction FROM results");
for (Row r: results.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index 74db449..82d665a 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -29,7 +29,7 @@ import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
@@ -54,7 +54,7 @@ public class JavaSimpleTextClassificationPipeline {
new LabeledDocument(1L, "b d", 0.0),
new LabeledDocument(2L, "spark f g h", 1.0),
new LabeledDocument(3L, "hadoop mapreduce", 0.0));
- SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -79,11 +79,11 @@ public class JavaSimpleTextClassificationPipeline {
new Document(5L, "l m n"),
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
- SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+ DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents.
- model.transform(test).registerAsTable("prediction");
- SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+ model.transform(test).registerTempTable("prediction");
+ DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
for (Row r: predictions.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+ ", prediction=" + r.get(3));
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index b708046..8defb76 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -26,9 +26,9 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
public class JavaSparkSQL {
public static class Person implements Serializable {
@@ -74,13 +74,13 @@ public class JavaSparkSQL {
});
// Apply a schema to an RDD of Java Beans and register it as a table.
- SchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
+ DataFrame schemaPeople = sqlCtx.applySchema(people, Person.class);
schemaPeople.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
- SchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ DataFrame teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
- // The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+ // The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
@Override
@@ -93,17 +93,17 @@ public class JavaSparkSQL {
}
System.out.println("=== Data source: Parquet File ===");
- // JavaSchemaRDDs can be saved as parquet files, maintaining the schema information.
+ // DataFrames can be saved as parquet files, maintaining the schema information.
schemaPeople.saveAsParquetFile("people.parquet");
// Read in the parquet file created above.
// Parquet files are self-describing so the schema is preserved.
- // The result of loading a parquet file is also a JavaSchemaRDD.
- SchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
+ // The result of loading a parquet file is also a DataFrame.
+ DataFrame parquetFile = sqlCtx.parquetFile("people.parquet");
//Parquet files can also be registered as tables and then used in SQL statements.
parquetFile.registerTempTable("parquetFile");
- SchemaRDD teenagers2 =
+ DataFrame teenagers2 =
sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {
@Override
@@ -119,8 +119,8 @@ public class JavaSparkSQL {
// A JSON dataset is pointed by path.
// The path can be either a single text file or a directory storing text files.
String path = "examples/src/main/resources/people.json";
- // Create a JavaSchemaRDD from the file(s) pointed by path
- SchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
+ // Create a DataFrame from the file(s) pointed by path
+ DataFrame peopleFromJsonFile = sqlCtx.jsonFile(path);
// Because the schema of a JSON dataset is automatically inferred, to write queries,
// it is better to take a look at what is the schema.
@@ -130,13 +130,13 @@ public class JavaSparkSQL {
// |-- age: IntegerType
// |-- name: StringType
- // Register this JavaSchemaRDD as a table.
+ // Register this DataFrame as a table.
peopleFromJsonFile.registerTempTable("people");
// SQL statements can be run by using the sql methods provided by sqlCtx.
- SchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ DataFrame teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
- // The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
+ // The results of SQL queries are DataFrame and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {
@Override
@@ -146,14 +146,14 @@ public class JavaSparkSQL {
System.out.println(name);
}
- // Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by
+ // Alternatively, a DataFrame can be created for a JSON dataset represented by
// a RDD[String] storing one JSON object per string.
List<String> jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
- SchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
+ DataFrame peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
- // Take a look at the schema of this new JavaSchemaRDD.
+ // Take a look at the schema of this new DataFrame.
peopleFromJsonRDD.printSchema();
// The schema of anotherPeople is ...
// root
@@ -164,7 +164,7 @@ public class JavaSparkSQL {
peopleFromJsonRDD.registerTempTable("people2");
- SchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
+ DataFrame peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/python/mllib/dataset_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/mllib/dataset_example.py b/examples/src/main/python/mllib/dataset_example.py
index 540dae7..b5a70db 100644
--- a/examples/src/main/python/mllib/dataset_example.py
+++ b/examples/src/main/python/mllib/dataset_example.py
@@ -16,7 +16,7 @@
#
"""
-An example of how to use SchemaRDD as a dataset for ML. Run with::
+An example of how to use DataFrame as a dataset for ML. Run with::
bin/spark-submit examples/src/main/python/mllib/dataset_example.py
"""
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/python/sql.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py
index d2c5ca4..7f5c68e 100644
--- a/examples/src/main/python/sql.py
+++ b/examples/src/main/python/sql.py
@@ -30,18 +30,18 @@ if __name__ == "__main__":
some_rdd = sc.parallelize([Row(name="John", age=19),
Row(name="Smith", age=23),
Row(name="Sarah", age=18)])
- # Infer schema from the first row, create a SchemaRDD and print the schema
- some_schemardd = sqlContext.inferSchema(some_rdd)
- some_schemardd.printSchema()
+ # Infer schema from the first row, create a DataFrame and print the schema
+ some_df = sqlContext.inferSchema(some_rdd)
+ some_df.printSchema()
# Another RDD is created from a list of tuples
another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
# Schema with two fields - person_name and person_age
schema = StructType([StructField("person_name", StringType(), False),
StructField("person_age", IntegerType(), False)])
- # Create a SchemaRDD by applying the schema to the RDD and print the schema
- another_schemardd = sqlContext.applySchema(another_rdd, schema)
- another_schemardd.printSchema()
+ # Create a DataFrame by applying the schema to the RDD and print the schema
+ another_df = sqlContext.applySchema(another_rdd, schema)
+ another_df.printSchema()
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)
@@ -49,7 +49,7 @@ if __name__ == "__main__":
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files.
path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
- # Create a SchemaRDD from the file(s) pointed to by path
+ # Create a DataFrame from the file(s) pointed to by path
people = sqlContext.jsonFile(path)
# root
# |-- person_name: string (nullable = false)
@@ -61,7 +61,7 @@ if __name__ == "__main__":
# |-- age: IntegerType
# |-- name: StringType
- # Register this SchemaRDD as a table.
+ # Register this DataFrame as a table.
people.registerAsTable("people")
# SQL statements can be run by using the sql methods provided by sqlContext
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index d8c7ef3..283bb80 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -18,7 +18,6 @@
package org.apache.spark.examples.ml
import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
@@ -101,7 +100,7 @@ object CrossValidatorExample {
// Make predictions on test documents. cvModel uses the best model found (lrModel).
cvModel.transform(test)
- .select('id, 'text, 'score, 'prediction)
+ .select("id", "text", "score", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index cf62772..b788582 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -143,7 +143,7 @@ object MovieLensALS {
// Evaluate the model.
// TODO: Create an evaluator to compute RMSE.
- val mse = predictions.select('rating, 'prediction)
+ val mse = predictions.select("rating", "prediction").rdd
.flatMap { case Row(rating: Float, prediction: Float) =>
val err = rating.toDouble - prediction
val err2 = err * err
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index e8a2adf..95cc980 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -18,7 +18,6 @@
package org.apache.spark.examples.ml
import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -42,7 +41,7 @@ object SimpleParamsExample {
// Prepare training data.
// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of Java Beans
- // into SchemaRDDs, where it uses the bean metadata to infer the schema.
+ // into DataFrames, where it uses the bean metadata to infer the schema.
val training = sparkContext.parallelize(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
@@ -92,7 +91,7 @@ object SimpleParamsExample {
// Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
// column since we renamed the lr.scoreCol parameter previously.
model2.transform(test)
- .select('features, 'label, 'probability, 'prediction)
+ .select("features", "label", "probability", "prediction")
.collect()
.foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) =>
println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction)
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index b9a6ef0..065db62 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
import scala.beans.BeanInfo
import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
@@ -80,7 +79,7 @@ object SimpleTextClassificationPipeline {
// Make predictions on test documents.
model.transform(test)
- .select('id, 'text, 'score, 'prediction)
+ .select("id", "text", "score", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index f8d83f4..f229a58 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -28,10 +28,10 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, SQLContext, SchemaRDD}
+import org.apache.spark.sql.{Row, SQLContext, DataFrame}
/**
- * An example of how to use [[org.apache.spark.sql.SchemaRDD]] as a Dataset for ML. Run with
+ * An example of how to use [[org.apache.spark.sql.DataFrame]] as a Dataset for ML. Run with
* {{{
* ./bin/run-example org.apache.spark.examples.mllib.DatasetExample [options]
* }}}
@@ -47,7 +47,7 @@ object DatasetExample {
val defaultParams = Params()
val parser = new OptionParser[Params]("DatasetExample") {
- head("Dataset: an example app using SchemaRDD as a Dataset for ML.")
+ head("Dataset: an example app using DataFrame as a Dataset for ML.")
opt[String]("input")
.text(s"input path to dataset")
.action((x, c) => c.copy(input = x))
@@ -80,20 +80,20 @@ object DatasetExample {
}
println(s"Loaded ${origData.count()} instances from file: ${params.input}")
- // Convert input data to SchemaRDD explicitly.
- val schemaRDD: SchemaRDD = origData
- println(s"Inferred schema:\n${schemaRDD.schema.prettyJson}")
- println(s"Converted to SchemaRDD with ${schemaRDD.count()} records")
+ // Convert input data to DataFrame explicitly.
+ val df: DataFrame = origData.toDF
+ println(s"Inferred schema:\n${df.schema.prettyJson}")
+ println(s"Converted to DataFrame with ${df.count()} records")
- // Select columns, using implicit conversion to SchemaRDD.
- val labelsSchemaRDD: SchemaRDD = origData.select('label)
- val labels: RDD[Double] = labelsSchemaRDD.map { case Row(v: Double) => v }
+ // Select columns, using implicit conversion to DataFrames.
+ val labelsDf: DataFrame = origData.select("label")
+ val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v }
val numLabels = labels.count()
val meanLabel = labels.fold(0.0)(_ + _) / numLabels
println(s"Selected label column with average value $meanLabel")
- val featuresSchemaRDD: SchemaRDD = origData.select('features)
- val features: RDD[Vector] = featuresSchemaRDD.map { case Row(v: Vector) => v }
+ val featuresDf: DataFrame = origData.select("features")
+ val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v }
val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
(summary, feat) => summary.add(feat),
(sum1, sum2) => sum1.merge(sum2))
@@ -103,13 +103,13 @@ object DatasetExample {
tmpDir.deleteOnExit()
val outputDir = new File(tmpDir, "dataset").toString
println(s"Saving to $outputDir as Parquet file.")
- schemaRDD.saveAsParquetFile(outputDir)
+ df.saveAsParquetFile(outputDir)
println(s"Loading Parquet file with UDT from $outputDir.")
val newDataset = sqlContext.parquetFile(outputDir)
println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
- val newFeatures = newDataset.select('features).map { case Row(v: Vector) => v }
+ val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v }
val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())(
(summary, feat) => summary.add(feat),
(sum1, sum2) => sum1.merge(sum2))
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 2e98b2d..a5d7f26 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -19,6 +19,8 @@ package org.apache.spark.examples.sql
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.dsl._
+import org.apache.spark.sql.dsl.literals._
// One method for defining the schema of an RDD is to make a case class with the desired column
// names and types.
@@ -54,7 +56,7 @@ object RDDRelation {
rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)
// Queries can also be written using a LINQ-like Scala DSL.
- rdd.where('key === 1).orderBy('value.asc).select('key).collect().foreach(println)
+ rdd.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println)
// Write out an RDD as a parquet file.
rdd.saveAsParquetFile("pair.parquet")
@@ -63,7 +65,7 @@ object RDDRelation {
val parquetFile = sqlContext.parquetFile("pair.parquet")
// Queries can be run using the DSL on parequet files just like the original RDD.
- parquetFile.where('key === 1).select('value as 'a).collect().foreach(println)
+ parquetFile.where($"key" === 1).select($"value".as("a")).collect().foreach(println)
// These files can also be registered as tables.
parquetFile.registerTempTable("parquetFile")
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
index 77d230e..bc3defe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -21,7 +21,7 @@ import scala.annotation.varargs
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.{ParamMap, ParamPair, Params}
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.DataFrame
/**
* :: AlphaComponent ::
@@ -38,7 +38,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
* @return fitted model
*/
@varargs
- def fit(dataset: SchemaRDD, paramPairs: ParamPair[_]*): M = {
+ def fit(dataset: DataFrame, paramPairs: ParamPair[_]*): M = {
val map = new ParamMap().put(paramPairs: _*)
fit(dataset, map)
}
@@ -50,7 +50,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
* @param paramMap parameter map
* @return fitted model
*/
- def fit(dataset: SchemaRDD, paramMap: ParamMap): M
+ def fit(dataset: DataFrame, paramMap: ParamMap): M
/**
* Fits multiple models to the input data with multiple sets of parameters.
@@ -61,7 +61,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
* @param paramMaps an array of parameter maps
* @return fitted models, matching the input parameter maps
*/
- def fit(dataset: SchemaRDD, paramMaps: Array[ParamMap]): Seq[M] = {
+ def fit(dataset: DataFrame, paramMaps: Array[ParamMap]): Seq[M] = {
paramMaps.map(fit(dataset, _))
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
index db563dd..d2ca2e6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.DataFrame
/**
* :: AlphaComponent ::
@@ -35,5 +35,5 @@ abstract class Evaluator extends Identifiable {
* @param paramMap parameter map that specifies the input columns and output metrics
* @return metric
*/
- def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double
+ def evaluate(dataset: DataFrame, paramMap: ParamMap): Double
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index ad6fed1..fe39cd1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ListBuffer
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.{Param, ParamMap}
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
/**
@@ -88,7 +88,7 @@ class Pipeline extends Estimator[PipelineModel] {
* @param paramMap parameter map
* @return fitted pipeline
*/
- override def fit(dataset: SchemaRDD, paramMap: ParamMap): PipelineModel = {
+ override def fit(dataset: DataFrame, paramMap: ParamMap): PipelineModel = {
transformSchema(dataset.schema, paramMap, logging = true)
val map = this.paramMap ++ paramMap
val theStages = map(stages)
@@ -162,7 +162,7 @@ class PipelineModel private[ml] (
}
}
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
// Precedence of ParamMaps: paramMap > this.paramMap > fittingParamMap
val map = (fittingParamMap ++ this.paramMap) ++ paramMap
transformSchema(dataset.schema, map, logging = true)
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index af56f9c..b233bff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -22,9 +22,9 @@ import scala.annotation.varargs
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param._
-import org.apache.spark.sql.SchemaRDD
-import org.apache.spark.sql.catalyst.analysis.Star
-import org.apache.spark.sql.catalyst.expressions.ScalaUdf
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql._
+import org.apache.spark.sql.dsl._
import org.apache.spark.sql.types._
/**
@@ -41,7 +41,7 @@ abstract class Transformer extends PipelineStage with Params {
* @return transformed dataset
*/
@varargs
- def transform(dataset: SchemaRDD, paramPairs: ParamPair[_]*): SchemaRDD = {
+ def transform(dataset: DataFrame, paramPairs: ParamPair[_]*): DataFrame = {
val map = new ParamMap()
paramPairs.foreach(map.put(_))
transform(dataset, map)
@@ -53,7 +53,7 @@ abstract class Transformer extends PipelineStage with Params {
* @param paramMap additional parameters, overwrite embedded params
* @return transformed dataset
*/
- def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD
+ def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame
}
/**
@@ -95,11 +95,10 @@ private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, O
StructType(outputFields)
}
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
transformSchema(dataset.schema, paramMap, logging = true)
- import dataset.sqlContext._
val map = this.paramMap ++ paramMap
- val udf = ScalaUdf(this.createTransformFunc(map), outputDataType, Seq(map(inputCol).attr))
- dataset.select(Star(None), udf as map(outputCol))
+ dataset.select($"*", callUDF(
+ this.createTransformFunc(map), outputDataType, Column(map(inputCol))).as(map(outputCol)))
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 8c57081..eeb6301 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -24,7 +24,7 @@ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.Star
+import org.apache.spark.sql.dsl._
import org.apache.spark.sql.catalyst.dsl._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.storage.StorageLevel
@@ -87,11 +87,10 @@ class LogisticRegression extends Estimator[LogisticRegressionModel] with Logisti
def setScoreCol(value: String): this.type = set(scoreCol, value)
def setPredictionCol(value: String): this.type = set(predictionCol, value)
- override def fit(dataset: SchemaRDD, paramMap: ParamMap): LogisticRegressionModel = {
+ override def fit(dataset: DataFrame, paramMap: ParamMap): LogisticRegressionModel = {
transformSchema(dataset.schema, paramMap, logging = true)
- import dataset.sqlContext._
val map = this.paramMap ++ paramMap
- val instances = dataset.select(map(labelCol).attr, map(featuresCol).attr)
+ val instances = dataset.select(map(labelCol), map(featuresCol))
.map { case Row(label: Double, features: Vector) =>
LabeledPoint(label, features)
}.persist(StorageLevel.MEMORY_AND_DISK)
@@ -131,9 +130,8 @@ class LogisticRegressionModel private[ml] (
validateAndTransformSchema(schema, paramMap, fitting = false)
}
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
transformSchema(dataset.schema, paramMap, logging = true)
- import dataset.sqlContext._
val map = this.paramMap ++ paramMap
val score: Vector => Double = (v) => {
val margin = BLAS.dot(v, weights)
@@ -143,7 +141,7 @@ class LogisticRegressionModel private[ml] (
val predict: Double => Double = (score) => {
if (score > t) 1.0 else 0.0
}
- dataset.select(Star(None), score.call(map(featuresCol).attr) as map(scoreCol))
- .select(Star(None), predict.call(map(scoreCol).attr) as map(predictionCol))
+ dataset.select($"*", callUDF(score, Column(map(featuresCol))).as(map(scoreCol)))
+ .select($"*", callUDF(predict, Column(map(scoreCol))).as(map(predictionCol)))
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 12473cb..1979ab9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.param._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.sql.{Row, SchemaRDD}
+import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType
/**
@@ -41,7 +41,7 @@ class BinaryClassificationEvaluator extends Evaluator with Params
def setScoreCol(value: String): this.type = set(scoreCol, value)
def setLabelCol(value: String): this.type = set(labelCol, value)
- override def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double = {
+ override def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
val map = this.paramMap ++ paramMap
val schema = dataset.schema
@@ -52,8 +52,7 @@ class BinaryClassificationEvaluator extends Evaluator with Params
require(labelType == DoubleType,
s"Label column ${map(labelCol)} must be double type but found $labelType")
- import dataset.sqlContext._
- val scoreAndLabels = dataset.select(map(scoreCol).attr, map(labelCol).attr)
+ val scoreAndLabels = dataset.select(map(scoreCol), map(labelCol))
.map { case Row(score: Double, label: Double) =>
(score, label)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 72825f6..e7bdb07 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.param._
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.Star
+import org.apache.spark.sql.dsl._
import org.apache.spark.sql.catalyst.dsl._
import org.apache.spark.sql.types.{StructField, StructType}
@@ -43,14 +43,10 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP
def setInputCol(value: String): this.type = set(inputCol, value)
def setOutputCol(value: String): this.type = set(outputCol, value)
- override def fit(dataset: SchemaRDD, paramMap: ParamMap): StandardScalerModel = {
+ override def fit(dataset: DataFrame, paramMap: ParamMap): StandardScalerModel = {
transformSchema(dataset.schema, paramMap, logging = true)
- import dataset.sqlContext._
val map = this.paramMap ++ paramMap
- val input = dataset.select(map(inputCol).attr)
- .map { case Row(v: Vector) =>
- v
- }
+ val input = dataset.select(map(inputCol)).map { case Row(v: Vector) => v }
val scaler = new feature.StandardScaler().fit(input)
val model = new StandardScalerModel(this, map, scaler)
Params.inheritValues(map, this, model)
@@ -83,14 +79,13 @@ class StandardScalerModel private[ml] (
def setInputCol(value: String): this.type = set(inputCol, value)
def setOutputCol(value: String): this.type = set(outputCol, value)
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
transformSchema(dataset.schema, paramMap, logging = true)
- import dataset.sqlContext._
val map = this.paramMap ++ paramMap
val scale: (Vector) => Vector = (v) => {
scaler.transform(v)
}
- dataset.select(Star(None), scale.call(map(inputCol).attr) as map(outputCol))
+ dataset.select($"*", callUDF(scale, Column(map(inputCol))).as(map(outputCol)))
}
private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 2d89e76..f6437c7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -29,10 +29,8 @@ import org.apache.spark.{HashPartitioner, Logging, Partitioner}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.param._
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SchemaRDD
-import org.apache.spark.sql.catalyst.dsl._
-import org.apache.spark.sql.catalyst.expressions.Cast
-import org.apache.spark.sql.catalyst.plans.LeftOuter
+import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.dsl._
import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}
import org.apache.spark.util.Utils
import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter}
@@ -112,7 +110,7 @@ class ALSModel private[ml] (
def setPredictionCol(value: String): this.type = set(predictionCol, value)
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
import dataset.sqlContext._
import org.apache.spark.ml.recommendation.ALSModel.Factor
val map = this.paramMap ++ paramMap
@@ -120,13 +118,13 @@ class ALSModel private[ml] (
val instanceTable = s"instance_$uid"
val userTable = s"user_$uid"
val itemTable = s"item_$uid"
- val instances = dataset.as(Symbol(instanceTable))
+ val instances = dataset.as(instanceTable)
val users = userFactors.map { case (id, features) =>
Factor(id, features)
- }.as(Symbol(userTable))
+ }.as(userTable)
val items = itemFactors.map { case (id, features) =>
Factor(id, features)
- }.as(Symbol(itemTable))
+ }.as(itemTable)
val predict: (Seq[Float], Seq[Float]) => Float = (userFeatures, itemFeatures) => {
if (userFeatures != null && itemFeatures != null) {
blas.sdot(k, userFeatures.toArray, 1, itemFeatures.toArray, 1)
@@ -135,12 +133,12 @@ class ALSModel private[ml] (
}
}
val inputColumns = dataset.schema.fieldNames
- val prediction =
- predict.call(s"$userTable.features".attr, s"$itemTable.features".attr) as map(predictionCol)
- val outputColumns = inputColumns.map(f => s"$instanceTable.$f".attr as f) :+ prediction
+ val prediction = callUDF(predict, $"$userTable.features", $"$itemTable.features")
+ .as(map(predictionCol))
+ val outputColumns = inputColumns.map(f => $"$instanceTable.$f".as(f)) :+ prediction
instances
- .join(users, LeftOuter, Some(map(userCol).attr === s"$userTable.id".attr))
- .join(items, LeftOuter, Some(map(itemCol).attr === s"$itemTable.id".attr))
+ .join(users, Column(map(userCol)) === $"$userTable.id", "left")
+ .join(items, Column(map(itemCol)) === $"$itemTable.id", "left")
.select(outputColumns: _*)
}
@@ -209,14 +207,13 @@ class ALS extends Estimator[ALSModel] with ALSParams {
setMaxIter(20)
setRegParam(1.0)
- override def fit(dataset: SchemaRDD, paramMap: ParamMap): ALSModel = {
- import dataset.sqlContext._
+ override def fit(dataset: DataFrame, paramMap: ParamMap): ALSModel = {
val map = this.paramMap ++ paramMap
- val ratings =
- dataset.select(map(userCol).attr, map(itemCol).attr, Cast(map(ratingCol).attr, FloatType))
- .map { row =>
- new Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
- }
+ val ratings = dataset
+ .select(Column(map(userCol)), Column(map(itemCol)), Column(map(ratingCol)).cast(FloatType))
+ .map { row =>
+ new Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
+ }
val (userFactors, itemFactors) = ALS.train(ratings, rank = map(rank),
numUserBlocks = map(numUserBlocks), numItemBlocks = map(numItemBlocks),
maxIter = map(maxIter), regParam = map(regParam), implicitPrefs = map(implicitPrefs),
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 08fe991..5d51c51 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
/**
@@ -64,7 +64,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
def setEvaluator(value: Evaluator): this.type = set(evaluator, value)
def setNumFolds(value: Int): this.type = set(numFolds, value)
- override def fit(dataset: SchemaRDD, paramMap: ParamMap): CrossValidatorModel = {
+ override def fit(dataset: DataFrame, paramMap: ParamMap): CrossValidatorModel = {
val map = this.paramMap ++ paramMap
val schema = dataset.schema
transformSchema(dataset.schema, paramMap, logging = true)
@@ -74,7 +74,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
val epm = map(estimatorParamMaps)
val numModels = epm.size
val metrics = new Array[Double](epm.size)
- val splits = MLUtils.kFold(dataset, map(numFolds), 0)
+ val splits = MLUtils.kFold(dataset.rdd, map(numFolds), 0)
splits.zipWithIndex.foreach { case ((training, validation), splitIndex) =>
val trainingDataset = sqlCtx.applySchema(training, schema).cache()
val validationDataset = sqlCtx.applySchema(validation, schema).cache()
@@ -117,7 +117,7 @@ class CrossValidatorModel private[ml] (
val bestModel: Model[_])
extends Model[CrossValidatorModel] with CrossValidatorParams {
- override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+ override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
bestModel.transform(dataset, paramMap)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
index 47f1f46..56a9dbd 100644
--- a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
@@ -26,7 +26,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.feature.StandardScaler;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
@@ -37,7 +37,7 @@ public class JavaPipelineSuite {
private transient JavaSparkContext jsc;
private transient SQLContext jsql;
- private transient SchemaRDD dataset;
+ private transient DataFrame dataset;
@Before
public void setUp() {
@@ -65,7 +65,7 @@ public class JavaPipelineSuite {
.setStages(new PipelineStage[] {scaler, lr});
PipelineModel model = pipeline.fit(dataset);
model.transform(dataset).registerTempTable("prediction");
- SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+ DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
predictions.collectAsList();
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 2eba833..f4ba23c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -26,7 +26,7 @@ import org.junit.Test;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
@@ -34,7 +34,7 @@ public class JavaLogisticRegressionSuite implements Serializable {
private transient JavaSparkContext jsc;
private transient SQLContext jsql;
- private transient SchemaRDD dataset;
+ private transient DataFrame dataset;
@Before
public void setUp() {
@@ -55,7 +55,7 @@ public class JavaLogisticRegressionSuite implements Serializable {
LogisticRegression lr = new LogisticRegression();
LogisticRegressionModel model = lr.fit(dataset);
model.transform(dataset).registerTempTable("prediction");
- SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+ DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
predictions.collectAsList();
}
@@ -67,7 +67,7 @@ public class JavaLogisticRegressionSuite implements Serializable {
LogisticRegressionModel model = lr.fit(dataset);
model.transform(dataset, model.threshold().w(0.8)) // overwrite threshold
.registerTempTable("prediction");
- SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
+ DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction");
predictions.collectAsList();
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
index a9f1c4a..074b58c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
@@ -30,7 +30,7 @@ import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList;
@@ -38,7 +38,7 @@ public class JavaCrossValidatorSuite implements Serializable {
private transient JavaSparkContext jsc;
private transient SQLContext jsql;
- private transient SchemaRDD dataset;
+ private transient DataFrame dataset;
@Before
public void setUp() {
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 4515084..2f175fb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -23,7 +23,7 @@ import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar.mock
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.DataFrame
class PipelineSuite extends FunSuite {
@@ -36,11 +36,11 @@ class PipelineSuite extends FunSuite {
val estimator2 = mock[Estimator[MyModel]]
val model2 = mock[MyModel]
val transformer3 = mock[Transformer]
- val dataset0 = mock[SchemaRDD]
- val dataset1 = mock[SchemaRDD]
- val dataset2 = mock[SchemaRDD]
- val dataset3 = mock[SchemaRDD]
- val dataset4 = mock[SchemaRDD]
+ val dataset0 = mock[DataFrame]
+ val dataset1 = mock[DataFrame]
+ val dataset2 = mock[DataFrame]
+ val dataset3 = mock[DataFrame]
+ val dataset4 = mock[DataFrame]
when(estimator0.fit(meq(dataset0), any[ParamMap]())).thenReturn(model0)
when(model0.transform(meq(dataset0), any[ParamMap]())).thenReturn(dataset1)
@@ -74,7 +74,7 @@ class PipelineSuite extends FunSuite {
val estimator = mock[Estimator[MyModel]]
val pipeline = new Pipeline()
.setStages(Array(estimator, estimator))
- val dataset = mock[SchemaRDD]
+ val dataset = mock[DataFrame]
intercept[IllegalArgumentException] {
pipeline.fit(dataset)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e8030fe..1912afc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -21,12 +21,12 @@ import org.scalatest.FunSuite
import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{SQLContext, SchemaRDD}
+import org.apache.spark.sql.{SQLContext, DataFrame}
class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
@transient var sqlContext: SQLContext = _
- @transient var dataset: SchemaRDD = _
+ @transient var dataset: DataFrame = _
override def beforeAll(): Unit = {
super.beforeAll()
@@ -36,34 +36,28 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
}
test("logistic regression") {
- val sqlContext = this.sqlContext
- import sqlContext._
val lr = new LogisticRegression
val model = lr.fit(dataset)
model.transform(dataset)
- .select('label, 'prediction)
+ .select("label", "prediction")
.collect()
}
test("logistic regression with setters") {
- val sqlContext = this.sqlContext
- import sqlContext._
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(1.0)
val model = lr.fit(dataset)
model.transform(dataset, model.threshold -> 0.8) // overwrite threshold
- .select('label, 'score, 'prediction)
+ .select("label", "score", "prediction")
.collect()
}
test("logistic regression fit and transform with varargs") {
- val sqlContext = this.sqlContext
- import sqlContext._
val lr = new LogisticRegression
val model = lr.fit(dataset, lr.maxIter -> 10, lr.regParam -> 1.0)
model.transform(dataset, model.threshold -> 0.8, model.scoreCol -> "probability")
- .select('label, 'probability, 'prediction)
+ .select("label", "probability", "prediction")
.collect()
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index cdd4db1..58289ac 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -350,7 +350,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
numItemBlocks: Int = 3,
targetRMSE: Double = 0.05): Unit = {
val sqlContext = this.sqlContext
- import sqlContext.{createSchemaRDD, symbolToUnresolvedAttribute}
+ import sqlContext.createSchemaRDD
val als = new ALS()
.setRank(rank)
.setRegParam(regParam)
@@ -360,7 +360,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
val alpha = als.getAlpha
val model = als.fit(training)
val predictions = model.transform(test)
- .select('rating, 'prediction)
+ .select("rating", "prediction")
.map { case Row(rating: Float, prediction: Float) =>
(rating.toDouble, prediction.toDouble)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 41cc13d..74104fa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -23,11 +23,11 @@ import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{SQLContext, SchemaRDD}
+import org.apache.spark.sql.{SQLContext, DataFrame}
class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext {
- @transient var dataset: SchemaRDD = _
+ @transient var dataset: DataFrame = _
override def beforeAll(): Unit = {
super.beforeAll()
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index af0b0eb..e750fed 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -95,7 +95,20 @@ object MimaExcludes {
) ++ Seq(
// SPARK-5166 Spark SQL API stabilization
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
- ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit")
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
+ ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
+ ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+ ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
) ++ Seq(
// SPARK-5270
ProblemFilters.exclude[MissingMethodProblem](
http://git-wip-us.apache.org/repos/asf/spark/blob/119f45d6/python/pyspark/java_gateway.py
----------------------------------------------------------------------
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index a975dc1..a0a0284 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -111,10 +111,9 @@ def launch_gateway():
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
- java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
- java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
- java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
- java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
+ # TODO(davies): move into sql
+ java_import(gateway.jvm, "org.apache.spark.sql.*")
+ java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")
return gateway
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org