You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@phoenix.apache.org by ma...@apache.org on 2015/04/16 16:32:02 UTC
[21/50] [abbrv] phoenix git commit: PHOENIX-1071 Add phoenix-spark for Spark integration

PHOENIX-1071 Add phoenix-spark for Spark integration


Project: http://git-wip-us.apache.org/repos/asf/phoenix/repo
Commit: http://git-wip-us.apache.org/repos/asf/phoenix/commit/f2d9080d
Tree: http://git-wip-us.apache.org/repos/asf/phoenix/tree/f2d9080d
Diff: http://git-wip-us.apache.org/repos/asf/phoenix/diff/f2d9080d

Branch: refs/heads/calcite
Commit: f2d9080d25006d7737286919498fa1999c9bf78c
Parents: 742ca13
Author: ravimagham <ra...@apache.org>
Authored: Sun Apr 5 01:05:12 2015 -0700
Committer: ravimagham <ra...@apache.org>
Committed: Sun Apr 5 01:05:12 2015 -0700

----------------------------------------------------------------------
 NOTICE                                          |   5 +
 phoenix-spark/README.md                         |  89 ++++
 phoenix-spark/pom.xml                           | 523 +++++++++++++++++++
 .../org/apache/phoenix/spark/PhoenixRDD.scala   | 166 ++++++
 .../phoenix/spark/PhoenixRecordWritable.scala   |  89 ++++
 .../phoenix/spark/ProductRDDFunctions.scala     |  68 +++
 .../phoenix/spark/SparkContextFunctions.scala   |  41 ++
 .../spark/SparkSqlContextFunctions.scala        |  39 ++
 .../org/apache/phoenix/spark/package.scala      |  32 ++
 phoenix-spark/src/test/resources/log4j.xml      |  41 ++
 phoenix-spark/src/test/resources/setup.sql      |  18 +
 .../apache/phoenix/spark/PhoenixRDDTest.scala   | 333 ++++++++++++
 pom.xml                                         |   1 +
 13 files changed, 1445 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/NOTICE
----------------------------------------------------------------------
diff --git a/NOTICE b/NOTICE
index 0836479..0bd2251 100644
--- a/NOTICE
+++ b/NOTICE
@@ -23,3 +23,8 @@ Copyright (c) 2003-2008, Terrence Parr.
 
 JUnit (http://www.junit.org/) included under the Common Public License v1.0.
 See the full text here: http://junit.sourceforge.net/cpl-v10.html
+
+The phoenix-spark module has been adapted from the phoenix-spark library
+distributed under the terms of the Apache 2 license. Original source copyright:
+Copyright 2014 Simply Measured, Inc.
+Copyright 2015 Interset Software Inc.

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/README.md
----------------------------------------------------------------------
diff --git a/phoenix-spark/README.md b/phoenix-spark/README.md
new file mode 100644
index 0000000..1c030f8
--- /dev/null
+++ b/phoenix-spark/README.md
@@ -0,0 +1,89 @@
+phoenix-spark extends Phoenix's MapReduce support to allow Spark to load Phoenix tables as RDDs or
+DataFrames, and enables persisting RDDs of Tuples back to Phoenix.
+
+## Reading Phoenix Tables
+
+Given a Phoenix table with the following DDL
+
+```sql
+CREATE TABLE TABLE1 (ID BIGINT NOT NULL PRIMARY KEY, COL1 VARCHAR);
+UPSERT INTO TABLE1 (ID, COL1) VALUES (1, 'test_row_1');
+UPSERT INTO TABLE1 (ID, COL1) VALUES (2, 'test_row_2');
+```
+
+### Load as a DataFrame
+```scala
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.SQLContext
+import org.apache.phoenix.spark._
+
+val sc = new SparkContext("local", "phoenix-test")
+val sqlContext = new SQLContext(sc)
+
+// Load the columns 'ID' and 'COL1' from TABLE1 as a DataFrame
+val df = sqlContext.phoenixTableAsDataFrame(
+  "TABLE1", Array("ID", "COL1"), zkUrl = Some("phoenix-server:2181")
+)
+
+df.show
+```
+
+### Load as an RDD
+```scala
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.SQLContext
+import org.apache.phoenix.spark._
+
+val sc = new SparkContext("local", "phoenix-test")
+
+// Load the columns 'ID' and 'COL1' from TABLE1 as an RDD
+val rdd: RDD[Map[String, AnyRef]] = sc.phoenixTableAsRDD(
+  "TABLE1", Seq("ID", "COL1"), zkUrl = Some("phoenix-server:2181")
+)
+
+rdd.count()
+
+val firstId = rdd1.first()("ID").asInstanceOf[Long]
+val firstCol = rdd1.first()("COL1").asInstanceOf[String]
+```
+
+## Saving RDDs to Phoenix
+
+Given a Phoenix table with the following DDL
+
+```sql
+CREATE TABLE OUTPUT_TEST_TABLE (id BIGINT NOT NULL PRIMARY KEY, col1 VARCHAR, col2 INTEGER);
+```
+
+`saveToPhoenix` is an implicit method on RDD[Product], or an RDD of Tuples. The data types must
+correspond to the Java types Phoenix supports (http://phoenix.apache.org/language/datatypes.html)
+
+```scala
+import org.apache.spark.SparkContext
+import org.apache.phoenix.spark._
+
+val sc = new SparkContext("local", "phoenix-test")
+val dataSet = List((1L, "1", 1), (2L, "2", 2), (3L, "3", 3))
+
+sc
+  .parallelize(dataSet)
+  .saveToPhoenix(
+    "OUTPUT_TEST_TABLE",
+    Seq("ID","COL1","COL2"),
+    zkUrl = Some("phoenix-server:2181")
+  )
+```
+
+## Notes
+
+The functions `phoenixTableAsDataFrame`, `phoenixTableAsRDD` and `saveToPhoenix` all support
+optionally specifying a `conf` Hadoop configuration parameter with custom Phoenix client settings,
+as well as an optional `zkUrl` parameter for the Phoenix connection URL.
+
+If `zkUrl` isn't specified, it's assumed that the "hbase.zookeeper.quorum" property has been set
+in the `conf` parameter. Similarly, if no configuration is passed in, `zkUrl` must be specified.
+
+## Limitations
+
+- No pushdown predicate support from Spark SQL (yet)
+- No support for aggregate or distinct functions (http://phoenix.apache.org/phoenix_mr.html)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/pom.xml
----------------------------------------------------------------------
diff --git a/phoenix-spark/pom.xml b/phoenix-spark/pom.xml
new file mode 100644
index 0000000..3312b09
--- /dev/null
+++ b/phoenix-spark/pom.xml
@@ -0,0 +1,523 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.phoenix</groupId>
+    <artifactId>phoenix</artifactId>
+    <version>4.4.0-SNAPSHOT</version>
+  </parent>
+  <artifactId>phoenix-spark</artifactId>
+  <name>Phoenix - Spark</name>
+
+  <properties>
+    <spark.version>1.3.0</spark.version>
+    <scala.version>2.10.4</scala.version>
+    <scala.binary.version>2.10</scala.binary.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.phoenix</groupId>
+      <artifactId>phoenix-core</artifactId>
+    </dependency>
+
+    <!-- Force import of Spark's servlet API for unit tests -->
+    <dependency>
+      <groupId>javax.servlet</groupId>
+      <artifactId>javax.servlet-api</artifactId>
+      <version>3.0.1</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <version>2.2.2</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.scalamock</groupId>
+      <artifactId>scalamock-scalatest-support_${scala.binary.version}</artifactId>
+      <version>3.1.4</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.xerial.snappy</groupId>
+      <artifactId>snappy-java</artifactId>
+      <version>1.1.1.6</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <version>${hadoop-two.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>jsp-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>${hadoop-two.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>jsp-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>${hadoop-two.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>jsp-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <version>${hadoop-two.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>jsp-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-client</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>thrift</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-api-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>servlet-api-2.5</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-compiler</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-hadoop-compat</artifactId>
+      <version>${hbase.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>thrift</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-api-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>servlet-api-2.5</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-compiler</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-hadoop2-compat</artifactId>
+      <version>${hbase.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>thrift</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jsp-api-2.1</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>servlet-api-2.5</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-compiler</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby</groupId>
+          <artifactId>jruby-complete</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-it</artifactId>
+      <version>${hbase.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <version>3.2.0</version>
+        <configuration>
+          <charset>${project.build.sourceEncoding}</charset>
+          <jvmArgs>
+            <jvmArg>-Xmx1024m</jvmArg>
+          </jvmArgs>
+          <scalaVersion>${scala.version}</scalaVersion>
+        </configuration>
+        <executions>
+          <execution>
+            <id>scala-compile-first</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>add-source</goal>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>scala-test-compile</id>
+            <phase>process-test-resources</phase>
+            <goals>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <version>1.0</version>
+        <configuration>
+          <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
+          <junitxml>.</junitxml>
+          <filereports>WDF TestSuite.txt</filereports>
+        </configuration>
+        <executions>
+          <execution>
+            <id>test</id>
+            <phase>test</phase>
+            <goals>
+              <goal>test</goal>
+            </goals>
+            <configuration>
+              <parallel>true</parallel>
+              <tagsToExclude>Integration-Test</tagsToExclude>
+            </configuration>
+          </execution>
+          <execution>
+            <id>integration-test</id>
+            <phase>integration-test</phase>
+            <goals>
+              <goal>test</goal>
+            </goals>
+            <configuration>
+              <parallel>false</parallel>
+              <tagsToInclude>Integration-Test</tagsToInclude>
+              <argLine>-Xmx3g -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m</argLine>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRDD.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRDD.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRDD.scala
new file mode 100644
index 0000000..b27f9f9
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRDD.scala
@@ -0,0 +1,166 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hbase.HConstants
+import org.apache.hadoop.io.NullWritable
+import org.apache.phoenix.mapreduce.PhoenixInputFormat
+import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil
+import org.apache.phoenix.schema.types._
+import org.apache.phoenix.util.ColumnInfo
+import org.apache.spark._
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.sql.{Row, DataFrame, SQLContext}
+import org.apache.spark.sql.types._
+import scala.collection.JavaConverters._
+
+class PhoenixRDD(sc: SparkContext, table: String, columns: Seq[String],
+                 predicate: Option[String] = None, zkUrl: Option[String] = None,
+                 @transient conf: Configuration)
+  extends RDD[PhoenixRecordWritable](sc, Nil) with Logging {
+
+  @transient lazy val phoenixConf = {
+    getPhoenixConfiguration
+  }
+
+  val phoenixRDD = sc.newAPIHadoopRDD(phoenixConf,
+    classOf[PhoenixInputFormat[PhoenixRecordWritable]],
+    classOf[NullWritable],
+    classOf[PhoenixRecordWritable])
+
+  override protected def getPartitions: Array[Partition] = {
+    phoenixRDD.partitions
+  }
+
+  @DeveloperApi
+  override def compute(split: Partition, context: TaskContext) = {
+    phoenixRDD.compute(split, context).map(r => r._2)
+  }
+
+  def printPhoenixConfig(conf: Configuration): Unit = {
+    for (mapEntry <- conf.iterator().asScala) {
+      val k = mapEntry.getKey
+      val v = mapEntry.getValue
+
+      if (k.startsWith("phoenix")) {
+        println(s"$k = $v")
+      }
+    }
+  }
+
+  def buildSql(table: String, columns: Seq[String], predicate: Option[String]): String = {
+    val query = "SELECT %s FROM \"%s\"".format(
+      columns.map(f => "\"" + f + "\"").mkString(", "),
+      table
+    )
+
+    query + (predicate match {
+      case Some(p: String) => " WHERE " + p
+      case _ => ""
+    })
+  }
+
+  def getPhoenixConfiguration: Configuration = {
+
+    // This is just simply not serializable, so don't try, but clone it because
+    // PhoenixConfigurationUtil mutates it.
+    val config = new Configuration(conf)
+
+    PhoenixConfigurationUtil.setInputQuery(config, buildSql(table, columns, predicate))
+    PhoenixConfigurationUtil.setSelectColumnNames(config, columns.mkString(","))
+    PhoenixConfigurationUtil.setInputTableName(config, "\"" + table + "\"")
+    PhoenixConfigurationUtil.setInputClass(config, classOf[PhoenixRecordWritable])
+
+    // Override the Zookeeper URL if present. Throw exception if no address given.
+    zkUrl match {
+      case Some(url) => config.set(HConstants.ZOOKEEPER_QUORUM, url )
+      case _ => {
+        if(config.get(HConstants.ZOOKEEPER_QUORUM) == null) {
+          throw new UnsupportedOperationException(
+            s"One of zkUrl or '${HConstants.ZOOKEEPER_QUORUM}' config property must be provided"
+          )
+        }
+      }
+    }
+
+    config
+  }
+
+  // Convert our PhoenixRDD to a DataFrame
+  def toDataFrame(sqlContext: SQLContext): DataFrame = {
+    val columnList = PhoenixConfigurationUtil
+      .getSelectColumnMetadataList(new Configuration(phoenixConf))
+      .asScala
+
+    val columnNames: Seq[String] = columnList.map(ci => {
+      ci.getDisplayName
+    })
+
+    // Lookup the Spark catalyst types from the Phoenix schema
+    val structFields = phoenixSchemaToCatalystSchema(columnList).toArray
+
+    // Create the data frame from the converted Spark schema
+    sqlContext.createDataFrame(map(pr => {
+      val values = pr.resultMap
+      val row = new GenericMutableRow(values.size)
+
+      columnNames.zipWithIndex.foreach {
+        case (columnName, i) => {
+          row.update(i, values(columnName))
+        }
+      }
+
+      row.asInstanceOf[Row]
+    }), new StructType(structFields))
+  }
+
+  def phoenixSchemaToCatalystSchema(columnList: Seq[ColumnInfo]) = {
+    columnList.map(ci => {
+      val structType = phoenixTypeToCatalystType(ci.getPDataType)
+      StructField(ci.getDisplayName, structType)
+    })
+  }
+
+  // Lookup table for Phoenix types to Spark catalyst types
+  def phoenixTypeToCatalystType(phoenixType: PDataType[_]): DataType = phoenixType match {
+    case t if t.isInstanceOf[PVarchar] || t.isInstanceOf[PChar] => StringType
+    case t if t.isInstanceOf[PLong] || t.isInstanceOf[PUnsignedLong] => LongType
+    case t if t.isInstanceOf[PInteger] || t.isInstanceOf[PUnsignedInt] => IntegerType
+    case t if t.isInstanceOf[PFloat] || t.isInstanceOf[PUnsignedFloat] => FloatType
+    case t if t.isInstanceOf[PDouble] || t.isInstanceOf[PUnsignedDouble] => DoubleType
+    case t if t.isInstanceOf[PDecimal] => DecimalType(None)
+    case t if t.isInstanceOf[PTimestamp] || t.isInstanceOf[PUnsignedTimestamp] => TimestampType
+    case t if t.isInstanceOf[PTime] || t.isInstanceOf[PUnsignedTime] => TimestampType
+    case t if t.isInstanceOf[PDate] || t.isInstanceOf[PUnsignedDate] => TimestampType
+    case t if t.isInstanceOf[PBoolean] => BooleanType
+    case t if t.isInstanceOf[PVarbinary] || t.isInstanceOf[PBinary] => BinaryType
+    case t if t.isInstanceOf[PIntegerArray] || t.isInstanceOf[PUnsignedIntArray] => ArrayType(IntegerType, containsNull = true)
+    case t if t.isInstanceOf[PBooleanArray] => ArrayType(BooleanType, containsNull = true)
+    case t if t.isInstanceOf[PVarcharArray] || t.isInstanceOf[PCharArray] => ArrayType(StringType, containsNull = true)
+    case t if t.isInstanceOf[PVarbinaryArray] || t.isInstanceOf[PBinaryArray] => ArrayType(BinaryType, containsNull = true)
+    case t if t.isInstanceOf[PLongArray] || t.isInstanceOf[PUnsignedLongArray] => ArrayType(LongType, containsNull = true)
+    case t if t.isInstanceOf[PSmallintArray] || t.isInstanceOf[PUnsignedSmallintArray] => ArrayType(IntegerType, containsNull = true)
+    case t if t.isInstanceOf[PTinyintArray] || t.isInstanceOf[PUnsignedTinyintArray] => ArrayType(ByteType, containsNull = true)
+    case t if t.isInstanceOf[PFloatArray] || t.isInstanceOf[PUnsignedFloatArray] => ArrayType(FloatType, containsNull = true)
+    case t if t.isInstanceOf[PDoubleArray] || t.isInstanceOf[PUnsignedDoubleArray] => ArrayType(DoubleType, containsNull = true)
+    case t if t.isInstanceOf[PDecimalArray] => ArrayType(DecimalType(None), containsNull = true)
+    case t if t.isInstanceOf[PTimestampArray] || t.isInstanceOf[PUnsignedTimestampArray] => ArrayType(TimestampType, containsNull = true)
+    case t if t.isInstanceOf[PDateArray] || t.isInstanceOf[PUnsignedDateArray] => ArrayType(TimestampType, containsNull = true)
+    case t if t.isInstanceOf[PTimeArray] || t.isInstanceOf[PUnsignedTimeArray] => ArrayType(TimestampType, containsNull = true)
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRecordWritable.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRecordWritable.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRecordWritable.scala
new file mode 100644
index 0000000..48a70ec
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/PhoenixRecordWritable.scala
@@ -0,0 +1,89 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import java.sql.{PreparedStatement, ResultSet}
+import org.apache.hadoop.mapreduce.lib.db.DBWritable
+import org.apache.phoenix.mapreduce.util.ColumnInfoToStringEncoderDecoder
+import org.apache.phoenix.schema.types.{PDate, PhoenixArray}
+import org.joda.time.DateTime
+import scala.collection.{immutable, mutable}
+import scala.collection.JavaConversions._
+
+class PhoenixRecordWritable(var encodedColumns: String) extends DBWritable {
+  val upsertValues = mutable.ArrayBuffer[Any]()
+  val resultMap = mutable.Map[String, AnyRef]()
+
+  def result : immutable.Map[String, AnyRef] = {
+    resultMap.toMap
+  }
+
+  override def write(statement: PreparedStatement): Unit = {
+    // Decode the ColumnInfo list
+    val columns = ColumnInfoToStringEncoderDecoder.decode(encodedColumns).toList
+
+    // Make sure we at least line up in size
+    if(upsertValues.length != columns.length) {
+      throw new UnsupportedOperationException(
+        s"Upsert values ($upsertValues) do not match the specified columns ($columns)"
+      )
+    }
+
+    // Correlate each value (v) to a column type (c) and an index (i)
+    upsertValues.zip(columns).zipWithIndex.foreach {
+      case ((v, c), i) => {
+        if (v != null) {
+          // Both Java and Joda dates used to work in 4.2.3, but now they must be java.sql.Date
+          val (finalObj, finalType) = v match {
+            case dt: DateTime => (new java.sql.Date(dt.getMillis), PDate.INSTANCE.getSqlType)
+            case d: java.util.Date => (new java.sql.Date(d.getTime), PDate.INSTANCE.getSqlType)
+            case _ => (v, c.getSqlType)
+          }
+          statement.setObject(i + 1, finalObj, finalType)
+        } else {
+          statement.setNull(i + 1, c.getSqlType)
+        }
+      }
+    }
+  }
+
+  override def readFields(resultSet: ResultSet): Unit = {
+    val metadata = resultSet.getMetaData
+    for(i <- 1 to metadata.getColumnCount) {
+
+      // Return the contents of a PhoenixArray, if necessary
+      val value = resultSet.getObject(i) match {
+        case x: PhoenixArray => x.getArray
+        case y => y
+      }
+
+      // Put a (ColumnLabel -> value) entry in the result map
+      resultMap(metadata.getColumnLabel(i)) = value
+    }
+  }
+
+  def add(value: Any): Unit = {
+    upsertValues.append(value)
+  }
+
+  // Empty constructor for MapReduce
+  def this() = {
+    this("")
+  }
+
+  // Encoded columns are a Phoenix-serialized representation of the column meta data
+  def setEncodedColumns(encodedColumns: String) {
+    this.encodedColumns = encodedColumns
+  }
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/ProductRDDFunctions.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/ProductRDDFunctions.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/ProductRDDFunctions.scala
new file mode 100644
index 0000000..2926569
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/ProductRDDFunctions.scala
@@ -0,0 +1,68 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hbase.HConstants
+import org.apache.hadoop.io.NullWritable
+import org.apache.phoenix.mapreduce.PhoenixOutputFormat
+import org.apache.phoenix.mapreduce.util.{ColumnInfoToStringEncoderDecoder, PhoenixConfigurationUtil}
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+
+class ProductRDDFunctions[A <: Product](data: RDD[A]) extends Logging with Serializable {
+
+  def saveToPhoenix(tableName: String, cols: Seq[String],
+                    conf: Configuration = new Configuration, zkUrl: Option[String] = None)
+                    : Unit = {
+
+    // Setup Phoenix output configuration, make a local copy
+    val config = new Configuration(conf)
+    PhoenixConfigurationUtil.setOutputTableName(config, tableName)
+    PhoenixConfigurationUtil.setUpsertColumnNames(config, cols.mkString(","))
+
+    // Override the Zookeeper URL if present. Throw exception if no address given.
+    zkUrl match {
+      case Some(url) => config.set(HConstants.ZOOKEEPER_QUORUM, url )
+      case _ => {
+        if(config.get(HConstants.ZOOKEEPER_QUORUM) == null) {
+          throw new UnsupportedOperationException(
+            s"One of zkUrl or '${HConstants.ZOOKEEPER_QUORUM}' config property must be provided"
+          )
+        }
+      }
+    }
+
+    // Encode the column info to a serializable type
+    val encodedColumns = ColumnInfoToStringEncoderDecoder.encode(
+      PhoenixConfigurationUtil.getUpsertColumnMetadataList(config)
+    )
+
+    // Map each element of the product to a new (NullWritable, PhoenixRecordWritable)
+    val phxRDD: RDD[(NullWritable, PhoenixRecordWritable)] = data.map { e =>
+      val rec = new PhoenixRecordWritable(encodedColumns)
+      e.productIterator.foreach { rec.add(_) }
+      (null, rec)
+    }
+
+    // Save it
+    phxRDD.saveAsNewAPIHadoopFile(
+      "",
+      classOf[NullWritable],
+      classOf[PhoenixRecordWritable],
+      classOf[PhoenixOutputFormat[PhoenixRecordWritable]],
+      config
+    )
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkContextFunctions.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkContextFunctions.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkContextFunctions.scala
new file mode 100644
index 0000000..a3cd8f0
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkContextFunctions.scala
@@ -0,0 +1,41 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+
+class SparkContextFunctions(@transient val sc: SparkContext) extends Serializable {
+
+  /*
+    This will return an RDD of Map[String, AnyRef], where the String key corresponds to the column
+    name and the AnyRef value will be a java.sql type as returned by Phoenix
+
+    'table' is the corresponding Phoenix table
+    'columns' is a sequence of of columns to query
+    'predicate' is a set of statements to go after a WHERE clause, e.g. "TID = 123"
+    'zkUrl' is an optional Zookeeper URL to use to connect to Phoenix
+    'conf' is a Hadoop Configuration object. If zkUrl is not set, the "hbase.zookeeper.quorum"
+      property will be used
+   */
+
+  def phoenixTableAsRDD(table: String, columns: Seq[String], predicate: Option[String] = None,
+                        zkUrl: Option[String] = None, conf: Configuration = new Configuration())
+                        : RDD[Map[String, AnyRef]] = {
+
+    // Create a PhoenixRDD, but only return the serializable 'result' map
+    new PhoenixRDD(sc, table, columns, predicate, zkUrl, conf).map(_.result)
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkSqlContextFunctions.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkSqlContextFunctions.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkSqlContextFunctions.scala
new file mode 100644
index 0000000..cc3f378
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/SparkSqlContextFunctions.scala
@@ -0,0 +1,39 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.sql.{DataFrame, SQLContext}
+
+class SparkSqlContextFunctions(@transient val sqlContext: SQLContext) extends Serializable {
+
+  /*
+  This will return a Spark DataFrame, with Phoenix types converted Spark SQL catalyst types
+
+  'table' is the corresponding Phoenix table
+  'columns' is a sequence of of columns to query
+  'predicate' is a set of statements to go after a WHERE clause, e.g. "TID = 123"
+  'zkUrl' is an optional Zookeeper URL to use to connect to Phoenix
+  'conf' is a Hadoop Configuration object. If zkUrl is not set, the "hbase.zookeeper.quorum"
+    property will be used
+ */
+  def phoenixTableAsDataFrame(table: String, columns: Seq[String],
+                               predicate: Option[String] = None, zkUrl: Option[String] = None,
+                               conf: Configuration = new Configuration): DataFrame = {
+
+    // Create the PhoenixRDD and convert it to a DataFrame
+    new PhoenixRDD(sqlContext.sparkContext, table, columns, predicate, zkUrl, conf)
+      .toDataFrame(sqlContext)
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/main/scala/org/apache/phoenix/spark/package.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/main/scala/org/apache/phoenix/spark/package.scala b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/package.scala
new file mode 100644
index 0000000..c19ec16
--- /dev/null
+++ b/phoenix-spark/src/main/scala/org/apache/phoenix/spark/package.scala
@@ -0,0 +1,32 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+
+package object spark {
+  implicit def toProductRDDFunctions[A <: Product](rdd: RDD[A]): ProductRDDFunctions[A] = {
+    new ProductRDDFunctions[A](rdd)
+  }
+
+  implicit def toSparkContextFunctions(sc: SparkContext): SparkContextFunctions = {
+    new SparkContextFunctions(sc)
+  }
+
+  implicit def toSparkSqlContextFunctions(sqlContext: SQLContext): SparkSqlContextFunctions = {
+    new SparkSqlContextFunctions(sqlContext)
+  }
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/test/resources/log4j.xml
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/test/resources/log4j.xml b/phoenix-spark/src/test/resources/log4j.xml
new file mode 100644
index 0000000..d4799da
--- /dev/null
+++ b/phoenix-spark/src/test/resources/log4j.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
+
+<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
+  <appender name="console" class="org.apache.log4j.ConsoleAppender">
+    <param name="Target" value="System.out"/>
+
+    <layout class="org.apache.log4j.PatternLayout">
+      <param name="ConversionPattern" value="%-4r [%t] %-5p %c %x - %m%n"/>
+    </layout>
+  </appender>
+
+  <logger name="org.eclipse">
+    <level value="ERROR"/>
+  </logger>
+
+  <logger name="org.apache">
+    <level value="ERROR"/>
+  </logger>
+
+  <logger name = "org.apache.phoenix.mapreduce">
+    <level value="FATAL"/>
+  </logger>
+
+  <logger name="org.mortbay">
+    <level value="ERROR"/>
+  </logger>
+
+  <logger name="BlockStateChange">
+    <level value="ERROR"/>
+  </logger>
+
+  <logger name="io.netty">
+    <level value="ERROR"/>
+  </logger>
+
+  <root>
+    <priority value="INFO"/>
+    <appender-ref ref="console"/>
+  </root>
+</log4j:configuration>

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/test/resources/setup.sql
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/test/resources/setup.sql b/phoenix-spark/src/test/resources/setup.sql
new file mode 100644
index 0000000..14a7e7e
--- /dev/null
+++ b/phoenix-spark/src/test/resources/setup.sql
@@ -0,0 +1,18 @@
+CREATE TABLE table1 (id BIGINT NOT NULL PRIMARY KEY, col1 VARCHAR)
+CREATE TABLE table2 (id BIGINT NOT NULL PRIMARY KEY, table1_id BIGINT, "t2col1" VARCHAR)
+UPSERT INTO table1 (id, col1) VALUES (1, 'test_row_1')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (1, 1, 'test_child_1')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (2, 1, 'test_child_2')
+UPSERT INTO table1 (id, col1) VALUES (2, 'test_row_2')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (3, 2, 'test_child_1')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (4, 2, 'test_child_2')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (5, 2, 'test_child_3')
+UPSERT INTO table2 (id, table1_id, "t2col1") VALUES (6, 2, 'test_child_4')
+CREATE TABLE "table3" ("id" BIGINT NOT NULL PRIMARY KEY, "col1" VARCHAR)
+UPSERT INTO "table3" ("id", "col1") VALUES (1, 'foo')
+UPSERT INTO "table3" ("id", "col1") VALUES (2, 'bar')
+CREATE TABLE ARRAY_TEST_TABLE (ID BIGINT NOT NULL PRIMARY KEY, VCARRAY VARCHAR[])
+UPSERT INTO ARRAY_TEST_TABLE (ID, VCARRAY) VALUES (1, ARRAY['String1', 'String2', 'String3'])
+CREATE TABLE DATE_PREDICATE_TEST_TABLE (ID BIGINT NOT NULL, TIMESERIES_KEY TIMESTAMP NOT NULL CONSTRAINT pk PRIMARY KEY (ID, TIMESERIES_KEY))
+UPSERT INTO DATE_PREDICATE_TEST_TABLE (ID, TIMESERIES_KEY) VALUES (1, CAST(CURRENT_TIME() AS TIMESTAMP))
+CREATE TABLE OUTPUT_TEST_TABLE (id BIGINT NOT NULL PRIMARY KEY, col1 VARCHAR, col2 INTEGER, col3 DATE)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/phoenix-spark/src/test/scala/org/apache/phoenix/spark/PhoenixRDDTest.scala
----------------------------------------------------------------------
diff --git a/phoenix-spark/src/test/scala/org/apache/phoenix/spark/PhoenixRDDTest.scala b/phoenix-spark/src/test/scala/org/apache/phoenix/spark/PhoenixRDDTest.scala
new file mode 100644
index 0000000..63cb6e4
--- /dev/null
+++ b/phoenix-spark/src/test/scala/org/apache/phoenix/spark/PhoenixRDDTest.scala
@@ -0,0 +1,333 @@
+/*
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+package org.apache.phoenix.spark
+
+import java.sql.{Connection, DriverManager}
+import java.util.Date
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hbase.{HConstants, HBaseTestingUtility}
+import org.apache.phoenix.schema.ColumnNotFoundException
+import org.apache.phoenix.schema.types.PVarchar
+import org.apache.phoenix.util.ColumnInfo
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.{StringType, StructField}
+import org.apache.spark.{SparkConf, SparkContext}
+import org.joda.time.DateTime
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
+import org.apache.phoenix.spark._
+
+import scala.collection.mutable.ListBuffer
+
+class PhoenixRDDTest extends FunSuite with Matchers with BeforeAndAfterAll {
+  lazy val hbaseTestingUtility = {
+    new HBaseTestingUtility()
+  }
+
+  lazy val hbaseConfiguration = {
+    val conf = hbaseTestingUtility.getConfiguration
+
+    val quorum = conf.get("hbase.zookeeper.quorum")
+    val clientPort = conf.get("hbase.zookeeper.property.clientPort")
+    val znodeParent = conf.get("zookeeper.znode.parent")
+
+    // This is an odd one - the Zookeeper Quorum entry in the config is totally wrong. It's
+    // just reporting localhost.
+    conf.set(org.apache.hadoop.hbase.HConstants.ZOOKEEPER_QUORUM, s"$quorum:$clientPort:$znodeParent")
+
+    conf
+  }
+
+  lazy val quorumAddress = {
+    hbaseConfiguration.get("hbase.zookeeper.quorum")
+  }
+
+  lazy val zookeeperClientPort = {
+    hbaseConfiguration.get("hbase.zookeeper.property.clientPort")
+  }
+
+  lazy val zookeeperZnodeParent = {
+    hbaseConfiguration.get("zookeeper.znode.parent")
+  }
+
+  lazy val hbaseConnectionString = {
+    s"$quorumAddress:$zookeeperClientPort:$zookeeperZnodeParent"
+  }
+
+  var conn: Connection = _
+
+  override def beforeAll() {
+    hbaseTestingUtility.startMiniCluster()
+
+    conn = DriverManager.getConnection(s"jdbc:phoenix:$hbaseConnectionString")
+
+    conn.setAutoCommit(true)
+
+    // each SQL statement used to set up Phoenix must be on a single line. Yes, that
+    // can potentially make large lines.
+    val setupSqlSource = getClass.getClassLoader.getResourceAsStream("setup.sql")
+
+    val setupSql = scala.io.Source.fromInputStream(setupSqlSource).getLines()
+
+    for (sql <- setupSql) {
+      val stmt = conn.createStatement()
+
+      stmt.execute(sql)
+
+      stmt.close()
+    }
+
+    conn.commit()
+  }
+
+  override def afterAll() {
+    conn.close()
+    hbaseTestingUtility.shutdownMiniCluster()
+  }
+
+  val conf = new SparkConf().set("spark.ui.showConsoleProgress", "false")
+
+  val sc = new SparkContext("local[1]", "PhoenixSparkTest", conf)
+
+  def buildSql(table: String, columns: Seq[String], predicate: Option[String]): String = {
+    val query = "SELECT %s FROM \"%s\"" format(columns.map(f => "\"" + f + "\"").mkString(", "), table)
+
+    query + (predicate match {
+      case Some(p: String) => " WHERE " + p
+      case _ => ""
+    })
+  }
+
+  test("Can create valid SQL") {
+    val rdd = new PhoenixRDD(sc, "MyTable", Array("Foo", "Bar"),
+      conf = hbaseConfiguration)
+
+    rdd.buildSql("MyTable", Array("Foo", "Bar"), None) should
+      equal("SELECT \"Foo\", \"Bar\" FROM \"MyTable\"")
+  }
+
+  test("Can convert Phoenix schema") {
+    val phoenixSchema = List(
+      new ColumnInfo("varcharColumn", PVarchar.INSTANCE.getSqlType)
+    )
+
+    val rdd = new PhoenixRDD(sc, "MyTable", Array("Foo", "Bar"),
+      conf = hbaseConfiguration)
+
+    val catalystSchema = rdd.phoenixSchemaToCatalystSchema(phoenixSchema)
+
+    val expected = List(StructField("varcharColumn", StringType, nullable = true))
+
+    catalystSchema shouldEqual expected
+  }
+
+  test("Can create schema RDD and execute query") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("TABLE1", Array("ID", "COL1"), conf = hbaseConfiguration)
+
+    df1.registerTempTable("sql_table_1")
+
+    val df2 = sqlContext.phoenixTableAsDataFrame("TABLE2", Array("ID", "TABLE1_ID"),
+      conf = hbaseConfiguration)
+
+    df2.registerTempTable("sql_table_2")
+
+    val sqlRdd = sqlContext.sql("SELECT t1.ID, t1.COL1, t2.ID, t2.TABLE1_ID FROM sql_table_1 AS t1 INNER JOIN sql_table_2 AS t2 ON (t2.TABLE1_ID = t1.ID)")
+
+    val count = sqlRdd.count()
+
+    count shouldEqual 6L
+  }
+
+  test("Can create schema RDD and execute query on case sensitive table (no config)") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("table3", Array("id", "col1"), zkUrl = Some(hbaseConnectionString))
+
+    df1.registerTempTable("table3")
+
+    val sqlRdd = sqlContext.sql("SELECT * FROM table3")
+
+    val count = sqlRdd.count()
+
+    count shouldEqual 2L
+  }
+
+  test("Can create schema RDD and execute constrained query") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("TABLE1", Array("ID", "COL1"), conf = hbaseConfiguration)
+
+    df1.registerTempTable("sql_table_1")
+
+    val df2 = sqlContext.phoenixTableAsDataFrame("TABLE2", Array("ID", "TABLE1_ID"),
+      predicate = Some("\"ID\" = 1"),
+      conf = hbaseConfiguration)
+
+    df2.registerTempTable("sql_table_2")
+
+    val sqlRdd = sqlContext.sql("SELECT t1.ID, t1.COL1, t2.ID, t2.TABLE1_ID FROM sql_table_1 AS t1 INNER JOIN sql_table_2 AS t2 ON (t2.TABLE1_ID = t1.ID)")
+
+    val count = sqlRdd.count()
+
+    count shouldEqual 1L
+  }
+
+  test("Using a predicate referring to a non-existent column should fail") {
+    intercept[RuntimeException] {
+      val sqlContext = new SQLContext(sc)
+
+      val df1 = sqlContext.phoenixTableAsDataFrame("table3", Array("id", "col1"),
+        predicate = Some("foo = bar"),
+        conf = hbaseConfiguration)
+
+      df1.registerTempTable("table3")
+
+      val sqlRdd = sqlContext.sql("SELECT * FROM table3")
+
+      // we have to execute an action before the predicate failure can occur
+      val count = sqlRdd.count()
+    }.getCause shouldBe a [ColumnNotFoundException]
+  }
+
+  test("Can create schema RDD with predicate that will never match") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("table3", Array("id", "col1"),
+      predicate = Some("\"id\" = -1"),
+      conf = hbaseConfiguration)
+
+    df1.registerTempTable("table3")
+
+    val sqlRdd = sqlContext.sql("SELECT * FROM table3")
+
+    val count = sqlRdd.count()
+
+    count shouldEqual 0L
+  }
+
+  test("Can create schema RDD with complex predicate") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("DATE_PREDICATE_TEST_TABLE", Array("ID", "TIMESERIES_KEY"),
+      predicate = Some("ID > 0 AND TIMESERIES_KEY BETWEEN CAST(TO_DATE('1990-01-01 00:00:01', 'yyyy-MM-dd HH:mm:ss') AS TIMESTAMP) AND CAST(TO_DATE('1990-01-30 00:00:01', 'yyyy-MM-dd HH:mm:ss') AS TIMESTAMP)"),
+      conf = hbaseConfiguration)
+    
+    df1.registerTempTable("date_predicate_test_table")
+
+    val sqlRdd = df1.sqlContext.sql("SELECT * FROM date_predicate_test_table")
+
+    val count = sqlRdd.count()
+
+    count shouldEqual 0L
+  }
+
+  test("Can query an array table") {
+    val sqlContext = new SQLContext(sc)
+
+    val df1 = sqlContext.phoenixTableAsDataFrame("ARRAY_TEST_TABLE", Array("ID", "VCARRAY"),
+      conf = hbaseConfiguration)
+
+    df1.registerTempTable("ARRAY_TEST_TABLE")
+
+    val sqlRdd = sqlContext.sql("SELECT * FROM ARRAY_TEST_TABLE")
+
+    val count = sqlRdd.count()
+
+    // get row 0, column 1, which should be "VCARRAY"
+    val arrayValues = sqlRdd.collect().apply(0).apply(1)
+
+    arrayValues should equal(Array("String1", "String2", "String3"))
+
+    count shouldEqual 1L
+  }
+  
+  test("Can read a table as an RDD") {
+    val rdd1 = sc.phoenixTableAsRDD("ARRAY_TEST_TABLE", Seq("ID", "VCARRAY"),
+      conf = hbaseConfiguration)
+
+    val count = rdd1.count()
+
+    val arrayValues = rdd1.take(1)(0)("VCARRAY")
+
+    arrayValues should equal(Array("String1", "String2", "String3"))
+
+    count shouldEqual 1L
+  }
+
+  test("Can save to phoenix table") {
+    val sqlContext = new SQLContext(sc)
+
+    val dataSet = List((1L, "1", 1), (2L, "2", 2), (3L, "3", 3))
+
+    sc
+      .parallelize(dataSet)
+      .saveToPhoenix(
+        "OUTPUT_TEST_TABLE",
+        Seq("ID","COL1","COL2"),
+        hbaseConfiguration
+      )
+
+    // Load the results back
+    val stmt = conn.createStatement()
+    val rs = stmt.executeQuery("SELECT ID, COL1, COL2 FROM OUTPUT_TEST_TABLE")
+    val results = ListBuffer[(Long, String, Int)]()
+    while(rs.next()) {
+      results.append((rs.getLong(1), rs.getString(2), rs.getInt(3)))
+    }
+    stmt.close()
+
+    // Verify they match
+    (0 to results.size - 1).foreach { i =>
+      dataSet(i) shouldEqual results(i)
+    }
+  }
+
+  test("Can save Java and Joda dates to Phoenix (no config)") {
+    val dt = new DateTime()
+    val date = new Date()
+
+    val dataSet = List((1L, "1", 1, dt), (2L, "2", 2, date))
+    sc
+      .parallelize(dataSet)
+      .saveToPhoenix(
+        "OUTPUT_TEST_TABLE",
+        Seq("ID","COL1","COL2","COL3"),
+        zkUrl = Some(hbaseConnectionString)
+      )
+
+    // Load the results back
+    val stmt = conn.createStatement()
+    val rs = stmt.executeQuery("SELECT COL3 FROM OUTPUT_TEST_TABLE WHERE ID = 1 OR ID = 2 ORDER BY ID ASC")
+    val results = ListBuffer[java.sql.Date]()
+    while(rs.next()) {
+      results.append(rs.getDate(1))
+    }
+    stmt.close()
+
+    // Verify the epochs are equal
+    results(0).getTime shouldEqual dt.getMillis
+    results(1).getTime shouldEqual date.getTime
+  }
+
+  test("Not specifying a zkUrl or a config quorum URL should fail") {
+    intercept[UnsupportedOperationException] {
+      val sqlContext = new SQLContext(sc)
+      val badConf = new Configuration(hbaseConfiguration)
+      badConf.unset(HConstants.ZOOKEEPER_QUORUM)
+      sqlContext.phoenixTableAsDataFrame("TABLE1", Array("ID", "COL1"), conf = badConf)
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/f2d9080d/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 861c868..bfafe78 100644
--- a/pom.xml
+++ b/pom.xml
@@ -28,6 +28,7 @@
     <module>phoenix-pig</module>
     <module>phoenix-assembly</module>
     <module>phoenix-pherf</module>
+    <module>phoenix-spark</module>
   </modules>
 
   <repositories>