You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by hi...@apache.org on 2016/09/20 23:00:39 UTC
[10/14] incubator-geode git commit: GEODE-37 change package name from
io.pivotal.geode (for ./geode-spark-connector/src/main/scala/io/pivotal)to
org.apache.geode for(to ./geode-spark-connector/src/main/scala/org/apache)
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/DefaultGeodeConnectionManager.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/DefaultGeodeConnectionManager.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/DefaultGeodeConnectionManager.scala
new file mode 100644
index 0000000..eb67cda
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/DefaultGeodeConnectionManager.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal
+
+import io.pivotal.geode.spark.connector.{GeodeConnection, GeodeConnectionConf, GeodeConnectionManager}
+
+import scala.collection.mutable
+
+/**
+ * Default implementation of GeodeConnectionFactory
+ */
+class DefaultGeodeConnectionManager extends GeodeConnectionManager {
+
+ def getConnection(connConf: GeodeConnectionConf): GeodeConnection =
+ DefaultGeodeConnectionManager.getConnection(connConf)
+
+ def closeConnection(connConf: GeodeConnectionConf): Unit =
+ DefaultGeodeConnectionManager.closeConnection(connConf)
+
+}
+
+object DefaultGeodeConnectionManager {
+
+ /** connection cache, keyed by host:port pair */
+ private[connector] val connections = mutable.Map[(String, Int), GeodeConnection]()
+
+ /**
+ * use locator host:port pair to lookup cached connection. create new connection
+ * and add it to the cache `connections` if it does not exist.
+ */
+ def getConnection(connConf: GeodeConnectionConf)
+ (implicit factory: DefaultGeodeConnectionFactory = new DefaultGeodeConnectionFactory): GeodeConnection = {
+
+ def getCachedConnection(locators: Seq[(String, Int)]): GeodeConnection = {
+ val conns = connConf.locators.map(connections withDefaultValue null).filter(_ != null)
+ if (conns.nonEmpty) conns(0) else null
+ }
+
+ val conn1 = getCachedConnection(connConf.locators)
+ if (conn1 != null) conn1
+ else connections.synchronized {
+ val conn2 = getCachedConnection(connConf.locators)
+ if (conn2 != null) conn2
+ else {
+ val conn3 = factory.newConnection(connConf.locators, connConf.geodeProps)
+ connConf.locators.foreach(pair => connections += (pair -> conn3))
+ conn3
+ }
+ }
+ }
+
+ /**
+ * Close the connection and remove it from connection cache.
+ * Note: multiple entries may share the same connection, all those entries are removed.
+ */
+ def closeConnection(connConf: GeodeConnectionConf): Unit = {
+ val conns = connConf.locators.map(connections withDefaultValue null).filter(_ != null)
+ if (conns.nonEmpty) connections.synchronized {
+ conns(0).close()
+ connections.retain((k,v) => v != conns(0))
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/LocatorHelper.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/LocatorHelper.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/LocatorHelper.scala
new file mode 100644
index 0000000..4baa936
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/LocatorHelper.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal
+
+import java.net.InetSocketAddress
+import java.util.{ArrayList => JArrayList}
+
+import org.apache.geode.cache.client.internal.locator.{GetAllServersResponse, GetAllServersRequest}
+import org.apache.geode.distributed.internal.ServerLocation
+import org.apache.geode.distributed.internal.tcpserver.TcpClient
+import org.apache.spark.Logging
+
+import scala.util.{Failure, Success, Try}
+
+
+object LocatorHelper extends Logging {
+
+ /** valid locator strings are: host[port] and host:port */
+ final val LocatorPattern1 = """([\w-_]+(\.[\w-_]+)*)\[([0-9]{2,5})\]""".r
+ final val LocatorPattern2 = """([\w-_]+(\.[\w-_]+)*):([0-9]{2,5})""".r
+
+ /** convert single locator string to Try[(host, port)] */
+ def locatorStr2HostPortPair(locatorStr: String): Try[(String, Int)] =
+ locatorStr match {
+ case LocatorPattern1(host, domain, port) => Success((host, port.toInt))
+ case LocatorPattern2(host, domain, port) => Success((host, port.toInt))
+ case _ => Failure(new Exception(s"invalid locator: $locatorStr"))
+ }
+
+ /**
+ * Parse locator strings and returns Seq of (hostname, port) pair.
+ * Valid locator string are one or more "host[port]" and/or "host:port"
+ * separated by `,`. For example:
+ * host1.mydomain.com[8888],host2.mydomain.com[8889]
+ * host1.mydomain.com:8888,host2.mydomain.com:8889
+ */
+ def parseLocatorsString(locatorsStr: String): Seq[(String, Int)] =
+ locatorsStr.split(",").map(locatorStr2HostPortPair).map(_.get)
+
+
+ /**
+ * Return the list of live Geode servers for the given locators.
+ * @param locators locators for the given Geode cluster
+ * @param serverGroup optional server group name, default is "" (empty string)
+ */
+ def getAllGeodeServers(locators: Seq[(String, Int)], serverGroup: String = ""): Option[Seq[(String, Int)]] = {
+ var result: Option[Seq[(String, Int)]] = None
+ locators.find { case (host, port) =>
+ try {
+ val addr = new InetSocketAddress(host, port)
+ val req = new GetAllServersRequest(serverGroup)
+ val res = TcpClient.requestToServer(addr.getAddress, addr.getPort, req, 2000)
+ if (res != null) {
+ import scala.collection.JavaConverters._
+ val servers = res.asInstanceOf[GetAllServersResponse].getServers.asInstanceOf[JArrayList[ServerLocation]]
+ if (servers.size > 0)
+ result = Some(servers.asScala.map(e => (e.getHostName, e.getPort)))
+ }
+ } catch { case e: Exception => logWarning("getAllGeodeServers error", e)
+ }
+ result.isDefined
+ }
+ result
+ }
+
+ /**
+ * Pick up at most 3 preferred servers from all available servers based on
+ * host name and Spark executor id.
+ *
+ * This method is used by DefaultGeodeConnection to create ClientCache. Usually
+ * one server is enough to initialize ClientCacheFactory, but this provides two
+ * backup servers in case of the 1st server can't be connected.
+ *
+ * @param servers all available servers in the form of (hostname, port) pairs
+ * @param hostName the host name of the Spark executor
+ * @param executorId the Spark executor Id, such as "<driver>", "0", "1", ...
+ * @return Seq[(hostname, port)] of preferred servers
+ */
+ def pickPreferredGeodeServers(
+ servers: Seq[(String, Int)], hostName: String, executorId: String): Seq[(String, Int)] = {
+
+ // pick up `length` items form the Seq starts at the `start` position.
+ // The Seq is treated as a ring, so at most `Seq.size` items can be picked
+ def circularTake[T](seq: Seq[T], start: Int, length: Int): Seq[T] = {
+ val size = math.min(seq.size, length)
+ (start until start + size).map(x => seq(x % seq.size))
+ }
+
+ // map executor id to int: "<driver>" (or non-number string) to 0, and "n" to n + 1
+ val id = try { executorId.toInt + 1 } catch { case e: NumberFormatException => 0 }
+
+ // algorithm:
+ // 1. sort server list
+ // 2. split sorted server list into 3 sub-lists a, b, and c:
+ // list-a: servers on the given host
+ // list-b: servers that are in front of list-a on the sorted server list
+ // list-c: servers that are behind list-a on the sorted server list
+ // then rotate list-a based on executor id, then create new server list:
+ // modified list-a ++ list-c ++ list-b
+ // 3. if there's no server on the given host, then create new server list
+ // by rotating sorted server list based on executor id.
+ // 4. take up to 3 servers from the new server list
+ val sortedServers = servers.sorted
+ val firstIdx = sortedServers.indexWhere(p => p._1 == hostName)
+ val lastIdx = if (firstIdx < 0) -1 else sortedServers.lastIndexWhere(p => p._1 == hostName)
+
+ if (firstIdx < 0) { // no local server
+ circularTake(sortedServers, id, 3)
+ } else {
+ val (seq1, seq2) = sortedServers.splitAt(firstIdx)
+ val seq = if (firstIdx == lastIdx) { // one local server
+ seq2 ++ seq1
+ } else { // multiple local server
+ val (seq3, seq4) = seq2.splitAt(lastIdx - firstIdx + 1)
+ val seq3b = if (id % seq3.size == 0) seq3 else circularTake(seq3, id, seq3.size)
+ seq3b ++ seq4 ++ seq1
+ }
+ circularTake(seq, 0, 3)
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/geodefunctions/StructStreamingResultCollector.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/geodefunctions/StructStreamingResultCollector.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/geodefunctions/StructStreamingResultCollector.scala
new file mode 100644
index 0000000..5139be4
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/geodefunctions/StructStreamingResultCollector.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.geodefunctions
+
+import java.util.concurrent.{TimeUnit, LinkedBlockingQueue, BlockingQueue}
+import org.apache.geode.DataSerializer
+import org.apache.geode.cache.execute.ResultCollector
+import org.apache.geode.cache.query.internal.types.StructTypeImpl
+import org.apache.geode.cache.query.types.StructType
+import org.apache.geode.distributed.DistributedMember
+import org.apache.geode.internal.{Version, ByteArrayDataInput}
+import io.pivotal.geode.spark.connector.internal.geodefunctions.StructStreamingResultSender.
+ {TYPE_CHUNK, DATA_CHUNK, ERROR_CHUNK, SER_DATA, UNSER_DATA, BYTEARR_DATA}
+
+/**
+ * StructStreamingResultCollector and StructStreamingResultSender are paired
+ * to transfer result of list of `org.apache.geode.cache.query.Struct`
+ * from Geode server to Spark Connector (the client of Geode server)
+ * in streaming, i.e., while sender sending the result, the collector can
+ * start processing the arrived result without waiting for full result to
+ * become available.
+ */
+class StructStreamingResultCollector(desc: String) extends ResultCollector[Array[Byte], Iterator[Array[Object]]] {
+
+ /** the constructor that provide default `desc` (description) */
+ def this() = this("StructStreamingResultCollector")
+
+ private val queue: BlockingQueue[Array[Byte]] = new LinkedBlockingQueue[Array[Byte]]()
+ var structType: StructType = null
+
+ /** ------------------------------------------ */
+ /** ResultCollector interface implementations */
+ /** ------------------------------------------ */
+
+ override def getResult: Iterator[Array[Object]] = resultIterator
+
+ override def getResult(timeout: Long, unit: TimeUnit): Iterator[Array[Object]] =
+ throw new UnsupportedOperationException()
+
+ /** addResult add non-empty byte array (chunk) to the queue */
+ override def addResult(memberID: DistributedMember, chunk: Array[Byte]): Unit =
+ if (chunk != null && chunk.size > 1) {
+ this.queue.add(chunk)
+ // println(s"""$desc receive from $memberID: ${chunk.mkString(" ")}""")
+ }
+
+ /** endResults add special `Array.empty` to the queue as marker of end of data */
+ override def endResults(): Unit = this.queue.add(Array.empty)
+
+ override def clearResults(): Unit = this.queue.clear()
+
+ /** ------------------------------------------ */
+ /** Internal methods */
+ /** ------------------------------------------ */
+
+ def getResultType: StructType = {
+ // trigger lazy resultIterator initialization if necessary
+ if (structType == null) resultIterator.hasNext
+ structType
+ }
+
+ /**
+ * Note: The data is sent in chunks, and each chunk contains multiple
+ * records. So the result iterator is an iterator (I) of iterator (II),
+ * i.e., go through each chunk (iterator (I)), and for each chunk, go
+ * through each record (iterator (II)).
+ */
+ private lazy val resultIterator = new Iterator[Array[Object]] {
+
+ private var currentIterator: Iterator[Array[Object]] = nextIterator()
+
+ override def hasNext: Boolean = {
+ if (!currentIterator.hasNext && currentIterator != Iterator.empty) currentIterator = nextIterator()
+ currentIterator.hasNext
+ }
+
+ /** Note: make sure call `hasNext` first to adjust `currentIterator` */
+ override def next(): Array[Object] = currentIterator.next()
+ }
+
+ /** get the iterator for the next chunk of data */
+ private def nextIterator(): Iterator[Array[Object]] = {
+ val chunk: Array[Byte] = queue.take
+ if (chunk.isEmpty) {
+ Iterator.empty
+ } else {
+ val input = new ByteArrayDataInput()
+ input.initialize(chunk, Version.CURRENT)
+ val chunkType = input.readByte()
+ // println(s"chunk type $chunkType")
+ chunkType match {
+ case TYPE_CHUNK =>
+ if (structType == null)
+ structType = DataSerializer.readObject(input).asInstanceOf[StructTypeImpl]
+ nextIterator()
+ case DATA_CHUNK =>
+ // require(structType != null && structType.getFieldNames.length > 0)
+ if (structType == null) structType = StructStreamingResultSender.KeyValueType
+ chunkToIterator(input, structType.getFieldNames.length)
+ case ERROR_CHUNK =>
+ val error = DataSerializer.readObject(input).asInstanceOf[Exception]
+ errorPropagationIterator(error)
+ case _ => throw new RuntimeException(s"unknown chunk type: $chunkType")
+ }
+ }
+ }
+
+ /** create a iterator that propagate sender's exception */
+ private def errorPropagationIterator(ex: Exception) = new Iterator[Array[Object]] {
+ val re = new RuntimeException(ex)
+ override def hasNext: Boolean = throw re
+ override def next(): Array[Object] = throw re
+ }
+
+ /** convert a chunk of data to an iterator */
+ private def chunkToIterator(input: ByteArrayDataInput, rowSize: Int) = new Iterator[Array[Object]] {
+ override def hasNext: Boolean = input.available() > 0
+ val tmpInput = new ByteArrayDataInput()
+ override def next(): Array[Object] =
+ (0 until rowSize).map { ignore =>
+ val b = input.readByte()
+ b match {
+ case SER_DATA =>
+ val arr: Array[Byte] = DataSerializer.readByteArray(input)
+ tmpInput.initialize(arr, Version.CURRENT)
+ DataSerializer.readObject(tmpInput).asInstanceOf[Object]
+ case UNSER_DATA =>
+ DataSerializer.readObject(input).asInstanceOf[Object]
+ case BYTEARR_DATA =>
+ DataSerializer.readByteArray(input).asInstanceOf[Object]
+ case _ =>
+ throw new RuntimeException(s"unknown data type $b")
+ }
+ }.toArray
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryParser.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryParser.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryParser.scala
new file mode 100644
index 0000000..3f6dfad
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryParser.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import scala.util.parsing.combinator.RegexParsers
+
+class QueryParser extends RegexParsers {
+
+ def query: Parser[String] = opt(rep(IMPORT ~ PACKAGE)) ~> select ~> opt(distinct) ~> projection ~> from ~> regions <~ opt(where ~ filter) ^^ {
+ _.toString
+ }
+
+ val IMPORT: Parser[String] = "[Ii][Mm][Pp][Oo][Rr][Tt]".r
+
+ val select: Parser[String] = "[Ss][Ee][Ll][Ee][Cc][Tt]".r
+
+ val distinct: Parser[String] = "[Dd][Ii][Ss][Tt][Ii][Nn][Cc][Tt]".r
+
+ val from: Parser[String] = "[Ff][Rr][Oo][Mm]".r
+
+ val where: Parser[String] = "[Ww][Hh][Ee][Rr][Ee]".r
+
+ def PACKAGE: Parser[String] = """[\w.]+""".r
+
+ def projection: Parser[String] = "*" | repsep("""["\w]+[.\w"]*""".r, ",") ^^ {
+ _.toString
+ }
+
+ def regions: Parser[String] = repsep(region <~ opt(alias), ",") ^^ {
+ _.toString
+ }
+
+ def region: Parser[String] = """/[\w.]+[/[\w.]+]*""".r | """[\w]+[.\w]*""".r
+
+ def alias: Parser[String] = not(where) ~> """[\w]+""".r
+
+ def filter: Parser[String] = """[\w.]+[[\s]+[<>=.'\w]+]*""".r
+}
+
+object QueryParser extends QueryParser {
+
+ def parseOQL(expression: String) = parseAll(query, expression)
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryRDD.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryRDD.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryRDD.scala
new file mode 100644
index 0000000..474aa6a
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryRDD.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import io.pivotal.geode.spark.connector.GeodeConnectionConf
+import io.pivotal.geode.spark.connector.internal.rdd.{GeodeRDDPartition, ServerSplitsPartitioner}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{TaskContext, SparkContext, Partition}
+import scala.reflect.ClassTag
+
+/**
+ * An RDD that provides the functionality that read the OQL query result
+ *
+ * @param sc The SparkContext this RDD is associated with
+ * @param queryString The OQL query string
+ * @param connConf The GeodeConnectionConf that provide the GeodeConnection
+ */
+class QueryRDD[T](@transient sc: SparkContext,
+ queryString: String,
+ connConf: GeodeConnectionConf)
+ (implicit ct: ClassTag[T])
+ extends RDD[T](sc, Seq.empty) {
+
+ override def getPartitions: Array[Partition] = {
+ val conn = connConf.getConnection
+ val regionPath = getRegionPathFromQuery(queryString)
+ val md = conn.getRegionMetadata(regionPath)
+ md match {
+ case Some(metadata) =>
+ if (metadata.isPartitioned) {
+ val splits = ServerSplitsPartitioner.partitions(conn, metadata, Map.empty)
+ logInfo(s"QueryRDD.getPartitions():isPartitioned=true, partitions=${splits.mkString(",")}")
+ splits
+ }
+ else {
+ logInfo(s"QueryRDD.getPartitions():isPartitioned=false")
+ Array[Partition](new GeodeRDDPartition(0, Set.empty))
+
+ }
+ case None => throw new RuntimeException(s"Region $regionPath metadata was not found.")
+ }
+ }
+
+ override def compute(split: Partition, context: TaskContext): Iterator[T] = {
+ val buckets = split.asInstanceOf[GeodeRDDPartition].bucketSet
+ val regionPath = getRegionPathFromQuery(queryString)
+ val result = connConf.getConnection.executeQuery(regionPath, buckets, queryString)
+ result match {
+ case it: Iterator[T] =>
+ logInfo(s"QueryRDD.compute():query=$queryString, partition=$split")
+ it
+ case _ =>
+ throw new RuntimeException("Unexpected OQL result: " + result.toString)
+ }
+ }
+
+ private def getRegionPathFromQuery(queryString: String): String = {
+ val r = QueryParser.parseOQL(queryString).get
+ r match {
+ case r: String =>
+ val start = r.indexOf("/") + 1
+ var end = r.indexOf(")")
+ if (r.indexOf(".") > 0) end = math.min(r.indexOf("."), end)
+ if (r.indexOf(",") > 0) end = math.min(r.indexOf(","), end)
+ val regionPath = r.substring(start, end)
+ regionPath
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryResultCollector.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryResultCollector.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryResultCollector.scala
new file mode 100644
index 0000000..718d816
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/QueryResultCollector.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import java.util.concurrent.{TimeUnit, LinkedBlockingDeque}
+
+import org.apache.geode.DataSerializer
+import org.apache.geode.cache.execute.ResultCollector
+import org.apache.geode.distributed.DistributedMember
+import org.apache.geode.internal.{Version, ByteArrayDataInput}
+
+class QueryResultCollector extends ResultCollector[Array[Byte], Iterator[Object]]{
+
+ private val queue = new LinkedBlockingDeque[Array[Byte]]()
+
+ override def getResult = resultIterator
+
+ override def getResult(timeout: Long, unit: TimeUnit) = throw new UnsupportedOperationException
+
+ override def addResult(memberID: DistributedMember , chunk: Array[Byte]) =
+ if (chunk != null && chunk.size > 0) {
+ queue.add(chunk)
+ }
+
+ override def endResults = queue.add(Array.empty)
+
+
+ override def clearResults = queue.clear
+
+ private lazy val resultIterator = new Iterator[Object] {
+ private var currentIterator = nextIterator
+ def hasNext = {
+ if (!currentIterator.hasNext && currentIterator != Iterator.empty)
+ currentIterator = nextIterator
+ currentIterator.hasNext
+ }
+ def next = currentIterator.next
+ }
+
+ private def nextIterator: Iterator[Object] = {
+ val chunk = queue.take
+ if (chunk.isEmpty) {
+ Iterator.empty
+ }
+ else {
+ val input = new ByteArrayDataInput
+ input.initialize(chunk, Version.CURRENT)
+ new Iterator[Object] {
+ override def hasNext: Boolean = input.available() > 0
+ override def next: Object = DataSerializer.readObject(input).asInstanceOf[Object]
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RDDConverter.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RDDConverter.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RDDConverter.scala
new file mode 100644
index 0000000..6a1611c
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RDDConverter.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.sources.{BaseRelation, TableScan}
+
+import scala.tools.nsc.backend.icode.analysis.DataFlowAnalysis
+
+case class OQLRelation[T](queryRDD: QueryRDD[T])(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan {
+
+ override def schema: StructType = new SchemaBuilder(queryRDD).toSparkSchema()
+
+ override def buildScan(): RDD[Row] = new RowBuilder(queryRDD).toRowRDD()
+
+}
+
+object RDDConverter {
+
+ def queryRDDToDataFrame[T](queryRDD: QueryRDD[T], sqlContext: SQLContext): DataFrame = {
+ sqlContext.baseRelationToDataFrame(OQLRelation(queryRDD)(sqlContext))
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RowBuilder.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RowBuilder.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RowBuilder.scala
new file mode 100644
index 0000000..acbabc1
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/RowBuilder.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import org.apache.geode.cache.query.internal.StructImpl
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+
+class RowBuilder[T](queryRDD: QueryRDD[T]) {
+
+ /**
+ * Convert QueryRDD to RDD of Row
+ * @return RDD of Rows
+ */
+ def toRowRDD(): RDD[Row] = {
+ val rowRDD = queryRDD.map(row => {
+ row match {
+ case si: StructImpl => Row.fromSeq(si.getFieldValues)
+ case obj: Object => Row.fromSeq(Seq(obj))
+ }
+ })
+ rowRDD
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/SchemaBuilder.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/SchemaBuilder.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/SchemaBuilder.scala
new file mode 100644
index 0000000..44972839
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/SchemaBuilder.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import org.apache.geode.cache.query.internal.StructImpl
+import org.apache.spark.sql.types._
+import scala.collection.mutable.ListBuffer
+import org.apache.spark.Logging
+
+class SchemaBuilder[T](queryRDD: QueryRDD[T]) extends Logging {
+
+ val nullStructType = StructType(Nil)
+
+ val typeMap:Map[Class[_], DataType] = Map(
+ (classOf[java.lang.String], StringType),
+ (classOf[java.lang.Integer], IntegerType),
+ (classOf[java.lang.Short], ShortType),
+ (classOf[java.lang.Long], LongType),
+ (classOf[java.lang.Double], DoubleType),
+ (classOf[java.lang.Float], FloatType),
+ (classOf[java.lang.Boolean], BooleanType),
+ (classOf[java.lang.Byte], ByteType),
+ (classOf[java.util.Date], DateType),
+ (classOf[java.lang.Object], nullStructType)
+ )
+
+ /**
+ * Analyse QueryRDD to get the Spark schema
+ * @return The schema represented by Spark StructType
+ */
+ def toSparkSchema(): StructType = {
+ val row = queryRDD.first()
+ val tpe = row match {
+ case r: StructImpl => constructFromStruct(r)
+ case null => StructType(StructField("col1", NullType) :: Nil)
+ case default =>
+ val value = typeMap.getOrElse(default.getClass(), nullStructType)
+ StructType(StructField("col1", value) :: Nil)
+ }
+ logInfo(s"Schema: $tpe")
+ tpe
+ }
+
+ def constructFromStruct(r:StructImpl) = {
+ val names = r.getFieldNames
+ val values = r.getFieldValues
+ val lb = new ListBuffer[StructField]()
+ for (i <- 0 until names.length) {
+ val name = names(i)
+ val value = values(i)
+ val dataType = value match {
+ case null => NullType
+ case default => typeMap.getOrElse(default.getClass, nullStructType)
+ }
+ lb += StructField(name, dataType)
+ }
+ StructType(lb.toSeq)
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/UndefinedSerializer.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/UndefinedSerializer.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/UndefinedSerializer.scala
new file mode 100644
index 0000000..2809a73
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/oql/UndefinedSerializer.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.oql
+
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+import com.esotericsoftware.kryo.io.{Output, Input}
+import org.apache.geode.cache.query.QueryService
+import org.apache.geode.cache.query.internal.Undefined
+
+/**
+ * This is the customized serializer to serialize QueryService.UNDEFINED,
+ * i.e. org.apache.geode.cache.query.internal.Undefined, in order to
+ * guarantee the singleton Undefined after its deserialization within Spark.
+ */
+class UndefinedSerializer extends Serializer[Undefined] {
+
+ def write(kryo: Kryo, output: Output, u: Undefined) {
+ //Only serialize a byte for Undefined
+ output.writeByte(u.getDSFID)
+ }
+
+ def read (kryo: Kryo, input: Input, tpe: Class[Undefined]): Undefined = {
+ //Read DSFID of Undefined
+ input.readByte()
+ QueryService.UNDEFINED match {
+ case null => new Undefined
+ case _ =>
+ //Avoid calling Undefined constructor again.
+ QueryService.UNDEFINED.asInstanceOf[Undefined]
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeJoinRDD.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeJoinRDD.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeJoinRDD.scala
new file mode 100644
index 0000000..f971a3e
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeJoinRDD.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import org.apache.geode.cache.Region
+import io.pivotal.geode.spark.connector.GeodeConnectionConf
+import org.apache.spark.{TaskContext, Partition}
+import org.apache.spark.rdd.RDD
+import scala.collection.JavaConversions._
+
+/**
+ * An `RDD[T, V]` that will represent the result of a join between `left` RDD[T]
+ * and the specified Geode Region[K, V].
+ */
+class GeodeJoinRDD[T, K, V] private[connector]
+ ( left: RDD[T],
+ func: T => K,
+ val regionPath: String,
+ val connConf: GeodeConnectionConf
+ ) extends RDD[(T, V)](left.context, left.dependencies) {
+
+ /** validate region existence when GeodeRDD object is created */
+ validate()
+
+ /** Validate region, and make sure it exists. */
+ private def validate(): Unit = connConf.getConnection.validateRegion[K, V](regionPath)
+
+ override protected def getPartitions: Array[Partition] = left.partitions
+
+ override def compute(split: Partition, context: TaskContext): Iterator[(T, V)] = {
+ val region = connConf.getConnection.getRegionProxy[K, V](regionPath)
+ if (func == null) computeWithoutFunc(split, context, region)
+ else computeWithFunc(split, context, region)
+ }
+
+ /** T is (K, V1) since there's no map function `func` */
+ private def computeWithoutFunc(split: Partition, context: TaskContext, region: Region[K, V]): Iterator[(T, V)] = {
+ val leftPairs = left.iterator(split, context).toList.asInstanceOf[List[(K, _)]]
+ val leftKeys = leftPairs.map { case (k, v) => k}.toSet
+ // Note: get all will return (key, null) for non-exist entry, so remove those entries
+ val rightPairs = region.getAll(leftKeys).filter { case (k, v) => v != null}
+ leftPairs.filter{case (k, v) => rightPairs.contains(k)}
+ .map {case (k, v) => ((k, v).asInstanceOf[T], rightPairs.get(k).get)}.toIterator
+ }
+
+ private def computeWithFunc(split: Partition, context: TaskContext, region: Region[K, V]): Iterator[(T, V)] = {
+ val leftPairs = left.iterator(split, context).toList.map(t => (t, func(t)))
+ val leftKeys = leftPairs.map { case (t, k) => k}.toSet
+ // Note: get all will return (key, null) for non-exist entry, so remove those entries
+ val rightPairs = region.getAll(leftKeys).filter { case (k, v) => v != null}
+ leftPairs.filter { case (t, k) => rightPairs.contains(k)}.map {case (t, k) => (t, rightPairs.get(k).get)}.toIterator
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeOuterJoinRDD.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeOuterJoinRDD.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeOuterJoinRDD.scala
new file mode 100644
index 0000000..04855c1
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeOuterJoinRDD.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import org.apache.geode.cache.Region
+import io.pivotal.geode.spark.connector.GeodeConnectionConf
+import org.apache.spark.{TaskContext, Partition}
+import org.apache.spark.rdd.RDD
+import scala.collection.JavaConversions._
+
+/**
+ * An `RDD[ T, Option[V] ]` that represents the result of a left outer join
+ * between `left` RDD[T] and the specified Geode Region[K, V].
+ */
+class GeodeOuterJoinRDD[T, K, V] private[connector]
+ ( left: RDD[T],
+ func: T => K,
+ val regionPath: String,
+ val connConf: GeodeConnectionConf
+ ) extends RDD[(T, Option[V])](left.context, left.dependencies) {
+
+ /** validate region existence when GeodeRDD object is created */
+ validate()
+
+ /** Validate region, and make sure it exists. */
+ private def validate(): Unit = connConf.getConnection.validateRegion[K, V](regionPath)
+
+ override protected def getPartitions: Array[Partition] = left.partitions
+
+ override def compute(split: Partition, context: TaskContext): Iterator[(T, Option[V])] = {
+ val region = connConf.getConnection.getRegionProxy[K, V](regionPath)
+ if (func == null) computeWithoutFunc(split, context, region)
+ else computeWithFunc(split, context, region)
+ }
+
+ /** T is (K1, V1), and K1 and K are the same type since there's no map function `func` */
+ private def computeWithoutFunc(split: Partition, context: TaskContext, region: Region[K, V]): Iterator[(T, Option[V])] = {
+ val leftPairs = left.iterator(split, context).toList.asInstanceOf[List[(K, _)]]
+ val leftKeys = leftPairs.map { case (k, v) => k}.toSet
+ // Note: get all will return (key, null) for non-exist entry
+ val rightPairs = region.getAll(leftKeys)
+ // rightPairs is a java.util.Map, not scala map, so need to convert map.get() to Option
+ leftPairs.map{ case (k, v) => ((k, v).asInstanceOf[T], Option(rightPairs.get(k))) }.toIterator
+ }
+
+ private def computeWithFunc(split: Partition, context: TaskContext, region: Region[K, V]): Iterator[(T, Option[V])] = {
+ val leftPairs = left.iterator(split, context).toList.map(t => (t, func(t)))
+ val leftKeys = leftPairs.map { case (t, k) => k}.toSet
+ // Note: get all will return (key, null) for non-exist entry
+ val rightPairs = region.getAll(leftKeys)
+ // rightPairs is a java.util.Map, not scala map, so need to convert map.get() to Option
+ leftPairs.map{ case (t, k) => (t, Option(rightPairs.get(k)))}.toIterator
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartition.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartition.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartition.scala
new file mode 100644
index 0000000..24fe72e
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartition.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import org.apache.spark.Partition
+
+/**
+ * This serializable class represents a GeodeRDD partition. Each partition is mapped
+ * to one or more buckets of region. The GeodeRDD can materialize the data of the
+ * partition based on all information contained here.
+ * @param partitionId partition id, a 0 based number.
+ * @param bucketSet region bucket id set for this partition. Set.empty means whole
+ * region (used for replicated region)
+ * @param locations preferred location for this partition
+ */
+case class GeodeRDDPartition (
+ partitionId: Int, bucketSet: Set[Int], locations: Seq[String] = Nil)
+ extends Partition {
+
+ override def index: Int = partitionId
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitioner.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitioner.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitioner.scala
new file mode 100644
index 0000000..d960cab
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitioner.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import io.pivotal.geode.spark.connector.GeodeConnection
+import io.pivotal.geode.spark.connector.internal.RegionMetadata
+import org.apache.spark.{Logging, Partition}
+
+import scala.reflect.ClassTag
+
+/**
+ * A GeodeRDD partitioner is used to partition the region into multiple RDD partitions.
+ */
+trait GeodeRDDPartitioner extends Serializable {
+
+ def name: String
+
+ /** the function that generates partitions */
+ def partitions[K: ClassTag, V: ClassTag]
+ (conn: GeodeConnection, md: RegionMetadata, env: Map[String, String]): Array[Partition]
+}
+
+object GeodeRDDPartitioner extends Logging {
+
+ /** To add new partitioner, just add it to the following list */
+ final val partitioners: Map[String, GeodeRDDPartitioner] =
+ List(OnePartitionPartitioner, ServerSplitsPartitioner).map(e => (e.name, e)).toMap
+
+ /**
+ * Get a partitioner based on given name, a default partitioner will be returned if there's
+ * no partitioner for the given name.
+ */
+ def apply(name: String = defaultPartitionedRegionPartitioner.name): GeodeRDDPartitioner = {
+ val p = partitioners.get(name)
+ if (p.isDefined) p.get else {
+ logWarning(s"Invalid preferred partitioner name $name.")
+ defaultPartitionedRegionPartitioner
+ }
+ }
+
+ val defaultReplicatedRegionPartitioner = OnePartitionPartitioner
+
+ val defaultPartitionedRegionPartitioner = ServerSplitsPartitioner
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitionerImpl.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitionerImpl.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitionerImpl.scala
new file mode 100644
index 0000000..4606114
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDPartitionerImpl.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import io.pivotal.geode.spark.connector.GeodeConnection
+import io.pivotal.geode.spark.connector.internal.RegionMetadata
+import io.pivotal.geode.spark.connector.NumberPartitionsPerServerPropKey
+import org.apache.spark.Partition
+import scala.collection.JavaConversions._
+import scala.collection.immutable.SortedSet
+import scala.collection.mutable
+import scala.reflect.ClassTag
+
+/** This partitioner maps whole region to one GeodeRDDPartition */
+object OnePartitionPartitioner extends GeodeRDDPartitioner {
+
+ override val name = "OnePartition"
+
+ override def partitions[K: ClassTag, V: ClassTag]
+ (conn: GeodeConnection, md: RegionMetadata, env: Map[String, String]): Array[Partition] =
+ Array[Partition](new GeodeRDDPartition(0, Set.empty))
+}
+
+/**
+ * This partitioner maps whole region to N * M Geode RDD partitions, where M is the number of
+ * Geode servers that contain the data for the given region. Th default value of N is 1.
+ */
+object ServerSplitsPartitioner extends GeodeRDDPartitioner {
+
+ override val name = "ServerSplits"
+
+ override def partitions[K: ClassTag, V: ClassTag]
+ (conn: GeodeConnection, md: RegionMetadata, env: Map[String, String]): Array[Partition] = {
+ if (md == null) throw new RuntimeException("RegionMetadata is null")
+ val n = try { env.getOrElse(NumberPartitionsPerServerPropKey, "2").toInt } catch { case e: NumberFormatException => 2 }
+ if (!md.isPartitioned || md.getServerBucketMap == null || md.getServerBucketMap.isEmpty)
+ Array[Partition](new GeodeRDDPartition(0, Set.empty))
+ else {
+ val map = mapAsScalaMap(md.getServerBucketMap)
+ .map { case (srv, set) => (srv, asScalaSet(set).map(_.toInt)) }.toList
+ .map { case (srv, set) => (srv.getHostName, set) }
+ doPartitions(map, md.getTotalBuckets, n)
+ }
+ }
+
+ /** Converts server to bucket ID set list to array of RDD partitions */
+ def doPartitions(serverBucketMap: List[(String, mutable.Set[Int])], totalBuckets: Int, n: Int)
+ : Array[Partition] = {
+
+ // method that calculates the group size for splitting "k" items into "g" groups
+ def groupSize(k: Int, g: Int): Int = scala.math.ceil(k / g.toDouble).toInt
+
+ // 1. convert list of server and bucket set pairs to a list of server and sorted bucket set pairs
+ val srvToSortedBucketSet = serverBucketMap.map { case (srv, set) => (srv, SortedSet[Int]() ++ set) }
+
+ // 2. split bucket set of each server into n splits if possible, and server to Seq(server)
+ val srvToSplitedBuckeSet = srvToSortedBucketSet.flatMap { case (host, set) =>
+ if (set.isEmpty) Nil else set.grouped(groupSize(set.size, n)).toList.map(s => (Seq(host), s)) }
+
+ // 3. calculate empty bucket IDs by removing all bucket sets of all servers from the full bucket sets
+ val emptyIDs = SortedSet[Int]() ++ ((0 until totalBuckets).toSet /: srvToSortedBucketSet) {case (s1, (k, s2)) => s1 &~ s2}
+
+ // 4. distribute empty bucket IDs to all partitions evenly.
+ // The empty buckets do not contain data when partitions are created, but they may contain data
+ // when RDD is materialized, so need to include those bucket IDs in the partitions.
+ val srvToFinalBucketSet = if (emptyIDs.isEmpty) srvToSplitedBuckeSet
+ else srvToSplitedBuckeSet.zipAll(
+ emptyIDs.grouped(groupSize(emptyIDs.size, srvToSplitedBuckeSet.size)).toList, (Nil, Set.empty), Set.empty).map
+ { case ((server, set1), set2) => (server, SortedSet[Int]() ++ set1 ++ set2) }
+
+ // 5. create array of partitions w/ 0-based index
+ (0 until srvToFinalBucketSet.size).toList.zip(srvToFinalBucketSet).map
+ { case (i, (srv, set)) => new GeodeRDDPartition(i, set, srv) }.toArray
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDWriter.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDWriter.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDWriter.scala
new file mode 100644
index 0000000..27559f3
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRDDWriter.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import org.apache.geode.cache.Region
+import io.pivotal.geode.spark.connector._
+import org.apache.spark.{Logging, TaskContext}
+
+import scala.collection.Iterator
+import java.util.{HashMap => JMap}
+
+/** This trait provide some common code for pair and non-pair RDD writer */
+private[rdd] abstract class GeodeRDDWriterBase (opConf: Map[String, String]) extends Serializable {
+
+ val batchSize = try { opConf.getOrElse(RDDSaveBatchSizePropKey, RDDSaveBatchSizeDefault.toString).toInt}
+ catch { case e: NumberFormatException => RDDSaveBatchSizeDefault }
+
+ def mapDump(map: Map[_, _], num: Int): String = {
+ val firstNum = map.take(num + 1)
+ if (firstNum.size > num) s"$firstNum ..." else s"$firstNum"
+ }
+}
+
+/**
+ * Writer object that provides write function that saves non-pair RDD partitions to Geode.
+ * Those functions will be executed on Spark executors.
+ * @param regionPath the full path of the region where the data is written to
+ */
+class GeodeRDDWriter[T, K, V]
+ (regionPath: String, connConf: GeodeConnectionConf, opConf: Map[String, String] = Map.empty)
+ extends GeodeRDDWriterBase(opConf) with Serializable with Logging {
+
+ def write(func: T => (K, V))(taskContext: TaskContext, data: Iterator[T]): Unit = {
+ val region: Region[K, V] = connConf.getConnection.getRegionProxy[K, V](regionPath)
+ var count = 0
+ val chunks = data.grouped(batchSize)
+ chunks.foreach { chunk =>
+ val map = chunk.foldLeft(new JMap[K, V]()){case (m, t) => val (k, v) = func(t); m.put(k, v); m}
+ region.putAll(map)
+ count += chunk.length
+ }
+ logDebug(s"$count entries (batch.size = $batchSize) are saved to region $regionPath")
+ }
+}
+
+
+/**
+ * Writer object that provides write function that saves pair RDD partitions to Geode.
+ * Those functions will be executed on Spark executors.
+ * @param regionPath the full path of the region where the data is written to
+ */
+class GeodePairRDDWriter[K, V]
+ (regionPath: String, connConf: GeodeConnectionConf, opConf: Map[String, String] = Map.empty)
+ extends GeodeRDDWriterBase(opConf) with Serializable with Logging {
+
+ def write(taskContext: TaskContext, data: Iterator[(K, V)]): Unit = {
+ val region: Region[K, V] = connConf.getConnection.getRegionProxy[K, V](regionPath)
+ var count = 0
+ val chunks = data.grouped(batchSize)
+ chunks.foreach { chunk =>
+ val map = chunk.foldLeft(new JMap[K, V]()){case (m, (k,v)) => m.put(k,v); m}
+ region.putAll(map)
+ count += chunk.length
+ }
+ logDebug(s"$count entries (batch.batch = $batchSize) are saved to region $regionPath")
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRegionRDD.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRegionRDD.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRegionRDD.scala
new file mode 100644
index 0000000..6980c0f
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/internal/rdd/GeodeRegionRDD.scala
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.internal.rdd
+
+import scala.collection.Seq
+import scala.reflect.ClassTag
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{TaskContext, Partition, SparkContext}
+import io.pivotal.geode.spark.connector.{GeodeConnectionConf, PreferredPartitionerPropKey}
+import io.pivotal.geode.spark.connector.internal.rdd.GeodeRDDPartitioner._
+
+/**
+ * This class exposes Geode region as a RDD.
+ * @param sc the Spark Context
+ * @param regionPath the full path of the region
+ * @param connConf the GeodeConnectionConf to access the region
+ * @param opConf the parameters for this operation, such as preferred partitioner.
+ */
+class GeodeRegionRDD[K, V] private[connector]
+ (@transient sc: SparkContext,
+ val regionPath: String,
+ val connConf: GeodeConnectionConf,
+ val opConf: Map[String, String] = Map.empty,
+ val whereClause: Option[String] = None
+ ) (implicit ctk: ClassTag[K], ctv: ClassTag[V])
+ extends RDD[(K, V)](sc, Seq.empty) {
+
+ /** validate region existence when GeodeRDD object is created */
+ validate()
+
+ /** Validate region, and make sure it exists. */
+ private def validate(): Unit = connConf.getConnection.validateRegion[K, V](regionPath)
+
+ def kClassTag = ctk
+
+ def vClassTag = ctv
+
+ /**
+ * method `copy` is used by method `where` that creates new immutable
+ * GeodeRDD instance based this instance.
+ */
+ private def copy(
+ regionPath: String = regionPath,
+ connConf: GeodeConnectionConf = connConf,
+ opConf: Map[String, String] = opConf,
+ whereClause: Option[String] = None
+ ): GeodeRegionRDD[K, V] = {
+
+ require(sc != null,
+ """RDD transformation requires a non-null SparkContext. Unfortunately
+ |SparkContext in this GeodeRDD is null. This can happen after
+ |GeodeRDD has been deserialized. SparkContext is not Serializable,
+ |therefore it deserializes to null. RDD transformations are not allowed
+ |inside lambdas used in other RDD transformations.""".stripMargin )
+
+ new GeodeRegionRDD[K, V](sc, regionPath, connConf, opConf, whereClause)
+ }
+
+ /** When where clause is specified, OQL query
+ * `select key, value from /<region-path>.entries where <where clause> `
+ * is used to filter the dataset.
+ */
+ def where(whereClause: Option[String]): GeodeRegionRDD[K, V] = {
+ if (whereClause.isDefined) copy(whereClause = whereClause)
+ else this
+ }
+
+ /** this version is for Java API that doesn't use scala.Option */
+ def where(whereClause: String): GeodeRegionRDD[K, V] = {
+ if (whereClause == null || whereClause.trim.isEmpty) this
+ else copy(whereClause = Option(whereClause.trim))
+ }
+
+ /**
+ * Use preferred partitioner generate partitions. `defaultReplicatedRegionPartitioner`
+ * will be used if it's a replicated region.
+ */
+ override def getPartitions: Array[Partition] = {
+ val conn = connConf.getConnection
+ val md = conn.getRegionMetadata[K, V](regionPath)
+ md match {
+ case None => throw new RuntimeException(s"region $regionPath was not found.")
+ case Some(data) =>
+ logInfo(s"""RDD id=${this.id} region=$regionPath conn=${connConf.locators.mkString(",")}, env=$opConf""")
+ val p = if (data.isPartitioned) preferredPartitioner else defaultReplicatedRegionPartitioner
+ val splits = p.partitions[K, V](conn, data, opConf)
+ logDebug(s"""RDD id=${this.id} region=$regionPath partitions=\n ${splits.mkString("\n ")}""")
+ splits
+ }
+ }
+
+ /**
+ * provide preferred location(s) (host name(s)) of the given partition.
+ * Only some partitioner implementation(s) provides this info, which is
+ * useful when Spark cluster and Geode cluster share some hosts.
+ */
+ override def getPreferredLocations(split: Partition) =
+ split.asInstanceOf[GeodeRDDPartition].locations
+
+ /**
+ * Get preferred partitioner. return `defaultPartitionedRegionPartitioner` if none
+ * preference is specified.
+ */
+ private def preferredPartitioner =
+ GeodeRDDPartitioner(opConf.getOrElse(
+ PreferredPartitionerPropKey, GeodeRDDPartitioner.defaultPartitionedRegionPartitioner.name))
+
+ /** materialize a RDD partition */
+ override def compute(split: Partition, context: TaskContext): Iterator[(K, V)] = {
+ val partition = split.asInstanceOf[GeodeRDDPartition]
+ logDebug(s"compute RDD id=${this.id} partition $partition")
+ connConf.getConnection.getRegionData[K,V](regionPath, whereClause, partition)
+ // new InterruptibleIterator(context, split.asInstanceOf[GeodeRDDPartition[K, V]].iterator)
+ }
+}
+
+object GeodeRegionRDD {
+
+ def apply[K: ClassTag, V: ClassTag](sc: SparkContext, regionPath: String,
+ connConf: GeodeConnectionConf, opConf: Map[String, String] = Map.empty)
+ : GeodeRegionRDD[K, V] =
+ new GeodeRegionRDD[K, V](sc, regionPath, connConf, opConf)
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/GeodeJavaRegionRDD.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/GeodeJavaRegionRDD.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/GeodeJavaRegionRDD.scala
new file mode 100644
index 0000000..f859173
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/GeodeJavaRegionRDD.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.javaapi
+
+import io.pivotal.geode.spark.connector.internal.rdd.GeodeRegionRDD
+import org.apache.spark.api.java.JavaPairRDD
+
+class GeodeJavaRegionRDD[K, V](rdd: GeodeRegionRDD[K, V]) extends JavaPairRDD[K, V](rdd)(rdd.kClassTag, rdd.vClassTag) {
+
+ def where(whereClause: String): GeodeJavaRegionRDD[K, V] = new GeodeJavaRegionRDD(rdd.where(whereClause))
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/JavaAPIHelper.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/JavaAPIHelper.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/JavaAPIHelper.scala
new file mode 100644
index 0000000..ffa6195
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/javaapi/JavaAPIHelper.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.javaapi
+
+import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
+import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStream}
+
+import scala.reflect.ClassTag
+import scala.collection.JavaConversions._
+
+/**
+ * A helper class to make it possible to access components written in Scala from Java code.
+ */
+private[connector] object JavaAPIHelper {
+
+ /** Returns a `ClassTag` of a given runtime class. */
+ def getClassTag[T](clazz: Class[T]): ClassTag[T] = ClassTag(clazz)
+
+ /**
+ * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef].
+ * see JavaSparkContext.fakeClassTag in Spark for more info.
+ */
+ def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]
+
+ /** Converts a Java `Properties` to a Scala immutable `Map[String, String]`. */
+ def propertiesToScalaMap[K, V](props: java.util.Properties): Map[String, String] =
+ Map(props.toSeq: _*)
+
+ /** convert a JavaRDD[(K,V)] to JavaPairRDD[K,V] */
+ def toJavaPairRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] =
+ JavaPairRDD.fromJavaRDD(rdd)
+
+ /** convert a JavaDStream[(K,V)] to JavaPairDStream[K,V] */
+ def toJavaPairDStream[K, V](ds: JavaDStream[(K, V)]): JavaPairDStream[K, V] =
+ JavaPairDStream.fromJavaDStream(ds)
+
+ /** an empty Map[String, String] for default opConf **/
+ val emptyStrStrMap: Map[String, String] = Map.empty
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/package.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/package.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/package.scala
new file mode 100644
index 0000000..6f9a780
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/package.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark
+
+import io.pivotal.geode.spark.connector.internal.rdd.{ServerSplitsPartitioner, OnePartitionPartitioner}
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+
+import scala.reflect.ClassTag
+
+/**
+ * The root package of Geode connector for Apache Spark.
+ * Provides handy implicit conversions that add geode-specific
+ * methods to `SparkContext` and `RDD`.
+ */
+package object connector {
+
+ /** constants */
+ final val GeodeLocatorPropKey = "spark.geode.locators"
+ // partitioner related keys and values
+ final val PreferredPartitionerPropKey = "preferred.partitioner"
+ final val NumberPartitionsPerServerPropKey = "number.partitions.per.server"
+ final val OnePartitionPartitionerName = OnePartitionPartitioner.name
+ final val ServerSplitsPartitionerName = ServerSplitsPartitioner.name
+
+ final val RDDSaveBatchSizePropKey = "rdd.save.batch.size"
+ final val RDDSaveBatchSizeDefault = 10000
+
+ /** implicits */
+
+ implicit def toSparkContextFunctions(sc: SparkContext): GeodeSparkContextFunctions =
+ new GeodeSparkContextFunctions(sc)
+
+ implicit def toSQLContextFunctions(sqlContext: SQLContext): GeodeSQLContextFunctions =
+ new GeodeSQLContextFunctions(sqlContext)
+
+ implicit def toGeodePairRDDFunctions[K: ClassTag, V: ClassTag]
+ (self: RDD[(K, V)]): GeodePairRDDFunctions[K, V] = new GeodePairRDDFunctions(self)
+
+ implicit def toGeodeRDDFunctions[T: ClassTag]
+ (self: RDD[T]): GeodeRDDFunctions[T] = new GeodeRDDFunctions(self)
+
+ /** utility implicits */
+
+ /** convert Map[String, String] to java.util.Properties */
+ implicit def map2Properties(map: Map[String,String]): java.util.Properties =
+ (new java.util.Properties /: map) {case (props, (k,v)) => props.put(k,v); props}
+
+ /** internal util methods */
+
+ private[connector] def getRddPartitionsInfo(rdd: RDD[_], sep: String = "\n "): String =
+ rdd.partitions.zipWithIndex.map{case (p,i) => s"$i: $p loc=${rdd.preferredLocations(p)}"}.mkString(sep)
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/GeodeDStreamFunctions.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/GeodeDStreamFunctions.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/GeodeDStreamFunctions.scala
new file mode 100644
index 0000000..4d46429
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/GeodeDStreamFunctions.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector.streaming
+
+import io.pivotal.geode.spark.connector.GeodeConnectionConf
+import io.pivotal.geode.spark.connector.internal.rdd.{GeodePairRDDWriter, GeodeRDDWriter}
+import org.apache.spark.Logging
+import org.apache.spark.api.java.function.PairFunction
+import org.apache.spark.streaming.dstream.DStream
+
+/**
+ * Extra geode functions on DStream of non-pair elements through an implicit conversion.
+ * Import `io.pivotal.geode.spark.connector.streaming._` at the top of your program to
+ * use these functions.
+ */
+class GeodeDStreamFunctions[T](val dstream: DStream[T]) extends Serializable with Logging {
+
+ /**
+ * Save the DStream of non-pair elements to Geode key-value store.
+ * @param regionPath the full path of region that the DStream is stored
+ * @param func the function that converts elements of the DStream to key/value pairs
+ * @param connConf the GeodeConnectionConf object that provides connection to Geode cluster
+ * @param opConf the optional parameters for this operation
+ */
+ def saveToGeode[K, V](
+ regionPath: String,
+ func: T => (K, V),
+ connConf: GeodeConnectionConf = defaultConnectionConf,
+ opConf: Map[String, String] = Map.empty): Unit = {
+ connConf.getConnection.validateRegion[K, V](regionPath)
+ val writer = new GeodeRDDWriter[T, K, V](regionPath, connConf, opConf)
+ logInfo(s"""Save DStream region=$regionPath conn=${connConf.locators.mkString(",")}""")
+ dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write(func) _))
+ }
+
+ /** this version of saveToGeode is just for Java API */
+ def saveToGeode[K, V](
+ regionPath: String,
+ func: PairFunction[T, K, V],
+ connConf: GeodeConnectionConf,
+ opConf: Map[String, String] ): Unit = {
+ saveToGeode[K, V](regionPath, func.call _, connConf, opConf)
+ }
+
+ private[connector] def defaultConnectionConf: GeodeConnectionConf =
+ GeodeConnectionConf(dstream.context.sparkContext.getConf)
+}
+
+
+/**
+ * Extra geode functions on DStream of (key, value) pairs through an implicit conversion.
+ * Import `io.pivotal.geode.spark.connector.streaming._` at the top of your program to
+ * use these functions.
+ */
+class GeodePairDStreamFunctions[K, V](val dstream: DStream[(K,V)]) extends Serializable with Logging {
+
+ /**
+ * Save the DStream of pairs to Geode key-value store without any conversion
+ * @param regionPath the full path of region that the DStream is stored
+ * @param connConf the GeodeConnectionConf object that provides connection to Geode cluster
+ * @param opConf the optional parameters for this operation
+ */
+ def saveToGeode(
+ regionPath: String,
+ connConf: GeodeConnectionConf = defaultConnectionConf,
+ opConf: Map[String, String] = Map.empty): Unit = {
+ connConf.getConnection.validateRegion[K, V](regionPath)
+ val writer = new GeodePairRDDWriter[K, V](regionPath, connConf, opConf)
+ logInfo(s"""Save DStream region=$regionPath conn=${connConf.locators.mkString(",")}""")
+ dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _))
+ }
+
+ private[connector] def defaultConnectionConf: GeodeConnectionConf =
+ GeodeConnectionConf(dstream.context.sparkContext.getConf)
+}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/106cc60f/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/package.scala
----------------------------------------------------------------------
diff --git a/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/package.scala b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/package.scala
new file mode 100644
index 0000000..0d1f1eb
--- /dev/null
+++ b/geode-spark-connector/geode-spark-connector/src/main/scala/org/apache/geode/spark/connector/streaming/package.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.pivotal.geode.spark.connector
+
+import org.apache.spark.streaming.dstream.DStream
+
+/**
+ * Provides handy implicit conversions that add gemfire-specific methods to `DStream`.
+ */
+package object streaming {
+
+ implicit def toGeodeDStreamFunctions[T](ds: DStream[T]): GeodeDStreamFunctions[T] =
+ new GeodeDStreamFunctions[T](ds)
+
+ implicit def toGeodePairDStreamFunctions[K, V](ds: DStream[(K, V)]): GeodePairDStreamFunctions[K, V] =
+ new GeodePairDStreamFunctions[K, V](ds)
+
+}