You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sedona.apache.org by ji...@apache.org on 2021/03/18 19:40:50 UTC

[incubator-sedona] branch master updated: Add SQL Join in Python Binder examples

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new e0fa15d  Add SQL Join in Python Binder examples
e0fa15d is described below

commit e0fa15d7d8337a3a14d6296a0ddc320ad9bd7152
Author: Jia Yu <ji...@apache.org>
AuthorDate: Thu Mar 18 12:40:37 2021 -0700

    Add SQL Join in Python Binder examples
---
 binder/.gitignore                                  |   1 +
 binder/ApacheSedonaCore.ipynb                      |   2 +-
 binder/ApacheSedonaSQL.ipynb                       |  59 ++-
 ...eSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb | 485 +++++++++++++++++++++
 binder/Pipfile                                     |   2 +
 .../ne_50m_admin_0_countries_lakes.cpg             |   1 +
 .../ne_50m_admin_0_countries_lakes.dbf             | Bin 0 -> 546979 bytes
 .../ne_50m_admin_0_countries_lakes.prj             |   1 +
 .../ne_50m_admin_0_countries_lakes.shp             | Bin 0 -> 1652184 bytes
 .../ne_50m_admin_0_countries_lakes.shx             | Bin 0 -> 2028 bytes
 binder/data/ne_50m_airports/ne_50m_airports.dbf    | Bin 0 -> 326032 bytes
 binder/data/ne_50m_airports/ne_50m_airports.prj    |   1 +
 binder/data/ne_50m_airports/ne_50m_airports.shp    | Bin 0 -> 7968 bytes
 binder/data/ne_50m_airports/ne_50m_airports.shx    | Bin 0 -> 2348 bytes
 14 files changed, 518 insertions(+), 34 deletions(-)

diff --git a/binder/.gitignore b/binder/.gitignore
new file mode 100644
index 0000000..d5e992d
--- /dev/null
+++ b/binder/.gitignore
@@ -0,0 +1 @@
+/.ipynb_checkpoints/
diff --git a/binder/ApacheSedonaCore.ipynb b/binder/ApacheSedonaCore.ipynb
index ec417e5..e311b99 100644
--- a/binder/ApacheSedonaCore.ipynb
+++ b/binder/ApacheSedonaCore.ipynb
@@ -2138,7 +2138,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/binder/ApacheSedonaSQL.ipynb b/binder/ApacheSedonaSQL.ipynb
index f3b3f80..5643176 100644
--- a/binder/ApacheSedonaSQL.ipynb
+++ b/binder/ApacheSedonaSQL.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -33,7 +33,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -42,7 +42,7 @@
        "True"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -150,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -239,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -247,22 +247,22 @@
      "output_type": "stream",
      "text": [
       "== Physical Plan ==\n",
-      "DistanceJoin pointshape1#261: geometry, pointshape2#285: geometry, 2.0, false\n",
-      ":- Project [st_point(cast(_c0#255 as decimal(24,20)), cast(_c1#256 as decimal(24,20))) AS pointshape1#261]\n",
-      ":  +- FileScan csv [_c0#255,_c1#256] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/ubuntu/code/incubator-sedona/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
-      "+- Project [st_point(cast(_c0#279 as decimal(24,20)), cast(_c1#280 as decimal(24,20))) AS pointshape2#285]\n",
-      "   +- FileScan csv [_c0#279,_c1#280] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/ubuntu/code/incubator-sedona/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
+      "DistanceJoin pointshape1#261: geometry, pointshape2#287: geometry, 2.0, false\n",
+      ":- Project [st_point(cast(_c0#255 as decimal(24,20)), cast(_c1#256 as decimal(24,20))) AS pointshape1#261, abc AS name1#262]\n",
+      ":  +- FileScan csv [_c0#255,_c1#256] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/jiayu/GitHub/GeoSpark-datasys-repo/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
+      "+- Project [st_point(cast(_c0#281 as decimal(24,20)), cast(_c1#282 as decimal(24,20))) AS pointshape2#287, def AS name2#288]\n",
+      "   +- FileScan csv [_c0#281,_c1#282] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/jiayu/GitHub/GeoSpark-datasys-repo/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
       "\n",
       "\n",
-      "+-----------------+-----------------+\n",
-      "|      pointshape1|      pointshape2|\n",
-      "+-----------------+-----------------+\n",
-      "|POINT (1.1 101.1)|POINT (1.1 101.1)|\n",
-      "|POINT (1.1 101.1)|POINT (2.1 102.1)|\n",
-      "|POINT (2.1 102.1)|POINT (1.1 101.1)|\n",
-      "|POINT (2.1 102.1)|POINT (2.1 102.1)|\n",
-      "|POINT (2.1 102.1)|POINT (3.1 103.1)|\n",
-      "+-----------------+-----------------+\n",
+      "+-----------------+-----+-----------------+-----+\n",
+      "|      pointshape1|name1|      pointshape2|name2|\n",
+      "+-----------------+-----+-----------------+-----+\n",
+      "|POINT (1.1 101.1)|  abc|POINT (1.1 101.1)|  def|\n",
+      "|POINT (1.1 101.1)|  abc|POINT (2.1 102.1)|  def|\n",
+      "|POINT (2.1 102.1)|  abc|POINT (1.1 101.1)|  def|\n",
+      "|POINT (2.1 102.1)|  abc|POINT (2.1 102.1)|  def|\n",
+      "|POINT (2.1 102.1)|  abc|POINT (3.1 103.1)|  def|\n",
+      "+-----------------+-----+-----------------+-----+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
@@ -275,7 +275,7 @@
     "\n",
     "point_csv_df_1.createOrReplaceTempView(\"pointtable\")\n",
     "\n",
-    "point_df1 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1 from pointtable\")\n",
+    "point_df1 = spark.sql(\"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, \\'abc\\' as name1 from pointtable\")\n",
     "point_df1.createOrReplaceTempView(\"pointdf1\")\n",
     "\n",
     "point_csv_df2 = spark.read.format(\"csv\").\\\n",
@@ -283,7 +283,7 @@
     "    option(\"header\", \"false\").load(\"data/testpoint.csv\")\n",
     "\n",
     "point_csv_df2.createOrReplaceTempView(\"pointtable\")\n",
-    "point_df2 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2 from pointtable\")\n",
+    "point_df2 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, \\'def\\' as name2 from pointtable\")\n",
     "point_df2.createOrReplaceTempView(\"pointdf2\")\n",
     "\n",
     "distance_join_df = spark.sql(\"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2\")\n",
@@ -295,13 +295,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For more examples please refer to https://sedona.apache.org/"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
     "### Converting GeoPandas to Apache Sedona"
    ]
   },
@@ -665,7 +658,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb b/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb
new file mode 100644
index 0000000..b055d9f
--- /dev/null
+++ b/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb
@@ -0,0 +1,485 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import geopandas as gpd\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import col, expr, when\n",
+    "\n",
+    "from sedona.register import SedonaRegistrator\n",
+    "from sedona.utils import SedonaKryoRegistrator, KryoSerializer\n",
+    "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
+    "from sedona.utils.adapter import Adapter\n",
+    "from sedona.core.enums import GridType\n",
+    "from sedona.core.enums import IndexType\n",
+    "from sedona.core.spatialOperator import JoinQuery"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup Sedona environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.\\\n",
+    "        master(\"local[*]\").\\\n",
+    "        appName(\"SedonaSQL-Example\").\\\n",
+    "        config(\"spark.serializer\", KryoSerializer.getName).\\\n",
+    "        config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) .\\\n",
+    "        config('spark.jars.packages',\n",
+    "               'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,'\n",
+    "               'org.datasyslab:geotools-wrapper:geotools-24.0'). \\\n",
+    "        getOrCreate()\n",
+    "SedonaRegistrator.registerAll(spark)\n",
+    "sc = spark.sparkContext\n",
+    "sc.setSystemProperty(\"sedona.global.charset\", \"utf8\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read countries shapefile into a Sedona DataFrame \n",
+    "Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- geometry: geometry (nullable = true)\n",
+      " |-- featurecla: string (nullable = true)\n",
+      " |-- scalerank: string (nullable = true)\n",
+      " |-- LABELRANK: string (nullable = true)\n",
+      " |-- SOVEREIGNT: string (nullable = true)\n",
+      " |-- SOV_A3: string (nullable = true)\n",
+      " |-- ADM0_DIF: string (nullable = true)\n",
+      " |-- LEVEL: string (nullable = true)\n",
+      " |-- TYPE: string (nullable = true)\n",
+      " |-- ADMIN: string (nullable = true)\n",
+      " |-- ADM0_A3: string (nullable = true)\n",
+      " |-- GEOU_DIF: string (nullable = true)\n",
+      " |-- GEOUNIT: string (nullable = true)\n",
+      " |-- GU_A3: string (nullable = true)\n",
+      " |-- SU_DIF: string (nullable = true)\n",
+      " |-- SUBUNIT: string (nullable = true)\n",
+      " |-- SU_A3: string (nullable = true)\n",
+      " |-- BRK_DIFF: string (nullable = true)\n",
+      " |-- NAME: string (nullable = true)\n",
+      " |-- NAME_LONG: string (nullable = true)\n",
+      " |-- BRK_A3: string (nullable = true)\n",
+      " |-- BRK_NAME: string (nullable = true)\n",
+      " |-- BRK_GROUP: string (nullable = true)\n",
+      " |-- ABBREV: string (nullable = true)\n",
+      " |-- POSTAL: string (nullable = true)\n",
+      " |-- FORMAL_EN: string (nullable = true)\n",
+      " |-- FORMAL_FR: string (nullable = true)\n",
+      " |-- NAME_CIAWF: string (nullable = true)\n",
+      " |-- NOTE_ADM0: string (nullable = true)\n",
+      " |-- NOTE_BRK: string (nullable = true)\n",
+      " |-- NAME_SORT: string (nullable = true)\n",
+      " |-- NAME_ALT: string (nullable = true)\n",
+      " |-- MAPCOLOR7: string (nullable = true)\n",
+      " |-- MAPCOLOR8: string (nullable = true)\n",
+      " |-- MAPCOLOR9: string (nullable = true)\n",
+      " |-- MAPCOLOR13: string (nullable = true)\n",
+      " |-- POP_EST: string (nullable = true)\n",
+      " |-- POP_RANK: string (nullable = true)\n",
+      " |-- GDP_MD_EST: string (nullable = true)\n",
+      " |-- POP_YEAR: string (nullable = true)\n",
+      " |-- LASTCENSUS: string (nullable = true)\n",
+      " |-- GDP_YEAR: string (nullable = true)\n",
+      " |-- ECONOMY: string (nullable = true)\n",
+      " |-- INCOME_GRP: string (nullable = true)\n",
+      " |-- WIKIPEDIA: string (nullable = true)\n",
+      " |-- FIPS_10_: string (nullable = true)\n",
+      " |-- ISO_A2: string (nullable = true)\n",
+      " |-- ISO_A3: string (nullable = true)\n",
+      " |-- ISO_A3_EH: string (nullable = true)\n",
+      " |-- ISO_N3: string (nullable = true)\n",
+      " |-- UN_A3: string (nullable = true)\n",
+      " |-- WB_A2: string (nullable = true)\n",
+      " |-- WB_A3: string (nullable = true)\n",
+      " |-- WOE_ID: string (nullable = true)\n",
+      " |-- WOE_ID_EH: string (nullable = true)\n",
+      " |-- WOE_NOTE: string (nullable = true)\n",
+      " |-- ADM0_A3_IS: string (nullable = true)\n",
+      " |-- ADM0_A3_US: string (nullable = true)\n",
+      " |-- ADM0_A3_UN: string (nullable = true)\n",
+      " |-- ADM0_A3_WB: string (nullable = true)\n",
+      " |-- CONTINENT: string (nullable = true)\n",
+      " |-- REGION_UN: string (nullable = true)\n",
+      " |-- SUBREGION: string (nullable = true)\n",
+      " |-- REGION_WB: string (nullable = true)\n",
+      " |-- NAME_LEN: string (nullable = true)\n",
+      " |-- LONG_LEN: string (nullable = true)\n",
+      " |-- ABBREV_LEN: string (nullable = true)\n",
+      " |-- TINY: string (nullable = true)\n",
+      " |-- HOMEPART: string (nullable = true)\n",
+      " |-- MIN_ZOOM: string (nullable = true)\n",
+      " |-- MIN_LABEL: string (nullable = true)\n",
+      " |-- MAX_LABEL: string (nullable = true)\n",
+      " |-- NE_ID: string (nullable = true)\n",
+      " |-- WIKIDATAID: string (nullable = true)\n",
+      " |-- NAME_AR: string (nullable = true)\n",
+      " |-- NAME_BN: string (nullable = true)\n",
+      " |-- NAME_DE: string (nullable = true)\n",
+      " |-- NAME_EN: string (nullable = true)\n",
+      " |-- NAME_ES: string (nullable = true)\n",
+      " |-- NAME_FR: string (nullable = true)\n",
+      " |-- NAME_EL: string (nullable = true)\n",
+      " |-- NAME_HI: string (nullable = true)\n",
+      " |-- NAME_HU: string (nullable = true)\n",
+      " |-- NAME_ID: string (nullable = true)\n",
+      " |-- NAME_IT: string (nullable = true)\n",
+      " |-- NAME_JA: string (nullable = true)\n",
+      " |-- NAME_KO: string (nullable = true)\n",
+      " |-- NAME_NL: string (nullable = true)\n",
+      " |-- NAME_PL: string (nullable = true)\n",
+      " |-- NAME_PT: string (nullable = true)\n",
+      " |-- NAME_RU: string (nullable = true)\n",
+      " |-- NAME_SV: string (nullable = true)\n",
+      " |-- NAME_TR: string (nullable = true)\n",
+      " |-- NAME_VI: string (nullable = true)\n",
+      " |-- NAME_ZH: string (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "countries = ShapefileReader.readToGeometryRDD(sc, \"data/ne_50m_admin_0_countries_lakes/\")\n",
+    "countries_df = Adapter.toDf(countries, spark)\n",
+    "countries_df.createOrReplaceTempView(\"country\")\n",
+    "countries_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read airports shapefile into a Sedona DataFrame \n",
+    "Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- geometry: geometry (nullable = true)\n",
+      " |-- scalerank: string (nullable = true)\n",
+      " |-- featurecla: string (nullable = true)\n",
+      " |-- type: string (nullable = true)\n",
+      " |-- name: string (nullable = true)\n",
+      " |-- abbrev: string (nullable = true)\n",
+      " |-- location: string (nullable = true)\n",
+      " |-- gps_code: string (nullable = true)\n",
+      " |-- iata_code: string (nullable = true)\n",
+      " |-- wikipedia: string (nullable = true)\n",
+      " |-- natlscale: string (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "airports = ShapefileReader.readToGeometryRDD(sc, \"data/ne_50m_airports/\")\n",
+    "airports_df = Adapter.toDf(airports, spark)\n",
+    "airports_df.createOrReplaceTempView(\"airport\")\n",
+    "airports_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Spatial Join using SQL API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = spark.sql(\"SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Spatial Join using RDD API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "airports_rdd = Adapter.toSpatialRdd(airports_df, \"geometry\")\n",
+    "# Drop the duplicate name column in countries_df\n",
+    "countries_df = countries_df.drop(\"NAME\")\n",
+    "countries_rdd = Adapter.toSpatialRdd(countries_df, \"geometry\")\n",
+    "\n",
+    "airports_rdd.analyze()\n",
+    "countries_rdd.analyze()\n",
+    "\n",
+    "# 4 is the num partitions used in spatial partitioning. This is an optional parameter\n",
+    "airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)\n",
+    "countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())\n",
+    "\n",
+    "buildOnSpatialPartitionedRDD = True\n",
+    "usingIndex = True\n",
+    "considerBoundaryIntersection = True\n",
+    "airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)\n",
+    "\n",
+    "result_pair_rdd = JoinQuery.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)\n",
+    "\n",
+    "result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, spark)\n",
+    "\n",
+    "result2.createOrReplaceTempView(\"join_result_with_all_cols\")\n",
+    "# Select the columns needed in the join\n",
+    "result2 = spark.sql(\"SELECT geom_1 as country_geom, NAME_EN, geom_2 as airport_geom, name FROM join_result_with_all_cols\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Print spatial join results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|        country_geom|             NAME_EN|        airport_geom|                name|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|MULTIPOLYGON (((3...|Zimbabwe         ...|POINT (31.1014 -1...|Harare Int'l     ...|\n",
+      "|MULTIPOLYGON (((3...|Zimbabwe         ...|POINT (28.6225520...|Bulawayo         ...|\n",
+      "|MULTIPOLYGON (((3...|Zambia           ...|POINT (28.4455443...|Lusaka Int'l     ...|\n",
+      "|MULTIPOLYGON (((5...|Yemen            ...|POINT (45.030602 ...|Aden Int'l       ...|\n",
+      "|MULTIPOLYGON (((5...|Yemen            ...|POINT (44.2246467...|Sanaa Int'l      ...|\n",
+      "|MULTIPOLYGON (((1...|Vietnam          ...|POINT (105.803759...|Noi Bai          ...|\n",
+      "|MULTIPOLYGON (((1...|Vietnam          ...|POINT (106.664246...|Tan Son Nhat     ...|\n",
+      "|MULTIPOLYGON (((-...|Venezuela        ...|POINT (-67.005748...|Simon Bolivar Int...|\n",
+      "|MULTIPOLYGON (((-...|Venezuela        ...|POINT (-71.723768...|La Chinita Int'l ...|\n",
+      "|MULTIPOLYGON (((-...|Venezuela        ...|POINT (-67.922361...|Zim Valencia     ...|\n",
+      "|MULTIPOLYGON (((7...|Uzbekistan       ...|POINT (69.2666137...|Tashkent Int'l   ...|\n",
+      "|POLYGON ((-53.370...|Uruguay          ...|POINT (-56.026636...|Carrasco Int'l   ...|\n",
+      "|MULTIPOLYGON (((-...|Puerto Rico      ...|POINT (-66.004229...|Luis Munoz Marin ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-149.98172...|Anchorage Int'l  ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-71.016406...|Gen E L Logan Int...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-76.668642...|Baltimore-Washing...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-104.67379...|Denver Int'l     ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-97.040371...|Dallas-Ft. Worth ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-74.177147...|Newark Int'l     ...|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "only showing top 20 rows\n",
+      "\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|        country_geom|             NAME_EN|        airport_geom|                name|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-80.145258...|Fort Lauderdale H...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-80.278971...|Miami Int'l      ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-95.333704...|George Bush Inter...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-90.256693...|New Orleans Int'l...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-81.307371...|Orlando Int'l    ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-82.534824...|Tampa Int'l      ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-112.01363...|Sky Harbor Int'l ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-118.40246...|Los Angeles Int'l...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-116.97547...|General Abelardo ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-97.040371...|Dallas-Ft. Worth ...|\n",
+      "|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|\n",
+      "|POLYGON ((-69.965...|Peru             ...|POINT (-77.107565...|Jorge Chavez     ...|\n",
+      "|MULTIPOLYGON (((-...|Panama           ...|POINT (-79.387134...|Tocumen Int'l    ...|\n",
+      "|POLYGON ((-83.157...|Nicaragua        ...|POINT (-86.171284...|Augusto Cesar San...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-96.183570...|Gen. Heriberto Ja...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-106.27001...|General Rafael Bu...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-99.754508...|General Juan N Al...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-99.570649...|Jose Maria Morelo...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-98.375759...|Puebla           ...|\n",
+      "|MULTIPOLYGON (((-...|Mexico           ...|POINT (-99.082607...|Lic Benito Juarez...|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The result of SQL API\n",
+    "result.show()\n",
+    "# The result of RDD API\n",
+    "result2.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Group airports by country"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+------------+\n",
+      "|             NAME_EN|        country_geom|AirportCount|\n",
+      "+--------------------+--------------------+------------+\n",
+      "|Denmark          ...|MULTIPOLYGON (((1...|           1|\n",
+      "|Algeria          ...|POLYGON ((8.57656...|           1|\n",
+      "|Lesotho          ...|POLYGON ((28.7369...|           1|\n",
+      "|Pakistan         ...|POLYGON ((76.7668...|           3|\n",
+      "|Paraguay         ...|POLYGON ((-58.159...|           1|\n",
+      "|Madagascar       ...|MULTIPOLYGON (((4...|           1|\n",
+      "|United Kingdom   ...|MULTIPOLYGON (((-...|           2|\n",
+      "|Haiti            ...|MULTIPOLYGON (((-...|           1|\n",
+      "|Tanzania         ...|MULTIPOLYGON (((3...|           1|\n",
+      "|United Arab Emira...|MULTIPOLYGON (((5...|           1|\n",
+      "|Nigeria          ...|MULTIPOLYGON (((7...|           3|\n",
+      "|Guinea           ...|POLYGON ((-10.283...|           1|\n",
+      "|Malta            ...|MULTIPOLYGON (((1...|           1|\n",
+      "|Mali             ...|POLYGON ((-11.389...|           1|\n",
+      "|Croatia          ...|MULTIPOLYGON (((1...|           1|\n",
+      "|Saudi Arabia     ...|MULTIPOLYGON (((3...|           2|\n",
+      "|Kenya            ...|MULTIPOLYGON (((4...|           2|\n",
+      "|Yemen            ...|MULTIPOLYGON (((5...|           2|\n",
+      "|Niger            ...|POLYGON ((13.6063...|           1|\n",
+      "|Lithuania        ...|MULTIPOLYGON (((2...|           1|\n",
+      "+--------------------+--------------------+------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# result.createOrReplaceTempView(\"result\")\n",
+    "result2.createOrReplaceTempView(\"result\")\n",
+    "groupedresult = spark.sql(\"SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom\")\n",
+    "groupedresult.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualize the number of airports in each country"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot:>"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAACXCAYAAAAGX2ymAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABIMUlEQVR4nO29eZQkZ3Xg+7sRkUvtVV1Vve+t1i4kpEZiNWDMarMNGLAZrHmPGcYe+9kev/E8sGeefWyPh/EZjx/zxscYPDzLDIMMGDDGDKtZLGzQgnYkpJbU6kW9d1d1bblF3PfH/SKXqsysrDUzq+N3TnZnRmZEfBkV+d3v7qKqJCQkJCQkLIbX7gEkJCQkJHQHicBISEhISGiJRGAkJCQkJLREIjASEhISEloiERgJCQkJCS0RtHsA1YyNjenevXvbPYyEhISEVee+++47p6rj9d67QkRnV3Dsk/AVVX3dCg7R [...]
+      "text/plain": [
+       "<Figure size 432x288 with 2 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df = groupedresult.toPandas()\n",
+    "gdf = gpd.GeoDataFrame(df, geometry=\"country_geom\")\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
+    "fig, ax = plt.subplots(1, 1)\n",
+    "divider = make_axes_locatable(ax)\n",
+    "cax = divider.append_axes(\"right\", size=\"5%\", pad=0.1)\n",
+    "\n",
+    "gdf.plot(\n",
+    "    column=\"AirportCount\",\n",
+    "    legend=True,\n",
+    "    cmap='OrRd',\n",
+    "    cax=cax,\n",
+    "    ax=ax\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "apache-sedona",
+   "language": "python",
+   "name": "apache-sedona"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/binder/Pipfile b/binder/Pipfile
index 8c6ba1c..da15fc1 100644
--- a/binder/Pipfile
+++ b/binder/Pipfile
@@ -17,6 +17,8 @@ pyspark="==3.0.1"
 attrs="*"
 ipykernel = "*"
 apache-sedona="==1.0.0"
+matplotlib = "*"
+descartes = "*"
 
 [requires]
 python_version = "3.7"
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg
new file mode 100644
index 0000000..3ad133c
--- /dev/null
+++ b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg
@@ -0,0 +1 @@
+UTF-8
\ No newline at end of file
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf
new file mode 100644
index 0000000..03caa76
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf differ
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj
new file mode 100644
index 0000000..a30c00a
--- /dev/null
+++ b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp
new file mode 100644
index 0000000..e630c6f
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp differ
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx
new file mode 100644
index 0000000..76a0434
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.dbf b/binder/data/ne_50m_airports/ne_50m_airports.dbf
new file mode 100644
index 0000000..bdf3005
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.dbf differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.prj b/binder/data/ne_50m_airports/ne_50m_airports.prj
new file mode 100644
index 0000000..b13a717
--- /dev/null
+++ b/binder/data/ne_50m_airports/ne_50m_airports.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.shp b/binder/data/ne_50m_airports/ne_50m_airports.shp
new file mode 100644
index 0000000..d1417b2
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.shp differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.shx b/binder/data/ne_50m_airports/ne_50m_airports.shx
new file mode 100644
index 0000000..b99c0fc
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.shx differ