You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sedona.apache.org by ji...@apache.org on 2021/03/18 19:40:50 UTC
[incubator-sedona] branch master updated: Add SQL Join in Python
Binder examples
This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sedona.git
The following commit(s) were added to refs/heads/master by this push:
new e0fa15d Add SQL Join in Python Binder examples
e0fa15d is described below
commit e0fa15d7d8337a3a14d6296a0ddc320ad9bd7152
Author: Jia Yu <ji...@apache.org>
AuthorDate: Thu Mar 18 12:40:37 2021 -0700
Add SQL Join in Python Binder examples
---
binder/.gitignore | 1 +
binder/ApacheSedonaCore.ipynb | 2 +-
binder/ApacheSedonaSQL.ipynb | 59 ++-
...eSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb | 485 +++++++++++++++++++++
binder/Pipfile | 2 +
.../ne_50m_admin_0_countries_lakes.cpg | 1 +
.../ne_50m_admin_0_countries_lakes.dbf | Bin 0 -> 546979 bytes
.../ne_50m_admin_0_countries_lakes.prj | 1 +
.../ne_50m_admin_0_countries_lakes.shp | Bin 0 -> 1652184 bytes
.../ne_50m_admin_0_countries_lakes.shx | Bin 0 -> 2028 bytes
binder/data/ne_50m_airports/ne_50m_airports.dbf | Bin 0 -> 326032 bytes
binder/data/ne_50m_airports/ne_50m_airports.prj | 1 +
binder/data/ne_50m_airports/ne_50m_airports.shp | Bin 0 -> 7968 bytes
binder/data/ne_50m_airports/ne_50m_airports.shx | Bin 0 -> 2348 bytes
14 files changed, 518 insertions(+), 34 deletions(-)
diff --git a/binder/.gitignore b/binder/.gitignore
new file mode 100644
index 0000000..d5e992d
--- /dev/null
+++ b/binder/.gitignore
@@ -0,0 +1 @@
+/.ipynb_checkpoints/
diff --git a/binder/ApacheSedonaCore.ipynb b/binder/ApacheSedonaCore.ipynb
index ec417e5..e311b99 100644
--- a/binder/ApacheSedonaCore.ipynb
+++ b/binder/ApacheSedonaCore.ipynb
@@ -2138,7 +2138,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.9.1"
}
},
"nbformat": 4,
diff --git a/binder/ApacheSedonaSQL.ipynb b/binder/ApacheSedonaSQL.ipynb
index f3b3f80..5643176 100644
--- a/binder/ApacheSedonaSQL.ipynb
+++ b/binder/ApacheSedonaSQL.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -33,7 +33,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -42,7 +42,7 @@
"True"
]
},
- "execution_count": 3,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -67,7 +67,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -109,7 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -191,7 +191,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -239,7 +239,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -247,22 +247,22 @@
"output_type": "stream",
"text": [
"== Physical Plan ==\n",
- "DistanceJoin pointshape1#261: geometry, pointshape2#285: geometry, 2.0, false\n",
- ":- Project [st_point(cast(_c0#255 as decimal(24,20)), cast(_c1#256 as decimal(24,20))) AS pointshape1#261]\n",
- ": +- FileScan csv [_c0#255,_c1#256] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/ubuntu/code/incubator-sedona/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
- "+- Project [st_point(cast(_c0#279 as decimal(24,20)), cast(_c1#280 as decimal(24,20))) AS pointshape2#285]\n",
- " +- FileScan csv [_c0#279,_c1#280] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/ubuntu/code/incubator-sedona/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
+ "DistanceJoin pointshape1#261: geometry, pointshape2#287: geometry, 2.0, false\n",
+ ":- Project [st_point(cast(_c0#255 as decimal(24,20)), cast(_c1#256 as decimal(24,20))) AS pointshape1#261, abc AS name1#262]\n",
+ ": +- FileScan csv [_c0#255,_c1#256] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/jiayu/GitHub/GeoSpark-datasys-repo/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
+ "+- Project [st_point(cast(_c0#281 as decimal(24,20)), cast(_c1#282 as decimal(24,20))) AS pointshape2#287, def AS name2#288]\n",
+ " +- FileScan csv [_c0#281,_c1#282] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/jiayu/GitHub/GeoSpark-datasys-repo/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
"\n",
"\n",
- "+-----------------+-----------------+\n",
- "| pointshape1| pointshape2|\n",
- "+-----------------+-----------------+\n",
- "|POINT (1.1 101.1)|POINT (1.1 101.1)|\n",
- "|POINT (1.1 101.1)|POINT (2.1 102.1)|\n",
- "|POINT (2.1 102.1)|POINT (1.1 101.1)|\n",
- "|POINT (2.1 102.1)|POINT (2.1 102.1)|\n",
- "|POINT (2.1 102.1)|POINT (3.1 103.1)|\n",
- "+-----------------+-----------------+\n",
+ "+-----------------+-----+-----------------+-----+\n",
+ "| pointshape1|name1| pointshape2|name2|\n",
+ "+-----------------+-----+-----------------+-----+\n",
+ "|POINT (1.1 101.1)| abc|POINT (1.1 101.1)| def|\n",
+ "|POINT (1.1 101.1)| abc|POINT (2.1 102.1)| def|\n",
+ "|POINT (2.1 102.1)| abc|POINT (1.1 101.1)| def|\n",
+ "|POINT (2.1 102.1)| abc|POINT (2.1 102.1)| def|\n",
+ "|POINT (2.1 102.1)| abc|POINT (3.1 103.1)| def|\n",
+ "+-----------------+-----+-----------------+-----+\n",
"only showing top 5 rows\n",
"\n"
]
@@ -275,7 +275,7 @@
"\n",
"point_csv_df_1.createOrReplaceTempView(\"pointtable\")\n",
"\n",
- "point_df1 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1 from pointtable\")\n",
+ "point_df1 = spark.sql(\"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, \\'abc\\' as name1 from pointtable\")\n",
"point_df1.createOrReplaceTempView(\"pointdf1\")\n",
"\n",
"point_csv_df2 = spark.read.format(\"csv\").\\\n",
@@ -283,7 +283,7 @@
" option(\"header\", \"false\").load(\"data/testpoint.csv\")\n",
"\n",
"point_csv_df2.createOrReplaceTempView(\"pointtable\")\n",
- "point_df2 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2 from pointtable\")\n",
+ "point_df2 = spark.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, \\'def\\' as name2 from pointtable\")\n",
"point_df2.createOrReplaceTempView(\"pointdf2\")\n",
"\n",
"distance_join_df = spark.sql(\"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2\")\n",
@@ -295,13 +295,6 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "For more examples please refer to https://sedona.apache.org/"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
"### Converting GeoPandas to Apache Sedona"
]
},
@@ -665,7 +658,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.9.1"
}
},
"nbformat": 4,
diff --git a/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb b/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb
new file mode 100644
index 0000000..b055d9f
--- /dev/null
+++ b/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb
@@ -0,0 +1,485 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "import geopandas as gpd\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import col, expr, when\n",
+ "\n",
+ "from sedona.register import SedonaRegistrator\n",
+ "from sedona.utils import SedonaKryoRegistrator, KryoSerializer\n",
+ "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
+ "from sedona.utils.adapter import Adapter\n",
+ "from sedona.core.enums import GridType\n",
+ "from sedona.core.enums import IndexType\n",
+ "from sedona.core.spatialOperator import JoinQuery"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup Sedona environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark = SparkSession.builder.\\\n",
+ " master(\"local[*]\").\\\n",
+ " appName(\"SedonaSQL-Example\").\\\n",
+ " config(\"spark.serializer\", KryoSerializer.getName).\\\n",
+ " config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) .\\\n",
+ " config('spark.jars.packages',\n",
+ " 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,'\n",
+ " 'org.datasyslab:geotools-wrapper:geotools-24.0'). \\\n",
+ " getOrCreate()\n",
+ "SedonaRegistrator.registerAll(spark)\n",
+ "sc = spark.sparkContext\n",
+ "sc.setSystemProperty(\"sedona.global.charset\", \"utf8\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read countries shapefile into a Sedona DataFrame \n",
+ "Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- geometry: geometry (nullable = true)\n",
+ " |-- featurecla: string (nullable = true)\n",
+ " |-- scalerank: string (nullable = true)\n",
+ " |-- LABELRANK: string (nullable = true)\n",
+ " |-- SOVEREIGNT: string (nullable = true)\n",
+ " |-- SOV_A3: string (nullable = true)\n",
+ " |-- ADM0_DIF: string (nullable = true)\n",
+ " |-- LEVEL: string (nullable = true)\n",
+ " |-- TYPE: string (nullable = true)\n",
+ " |-- ADMIN: string (nullable = true)\n",
+ " |-- ADM0_A3: string (nullable = true)\n",
+ " |-- GEOU_DIF: string (nullable = true)\n",
+ " |-- GEOUNIT: string (nullable = true)\n",
+ " |-- GU_A3: string (nullable = true)\n",
+ " |-- SU_DIF: string (nullable = true)\n",
+ " |-- SUBUNIT: string (nullable = true)\n",
+ " |-- SU_A3: string (nullable = true)\n",
+ " |-- BRK_DIFF: string (nullable = true)\n",
+ " |-- NAME: string (nullable = true)\n",
+ " |-- NAME_LONG: string (nullable = true)\n",
+ " |-- BRK_A3: string (nullable = true)\n",
+ " |-- BRK_NAME: string (nullable = true)\n",
+ " |-- BRK_GROUP: string (nullable = true)\n",
+ " |-- ABBREV: string (nullable = true)\n",
+ " |-- POSTAL: string (nullable = true)\n",
+ " |-- FORMAL_EN: string (nullable = true)\n",
+ " |-- FORMAL_FR: string (nullable = true)\n",
+ " |-- NAME_CIAWF: string (nullable = true)\n",
+ " |-- NOTE_ADM0: string (nullable = true)\n",
+ " |-- NOTE_BRK: string (nullable = true)\n",
+ " |-- NAME_SORT: string (nullable = true)\n",
+ " |-- NAME_ALT: string (nullable = true)\n",
+ " |-- MAPCOLOR7: string (nullable = true)\n",
+ " |-- MAPCOLOR8: string (nullable = true)\n",
+ " |-- MAPCOLOR9: string (nullable = true)\n",
+ " |-- MAPCOLOR13: string (nullable = true)\n",
+ " |-- POP_EST: string (nullable = true)\n",
+ " |-- POP_RANK: string (nullable = true)\n",
+ " |-- GDP_MD_EST: string (nullable = true)\n",
+ " |-- POP_YEAR: string (nullable = true)\n",
+ " |-- LASTCENSUS: string (nullable = true)\n",
+ " |-- GDP_YEAR: string (nullable = true)\n",
+ " |-- ECONOMY: string (nullable = true)\n",
+ " |-- INCOME_GRP: string (nullable = true)\n",
+ " |-- WIKIPEDIA: string (nullable = true)\n",
+ " |-- FIPS_10_: string (nullable = true)\n",
+ " |-- ISO_A2: string (nullable = true)\n",
+ " |-- ISO_A3: string (nullable = true)\n",
+ " |-- ISO_A3_EH: string (nullable = true)\n",
+ " |-- ISO_N3: string (nullable = true)\n",
+ " |-- UN_A3: string (nullable = true)\n",
+ " |-- WB_A2: string (nullable = true)\n",
+ " |-- WB_A3: string (nullable = true)\n",
+ " |-- WOE_ID: string (nullable = true)\n",
+ " |-- WOE_ID_EH: string (nullable = true)\n",
+ " |-- WOE_NOTE: string (nullable = true)\n",
+ " |-- ADM0_A3_IS: string (nullable = true)\n",
+ " |-- ADM0_A3_US: string (nullable = true)\n",
+ " |-- ADM0_A3_UN: string (nullable = true)\n",
+ " |-- ADM0_A3_WB: string (nullable = true)\n",
+ " |-- CONTINENT: string (nullable = true)\n",
+ " |-- REGION_UN: string (nullable = true)\n",
+ " |-- SUBREGION: string (nullable = true)\n",
+ " |-- REGION_WB: string (nullable = true)\n",
+ " |-- NAME_LEN: string (nullable = true)\n",
+ " |-- LONG_LEN: string (nullable = true)\n",
+ " |-- ABBREV_LEN: string (nullable = true)\n",
+ " |-- TINY: string (nullable = true)\n",
+ " |-- HOMEPART: string (nullable = true)\n",
+ " |-- MIN_ZOOM: string (nullable = true)\n",
+ " |-- MIN_LABEL: string (nullable = true)\n",
+ " |-- MAX_LABEL: string (nullable = true)\n",
+ " |-- NE_ID: string (nullable = true)\n",
+ " |-- WIKIDATAID: string (nullable = true)\n",
+ " |-- NAME_AR: string (nullable = true)\n",
+ " |-- NAME_BN: string (nullable = true)\n",
+ " |-- NAME_DE: string (nullable = true)\n",
+ " |-- NAME_EN: string (nullable = true)\n",
+ " |-- NAME_ES: string (nullable = true)\n",
+ " |-- NAME_FR: string (nullable = true)\n",
+ " |-- NAME_EL: string (nullable = true)\n",
+ " |-- NAME_HI: string (nullable = true)\n",
+ " |-- NAME_HU: string (nullable = true)\n",
+ " |-- NAME_ID: string (nullable = true)\n",
+ " |-- NAME_IT: string (nullable = true)\n",
+ " |-- NAME_JA: string (nullable = true)\n",
+ " |-- NAME_KO: string (nullable = true)\n",
+ " |-- NAME_NL: string (nullable = true)\n",
+ " |-- NAME_PL: string (nullable = true)\n",
+ " |-- NAME_PT: string (nullable = true)\n",
+ " |-- NAME_RU: string (nullable = true)\n",
+ " |-- NAME_SV: string (nullable = true)\n",
+ " |-- NAME_TR: string (nullable = true)\n",
+ " |-- NAME_VI: string (nullable = true)\n",
+ " |-- NAME_ZH: string (nullable = true)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "countries = ShapefileReader.readToGeometryRDD(sc, \"data/ne_50m_admin_0_countries_lakes/\")\n",
+ "countries_df = Adapter.toDf(countries, spark)\n",
+ "countries_df.createOrReplaceTempView(\"country\")\n",
+ "countries_df.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read airports shapefile into a Sedona DataFrame \n",
+ "Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- geometry: geometry (nullable = true)\n",
+ " |-- scalerank: string (nullable = true)\n",
+ " |-- featurecla: string (nullable = true)\n",
+ " |-- type: string (nullable = true)\n",
+ " |-- name: string (nullable = true)\n",
+ " |-- abbrev: string (nullable = true)\n",
+ " |-- location: string (nullable = true)\n",
+ " |-- gps_code: string (nullable = true)\n",
+ " |-- iata_code: string (nullable = true)\n",
+ " |-- wikipedia: string (nullable = true)\n",
+ " |-- natlscale: string (nullable = true)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "airports = ShapefileReader.readToGeometryRDD(sc, \"data/ne_50m_airports/\")\n",
+ "airports_df = Adapter.toDf(airports, spark)\n",
+ "airports_df.createOrReplaceTempView(\"airport\")\n",
+ "airports_df.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run Spatial Join using SQL API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "result = spark.sql(\"SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run Spatial Join using RDD API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "airports_rdd = Adapter.toSpatialRdd(airports_df, \"geometry\")\n",
+ "# Drop the duplicate name column in countries_df\n",
+ "countries_df = countries_df.drop(\"NAME\")\n",
+ "countries_rdd = Adapter.toSpatialRdd(countries_df, \"geometry\")\n",
+ "\n",
+ "airports_rdd.analyze()\n",
+ "countries_rdd.analyze()\n",
+ "\n",
+ "# 4 is the num partitions used in spatial partitioning. This is an optional parameter\n",
+ "airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)\n",
+ "countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())\n",
+ "\n",
+ "buildOnSpatialPartitionedRDD = True\n",
+ "usingIndex = True\n",
+ "considerBoundaryIntersection = True\n",
+ "airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)\n",
+ "\n",
+ "result_pair_rdd = JoinQuery.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)\n",
+ "\n",
+ "result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, spark)\n",
+ "\n",
+ "result2.createOrReplaceTempView(\"join_result_with_all_cols\")\n",
+ "# Select the columns needed in the join\n",
+ "result2 = spark.sql(\"SELECT geom_1 as country_geom, NAME_EN, geom_2 as airport_geom, name FROM join_result_with_all_cols\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Print spatial join results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "| country_geom| NAME_EN| airport_geom| name|\n",
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "|MULTIPOLYGON (((3...|Zimbabwe ...|POINT (31.1014 -1...|Harare Int'l ...|\n",
+ "|MULTIPOLYGON (((3...|Zimbabwe ...|POINT (28.6225520...|Bulawayo ...|\n",
+ "|MULTIPOLYGON (((3...|Zambia ...|POINT (28.4455443...|Lusaka Int'l ...|\n",
+ "|MULTIPOLYGON (((5...|Yemen ...|POINT (45.030602 ...|Aden Int'l ...|\n",
+ "|MULTIPOLYGON (((5...|Yemen ...|POINT (44.2246467...|Sanaa Int'l ...|\n",
+ "|MULTIPOLYGON (((1...|Vietnam ...|POINT (105.803759...|Noi Bai ...|\n",
+ "|MULTIPOLYGON (((1...|Vietnam ...|POINT (106.664246...|Tan Son Nhat ...|\n",
+ "|MULTIPOLYGON (((-...|Venezuela ...|POINT (-67.005748...|Simon Bolivar Int...|\n",
+ "|MULTIPOLYGON (((-...|Venezuela ...|POINT (-71.723768...|La Chinita Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|Venezuela ...|POINT (-67.922361...|Zim Valencia ...|\n",
+ "|MULTIPOLYGON (((7...|Uzbekistan ...|POINT (69.2666137...|Tashkent Int'l ...|\n",
+ "|POLYGON ((-53.370...|Uruguay ...|POINT (-56.026636...|Carrasco Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|Puerto Rico ...|POINT (-66.004229...|Luis Munoz Marin ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-149.98172...|Anchorage Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-71.016406...|Gen E L Logan Int...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-76.668642...|Baltimore-Washing...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-104.67379...|Denver Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-97.040371...|Dallas-Ft. Worth ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-74.177147...|Newark Int'l ...|\n",
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n",
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "| country_geom| NAME_EN| airport_geom| name|\n",
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-80.145258...|Fort Lauderdale H...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-80.278971...|Miami Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-95.333704...|George Bush Inter...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-90.256693...|New Orleans Int'l...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-81.307371...|Orlando Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-82.534824...|Tampa Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-112.01363...|Sky Harbor Int'l ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-118.40246...|Los Angeles Int'l...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-116.97547...|General Abelardo ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-97.040371...|Dallas-Ft. Worth ...|\n",
+ "|MULTIPOLYGON (((-...|United States of ...|POINT (-84.425397...|Hartsfield-Jackso...|\n",
+ "|POLYGON ((-69.965...|Peru ...|POINT (-77.107565...|Jorge Chavez ...|\n",
+ "|MULTIPOLYGON (((-...|Panama ...|POINT (-79.387134...|Tocumen Int'l ...|\n",
+ "|POLYGON ((-83.157...|Nicaragua ...|POINT (-86.171284...|Augusto Cesar San...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-96.183570...|Gen. Heriberto Ja...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-106.27001...|General Rafael Bu...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-99.754508...|General Juan N Al...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-99.570649...|Jose Maria Morelo...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-98.375759...|Puebla ...|\n",
+ "|MULTIPOLYGON (((-...|Mexico ...|POINT (-99.082607...|Lic Benito Juarez...|\n",
+ "+--------------------+--------------------+--------------------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# The result of SQL API\n",
+ "result.show()\n",
+ "# The result of RDD API\n",
+ "result2.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Group airports by country"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+--------------------+--------------------+------------+\n",
+ "| NAME_EN| country_geom|AirportCount|\n",
+ "+--------------------+--------------------+------------+\n",
+ "|Denmark ...|MULTIPOLYGON (((1...| 1|\n",
+ "|Algeria ...|POLYGON ((8.57656...| 1|\n",
+ "|Lesotho ...|POLYGON ((28.7369...| 1|\n",
+ "|Pakistan ...|POLYGON ((76.7668...| 3|\n",
+ "|Paraguay ...|POLYGON ((-58.159...| 1|\n",
+ "|Madagascar ...|MULTIPOLYGON (((4...| 1|\n",
+ "|United Kingdom ...|MULTIPOLYGON (((-...| 2|\n",
+ "|Haiti ...|MULTIPOLYGON (((-...| 1|\n",
+ "|Tanzania ...|MULTIPOLYGON (((3...| 1|\n",
+ "|United Arab Emira...|MULTIPOLYGON (((5...| 1|\n",
+ "|Nigeria ...|MULTIPOLYGON (((7...| 3|\n",
+ "|Guinea ...|POLYGON ((-10.283...| 1|\n",
+ "|Malta ...|MULTIPOLYGON (((1...| 1|\n",
+ "|Mali ...|POLYGON ((-11.389...| 1|\n",
+ "|Croatia ...|MULTIPOLYGON (((1...| 1|\n",
+ "|Saudi Arabia ...|MULTIPOLYGON (((3...| 2|\n",
+ "|Kenya ...|MULTIPOLYGON (((4...| 2|\n",
+ "|Yemen ...|MULTIPOLYGON (((5...| 2|\n",
+ "|Niger ...|POLYGON ((13.6063...| 1|\n",
+ "|Lithuania ...|MULTIPOLYGON (((2...| 1|\n",
+ "+--------------------+--------------------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# result.createOrReplaceTempView(\"result\")\n",
+ "result2.createOrReplaceTempView(\"result\")\n",
+ "groupedresult = spark.sql(\"SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom\")\n",
+ "groupedresult.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualize the number of airports in each country"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<AxesSubplot:>"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAACXCAYAAAAGX2ymAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABIMUlEQVR4nO29eZQkZ3Xg+7sRkUvtVV1Vve+t1i4kpEZiNWDMarMNGLAZrHmPGcYe+9kev/E8sGeefWyPh/EZjx/zxscYPDzLDIMMGDDGDKtZLGzQgnYkpJbU6kW9d1d1bblF3PfH/SKXqsysrDUzq+N3TnZnRmZEfBkV+d3v7qKqJCQkJCQkLIbX7gEkJCQkJHQHicBISEhISGiJRGAkJCQkJLREIjASEhISEloiERgJCQkJCS0RtHsA1YyNjenevXvbPYyEhISEVee+++47p6rj9d67QkRnV3Dsk/AVVX3dCg7R [...]
+ "text/plain": [
+ "<Figure size 432x288 with 2 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df = groupedresult.toPandas()\n",
+ "gdf = gpd.GeoDataFrame(df, geometry=\"country_geom\")\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
+ "fig, ax = plt.subplots(1, 1)\n",
+ "divider = make_axes_locatable(ax)\n",
+ "cax = divider.append_axes(\"right\", size=\"5%\", pad=0.1)\n",
+ "\n",
+ "gdf.plot(\n",
+ " column=\"AirportCount\",\n",
+ " legend=True,\n",
+ " cmap='OrRd',\n",
+ " cax=cax,\n",
+ " ax=ax\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "apache-sedona",
+ "language": "python",
+ "name": "apache-sedona"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/binder/Pipfile b/binder/Pipfile
index 8c6ba1c..da15fc1 100644
--- a/binder/Pipfile
+++ b/binder/Pipfile
@@ -17,6 +17,8 @@ pyspark="==3.0.1"
attrs="*"
ipykernel = "*"
apache-sedona="==1.0.0"
+matplotlib = "*"
+descartes = "*"
[requires]
python_version = "3.7"
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg
new file mode 100644
index 0000000..3ad133c
--- /dev/null
+++ b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.cpg
@@ -0,0 +1 @@
+UTF-8
\ No newline at end of file
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf
new file mode 100644
index 0000000..03caa76
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.dbf differ
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj
new file mode 100644
index 0000000..a30c00a
--- /dev/null
+++ b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp
new file mode 100644
index 0000000..e630c6f
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shp differ
diff --git a/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx
new file mode 100644
index 0000000..76a0434
Binary files /dev/null and b/binder/data/ne_50m_admin_0_countries_lakes/ne_50m_admin_0_countries_lakes.shx differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.dbf b/binder/data/ne_50m_airports/ne_50m_airports.dbf
new file mode 100644
index 0000000..bdf3005
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.dbf differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.prj b/binder/data/ne_50m_airports/ne_50m_airports.prj
new file mode 100644
index 0000000..b13a717
--- /dev/null
+++ b/binder/data/ne_50m_airports/ne_50m_airports.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.shp b/binder/data/ne_50m_airports/ne_50m_airports.shp
new file mode 100644
index 0000000..d1417b2
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.shp differ
diff --git a/binder/data/ne_50m_airports/ne_50m_airports.shx b/binder/data/ne_50m_airports/ne_50m_airports.shx
new file mode 100644
index 0000000..b99c0fc
Binary files /dev/null and b/binder/data/ne_50m_airports/ne_50m_airports.shx differ