You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sedona.apache.org by ji...@apache.org on 2023/05/21 22:02:34 UTC
[sedona] branch master updated: [SEDONA-282] R raster write function (#838)
This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new cd74c0df [SEDONA-282] R raster write function (#838)
cd74c0df is described below
commit cd74c0dfac7ddaf05c9f38c84e55021bcfb4eb1f
Author: gregleleu <33...@users.noreply.github.com>
AuthorDate: Sun May 21 18:02:29 2023 -0400
[SEDONA-282] R raster write function (#838)
---
R/DESCRIPTION | 2 +-
R/NAMESPACE | 1 +
R/R/data_interface.R | 28 ++++-
R/man/spark_write_geojson.Rd | 13 +-
R/tests/testthat/helper-initialize.R | 2 +-
R/tests/testthat/test-data-interface-raster.R | 172 +++++++++++++++++++++++++-
R/vignettes/articles/raster.Rmd | 53 ++++++--
7 files changed, 258 insertions(+), 13 deletions(-)
diff --git a/R/DESCRIPTION b/R/DESCRIPTION
index f75b659b..8fd8b9a0 100644
--- a/R/DESCRIPTION
+++ b/R/DESCRIPTION
@@ -1,7 +1,7 @@
Type: Package
Package: apache.sedona
Title: R Interface for Apache Sedona
-Version: 1.4.0
+Version: 1.4.0.9000
Authors@R:
c(person(family = "Apache Sedona",
role = c("aut", "cre"),
diff --git a/R/NAMESPACE b/R/NAMESPACE
index 6d06c31f..16194ad4 100644
--- a/R/NAMESPACE
+++ b/R/NAMESPACE
@@ -33,6 +33,7 @@ export(spark_read_shapefile)
export(spark_write_geojson)
export(spark_write_geoparquet)
export(spark_write_geotiff)
+export(spark_write_raster)
export(to_spatial_rdd)
import(sparklyr)
importFrom(cli,cli_alert_info)
diff --git a/R/R/data_interface.R b/R/R/data_interface.R
index 4d68a514..a7cbfdfa 100644
--- a/R/R/data_interface.R
+++ b/R/R/data_interface.R
@@ -630,7 +630,8 @@ sedona_save_spatial_rdd <- function(x,
#'
#' * `spark_write_geojson`: to GeoJSON
#' * `spark_write_geoparquet`: to GeoParquet
-#' * `spark_write_geotiff`: to GeoTiff
+#' * `spark_write_geotiff`: to GeoTiff from Array\[Double\] rasters
+#' * `spark_write_raster`: to raster tiles after using RS output functions (`RS_AsXXX`)
#'
#'
#' @param path The path to the file. Needs to be accessible from the cluster.
@@ -737,6 +738,31 @@ spark_write_geotiff <- function(x,
}
+
+#' @export
+#' @rdname spark_write_geojson
+#' @importFrom sparklyr spark_write_source
+spark_write_raster <- function(x,
+ path,
+ mode = NULL,
+ options = list(),
+ partition_by = NULL,
+ ...) {
+
+ spark_write_source(
+ x = x,
+ source = "raster",
+ mode = mode,
+ options = options,
+ partition_by = partition_by,
+ save_args = list(path),
+ ...
+ )
+
+}
+
+
+
# ------- Utilities ------------
rdd_cls_from_type <- function(type = c("point", "polygon", "linestring")) {
type <- match.arg(type)
diff --git a/R/man/spark_write_geojson.Rd b/R/man/spark_write_geojson.Rd
index 04c13eb0..aff5116f 100644
--- a/R/man/spark_write_geojson.Rd
+++ b/R/man/spark_write_geojson.Rd
@@ -4,6 +4,7 @@
\alias{spark_write_geojson}
\alias{spark_write_geoparquet}
\alias{spark_write_geotiff}
+\alias{spark_write_raster}
\title{Write geospatial data from a Spark DataFrame.}
\usage{
spark_write_geojson(
@@ -32,6 +33,15 @@ spark_write_geotiff(
partition_by = NULL,
...
)
+
+spark_write_raster(
+ x,
+ path,
+ mode = NULL,
+ options = list(),
+ partition_by = NULL,
+ ...
+)
}
\arguments{
\item{x}{A Spark DataFrame or dplyr operation}
@@ -57,7 +67,8 @@ Functions to write geospatial data into a variety of formats from Spark DataFram
\itemize{
\item \code{spark_write_geojson}: to GeoJSON
\item \code{spark_write_geoparquet}: to GeoParquet
-\item \code{spark_write_geotiff}: to GeoTiff
+\item \code{spark_write_geotiff}: to GeoTiff from Array[Double] rasters
+\item \code{spark_write_raster}: to raster tiles after using RS output functions (\code{RS_AsXXX})
}
}
\examples{
diff --git a/R/tests/testthat/helper-initialize.R b/R/tests/testthat/helper-initialize.R
index b399345a..7104e722 100644
--- a/R/tests/testthat/helper-initialize.R
+++ b/R/tests/testthat/helper-initialize.R
@@ -21,7 +21,7 @@ testthat_spark_connection <- function(conn_retry_interval_s = 2) {
version <- Sys.getenv("SPARK_VERSION")
hadoop_version <- Sys.getenv("HADOOP_VERSION")
spark_installed <- spark_installed_versions()
- if (nrow(spark_installed[spark_installed$spark == version & spark_installed$hadoop_version == hadoop_version, ]) == 0) {
+ if (nrow(spark_installed[spark_installed$spark == version & spark_installed$hadoop == hadoop_version, ]) == 0) {
spark_install(version, hadoop_version)
}
diff --git a/R/tests/testthat/test-data-interface-raster.R b/R/tests/testthat/test-data-interface-raster.R
index 35f0ffb7..fef1aaf2 100644
--- a/R/tests/testthat/test-data-interface-raster.R
+++ b/R/tests/testthat/test-data-interface-raster.R
@@ -579,7 +579,7 @@ test_that("Should Pass geotiff file writing with handling invalid schema", {
})
-# Binary and RS_functions -----------------
+# Read Binary and RS_functions -----------------
# Only functions related to reading
test_that("Passed RS_FromGeoTiff from binary", {
@@ -765,3 +765,173 @@ test_that("Passed RS_Values with raster", {
sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
rm(a)
})
+
+
+# Write Binary and RS_functions -----------------
+test_that("Should read geotiff using binary source and write geotiff back to disk using raster source", {
+
+ ## Load
+ sdf_name <- random_string("spatial_sdf")
+ binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+
+ tmp_dest <- tempfile()
+
+ binary_sdf %>%
+ spark_write_raster(path = tmp_dest)
+
+ sdf_name_2 <- random_string("spatial_sdf_2")
+ binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+
+ expect_equal(
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+ )
+
+ ## Cleanup
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+ rm(binary_sdf, binary_2_sdf, tmp_dest)
+
+
+
+})
+
+test_that("Should read and write geotiff using given options", {
+
+ ## Load
+ sdf_name <- random_string("spatial_sdf")
+ binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+
+ tmp_dest <- tempfile()
+
+ binary_sdf %>%
+ spark_write_raster(path = tmp_dest,
+ options = list("rasterField" = "content",
+ "fileExtension" = ".tiff",
+ "pathField" = "path"
+ ))
+
+ sdf_name_2 <- random_string("spatial_sdf_2")
+ binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+
+
+ expect_equal(
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+ )
+
+ ## Cleanup
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+ rm(binary_sdf, binary_2_sdf, tmp_dest)
+
+})
+
+test_that("Should read and write via RS_FromGeoTiff and RS_AsGeoTiff", {
+
+
+ ## Load
+ sdf_name <- random_string("spatial_sdf")
+ binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+
+ raster_sdf <-
+ binary_sdf %>%
+ mutate(raster = RS_FromGeoTiff(content)) %>%
+ mutate(content = RS_AsGeoTiff(raster))
+
+
+ tmp_dest <- tempfile()
+
+ raster_sdf %>%
+ spark_write_raster(path = tmp_dest,
+ options = list("rasterField" = "content",
+ "fileExtension" = ".tiff",
+ "pathField" = "path"
+ ))
+
+ sdf_name_2 <- random_string("spatial_sdf_2")
+ binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+
+
+ expect_equal(
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name))
+ )
+
+ ## Cleanup
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+ rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+
+})
+
+test_that("Should handle null", {
+
+ ## Load
+ sdf_name <- random_string("spatial_sdf")
+ binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+
+ raster_sdf <-
+ binary_sdf %>%
+ mutate(raster = RS_FromGeoTiff(NULL)) %>%
+ mutate(content = RS_AsGeoTiff(raster))
+
+ tmp_dest <- tempfile()
+
+ raster_sdf %>%
+ spark_write_raster(path = tmp_dest)
+
+ sdf_name_2 <- random_string("spatial_sdf_2")
+ binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+
+ out <- sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+
+ expect_equal(
+ out$n,
+ 0
+ )
+
+ ## Cleanup
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+ rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+
+})
+
+test_that("Should read RS_FromGeoTiff and write RS_AsArcGrid", {
+
+ ## Load
+ sdf_name <- random_string("spatial_sdf")
+ binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+
+ raster_sdf <-
+ binary_sdf %>%
+ mutate(raster = RS_FromGeoTiff(content)) %>%
+ mutate(content = RS_AsArcGrid(raster)) %>%
+ sdf_register()
+
+ tmp_dest <- tempfile()
+
+ raster_sdf %>%
+ spark_write_raster(path = tmp_dest,
+ options = list("rasterField" = "content",
+ "fileExtension" = ".asc",
+ "pathField" = "path"
+ ))
+
+ sdf_name_2 <- random_string("spatial_sdf_2")
+ binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+
+
+
+ expect_equal(
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, dbplyr::remote_name(raster_sdf))),
+ sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+ )
+
+ ## Cleanup
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+ sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+ rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+
+})
diff --git a/R/vignettes/articles/raster.Rmd b/R/vignettes/articles/raster.Rmd
index e1553e34..8576e2b8 100644
--- a/R/vignettes/articles/raster.Rmd
+++ b/R/vignettes/articles/raster.Rmd
@@ -16,10 +16,11 @@ knitr::opts_chunk$set(
```
-Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into Spark.
+Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into and written from Spark.
# Using the RasterUDT
+## Read
Raster data in GeoTiff and ArcInfo Grid format can be loaded directly into Spark using the `sparklyr::spark_read_binary` and Sedona constructors `RS_FromGeoTiff` and `RS_FromArcInfoAsciiGrid`.
```{r include=FALSE}
@@ -44,13 +45,13 @@ raster
raster %>% sdf_schema()
```
- Once the data is loaded, raster functions are available in dplyr workflows:
-
- * [Raster operators](../../../api/sql/Raster-operators/)
- * [Raster input and output](../../../api/sql/Raster-loader/)
-
+Once the data is loaded, raster functions are available in dplyr workflows:
+
+* [Raster operators](../../../api/sql/Raster-operators/)
+* [Raster input and output](../../../api/sql/Raster-loader/)
+
Functions taking in `raster: Raster` arguments are meant to be used with data loaded with this reader, such as `RS_Value`, `RS_Values`, `RS_Envelope`. Functions taking in `Band: Array[Double]` arguments work with data loaded using the Sedona Geotiff DataFrame loader (see [below](#Using the Sedona Geotiff Dataframe Loader)).
-
+
For example, getting the number of bands:
```{r}
@@ -86,6 +87,42 @@ raster %>%
```
+## Write
+
+
+To write a Sedona Raster DataFrame to raster files, you need to (1) first convert the Raster DataFrame to a binary DataFrame using `RS_AsXXX` functions and (2) then write the binary DataFrame to raster files using Sedona's built-in `raster` data source.
+
+To write a Sedona binary DataFrame to external storage using Sedona's built-in `raster` data source, use the `spark_write_raster` function:
+
+```{r}
+dest_file <- tempfile()
+raster %>%
+ mutate(content = RS_AsGeoTiff(raster)) %>%
+ spark_write_raster(path = dest_file)
+
+dir(dest_file, recursive = TRUE)
+```
+
+
+Available options see [Raster writer](../../../api/sql/Raster-writer/):
+
+* rasterField: the binary column to be saved (if there is only one takes that column by default, otherwise specify)
+* fileExtension: `.tiff` bvy default, also accepts `.png`, `.jpeg`, `.asc`
+* pathField: if used any column name that indicates the paths of each raster file, otherwise random UUIDs are generated.
+
+```{r}
+dest_file <- tempfile()
+raster %>%
+ mutate(content = RS_AsArcGrid(raster)) %>%
+ spark_write_raster(path = dest_file,
+ options = list("rasterField" = "content",
+ "fileExtension" = ".asc",
+ "pathField" = "path"
+ ))
+
+dir(dest_file, recursive = TRUE)
+```
+
# Using the Sedona Geotiff Dataframe Loader
@@ -96,7 +133,7 @@ data_tbl
## Using a direct SQL query: results are collected directly
sc %>%
- DBI::dbGetQuery("SELECT
+ DBI::dbGetQuery("SELECT
image.geometry as Geom,
image.height as height,
image.width as width,