You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sedona.apache.org by ji...@apache.org on 2023/05/21 22:02:34 UTC

[sedona] branch master updated: [SEDONA-282] R raster write function (#838)

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new cd74c0df [SEDONA-282] R raster write function (#838)
cd74c0df is described below

commit cd74c0dfac7ddaf05c9f38c84e55021bcfb4eb1f
Author: gregleleu <33...@users.noreply.github.com>
AuthorDate: Sun May 21 18:02:29 2023 -0400

    [SEDONA-282] R raster write function (#838)
---
 R/DESCRIPTION                                 |   2 +-
 R/NAMESPACE                                   |   1 +
 R/R/data_interface.R                          |  28 ++++-
 R/man/spark_write_geojson.Rd                  |  13 +-
 R/tests/testthat/helper-initialize.R          |   2 +-
 R/tests/testthat/test-data-interface-raster.R | 172 +++++++++++++++++++++++++-
 R/vignettes/articles/raster.Rmd               |  53 ++++++--
 7 files changed, 258 insertions(+), 13 deletions(-)

diff --git a/R/DESCRIPTION b/R/DESCRIPTION
index f75b659b..8fd8b9a0 100644
--- a/R/DESCRIPTION
+++ b/R/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: apache.sedona
 Title: R Interface for Apache Sedona
-Version: 1.4.0
+Version: 1.4.0.9000
 Authors@R:
     c(person(family = "Apache Sedona",
              role = c("aut", "cre"),
diff --git a/R/NAMESPACE b/R/NAMESPACE
index 6d06c31f..16194ad4 100644
--- a/R/NAMESPACE
+++ b/R/NAMESPACE
@@ -33,6 +33,7 @@ export(spark_read_shapefile)
 export(spark_write_geojson)
 export(spark_write_geoparquet)
 export(spark_write_geotiff)
+export(spark_write_raster)
 export(to_spatial_rdd)
 import(sparklyr)
 importFrom(cli,cli_alert_info)
diff --git a/R/R/data_interface.R b/R/R/data_interface.R
index 4d68a514..a7cbfdfa 100644
--- a/R/R/data_interface.R
+++ b/R/R/data_interface.R
@@ -630,7 +630,8 @@ sedona_save_spatial_rdd <- function(x,
 #' 
 #' * `spark_write_geojson`: to GeoJSON
 #' * `spark_write_geoparquet`: to GeoParquet
-#' * `spark_write_geotiff`: to GeoTiff
+#' * `spark_write_geotiff`: to GeoTiff from Array\[Double\] rasters 
+#' * `spark_write_raster`: to raster tiles after using RS output functions (`RS_AsXXX`) 
 #'
 #'
 #' @param path The path to the file. Needs to be accessible from the cluster.
@@ -737,6 +738,31 @@ spark_write_geotiff <- function(x,
   
 }
 
+
+#' @export
+#' @rdname spark_write_geojson
+#' @importFrom sparklyr spark_write_source
+spark_write_raster <- function(x,
+                                   path,
+                                   mode = NULL,
+                                   options = list(),
+                                   partition_by = NULL,
+                                   ...) {
+  
+  spark_write_source(
+    x = x,
+    source = "raster",
+    mode = mode,
+    options = options,
+    partition_by = partition_by,
+    save_args = list(path),
+    ...
+  )
+  
+}
+
+
+
 # ------- Utilities ------------
 rdd_cls_from_type <- function(type = c("point", "polygon", "linestring")) {
   type <- match.arg(type)
diff --git a/R/man/spark_write_geojson.Rd b/R/man/spark_write_geojson.Rd
index 04c13eb0..aff5116f 100644
--- a/R/man/spark_write_geojson.Rd
+++ b/R/man/spark_write_geojson.Rd
@@ -4,6 +4,7 @@
 \alias{spark_write_geojson}
 \alias{spark_write_geoparquet}
 \alias{spark_write_geotiff}
+\alias{spark_write_raster}
 \title{Write geospatial data from a Spark DataFrame.}
 \usage{
 spark_write_geojson(
@@ -32,6 +33,15 @@ spark_write_geotiff(
   partition_by = NULL,
   ...
 )
+
+spark_write_raster(
+  x,
+  path,
+  mode = NULL,
+  options = list(),
+  partition_by = NULL,
+  ...
+)
 }
 \arguments{
 \item{x}{A Spark DataFrame or dplyr operation}
@@ -57,7 +67,8 @@ Functions to write geospatial data into a variety of formats from Spark DataFram
 \itemize{
 \item \code{spark_write_geojson}: to GeoJSON
 \item \code{spark_write_geoparquet}: to GeoParquet
-\item \code{spark_write_geotiff}: to GeoTiff
+\item \code{spark_write_geotiff}: to GeoTiff from Array[Double] rasters
+\item \code{spark_write_raster}: to raster tiles after using RS output functions (\code{RS_AsXXX})
 }
 }
 \examples{
diff --git a/R/tests/testthat/helper-initialize.R b/R/tests/testthat/helper-initialize.R
index b399345a..7104e722 100644
--- a/R/tests/testthat/helper-initialize.R
+++ b/R/tests/testthat/helper-initialize.R
@@ -21,7 +21,7 @@ testthat_spark_connection <- function(conn_retry_interval_s = 2) {
     version <- Sys.getenv("SPARK_VERSION")
     hadoop_version <- Sys.getenv("HADOOP_VERSION")
     spark_installed <- spark_installed_versions()
-    if (nrow(spark_installed[spark_installed$spark == version & spark_installed$hadoop_version == hadoop_version, ]) == 0) {
+    if (nrow(spark_installed[spark_installed$spark == version & spark_installed$hadoop == hadoop_version, ]) == 0) {
       spark_install(version, hadoop_version)
     }
 
diff --git a/R/tests/testthat/test-data-interface-raster.R b/R/tests/testthat/test-data-interface-raster.R
index 35f0ffb7..fef1aaf2 100644
--- a/R/tests/testthat/test-data-interface-raster.R
+++ b/R/tests/testthat/test-data-interface-raster.R
@@ -579,7 +579,7 @@ test_that("Should Pass geotiff file writing with handling invalid schema", {
 })
 
 
-# Binary and RS_functions  -----------------
+# Read Binary and RS_functions  -----------------
 # Only functions related to reading
 
 test_that("Passed RS_FromGeoTiff from binary", {
@@ -765,3 +765,173 @@ test_that("Passed RS_Values with raster", {
   sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
   rm(a)
 })
+
+
+# Write Binary and RS_functions  -----------------
+test_that("Should read geotiff using binary source and write geotiff back to disk using raster source", {
+  
+  ## Load
+  sdf_name <- random_string("spatial_sdf")
+  binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+  
+  tmp_dest <- tempfile()
+  
+  binary_sdf %>% 
+    spark_write_raster(path = tmp_dest)
+  
+  sdf_name_2 <- random_string("spatial_sdf_2")
+  binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+  
+  expect_equal(
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+  )
+  
+  ## Cleanup
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+  rm(binary_sdf, binary_2_sdf, tmp_dest)
+  
+  
+
+})
+
+test_that("Should read and write geotiff using given options", {
+  
+  ## Load
+  sdf_name <- random_string("spatial_sdf")
+  binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+  
+  tmp_dest <- tempfile()
+  
+  binary_sdf %>% 
+    spark_write_raster(path = tmp_dest, 
+                       options = list("rasterField" = "content", 
+                                      "fileExtension" = ".tiff",
+                                      "pathField" = "path"
+                                      ))
+  
+  sdf_name_2 <- random_string("spatial_sdf_2")
+  binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+  
+  
+  expect_equal(
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+  )
+  
+  ## Cleanup
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+  rm(binary_sdf, binary_2_sdf, tmp_dest)
+  
+})
+
+test_that("Should read and write via RS_FromGeoTiff and RS_AsGeoTiff", {
+  
+  
+  ## Load
+  sdf_name <- random_string("spatial_sdf")
+  binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+  
+  raster_sdf <- 
+    binary_sdf %>% 
+    mutate(raster = RS_FromGeoTiff(content)) %>% 
+    mutate(content = RS_AsGeoTiff(raster))
+  
+  
+  tmp_dest <- tempfile()
+  
+  raster_sdf %>% 
+    spark_write_raster(path = tmp_dest, 
+                       options = list("rasterField" = "content", 
+                                      "fileExtension" = ".tiff",
+                                      "pathField" = "path"
+                       ))
+  
+  sdf_name_2 <- random_string("spatial_sdf_2")
+  binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+  
+  
+  expect_equal(
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name)),
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name))
+  )
+  
+  ## Cleanup
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+  rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+  
+})
+
+test_that("Should handle null", {
+  
+  ## Load
+  sdf_name <- random_string("spatial_sdf")
+  binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+  
+  raster_sdf <- 
+    binary_sdf %>% 
+    mutate(raster = RS_FromGeoTiff(NULL)) %>% 
+    mutate(content = RS_AsGeoTiff(raster))
+  
+  tmp_dest <- tempfile()
+  
+  raster_sdf %>% 
+    spark_write_raster(path = tmp_dest)
+  
+  sdf_name_2 <- random_string("spatial_sdf_2")
+  binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+  
+  out <- sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+  
+  expect_equal(
+    out$n,
+    0
+  )
+  
+  ## Cleanup
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+  rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+  
+})
+
+test_that("Should read RS_FromGeoTiff and write RS_AsArcGrid", {
+  
+  ## Load
+  sdf_name <- random_string("spatial_sdf")
+  binary_sdf <- spark_read_binary(sc, dir = test_data("raster"), name = sdf_name)
+  
+  raster_sdf <- 
+    binary_sdf %>% 
+    mutate(raster = RS_FromGeoTiff(content)) %>% 
+    mutate(content = RS_AsArcGrid(raster)) %>% 
+    sdf_register()
+  
+  tmp_dest <- tempfile()
+  
+  raster_sdf %>% 
+    spark_write_raster(path = tmp_dest, 
+                       options = list("rasterField" = "content", 
+                                      "fileExtension" = ".asc",
+                                      "pathField" = "path"
+                       ))
+  
+  sdf_name_2 <- random_string("spatial_sdf_2")
+  binary_2_sdf <- spark_read_binary(sc, dir = tmp_dest, name = sdf_name_2, recursive_file_lookup = TRUE)
+  
+  
+  
+  expect_equal(
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM ? LIMIT 1", DBI::dbQuoteIdentifier(sc, dbplyr::remote_name(raster_sdf))),
+    sc %>% DBI::dbGetQuery("SELECT count(*) as n FROM (SELECT RS_FromGeoTiff(content) as raster FROM ?) LIMIT 1", DBI::dbQuoteIdentifier(sc, sdf_name_2))
+  )
+  
+  ## Cleanup
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name))
+  sc %>% DBI::dbExecute(paste0("DROP TABLE ", sdf_name_2))
+  rm(raster_sdf, binary_sdf, binary_2_sdf, tmp_dest)
+  
+})
diff --git a/R/vignettes/articles/raster.Rmd b/R/vignettes/articles/raster.Rmd
index e1553e34..8576e2b8 100644
--- a/R/vignettes/articles/raster.Rmd
+++ b/R/vignettes/articles/raster.Rmd
@@ -16,10 +16,11 @@ knitr::opts_chunk$set(
 ```
 
 
-Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into Spark.
+Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into and written from Spark.
 
 # Using the RasterUDT
 
+## Read
 Raster data in GeoTiff and ArcInfo Grid format can be loaded directly into Spark using the `sparklyr::spark_read_binary` and Sedona constructors `RS_FromGeoTiff` and `RS_FromArcInfoAsciiGrid`.
 
 ```{r include=FALSE}
@@ -44,13 +45,13 @@ raster
 raster %>% sdf_schema()
 ```
 
- Once the data is loaded, raster functions are available in dplyr workflows:
- 
- * [Raster operators](../../../api/sql/Raster-operators/)
- * [Raster input and output](../../../api/sql/Raster-loader/)
- 
+Once the data is loaded, raster functions are available in dplyr workflows:
+
+* [Raster operators](../../../api/sql/Raster-operators/)
+* [Raster input and output](../../../api/sql/Raster-loader/)
+
 Functions taking in `raster: Raster` arguments are meant to be used with data loaded with this reader, such as `RS_Value`, `RS_Values`, `RS_Envelope`. Functions taking in `Band: Array[Double]` arguments work with data loaded using the Sedona Geotiff DataFrame loader (see [below](#Using the Sedona Geotiff Dataframe Loader)).
- 
+
 
 For example, getting the number of bands:
 ```{r}
@@ -86,6 +87,42 @@ raster %>%
 ```
 
 
+## Write
+
+
+To write a Sedona Raster DataFrame to raster files, you need to (1) first convert the Raster DataFrame to a binary DataFrame using `RS_AsXXX` functions and (2) then write the binary DataFrame to raster files using Sedona's built-in `raster` data source.
+
+To write a Sedona binary DataFrame to external storage using Sedona's built-in `raster` data source, use the `spark_write_raster` function:
+
+```{r}
+dest_file <- tempfile()
+raster %>% 
+  mutate(content = RS_AsGeoTiff(raster)) %>% 
+  spark_write_raster(path = dest_file)
+
+dir(dest_file, recursive = TRUE)
+```
+
+
+Available options see [Raster writer](../../../api/sql/Raster-writer/):
+
+* rasterField: the binary column to be saved (if there is only one takes that column by default, otherwise specify)
+* fileExtension: `.tiff` bvy default, also accepts `.png`, `.jpeg`, `.asc`
+* pathField: if used any column name that indicates the paths of each raster file, otherwise random UUIDs are generated.
+
+```{r}
+dest_file <- tempfile()
+raster %>% 
+  mutate(content = RS_AsArcGrid(raster)) %>% 
+  spark_write_raster(path = dest_file, 
+                     options = list("rasterField" = "content", 
+                                    "fileExtension" = ".asc",
+                                    "pathField" = "path"
+                     ))
+
+dir(dest_file, recursive = TRUE)
+```
+
 
 # Using the Sedona Geotiff Dataframe Loader
 
@@ -96,7 +133,7 @@ data_tbl
 
 ## Using a direct SQL query: results are collected directly
 sc %>% 
-    DBI::dbGetQuery("SELECT 
+  DBI::dbGetQuery("SELECT 
              image.geometry as Geom, 
              image.height as height, 
              image.width as width,