You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sedona.apache.org by ji...@apache.org on 2022/04/06 18:35:50 UTC

[incubator-sedona] branch master updated: [SEDONA-96] Refactor ST_MakeValid to use GeometryFixer. (#596)

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new 46fc348e [SEDONA-96] Refactor ST_MakeValid to use GeometryFixer. (#596)
46fc348e is described below

commit 46fc348e41db58a51eeb70c1c7e87c0f95abb1fa
Author: Martin Andersson <u....@gmail.com>
AuthorDate: Wed Apr 6 20:35:44 2022 +0200

    [SEDONA-96] Refactor ST_MakeValid to use GeometryFixer. (#596)
---
 docs/api/sql/Function.md                           | 35 ++++++++----
 pom.xml                                            |  2 +-
 .../sql/sedona_sql/expressions/Functions.scala     | 56 ++++++++-----------
 .../org/apache/sedona/sql/functionTestScala.scala  | 63 ++++++++--------------
 4 files changed, 70 insertions(+), 86 deletions(-)

diff --git a/docs/api/sql/Function.md b/docs/api/sql/Function.md
index 7085cdee..71b2da8a 100644
--- a/docs/api/sql/Function.md
+++ b/docs/api/sql/Function.md
@@ -162,27 +162,40 @@ FROM polygondf
 
 ## ST_MakeValid
 
-Introduction: Given an invalid polygon or multipolygon and removeHoles boolean flag,
- create a valid representation of the geometry.
+Introduction: Given an invalid geometry, create a valid representation of the geometry.
 
-Format: `ST_MakeValid (A:geometry, removeHoles:Boolean)`
+Collapsed geometries are either converted to empty (keepCollaped=true) or a valid geometry of lower dimension (keepCollapsed=false).
+Default is keepCollapsed=false.
+
+Format: `ST_MakeValid (A:geometry)`
+
+Format: `ST_MakeValid (A:geometry, keepCollapsed:Boolean)`
 
 Since: `v1.0.0`
 
 Spark SQL example:
 
 ```SQL
-SELECT geometryValid.polygon
-FROM table
-LATERAL VIEW ST_MakeValid(polygon, false) geometryValid AS polygon
+WITH linestring AS (
+    SELECT ST_GeomFromWKT('LINESTRING(1 1, 1 1)') AS geom
+) SELECT ST_MakeValid(geom), ST_MakeValid(geom, true) FROM linestring
+```
+
+Result:
+```
++------------------+------------------------+
+|st_makevalid(geom)|st_makevalid(geom, true)|
++------------------+------------------------+
+|  LINESTRING EMPTY|             POINT (1 1)|
++------------------+------------------------+
 ```
 
 !!!note
-    Might return multiple polygons from a only one invalid polygon
-    That's the reason why we need to use the LATERAL VIEW expression
-    
-!!!note
-    Throws an exception if the geometry isn't polygon or multipolygon
+    In Sedona up to and including version 1.2 the behaviour of ST_MakeValid was different.
+    Be sure to check you code when upgrading. 
+    The previous implementation only worked for (multi)polygons and had a different interpretation of the second, boolean, argument.
+    It would also sometimes return multiple geometries for a single geomtry input.
+
 
 ## ST_PrecisionReduce
 
diff --git a/pom.xml b/pom.xml
index d150132a..b783351c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
         <geotools.version>24.0</geotools.version>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <dependency.scope>provided</dependency.scope>
-        <jts.version>1.18.0</jts.version>
+        <jts.version>1.18.2</jts.version>
         <jts2geojson.version>0.16.1</jts2geojson.version>
     </properties>
 
diff --git a/sql/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala b/sql/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala
index effe50bb..fbe446d4 100644
--- a/sql/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala
+++ b/sql/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala
@@ -36,6 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import org.geotools.geometry.jts.JTS
 import org.geotools.referencing.CRS
 import org.locationtech.jts.algorithm.MinimumBoundingCircle
+import org.locationtech.jts.geom.util.GeometryFixer
 import org.locationtech.jts.geom.{PrecisionModel, _}
 import org.locationtech.jts.io.{ByteOrderValues, WKBWriter, WKTWriter}
 import org.locationtech.jts.linearref.LengthIndexedLine
@@ -343,49 +344,38 @@ case class ST_Intersection(inputExpressions: Seq[Expression])
 }
 
 /**
-  * Given an invalid polygon or multipolygon and removeHoles boolean flag, create a valid representation of the geometry
+  * Given an invalid geometry, create a valid representation of the geometry.
+  * See: http://lin-ear-th-inking.blogspot.com/2021/05/fixing-invalid-geometry-with-jts.html
   *
   * @param inputExpressions
   */
 case class ST_MakeValid(inputExpressions: Seq[Expression])
-  extends Generator with CodegenFallback with UserDataGeneratator {
-  assert(inputExpressions.length == 2)
-
-  override def elementSchema: StructType = new StructType().add("Geometry", new GeometryUDT)
-
-  override def toString: String = s" **${ST_MakeValid.getClass.getName}** "
-
-  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
-    val geometry = GeometrySerializer.deserialize(inputExpressions(0).eval(input).asInstanceOf[ArrayData])
-    val removeHoles = inputExpressions(1).eval(input).asInstanceOf[Boolean]
-
-    // in order to do flatMap on java collections(util.List[Polygon])
-    import scala.jdk.CollectionConverters._
+  extends Expression with CodegenFallback {
+  assert(inputExpressions.length == 1 || inputExpressions.length == 2)
 
-    // makeValid works only on polygon or multipolygon
-    if (!geometry.getGeometryType.equalsIgnoreCase("POLYGON") && !geometry.getGeometryType.equalsIgnoreCase("MULTIPOLYGON")) {
-      throw new IllegalArgumentException("ST_MakeValid works only on Polygons and MultiPolygons")
+  override def eval(input: InternalRow): Any = {
+    val geometry = inputExpressions.head.toGeometry(input)
+    val keepCollapsed = if (inputExpressions.length == 2) {
+      inputExpressions(1).eval(input).asInstanceOf[Boolean]
+    } else {
+      false
     }
-
-    val validGeometry = geometry match {
-      case g: MultiPolygon =>
-        (0 until g.getNumGeometries).flatMap(i => {
-          val polygon = g.getGeometryN(i).asInstanceOf[Polygon]
-          JTS.makeValid(polygon, removeHoles).asScala.iterator
-        })
-      case g: Polygon =>
-        JTS.makeValid(g, removeHoles).asScala.iterator
-      case _ => Nil
+    (geometry) match {
+      case (geometry: Geometry) => nullSafeEval(geometry, keepCollapsed)
+      case _ => null
     }
+  }
 
-    val result = validGeometry.map(g => {
-      val serializedGeometry = GeometrySerializer.serialize(g.asInstanceOf[Geometry])
-      InternalRow(new GenericArrayData(serializedGeometry))
-    })
-
-    result
+  private def nullSafeEval(geometry: Geometry, keepCollapsed: Boolean) = {
+    val fixer = new GeometryFixer(geometry)
+    fixer.setKeepCollapsed(keepCollapsed)
+    fixer.getResult.toGenericArrayData
   }
 
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = GeometryUDT
+
   override def children: Seq[Expression] = inputExpressions
 
   protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) = {
diff --git a/sql/src/test/scala/org/apache/sedona/sql/functionTestScala.scala b/sql/src/test/scala/org/apache/sedona/sql/functionTestScala.scala
index 9612fd56..6f55c05a 100644
--- a/sql/src/test/scala/org/apache/sedona/sql/functionTestScala.scala
+++ b/sql/src/test/scala/org/apache/sedona/sql/functionTestScala.scala
@@ -22,6 +22,7 @@ package org.apache.sedona.sql
 import org.apache.commons.codec.binary.Hex
 import org.apache.sedona.sql.implicits._
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.sedona_sql.expressions.ST_MakeValid
 import org.apache.spark.sql.{DataFrame, Row}
 import org.geotools.geometry.jts.WKTReader2
 import org.locationtech.jts.algorithm.MinimumBoundingCircle
@@ -203,62 +204,42 @@ class functionTestScala extends TestBaseScala with Matchers with GeometrySample
     it("Passed ST_MakeValid On Invalid Polygon") {
 
       val df = sparkSession.sql("SELECT ST_GeomFromWKT('POLYGON((1 5, 1 1, 3 3, 5 3, 7 1, 7 5, 5 3, 3 3, 1 5))') AS polygon")
-      df.createOrReplaceTempView("table")
 
-      val result = sparkSession.sql(
-        """
-          |SELECT geometryValid.polygon
-          |FROM table
-          |LATERAL VIEW ST_MakeValid(polygon, false) geometryValid AS polygon
-          |""".stripMargin
-      ).collect()
-
-      val wktReader = new WKTReader2()
-      val firstValidGeometry = wktReader.read("POLYGON ((1 5, 3 3, 1 1, 1 5))")
-      val secondValidGeometry = wktReader.read("POLYGON ((5 3, 7 5, 7 1, 5 3))")
-
-      assert(result.exists(row => row.getAs[Geometry](0).equals(firstValidGeometry)))
-      assert(result.exists(row => row.getAs[Geometry](0).equals(secondValidGeometry)))
+      val result = df.withColumn("polygon", expr("ST_MakeValid(polygon)")).collect()
+
+      assert(result.length == 1)
+      assert(result.take(1)(0).get(0).asInstanceOf[Geometry].toText() == "MULTIPOLYGON (((1 5, 3 3, 1 1, 1 5)), ((5 3, 7 5, 7 1, 5 3)))")
     }
 
     it("Passed ST_MakeValid On Invalid MultiPolygon") {
 
       val df = sparkSession.sql("SELECT ST_GeomFromWKT('MULTIPOLYGON(((0 0, 3 0, 3 3, 0 3, 0 0)), ((3 0, 6 0, 6 3, 3 3, 3 0)))') AS multipolygon")
-      df.createOrReplaceTempView("table")
 
-      val result = sparkSession.sql(
-        """
-          |SELECT geometryValid.multipolygon
-          |FROM table
-          |LATERAL VIEW ST_MakeValid(multipolygon, false) geometryValid AS multipolygon
-          |""".stripMargin
-      ).collect()
-
-      val wktReader = new WKTReader2()
-      val firstValidGeometry = wktReader.read("POLYGON ((0 3, 3 3, 3 0, 0 0, 0 3))")
-      val secondValidGeometry = wktReader.read("POLYGON ((3 3, 6 3, 6 0, 3 0, 3 3))")
-
-      assert(result.exists(row => row.getAs[Geometry](0).equals(firstValidGeometry)))
-      assert(result.exists(row => row.getAs[Geometry](0).equals(secondValidGeometry)))
+      val result = df.withColumn("multipolygon", expr("ST_MakeValid(multipolygon)")).collect()
+
+      assert(result.length == 1)
+      assert(result.take(1)(0).get(0).asInstanceOf[Geometry].toText() == "POLYGON ((0 3, 3 3, 6 3, 6 0, 3 0, 0 0, 0 3))")
     }
 
     it("Passed ST_MakeValid On Valid Polygon") {
 
       val df = sparkSession.sql("SELECT ST_GeomFromWKT('POLYGON((1 1, 8 1, 8 8, 1 8, 1 1))') AS polygon")
-      df.createOrReplaceTempView("table")
 
-      val result = sparkSession.sql(
-        """
-          |SELECT geometryValid.polygon
-          |FROM table
-          |LATERAL VIEW ST_MakeValid(polygon, false) geometryValid AS polygon
-          |""".stripMargin
-      ).collect()
+      val result = df.withColumn("polygon", expr("ST_MakeValid(polygon)")).collect()
+
+      assert(result.length == 1)
+      assert(result.take(1)(0).get(0).asInstanceOf[Geometry].toText() == "POLYGON ((1 1, 1 8, 8 8, 8 1, 1 1))")
+    }
+
+    it("Passed ST_MakeValid on Invalid LineString") {
+
+      val df = sparkSession.sql("SELECT ST_GeomFromWKT('LINESTRING(1 1, 1 1)') AS geom")
 
-      val wktReader = new WKTReader2()
-      val validGeometry = wktReader.read("POLYGON((1 1, 8 1, 8 8, 1 8, 1 1))")
+      val result = df.selectExpr("ST_MakeValid(geom)", "ST_MakeValid(geom, true)").collect()
 
-      assert(result.exists(row => row.getAs[Geometry](0).equals(validGeometry)))
+      assert(result.length == 1)
+      assert(result.take(1)(0).get(0).asInstanceOf[Geometry].toText() == "LINESTRING EMPTY")
+      assert(result.take(1)(0).get(1).asInstanceOf[Geometry].toText() == "POINT (1 1)")
     }
 
     it("Passed ST_SimplifyPreserveTopology") {