You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2024/03/27 14:40:18 UTC

(spark) branch master updated: [SPARK-47616][SQL] Add User Document for Mapping Spark SQL Data Types from MySQL

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8d1539f7bb23 [SPARK-47616][SQL] Add User Document for Mapping Spark SQL Data Types from MySQL
8d1539f7bb23 is described below

commit 8d1539f7bb2319b746433067ab876754384246c2
Author: Kent Yao <ya...@apache.org>
AuthorDate: Wed Mar 27 07:40:05 2024 -0700

    [SPARK-47616][SQL] Add User Document for Mapping Spark SQL Data Types from MySQL
    
    ### What changes were proposed in this pull request?
    
    This PR added a User Document for Mapping Spark SQL Data Types from MySQL. The write side document is not included yet which might need further verification.
    
    ### Why are the changes needed?
    
    Now, the conversion of data types from MySQL to Spark SQL is solid. End users can refer to it.
    
    It's also time for maintainers to have an overall perspective to review the changes in the coming 4.0.0
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    
    add some test for missing MySQL data types
    
    ![image](https://github.com/apache/spark/assets/8326978/da8eb241-1d05-4a01-b038-d7d329193674)
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #45736 from yaooqinn/SPARK-47616.
    
    Authored-by: Kent Yao <ya...@apache.org>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../spark/sql/jdbc/MySQLIntegrationSuite.scala     |  16 ++
 docs/sql-data-sources-jdbc.md                      | 246 +++++++++++++++++++++
 2 files changed, 262 insertions(+)

diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala
index 10049169caa1..1343f9af7e35 100644
--- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala
+++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala
@@ -90,6 +90,10 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
     conn.prepareStatement("CREATE TABLE collections (" +
         "a SET('cap', 'hat', 'helmet'), b ENUM('S', 'M', 'L', 'XL'))").executeUpdate()
     conn.prepareStatement("INSERT INTO collections VALUES ('cap,hat', 'M')").executeUpdate()
+
+    conn.prepareStatement("CREATE TABLE TBL_GEOMETRY (col0 GEOMETRY)").executeUpdate()
+    conn.prepareStatement("INSERT INTO TBL_GEOMETRY VALUES (ST_GeomFromText('POINT(0 0)'))")
+      .executeUpdate()
   }
 
   def testConnection(): Unit = {
@@ -191,6 +195,12 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
       assert(rows(0).getAs[Timestamp](3).equals(Timestamp.valueOf("2009-02-13 23:31:30")))
       assert(rows(0).getAs[Date](4).equals(Date.valueOf("2001-01-01")))
     }
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("query", "select yr from dates")
+      .option("yearIsDateType", false)
+      .load()
+    checkAnswer(df, Row(2001))
   }
 
   test("SPARK-47406: MySQL datetime types with preferTimestampNTZ") {
@@ -318,6 +328,12 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
         Row("cap,hat", "M") :: Row("cap,hat", "M") :: Nil)
     }
   }
+
+  test("SPARK-47616: Read GEOMETRY from MySQL") {
+    val df = spark.read.jdbc(jdbcUrl, "TBL_GEOMETRY", new Properties)
+    checkAnswer(df,
+      Row(Array[Byte](0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
+  }
 }
 
 
diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md
index bc0573a37219..17a61078cccf 100644
--- a/docs/sql-data-sources-jdbc.md
+++ b/docs/sql-data-sources-jdbc.md
@@ -427,3 +427,249 @@ SELECT * FROM resultTable
 
 </div>
 </div>
+
+## Data Type Mapping
+
+### Mapping Spark SQL Data Types from MySQL
+
+The below table describe the data type conversions from MySQL data types to Spark SQL Data Types,
+when reading data from a MySQL table using the built-in jdbc data source with the MySQL Connector/J
+as the activated JDBC Driver. Note that, different JDBC drivers, such as Maria Connector/J, which
+are also available to connect MySQL, may have different mapping rules.
+
+<table>
+  <thead>
+    <tr>
+      <th><b>MySQL Data Type</b></th>
+      <th><b>Spark SQL Data Type</b></th>
+      <th><b>Remarks</b></th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>BIT(1)</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIT( &gt;1 )</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT(1)</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT(1)</td>
+      <td>ByteType</td>
+      <td>tinyInt1isBit=false</td>
+    </tr>
+    <tr>
+      <td>BOOLEAN</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BOOLEAN</td>
+      <td>ByteType</td>
+      <td>tinyInt1isBit=false</td>
+    </tr>
+    <tr>
+      <td>TINYINT( &gt;1 )</td>
+      <td>ByteType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT( any ) UNSIGNED</td>
+      <td>ShortType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SMALLINT</td>
+      <td>ShortType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SMALLINT UNSIGNED</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMINT [UNSIGNED]</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>INT</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>INT UNSIGNED</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIGINT</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIGINT UNSIGNED</td>
+      <td>DecimalType(20,0)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>FLOAT</td>
+      <td>FloatType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>FLOAT UNSIGNED</td>
+      <td>DoubleType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DOUBLE [UNSIGNED]</td>
+      <td>DoubleType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DECIMAL(p,s) [UNSIGNED]</td>
+      <td>DecimalType(min(38, p),(min(18,s)))</td>
+      <td>The column type is bounded to DecimalType(38, 18), thus if any value of this column have a actual presion greater 38 will fail with DECIMAL_PRECISION_EXCEEDS_MAX_PRECISION </td>
+    </tr>
+    <tr>
+      <td>DATE</td>
+      <td>DateType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DATETIME</td>
+      <td>TimestampType</td>
+      <td>(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ</td>
+    </tr>
+    <tr>
+      <td>DATETIME</td>
+      <td>TimestampNTZType</td>
+      <td>preferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ</td>
+    </tr>
+    <tr>
+      <td>TIMESTAMP</td>
+      <td>TimestampType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TIME</td>
+      <td>TimestampType</td>
+      <td>(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ</td>
+    </tr>
+    <tr>
+      <td>TIME</td>
+      <td>TimestampNTZType</td>
+      <td>preferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ</td>
+    </tr>
+    <tr>
+      <td>YEAR</td>
+      <td>DateType</td>
+      <td>yearIsDateType=true</td>
+    </tr>
+    <tr>
+      <td>YEAR</td>
+      <td>IntegerType</td>
+      <td>yearIsDateType=false</td>
+    </tr>
+    <tr>
+      <td>CHAR(n)</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARCHAR(n)</td>
+      <td>VarcharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BINARY(n)</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARBINARY(n)</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>CHAR(n) BINARY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARCHAR(n) BINARY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+     <tr>
+      <td>BLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>LONGBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>LONGTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>JSON</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>GEOMETRY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>ENUM</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SET</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+  </tbody>
+</table>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org