You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/04/13 05:31:24 UTC

[spark] branch branch-3.0 updated: [SPARK-31398][SQL] Fix perf regression of loading dates before 1582 year by non-vectorized ORC reader

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 4121c43  [SPARK-31398][SQL] Fix perf regression of loading dates before 1582 year by non-vectorized ORC reader
4121c43 is described below

commit 4121c43c69c7dee1f9a2d0001a7d91ef590f8f91
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Mon Apr 13 05:29:54 2020 +0000

    [SPARK-31398][SQL] Fix perf regression of loading dates before 1582 year by non-vectorized ORC reader
    
    ### What changes were proposed in this pull request?
    In regular ORC reader when `spark.sql.orc.enableVectorizedReader` is set to `false`, I propose to use `DaysWritable` in reading DATE values from ORC files. Currently, days from ORC files are converted to java.sql.Date, and then to days in Proleptic Gregorian calendar. So, the conversion to Java type can be eliminated.
    
    ### Why are the changes needed?
    - The PR fixes regressions in loading dates before the 1582 year from ORC files by when vectorised ORC reader is off.
    - The changes improve performance of regular ORC reader for DATE columns.
      - x3.6 faster comparing to the current master
      - x1.9-x4.3 faster against Spark 2.4.6
    
    Before (on JDK 8):
    ```
    Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------------------------------
    after 1582, vec off                               39651          39686          31          2.5         396.5       1.0X
    after 1582, vec on                                 3647           3660          13         27.4          36.5      10.9X
    before 1582, vec off                              38155          38219          61          2.6         381.6       1.0X
    before 1582, vec on                                4041           4046           6         24.7          40.4       9.8X
    ```
    
    After (on JDK 8):
    ```
    Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------------------------------
    after 1582, vec off                               10947          10971          28          9.1         109.5       1.0X
    after 1582, vec on                                 3677           3702          36         27.2          36.8       3.0X
    before 1582, vec off                              11456          11472          21          8.7         114.6       1.0X
    before 1582, vec on                                4079           4103          21         24.5          40.8       2.7X
    ```
    
    Spark 2.4.6:
    ```
    Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------------------------------
    after 1582, vec off                               48169          48276          96          2.1         481.7       1.0X
    after 1582, vec on                                 5375           5410          41         18.6          53.7       9.0X
    before 1582, vec off                              22353          22482         198          4.5         223.5       2.2X
    before 1582, vec on                                5474           5475           1         18.3          54.7       8.8X
    ```
    
    ### Does this PR introduce any user-facing change?
    No
    
    ### How was this patch tested?
    - By existing tests suites like `DateTimeUtilsSuite`
    - Checked for `hive-1.2` by:
    ```
    ./build/sbt -Phive-1.2 "test:testOnly *OrcHadoopFsRelationSuite"
    ```
    - Re-run `DateTimeRebaseBenchmark` in the environment:
    
    | Item | Description |
    | ---- | ----|
    | Region | us-west-2 (Oregon) |
    | Instance | r3.xlarge |
    | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) |
    | Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 |
    
    Closes #28169 from MaxGekk/orc-optimize-dates.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit cac8d1b3520c1196eb1fe72d9bcd9965604eed4a)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../DateTimeRebaseBenchmark-jdk11-results.txt      | 88 +++++++++++-----------
 .../benchmarks/DateTimeRebaseBenchmark-results.txt | 88 +++++++++++-----------
 .../datasources/orc/OrcDeserializer.scala          |  3 +-
 .../execution/datasources/orc/OrcShimUtils.scala   |  6 +-
 .../execution/datasources/orc/OrcShimUtils.scala   |  6 +-
 5 files changed, 96 insertions(+), 95 deletions(-)

diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
index bb7d8b0..edf9eef 100644
--- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
@@ -6,49 +6,49 @@ OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to parquet:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                  19037          19037           0          5.3         190.4       1.0X
-before 1582, noop                                 11543          11543           0          8.7         115.4       1.6X
-after 1582, rebase off                            31999          31999           0          3.1         320.0       0.6X
-after 1582, rebase on                             31657          31657           0          3.2         316.6       0.6X
-before 1582, rebase off                           23777          23777           0          4.2         237.8       0.8X
-before 1582, rebase on                            24767          24767           0          4.0         247.7       0.8X
+after 1582, noop                                  19946          19946           0          5.0         199.5       1.0X
+before 1582, noop                                 10910          10910           0          9.2         109.1       1.8X
+after 1582, rebase off                            32065          32065           0          3.1         320.7       0.6X
+after 1582, rebase on                             31424          31424           0          3.2         314.2       0.6X
+before 1582, rebase off                           21764          21764           0          4.6         217.6       0.9X
+before 1582, rebase on                            22700          22700           0          4.4         227.0       0.9X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from parquet:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   12901          13119         211          7.8         129.0       1.0X
-after 1582, vec off, rebase on                    12968          13078         153          7.7         129.7       1.0X
-after 1582, vec on, rebase off                     3633           3673          37         27.5          36.3       3.6X
-after 1582, vec on, rebase on                      4988           5035          42         20.0          49.9       2.6X
-before 1582, vec off, rebase off                  12767          12856          77          7.8         127.7       1.0X
-before 1582, vec off, rebase on                   14012          14051          41          7.1         140.1       0.9X
-before 1582, vec on, rebase off                    3626           3641          15         27.6          36.3       3.6X
-before 1582, vec on, rebase on                     5388           5458         114         18.6          53.9       2.4X
+after 1582, vec off, rebase off                   13054          13273         193          7.7         130.5       1.0X
+after 1582, vec off, rebase on                    13337          13418         117          7.5         133.4       1.0X
+after 1582, vec on, rebase off                     3666           3740         116         27.3          36.7       3.6X
+after 1582, vec on, rebase on                      5097           5160          70         19.6          51.0       2.6X
+before 1582, vec off, rebase off                  12908          12984          76          7.7         129.1       1.0X
+before 1582, vec off, rebase on                   14514          14575          53          6.9         145.1       0.9X
+before 1582, vec on, rebase off                    3718           3790          63         26.9          37.2       3.5X
+before 1582, vec on, rebase on                     5472           5527          48         18.3          54.7       2.4X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to parquet:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   3075           3075           0         32.5          30.7       1.0X
-before 1582, noop                                  3058           3058           0         32.7          30.6       1.0X
-after 1582, rebase off                            17236          17236           0          5.8         172.4       0.2X
-after 1582, rebase on                             19237          19237           0          5.2         192.4       0.2X
-before 1582, rebase off                           17317          17317           0          5.8         173.2       0.2X
-before 1582, rebase on                            19982          19982           0          5.0         199.8       0.2X
+after 1582, noop                                   2718           2718           0         36.8          27.2       1.0X
+before 1582, noop                                  2774           2774           0         36.0          27.7       1.0X
+after 1582, rebase off                            16515          16515           0          6.1         165.2       0.2X
+after 1582, rebase on                             18619          18619           0          5.4         186.2       0.1X
+before 1582, rebase off                           16382          16382           0          6.1         163.8       0.2X
+before 1582, rebase on                            19580          19580           0          5.1         195.8       0.1X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from parquet:             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   14865          14957         142          6.7         148.6       1.0X
-after 1582, vec off, rebase on                    17154          17190          45          5.8         171.5       0.9X
-after 1582, vec on, rebase off                     4878           4912          56         20.5          48.8       3.0X
-after 1582, vec on, rebase on                      8604           8653          42         11.6          86.0       1.7X
-before 1582, vec off, rebase off                  14891          14929          33          6.7         148.9       1.0X
-before 1582, vec off, rebase on                   17708          17758          68          5.6         177.1       0.8X
-before 1582, vec on, rebase off                    4928           4946          17         20.3          49.3       3.0X
-before 1582, vec on, rebase on                     9147           9152           7         10.9          91.5       1.6X
+after 1582, vec off, rebase off                   15082          15112          43          6.6         150.8       1.0X
+after 1582, vec off, rebase on                    18035          18105         107          5.5         180.3       0.8X
+after 1582, vec on, rebase off                     4948           4970          19         20.2          49.5       3.0X
+after 1582, vec on, rebase on                      8761           8780          25         11.4          87.6       1.7X
+before 1582, vec off, rebase off                  15285          15389         103          6.5         152.8       1.0X
+before 1582, vec off, rebase on                   18277          18368          80          5.5         182.8       0.8X
+before 1582, vec on, rebase off                    4938           4954          15         20.3          49.4       3.1X
+before 1582, vec on, rebase on                     9220           9248          33         10.8          92.2       1.6X
 
 
 ================================================================================================
@@ -59,36 +59,36 @@ OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to ORC:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                  18943          18943           0          5.3         189.4       1.0X
-before 1582, noop                                 11151          11151           0          9.0         111.5       1.7X
-after 1582                                        26738          26738           0          3.7         267.4       0.7X
-before 1582                                       19636          19636           0          5.1         196.4       1.0X
+after 1582, noop                                  19820          19820           0          5.0         198.2       1.0X
+before 1582, noop                                 10825          10825           0          9.2         108.2       1.8X
+after 1582                                        27705          27705           0          3.6         277.1       0.7X
+before 1582                                       19390          19390           0          5.2         193.9       1.0X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off                               36176          36378         206          2.8         361.8       1.0X
-after 1582, vec on                                 3802           3829          29         26.3          38.0       9.5X
-before 1582, vec off                              35815          35882          74          2.8         358.1       1.0X
-before 1582, vec on                                4163           4174          10         24.0          41.6       8.7X
+after 1582, vec off                               10401          10426          33          9.6         104.0       1.0X
+after 1582, vec on                                 3788           3813          29         26.4          37.9       2.7X
+before 1582, vec off                              10832          10843          11          9.2         108.3       1.0X
+before 1582, vec on                                4182           4219          46         23.9          41.8       2.5X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to ORC:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   3033           3033           0         33.0          30.3       1.0X
-before 1582, noop                                  3004           3004           0         33.3          30.0       1.0X
-after 1582                                        53907          53907           0          1.9         539.1       0.1X
-before 1582                                       57241          57241           0          1.7         572.4       0.1X
+after 1582, noop                                   2787           2787           0         35.9          27.9       1.0X
+before 1582, noop                                  2777           2777           0         36.0          27.8       1.0X
+after 1582                                        52456          52456           0          1.9         524.6       0.1X
+before 1582                                       54983          54983           0          1.8         549.8       0.1X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from ORC:                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off                               38266          38306          66          2.6         382.7       1.0X
-after 1582, vec on                                30233          30339         162          3.3         302.3       1.3X
-before 1582, vec off                              43584          43753         259          2.3         435.8       0.9X
-before 1582, vec on                               35681          35832         140          2.8         356.8       1.1X
+after 1582, vec off                               38336          38425          88          2.6         383.4       1.0X
+after 1582, vec on                                30089          30198          96          3.3         300.9       1.3X
+before 1582, vec off                              42739          42801          87          2.3         427.4       0.9X
+before 1582, vec on                               31530          31591          61          3.2         315.3       1.2X
 
 
diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
index a79f6fb..602f3e9 100644
--- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
+++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
@@ -6,49 +6,49 @@ OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to parquet:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                  24030          24030           0          4.2         240.3       1.0X
-before 1582, noop                                 10900          10900           0          9.2         109.0       2.2X
-after 1582, rebase off                            35666          35666           0          2.8         356.7       0.7X
-after 1582, rebase on                             35872          35872           0          2.8         358.7       0.7X
-before 1582, rebase off                           22131          22131           0          4.5         221.3       1.1X
-before 1582, rebase on                            22967          22967           0          4.4         229.7       1.0X
+after 1582, noop                                  23453          23453           0          4.3         234.5       1.0X
+before 1582, noop                                 10821          10821           0          9.2         108.2       2.2X
+after 1582, rebase off                            35558          35558           0          2.8         355.6       0.7X
+after 1582, rebase on                             35892          35892           0          2.8         358.9       0.7X
+before 1582, rebase off                           22700          22700           0          4.4         227.0       1.0X
+before 1582, rebase on                            23247          23247           0          4.3         232.5       1.0X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from parquet:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   12523          12589          58          8.0         125.2       1.0X
-after 1582, vec off, rebase on                    13387          13466          86          7.5         133.9       0.9X
-after 1582, vec on, rebase off                     3646           3678          41         27.4          36.5       3.4X
-after 1582, vec on, rebase on                      5104           5198         123         19.6          51.0       2.5X
-before 1582, vec off, rebase off                  13041          13101          98          7.7         130.4       1.0X
-before 1582, vec off, rebase on                   14077          14104          23          7.1         140.8       0.9X
-before 1582, vec on, rebase off                    3667           3726          87         27.3          36.7       3.4X
-before 1582, vec on, rebase on                     5504           5521          21         18.2          55.0       2.3X
+after 1582, vec off, rebase off                   12641          12690          43          7.9         126.4       1.0X
+after 1582, vec off, rebase on                    13318          13380          67          7.5         133.2       0.9X
+after 1582, vec on, rebase off                     3648           3659          11         27.4          36.5       3.5X
+after 1582, vec on, rebase on                      5160           5212          69         19.4          51.6       2.4X
+before 1582, vec off, rebase off                  13024          13065          36          7.7         130.2       1.0X
+before 1582, vec off, rebase on                   13810          13932         106          7.2         138.1       0.9X
+before 1582, vec on, rebase off                    3631           3695          57         27.5          36.3       3.5X
+before 1582, vec on, rebase on                     5791           5860          71         17.3          57.9       2.2X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to parquet:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   2782           2782           0         36.0          27.8       1.0X
-before 1582, noop                                  2778           2778           0         36.0          27.8       1.0X
-after 1582, rebase off                            16980          16980           0          5.9         169.8       0.2X
-after 1582, rebase on                             20023          20023           0          5.0         200.2       0.1X
-before 1582, rebase off                           17618          17618           0          5.7         176.2       0.2X
-before 1582, rebase on                            20416          20416           0          4.9         204.2       0.1X
+after 1582, noop                                   2750           2750           0         36.4          27.5       1.0X
+before 1582, noop                                  2833           2833           0         35.3          28.3       1.0X
+after 1582, rebase off                            16832          16832           0          5.9         168.3       0.2X
+after 1582, rebase on                             19688          19688           0          5.1         196.9       0.1X
+before 1582, rebase off                           17548          17548           0          5.7         175.5       0.2X
+before 1582, rebase on                            21343          21343           0          4.7         213.4       0.1X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from parquet:             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   15098          15146          55          6.6         151.0       1.0X
-after 1582, vec off, rebase on                    18127          18214          86          5.5         181.3       0.8X
-after 1582, vec on, rebase off                     4936           4984          42         20.3          49.4       3.1X
-after 1582, vec on, rebase on                      9685           9745          58         10.3          96.8       1.6X
-before 1582, vec off, rebase off                  15233          15259          25          6.6         152.3       1.0X
-before 1582, vec off, rebase on                   18710          18727          24          5.3         187.1       0.8X
-before 1582, vec on, rebase off                    4954           4980          24         20.2          49.5       3.0X
-before 1582, vec on, rebase on                    10224          10240          22          9.8         102.2       1.5X
+after 1582, vec off, rebase off                   15243          15329          82          6.6         152.4       1.0X
+after 1582, vec off, rebase on                    18296          18330          54          5.5         183.0       0.8X
+after 1582, vec on, rebase off                     4925           4927           2         20.3          49.2       3.1X
+after 1582, vec on, rebase on                      9647           9686          35         10.4          96.5       1.6X
+before 1582, vec off, rebase off                  14880          15105         267          6.7         148.8       1.0X
+before 1582, vec off, rebase on                   18474          18514          51          5.4         184.7       0.8X
+before 1582, vec on, rebase off                    4970           4978          10         20.1          49.7       3.1X
+before 1582, vec on, rebase on                     9938          10012          64         10.1          99.4       1.5X
 
 
 ================================================================================================
@@ -59,36 +59,36 @@ OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to ORC:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                  23770          23770           0          4.2         237.7       1.0X
-before 1582, noop                                 10939          10939           0          9.1         109.4       2.2X
-after 1582                                        32879          32879           0          3.0         328.8       0.7X
-before 1582                                       20267          20267           0          4.9         202.7       1.2X
+after 1582, noop                                  23500          23500           0          4.3         235.0       1.0X
+before 1582, noop                                 10788          10788           0          9.3         107.9       2.2X
+after 1582                                        32237          32237           0          3.1         322.4       0.7X
+before 1582                                       20187          20187           0          5.0         201.9       1.2X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off                               39651          39686          31          2.5         396.5       1.0X
-after 1582, vec on                                 3647           3660          13         27.4          36.5      10.9X
-before 1582, vec off                              38155          38219          61          2.6         381.6       1.0X
-before 1582, vec on                                4041           4046           6         24.7          40.4       9.8X
+after 1582, vec off                               10947          10971          28          9.1         109.5       1.0X
+after 1582, vec on                                 3677           3702          36         27.2          36.8       3.0X
+before 1582, vec off                              11456          11472          21          8.7         114.6       1.0X
+before 1582, vec on                                4079           4103          21         24.5          40.8       2.7X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to ORC:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   2802           2802           0         35.7          28.0       1.0X
-before 1582, noop                                  2797           2797           0         35.8          28.0       1.0X
-after 1582                                        59877          59877           0          1.7         598.8       0.0X
-before 1582                                       61361          61361           0          1.6         613.6       0.0X
+after 1582, noop                                   2891           2891           0         34.6          28.9       1.0X
+before 1582, noop                                  2906           2906           0         34.4          29.1       1.0X
+after 1582                                        55812          55812           0          1.8         558.1       0.1X
+before 1582                                       57512          57512           0          1.7         575.1       0.1X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from ORC:                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off                               48197          48288         118          2.1         482.0       1.0X
-after 1582, vec on                                38247          38351         128          2.6         382.5       1.3X
-before 1582, vec off                              53179          53359         249          1.9         531.8       0.9X
-before 1582, vec on                               44076          44268         269          2.3         440.8       1.1X
+after 1582, vec off                               46376          46410          33          2.2         463.8       1.0X
+after 1582, vec on                                35003          35189         163          2.9         350.0       1.3X
+before 1582, vec off                              52942          52979          34          1.9         529.4       0.9X
+before 1582, vec on                               42596          42747         158          2.3         426.0       1.1X
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
index 6d52d40..4ab009c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -23,6 +23,7 @@ import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData}
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.RebaseDateTime.rebaseJulianToGregorianDays
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -108,7 +109,7 @@ class OrcDeserializer(
         updater.set(ordinal, bytes)
 
       case DateType => (ordinal, value) =>
-        updater.setInt(ordinal, DateTimeUtils.fromJavaDate(OrcShimUtils.getSqlDate(value)))
+        updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value))
 
       case TimestampType => (ordinal, value) =>
         updater.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
diff --git a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
index ece5280..7fbc1cd 100644
--- a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
+++ b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
-import java.sql.Date
-
 import org.apache.orc.storage.common.`type`.HiveDecimal
 import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch
 import org.apache.orc.storage.ql.io.sarg.{SearchArgument => OrcSearchArgument}
@@ -38,7 +36,9 @@ private[sql] object OrcShimUtils {
   private[sql] type Operator = OrcOperator
   private[sql] type SearchArgument = OrcSearchArgument
 
-  def getSqlDate(value: Any): Date = value.asInstanceOf[DateWritable].get
+  def getGregorianDays(value: Any): Int = {
+    new DaysWritable(value.asInstanceOf[DateWritable]).gregorianDays
+  }
 
   def getDecimal(value: Any): Decimal = {
     val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal()
diff --git a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
index 5666d31..60c5b7a 100644
--- a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
+++ b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
-import java.sql.Date
-
 import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch
 import org.apache.hadoop.hive.ql.io.sarg.{SearchArgument => OrcSearchArgument}
@@ -39,7 +37,9 @@ private[sql] object OrcShimUtils {
   private[sql] type Operator = OrcOperator
   private[sql] type SearchArgument = OrcSearchArgument
 
-  def getSqlDate(value: Any): Date = value.asInstanceOf[DateWritable].get
+  def getGregorianDays(value: Any): Int = {
+    new DaysWritable(value.asInstanceOf[DateWritable]).gregorianDays
+  }
 
   def getDecimal(value: Any): Decimal = {
     val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org