You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/01/09 07:00:36 UTC

[2/3] impala git commit: IMPALA-7924: Generate Thrift 11 Python Code

IMPALA-7924: Generate Thrift 11 Python Code

Upgrades the version of the toolchain in order to pull in Thrift 0.11.0.
Updates the CMake build to write generated Python code using Thrift 0.11
to shell/build/thrift-11-gen/gen-py/.

The Thrift 0.11 Python deserialization code has some big performance
improvements that allow faster parsing of runtime profiles. By adding
the ability to generate the Thrift Python code using Thrift 0.11 we can
take advantage of the Python performance improvements without going
through a full Thrift upgrade from 0.9 to 0.11.

Set USE_THRIFT11_GEN_PY=true and then run bin/set-pythonpath.sh to add
the Thrift 0.11 Python generated code to the PYTHONPATH rather than the
0.9 generated code.

Testing:
- Ran core tests

Change-Id: I3432c3e29d28ec3ef6a0a22156a18910f511fed0
Reviewed-on: http://gerrit.cloudera.org:8080/12036
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/fa78c594
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/fa78c594
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/fa78c594

Branch: refs/heads/master
Commit: fa78c594de39878902f3887a209f29f7976583d0
Parents: ae65ff8
Author: Sahil Takiar <ta...@gmail.com>
Authored: Mon Dec 3 16:09:04 2018 -0500
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Jan 9 05:54:59 2019 +0000

----------------------------------------------------------------------
 CMakeLists.txt               |   7 +++
 bin/bootstrap_toolchain.py   |   1 +
 bin/impala-config.sh         |   4 +-
 bin/set-pythonpath.sh        |  10 +++-
 common/thrift/CMakeLists.txt | 112 ++++++++++++++++++++++++++------------
 5 files changed, 96 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fcc855..285e617 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ set_dep_root(RE2)
 set_dep_root(RAPIDJSON)
 set_dep_root(SNAPPY)
 set_dep_root(THRIFT)
+set(THRIFT11_ROOT $ENV{IMPALA_TOOLCHAIN}/thrift-$ENV{IMPALA_THRIFT11_VERSION})
 set_dep_root(ZLIB)
 set_dep_root(CCTZ)
 
@@ -263,6 +264,12 @@ message(STATUS "Thrift version: ${THRIFT_VERSION}")
 message(STATUS "Thrift contrib dir: ${THRIFT_CONTRIB_DIR}")
 message(STATUS "Thrift compiler: ${THRIFT_COMPILER}")
 
+if (NOT EXISTS ${THRIFT11_ROOT}/bin/thrift)
+  message(FATAL_ERROR "Unable to find Thrift 11 compiler")
+endif(NOT EXISTS ${THRIFT11_ROOT}/bin/thrift)
+
+set(THRIFT11_COMPILER ${THRIFT11_ROOT}/bin/thrift)
+
 # find flatbuffers headers, lib and compiler
 find_package(FlatBuffers REQUIRED)
 IMPALA_ADD_THIRDPARTY_LIB(flatbuffers ${FLATBUFFERS_INCLUDE_DIR}

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index d7bf5b4..286aaaa 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -466,6 +466,7 @@ if __name__ == "__main__":
       "libunwind", "lz4", "openldap", "openssl", "orc", "protobuf",
       "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"])
   packages.insert(0, Package("llvm", "5.0.1-asserts-p1"))
+  packages.insert(0, Package("thrift", os.environ.get("IMPALA_THRIFT11_VERSION")))
   bootstrap(toolchain_root, packages)
 
   # Download the CDH components if necessary.

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 9e27f54..230554f 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -68,7 +68,7 @@ fi
 # moving to a different build of the toolchain, e.g. when a version is bumped or a
 # compile option is changed. The build id can be found in the output of the toolchain
 # build jobs, it is constructed from the build number and toolchain git hash prefix.
-export IMPALA_TOOLCHAIN_BUILD_ID=235-57748561c0
+export IMPALA_TOOLCHAIN_BUILD_ID=241-259ecff082
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p4
@@ -143,6 +143,8 @@ export IMPALA_TPC_H_VERSION=2.17.0
 unset IMPALA_TPC_H_URL
 export IMPALA_THRIFT_VERSION=0.9.3-p4
 unset IMPALA_THRIFT_URL
+export IMPALA_THRIFT11_VERSION=0.11.0-p2
+unset IMPALA_THRIFT11_URL
 export IMPALA_ZLIB_VERSION=1.2.8
 unset IMPALA_ZLIB_URL
 

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/set-pythonpath.sh
----------------------------------------------------------------------
diff --git a/bin/set-pythonpath.sh b/bin/set-pythonpath.sh
index 10d0c67..86a5fe7 100755
--- a/bin/set-pythonpath.sh
+++ b/bin/set-pythonpath.sh
@@ -16,7 +16,15 @@
 # under the License.
 
 # set the python path for test modules and beeswax
-PYTHONPATH=${IMPALA_HOME}:${IMPALA_HOME}/shell/gen-py:${IMPALA_HOME}/testdata/
+# setting USE_THRIFT11_GEN_PY will add Thrift 11 Python generated code rather than the
+# default Thrift Python code
+PYTHONPATH=${IMPALA_HOME}
+if [ -n "${USE_THRIFT11_GEN_PY:-}" ]; then
+  PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/shell/build/thrift-11-gen/gen-py
+else
+  PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/shell/gen-py
+fi
+PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/testdata/
 
 # There should be just a single version of python that created the
 # site-packages directory. We find it by performing shell independent expansion

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/common/thrift/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/common/thrift/CMakeLists.txt b/common/thrift/CMakeLists.txt
index 87dcede..4d8646a 100644
--- a/common/thrift/CMakeLists.txt
+++ b/common/thrift/CMakeLists.txt
@@ -39,19 +39,19 @@ function(THRIFT_GEN VAR)
   ENDIF(NOT ARGN)
 
   set(${VAR})
-  foreach(FIL ${ARGN})
+  foreach(THRIFT_FILE ${ARGN})
     # Get full path
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    # Get basename
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    get_filename_component(ABS_THRIFT_FILE ${THRIFT_FILE} ABSOLUTE)
+    # Get basename without the file extension
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
 
     # All the output files we can determine based on filename.
     #   - Does not include .skeleton.cpp files
     #   - Does not include java output files
-    set(OUTPUT_BE_FILE "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_types.cpp")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_types.h")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_constants.cpp")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_constants.h")
+    set(OUTPUT_BE_FILE "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_types.cpp")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_types.h")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_constants.cpp")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_constants.h")
     list(APPEND ${VAR} ${OUTPUT_BE_FILE})
 
     # BeeswaxService thrift generation
@@ -60,38 +60,38 @@ function(THRIFT_GEN VAR)
     # We need to generate C++ src file for the parent dependencies using the "-r" option.
     set(CPP_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen cpp:moveable_types -o
         ${BE_OUTPUT_DIR})
-    IF (FIL STREQUAL "beeswax.thrift")
+    IF (THRIFT_FILE STREQUAL "beeswax.thrift")
       set(CPP_ARGS -r ${CPP_ARGS})
-    ENDIF(FIL STREQUAL "beeswax.thrift")
+    ENDIF(THRIFT_FILE STREQUAL "beeswax.thrift")
 
-    IF (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
+    IF (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
       # Do not generate Java HiveServer2 and Parquet files because we should use the jar
       # from Hive or Parquet.
       # Also do not generate ImpalaService.thrift because the generated code doesn't
       # compile with hive if the thrift version in hive is 0.9.0
       add_custom_command(
         OUTPUT ${OUTPUT_BE_FILE}
-        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${FIL}
-        DEPENDS ${ABS_FIL}
-        COMMENT "Running thrift compiler on ${FIL}"
+        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${THRIFT_FILE}
+        DEPENDS ${ABS_THRIFT_FILE}
+        COMMENT "Running thrift compiler on ${THRIFT_FILE}"
         VERBATIM
       )
-    ELSE (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
+    ELSE (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
       add_custom_command(
         OUTPUT ${OUTPUT_BE_FILE}
-        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${JAVA_FE_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${FIL}
-        DEPENDS ${ABS_FIL}
-        COMMENT "Running thrift compiler on ${FIL}"
+        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${JAVA_FE_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${THRIFT_FILE}
+        DEPENDS ${ABS_THRIFT_FILE}
+        COMMENT "Running thrift compiler on ${THRIFT_FILE}"
         VERBATIM
     )
-    ENDIF (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
-  endforeach(FIL)
+    ENDIF (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
+  endforeach(THRIFT_FILE)
 
   set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction(THRIFT_GEN)
@@ -103,28 +103,61 @@ function(THRIFT_GEN_DS VAR)
   ENDIF(NOT ARGN)
 
   set(${VAR})
-  foreach(FIL ${ARGN})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
+  foreach(THRIFT_FILE ${ARGN})
+    get_filename_component(ABS_THRIFT_FILE ${THRIFT_FILE} ABSOLUTE)
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
 
     # This isn't the exact output file that's created, just a unique file
-    set(OUTPUT_FILE "${EXT_DS_OUTPUT_DIR}/${FIL_WE}.java")
+    set(OUTPUT_FILE "${EXT_DS_OUTPUT_DIR}/${THRIFT_FILE_WE}.java")
     list(APPEND ${VAR} ${OUTPUT_FILE})
     add_custom_command(
       OUTPUT ${OUTPUT_FILE}
-      COMMAND ${THRIFT_COMPILER} ${JAVA_EXT_DS_ARGS} ${FIL}
-      DEPENDS ${ABS_FIL}
-      COMMENT "Running thrift compiler for ext-data-source on ${FIL}"
+      COMMAND ${THRIFT_COMPILER} ${JAVA_EXT_DS_ARGS} ${THRIFT_FILE}
+      DEPENDS ${ABS_THRIFT_FILE}
+      COMMENT "Running thrift compiler for ext-data-source on ${THRIFT_FILE}"
       VERBATIM
     )
-  endforeach(FIL)
+  endforeach(THRIFT_FILE)
   set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction(THRIFT_GEN_DS)
 
+# Helper function similar to THRIFT_GEN and THRIFT_GEN_DS, but specific to Thrift 11. For
+# now the method only generates Python code. The generated Python code is not used by
+# any other Impala module, it is merely meant to provide devs access to Thrift 11 Python
+# generated code until a full upgrade from Thrift 0.9 to 0.11 has been completed. Once
+# the complete upgrade to Thrift 11 has been complete (see IMPALA-7825), this method
+# should be removed.
+function(THRIFT11_GEN VAR)
+  IF (NOT ARGN)
+    MESSAGE(SEND_ERROR "Error: THRIFT11_GEN called without any src files")
+    RETURN()
+  ENDIF(NOT ARGN)
+
+  set(${VAR})
+  foreach(THRIFT_FILE ${ARGN})
+    # Get full path
+    get_filename_component(ABS_THRIFT__FILE ${THRIFT_FILE} ABSOLUTE)
+    # Get basename without the file extension
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
+
+    # This isn't the exact output file that's created, just a unique file
+    set(OUTPUT_FILE "${THRIFT11_PYTHON_OUTPUT_DIR}/${THRIFT_FILE_WE}.py")
+    list(APPEND ${VAR} ${OUTPUT_FILE})
+    add_custom_command(
+      OUTPUT ${OUTPUT_FILE}
+      COMMAND ${THRIFT11_COMPILER} ${THRIFT11_PYTHON_ARGS} ${THRIFT_FILE}
+      DEPENDS ${ABS_THRIFT_FILE}
+      COMMENT "Running thrift 11 compiler on ${THRIFT_FILE}"
+      VERBATIM
+    )
+  endforeach(THRIFT_FILE)
+  set(${VAR} ${${VAR}} PARENT_SCOPE)
+endfunction(THRIFT11_GEN)
 
 set(HIVE_THRIFT_SOURCE_DIR "hive-$ENV{IMPALA_HIVE_MAJOR_VERSION}-api")
 set(TCLI_SERVICE_THRIFT "${HIVE_THRIFT_SOURCE_DIR}/TCLIService.thrift")
 message("Using Thrift compiler: ${THRIFT_COMPILER}")
+message("Using Thrift 11 compiler: ${THRIFT11_COMPILER}")
 set(THRIFT_INCLUDE_DIR_OPTION -I ${THRIFT_CONTRIB_DIR} -I $ENV{HIVE_SRC_DIR}/metastore/if
   -I ${HIVE_THRIFT_SOURCE_DIR})
 set(BE_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/be/generated-sources)
@@ -132,17 +165,21 @@ set(FE_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/fe/generated-sources)
 # TODO: avoid duplicating generated java classes
 set(EXT_DS_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/ext-data-source/api/generated-sources)
 set(PYTHON_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/shell/)
+set(THRIFT11_PYTHON_OUTPUT_DIR ${PYTHON_OUTPUT_DIR}/build/thrift-11-gen)
 MESSAGE("Found output dir: " ${PYTHON_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${BE_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${FE_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${EXT_DS_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${PYTHON_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${HIVE_THRIFT_SOURCE_DIR})
+file(MAKE_DIRECTORY ${THRIFT11_PYTHON_OUTPUT_DIR})
 
 # Args passed to thrift for Java gen
 set(JAVA_FE_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen java -o ${FE_OUTPUT_DIR})
 set(JAVA_EXT_DS_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen java -o ${EXT_DS_OUTPUT_DIR})
 set(PYTHON_ARGS ${THRIFT_INCLUDE_DIR_OPTION} -r --gen py -o ${PYTHON_OUTPUT_DIR})
+set(THRIFT11_PYTHON_ARGS ${THRIFT_INCLUDE_DIR_OPTION} -r --gen py -o
+                         ${THRIFT11_PYTHON_OUTPUT_DIR})
 
 set (EXT_DATA_SRC_FILES
   ErrorCodes.thrift
@@ -211,7 +248,7 @@ add_custom_command(OUTPUT hive-2-api/TCLIService.thrift
 # a list of files they produce
 THRIFT_GEN(THRIFT_ALL_FILES ${SRC_FILES})
 THRIFT_GEN_DS(THRIFT_DATA_SRC_FILES ${EXT_DATA_SRC_FILES})
-
+THRIFT11_GEN(THRIFT11_ALL_FILES ${SRC_FILES})
 
 add_custom_target(thrift-generated-files-error DEPENDS ErrorCodes.thrift)
 add_custom_target(thrift-generated-files-metrics DEPENDS MetricDefs.thrift)
@@ -225,6 +262,9 @@ add_dependencies(thrift-cpp thrift-generated-files-metrics thrift-generated-file
 add_custom_target(thrift-ext-data-src ALL DEPENDS ${THRIFT_DATA_SRC_FILES})
 add_dependencies(thrift-ext-data-src thrift-cpp)
 
+add_custom_target(thrift11-py ALL DEPENDS ${THRIFT11_ALL_FILES})
+add_dependencies(thrift11-py thrift-ext-data-src)
+
 # Combined target for all thrift dependencies
 add_custom_target(thrift-deps ALL)
-add_dependencies(thrift-deps thrift-ext-data-src)
+add_dependencies(thrift-deps thrift11-py)