You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2020/09/10 04:32:30 UTC

[GitHub] [spark] HyukjinKwon commented on a change in pull request #29703: [SPARK-32017][PYTHON][BUILD] Make Pyspark Hadoop 3.2+ Variant available in PyPI

HyukjinKwon commented on a change in pull request #29703:
URL: https://github.com/apache/spark/pull/29703#discussion_r486056702



##########
File path: python/pyspark/install.py
##########
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import re
+import tarfile
+import traceback
+import urllib.request
+from shutil import rmtree
+# NOTE that we shouldn't import pyspark here because this is used in
+# setup.py, and assume there's no PySpark imported.
+
+DEFAULT_HADOOP = "hadoop3.2"
+DEFAULT_HIVE = "hive2.3"
+SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
+SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
+UNSUPPORTED_COMBINATIONS = [
+    ("without-hadoop", "hive1.2"),
+    ("hadoop3.2", "hive1.2"),
+]
+
+
+def checked_package_name(spark_version, hadoop_version, hive_version):
+    if hive_version == "hive1.2":
+        return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
+    else:
+        return "%s-bin-%s" % (spark_version, hadoop_version)
+
+
+def checked_versions(spark_version, hadoop_version, hive_version):
+    """
+    Check the valid combinations of supported versions in Spark distributions.
+
+    :param spark_version: Spark version. It should be X.X.X such as '3.0.0' or spark-3.0.0.
+    :param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.
+        'without' and 'without-hadoop' are supported as special keywords for Hadoop free
+        distribution.
+    :param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'.
+
+    :return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple.
+        For example, spark-3.0.0, hadoop3.2 and hive2.3.
+    """
+    if re.match("^[0-9]+\\.[0-9]+\\.[0-9]+$", spark_version):
+        spark_version = "spark-%s" % spark_version
+    if not spark_version.startswith("spark-"):
+        raise RuntimeError(
+            "Spark version should start with 'spark-' prefix; however, "
+            "got %s" % spark_version)
+
+    if hadoop_version == "without":
+        hadoop_version = "without-hadoop"
+    elif re.match("^[0-9]+\\.[0-9]+$", hadoop_version):
+        hadoop_version = "hadoop%s" % hadoop_version
+
+    if hadoop_version not in SUPPORTED_HADOOP_VERSIONS:
+        raise RuntimeError(
+            "Spark distribution of %s is not supported. Hadoop version should be "
+            "one of [%s]" % (hadoop_version, ", ".join(
+                SUPPORTED_HADOOP_VERSIONS)))
+
+    if re.match("^[0-9]+\\.[0-9]+$", hive_version):
+        hive_version = "hive%s" % hive_version
+
+    if hive_version not in SUPPORTED_HIVE_VERSIONS:
+        raise RuntimeError(
+            "Spark distribution of %s is not supported. Hive version should be "
+            "one of [%s]" % (hive_version, ", ".join(
+                SUPPORTED_HADOOP_VERSIONS)))
+
+    if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS:
+        raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.")
+
+    return spark_version, hadoop_version, hive_version
+
+
+def install_spark(dest, spark_version, hadoop_version, hive_version):

Review comment:
       I basically referred to https://github.com/apache/spark/blob/b84ed4146d93b37adb2b83ca642c7978a1ac853e/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala#L70-L111
   and
   https://github.com/apache/spark/blob/f53d8c63e80172295e2fbc805c0c391bdececcaa/R/pkg/R/install.R#L68-L161




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org