You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2021/09/21 14:36:20 UTC
[GitHub] [spark] dgd-contributor opened a new pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
dgd-contributor opened a new pull request #34058:
URL: https://github.com/apache/spark/pull/34058
### What changes were proposed in this pull request?
Support multi-index in new syntax to specify index data type
### Why are the changes needed?
Support multi-index in new syntax to specify index data type
https://issues.apache.org/jira/browse/SPARK-36707
### Does this PR introduce _any_ user-facing change?
After this PR user can use
``` python
>>> ps.DataFrame[[int, int],[int, int]]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
>>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
>>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
>>> pdf = pd.DataFrame([[1,2,3],[2,3,4],[4,5,6]], index=idx, columns=["a", "b", "c"])
>>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
>>> ps.DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
>>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
```
### How was this patch tested?
exist tests
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714430891
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
Review comment:
Sorry, could you explain more?
I use this comment for :
``` python
>>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
>>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
>>> pdf = pd.DataFrame([[1,2,3],[2,3,4],[4,5,6]], index=idx, columns=["a", "b", "c"])
>>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925914256
**[Test build #143553 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143553/testReport)** for PR 34058 at commit [`49b8d19`](https://github.com/apache/spark/commit/49b8d1960a7ed57f777ff751e9a62d05de6d6010).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926435427
**[Test build #143596 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143596/testReport)** for PR 34058 at commit [`c8cf08e`](https://github.com/apache/spark/commit/c8cf08ee446302be514642f5c8e571b21eb242a2).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925589368
**[Test build #143545 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143545/testReport)** for PR 34058 at commit [`b7dbb9c`](https://github.com/apache/spark/commit/b7dbb9cd75d8d9bdb146e88b7861eae7cfcf26ab).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925674931
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143547/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926439109
**[Test build #143596 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143596/testReport)** for PR 34058 at commit [`c8cf08e`](https://github.com/apache/spark/commit/c8cf08ee446302be514642f5c8e571b21eb242a2).
* This patch **fails to build**.
* This patch merges cleanly.
* This patch adds the following public classes _(experimental)_:
* ` (<class 'int'>,)`
* ` (<class 'int'>, <class 'int'>, <class 'int'>)`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925590237
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143545/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925457249
**[Test build #143525 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143525/testReport)** for PR 34058 at commit [`8e4bedc`](https://github.com/apache/spark/commit/8e4bedcd38e7fa01a8663f766f9f2f20856db699).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925452366
ok to test
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926726480
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48115/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926666099
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143603/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926666099
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143603/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926398197
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143594/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925670778
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48054/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925589368
**[Test build #143545 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143545/testReport)** for PR 34058 at commit [`b7dbb9c`](https://github.com/apache/spark/commit/b7dbb9cd75d8d9bdb146e88b7861eae7cfcf26ab).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714421963
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
Review comment:
I think this shouldn't be wrapped w/ `zip` but a tuple.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925750214
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48056/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925471508
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48033/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925472750
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143525/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-924076359
Can one of the admins verify this patch?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715274484
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
+ new_param.name = param.start
+ # When the given argument is a numpy's dtype instance.
+ new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+ new_params.append(new_param)
+ return new_params
+
+
+def _prepare_a_tuple(params: Any) -> Any:
+ if isinstance(params, zip): # type: ignore
+ # Example:
+ # params = [zip(pdf.columns, pdf.dtypes)]
+ # or
Review comment:
```suggestion
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715276924
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
Review comment:
I would name it something like `_contains_names` or `_is_named_params`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-934001349
Please take a look at this PR https://github.com/apache/spark/pull/34176
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925489894
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48033/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926514413
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48108/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926432345
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48106/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926439178
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143596/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926398174
**[Test build #143594 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143594/testReport)** for PR 34058 at commit [`0eb2dea`](https://github.com/apache/spark/commit/0eb2deae8b5ef487b7d92ebdf0c6bcfe2d6a2e3b).
* This patch **fails Python style tests**.
* This patch merges cleanly.
* This patch adds the following public classes _(experimental)_:
* ` (<class 'int'>,)`
* ` (<class 'int'>, <class 'int'>, <class 'int'>)`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925914256
**[Test build #143553 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143553/testReport)** for PR 34058 at commit [`49b8d19`](https://github.com/apache/spark/commit/49b8d1960a7ed57f777ff751e9a62d05de6d6010).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715277365
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -793,11 +846,23 @@ def extract_types(params: Any) -> Tuple:
- DataFrame[index_type, [type, ...]]
- DataFrame[(index_name, index_type), [(name, type), ...]]
- DataFrame[dtype instance, dtypes instance]
- - DataFrame[(index_name, index_type), zip(names, types)]\n"""
+ - DataFrame[(index_name, index_type), zip(names, types)]
+ - DataFrame[[index_type, ...], [type, ...]]
+ - DataFrame[[(index_name, index_type), ...], [(name, type), ...]]
+ - DataFrame[dtypes instance, dtypes instance]
+ - DataFrame[zip(index_names, index_types), zip(names, types)]\n"""
+ "However, got %s." % str(origin)
)
+def _get_holder(is_index: bool) -> Union[Type[IndexNameTypeHolder], Type[NameTypeHolder]]:
Review comment:
let's remove this method. It doesn't virtually remove the duplication by dispatching on `is_index`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-927216403
Yeah, let's hold off for a while.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925472750
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143525/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925671394
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48054/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926398197
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143594/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715443222
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
+ new_param.name = param.start
+ # When the given argument is a numpy's dtype instance.
+ new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+ new_params.append(new_param)
+ return new_params
+
+
+def _prepare_a_tuple(params: Any) -> Any:
Review comment:
Updated. Please take another look when you have time.
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
Review comment:
updated
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714711128
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -2563,12 +2563,18 @@ def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
index_spark_columns = None
index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None
- index_fields = None
+
if should_retain_index:
- index_spark_columns = [scol_for(sdf, index_field.struct_field.name)]
- index_fields = [index_field]
- if index_field.struct_field.name != SPARK_DEFAULT_INDEX_NAME:
- index_names = [(index_field.struct_field.name,)]
+ index_spark_columns = [
+ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
+ ]
+ if all(
+ [
+ index_field.struct_field.name != SPARK_DEFAULT_INDEX_NAME
Review comment:
Maybe we should use `SPARK_INDEX_NAME_PATTERN` and if the pattern matches.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925674931
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143547/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925997015
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48062/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926469313
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48106/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714451936
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
Review comment:
I meant:
```
ps.DataFrame[(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
```
:-).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925996969
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48062/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925671394
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48054/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925711094
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48056/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925750170
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48056/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926514446
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48108/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715290116
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
Review comment:
I thought it like: `slice` is always a pair of a name and type
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926514446
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48108/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714452107
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
Review comment:
ohh okay, its for dtype*s*
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
Review comment:
ohh okay, its for dtype**s**
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-924064228
CC @HyukjinKwon , Could you take a look when you have time? Thank you!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925490992
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48033/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926474907
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48108/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-924076359
Can one of the admins verify this patch?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925997015
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48062/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926469313
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48106/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925457249
**[Test build #143525 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143525/testReport)** for PR 34058 at commit [`8e4bedc`](https://github.com/apache/spark/commit/8e4bedcd38e7fa01a8663f766f9f2f20856db699).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925735009
Looks pretty promising otherwise. Thanks for working on this, @dgd-contributor.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor closed pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor closed pull request #34058:
URL: https://github.com/apache/spark/pull/34058
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926655843
**[Test build #143603 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143603/testReport)** for PR 34058 at commit [`271ad2d`](https://github.com/apache/spark/commit/271ad2d90bdf7a078a3068f796de404397c7634a).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926692788
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48115/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925673745
**[Test build #143547 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143547/testReport)** for PR 34058 at commit [`cffdd3e`](https://github.com/apache/spark/commit/cffdd3e88de1cf2de1a799f76999f31890d1bbd0).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925950995
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143553/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925935993
**[Test build #143553 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143553/testReport)** for PR 34058 at commit [`49b8d19`](https://github.com/apache/spark/commit/49b8d1960a7ed57f777ff751e9a62d05de6d6010).
* This patch passes all tests.
* This patch merges cleanly.
* This patch adds no public classes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926655843
**[Test build #143603 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143603/testReport)** for PR 34058 at commit [`271ad2d`](https://github.com/apache/spark/commit/271ad2d90bdf7a078a3068f796de404397c7634a).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926665803
**[Test build #143603 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143603/testReport)** for PR 34058 at commit [`271ad2d`](https://github.com/apache/spark/commit/271ad2d90bdf7a078a3068f796de404397c7634a).
* This patch **fails PySpark unit tests**.
* This patch merges cleanly.
* This patch adds the following public classes _(experimental)_:
* ` (<class 'int'>,)`
* ` (<class 'int'>, <class 'int'>, <class 'int'>)`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715274867
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
Review comment:
`_convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types` is repeated. Can we make one function instead of two `_convert_tuples_to_zip` and `_is_valid_type_tuples`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715275958
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
+ new_param.name = param.start
+ # When the given argument is a numpy's dtype instance.
+ new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+ new_params.append(new_param)
+ return new_params
+
+
+def _prepare_a_tuple(params: Any) -> Any:
+ if isinstance(params, zip): # type: ignore
+ # Example:
+ # params = [zip(pdf.columns, pdf.dtypes)]
+ # or
+ # params = [zip(pdf.index.names, pdf.index.dtypes)]
+ params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
+
+ if isinstance(params, Iterable):
+ params = tuple(params)
+ else:
+ params = (params,)
+ return params
+
+
+def _convert_tuples_to_zip(params: Any) -> Any:
+ return zip((name for name, _ in params), (tpe for _, tpe in params))
+
+
+def _is_valid_type_tuples(params: Any) -> bool:
+ return isinstance(params, list) and len(params) >= 1 and isinstance(params[0], tuple)
+
+
+def _handle_list_of_types(params: Any, origin: Any, is_index: bool) -> Any:
Review comment:
Looks like the generalization seems a bit odds here. `_handle_list_of_types` address unnamed type holders and `_convert_slices_to_holders` address named type holders. Can we rename it properly?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925465871
**[Test build #143525 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143525/testReport)** for PR 34058 at commit [`8e4bedc`](https://github.com/apache/spark/commit/8e4bedcd38e7fa01a8663f766f9f2f20856db699).
* This patch passes all tests.
* This patch merges cleanly.
* This patch adds no public classes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925490992
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48033/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714939496
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -2563,12 +2563,18 @@ def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
index_spark_columns = None
index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None
- index_fields = None
+
if should_retain_index:
- index_spark_columns = [scol_for(sdf, index_field.struct_field.name)]
- index_fields = [index_field]
- if index_field.struct_field.name != SPARK_DEFAULT_INDEX_NAME:
- index_names = [(index_field.struct_field.name,)]
+ index_spark_columns = [
+ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
+ ]
+ if all(
+ [
+ index_field.struct_field.name != SPARK_DEFAULT_INDEX_NAME
Review comment:
Thanks, updated, please take another look when you have time!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714712795
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
Review comment:
feel free to name it `_extract_types`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714712982
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
Review comment:
oh, can you remove this one while we're here? I fixed this already SPARK-36708
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715290197
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
Review comment:
if it's not a `slice`, it's just type and uses a default type.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714427614
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
Review comment:
Thanks for reviewing,
above comment is ```DataFrame[pdf.index.dtype, pdf.dtypes]```
this one is ```DataFrame[pdf.index.dtypes, pdf.dtypes]```
This one is used for :
``` python
>>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
>>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
>>> pdf = pd.DataFrame([[1,2,3],[2,3,4],[4,5,6]], index=idx, columns=["a", "b", "c"])
>>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926397312
**[Test build #143594 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143594/testReport)** for PR 34058 at commit [`0eb2dea`](https://github.com/apache/spark/commit/0eb2deae8b5ef487b7d92ebdf0c6bcfe2d6a2e3b).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-927606088
I create new PR https://github.com/apache/spark/pull/34112 with ```Lead-authored-by``` and ```Co-authored-by``` tags from my personal account. Could you take a look? Thanks @HyukjinKwon
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926435427
**[Test build #143596 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143596/testReport)** for PR 34058 at commit [`c8cf08e`](https://github.com/apache/spark/commit/c8cf08ee446302be514642f5c8e571b21eb242a2).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715284917
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
Review comment:
Thanks for your review!
I'm not understand why we have ```name``` here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925627459
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48054/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715276357
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
+ new_param.name = param.start
+ # When the given argument is a numpy's dtype instance.
+ new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+ new_params.append(new_param)
+ return new_params
+
+
+def _prepare_a_tuple(params: Any) -> Any:
Review comment:
I would name it to `_to_tuple_of_params`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926463521
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48106/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925673745
**[Test build #143547 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143547/testReport)** for PR 34058 at commit [`cffdd3e`](https://github.com/apache/spark/commit/cffdd3e88de1cf2de1a799f76999f31890d1bbd0).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926439178
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143596/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925750214
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48056/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714421840
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
Review comment:
this comment seems a duplicate w/ above.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925952512
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48062/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715274552
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
+ new_param.name = param.start
+ # When the given argument is a numpy's dtype instance.
+ new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+ new_params.append(new_param)
+ return new_params
+
+
+def _prepare_a_tuple(params: Any) -> Any:
Review comment:
Shall we add some docstring? what's input and output?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925590212
**[Test build #143545 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143545/testReport)** for PR 34058 at commit [`b7dbb9c`](https://github.com/apache/spark/commit/b7dbb9cd75d8d9bdb146e88b7861eae7cfcf26ab).
* This patch **fails Python style tests**.
* This patch merges cleanly.
* This patch adds no public classes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715274372
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
Review comment:
```
DataFrame[["id": int, "A": int], [int, int]]
```
do you mean:
```
DataFrame[[("id", int), ("A", int)], [int, int]]
```
? `DataFrame[["id": int, "A": int]...]` is an invalid syntax in Python.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dgd-contributor commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
dgd-contributor commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715284917
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
Review comment:
Thanks for your review!
I do not understand why we have ```name``` here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r715277100
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -690,98 +696,145 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
- return Tuple[extract_types(params)]
+ return Tuple[_extract_types(params)]
-# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
-def extract_types(params: Any) -> Tuple:
+def _extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
- index_param = params[0]
- index_type = type(
- "IndexNameType", (IndexNameTypeHolder,), {}
- ) # type: Type[IndexNameTypeHolder]
- if isinstance(index_param, tuple):
- if len(index_param) != 2:
- raise TypeError(
- "Type hints for index should be specified as "
- "DataFrame[('name', type), ...]; however, got %s" % index_param
- )
- name, tpe = index_param
- else:
- name, tpe = None, index_param
+ index_params = params[0]
+
+ if isinstance(index_params, tuple) and len(index_params) == 2:
+ index_params = tuple([slice(*index_params)])
+
+ index_params = (
+ _convert_tuples_to_zip(index_params)
+ if _is_valid_type_tuples(index_params)
+ else index_params
+ )
+ index_params = _prepare_a_tuple(index_params)
- index_type.name = name
- if isinstance(tpe, ExtensionDtype):
- index_type.tpe = tpe
+ if _is_valid_slices(index_params):
+ # Example:
+ # DataFrame[["id": int, "A": int], [int, int]]
+ new_index_params = _convert_slices_to_holders(index_params, is_index=True)
+ index_types = tuple(new_index_params)
else:
- index_type.tpe = tpe.type if isinstance(tpe, np.dtype) else tpe
+ # Exaxmples:
+ # DataFrame[[float, float], [int, int]]
+ # DataFrame[pdf.dtypes, [int, int]]
+ index_types = _handle_list_of_types(index_params, origin, is_index=True)
data_types = params[1]
- if (
- isinstance(data_types, list)
- and len(data_types) >= 1
- and isinstance(data_types[0], tuple)
- ): # type: ignore
- # Example:
- # DataFrame[("index", int), [("id", int), ("A", int)]]
- data_types = zip((name for name, _ in data_types), (tpe for _, tpe in data_types))
- return (index_type,) + extract_types(data_types)
- elif all(not isinstance(param, slice) and not isinstance(param, Iterable) for param in params):
+ data_types = (
+ _convert_tuples_to_zip(data_types) if _is_valid_type_tuples(data_types) else data_types
+ )
+
+ return index_types + _extract_types(data_types)
+
+ else:
# Exaxmples:
# DataFrame[float, float]
# DataFrame[pdf.dtypes]
+ return _handle_list_of_types(params, origin, is_index=False)
+
+
+def _is_valid_slices(params: Any) -> Any:
+ return all(
+ isinstance(param, slice) and param.step is None and param.stop is not None
+ for param in params
+ )
+
+
+def _convert_slices_to_holders(params: Any, is_index: bool) -> Any:
+ # Example:
+ # params = (slice("id", int, None), slice("A", int, None))
+ new_params = []
+ for param in params:
+ new_param = _get_holder(is_index)
Review comment:
`ExtensionDtype` handling seems missing here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926397312
**[Test build #143594 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143594/testReport)** for PR 34058 at commit [`0eb2dea`](https://github.com/apache/spark/commit/0eb2deae8b5ef487b7d92ebdf0c6bcfe2d6a2e3b).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926726443
Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/48115/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-926726480
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder-K8s/48115/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins removed a comment on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925590237
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143545/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925950995
Refer to this link for build results (access rights to CI server needed):
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/143553/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714421963
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
Review comment:
I think this shouldn't be wrapped w/ `zip`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #34058:
URL: https://github.com/apache/spark/pull/34058#issuecomment-925674904
**[Test build #143547 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/143547/testReport)** for PR 34058 at commit [`cffdd3e`](https://github.com/apache/spark/commit/cffdd3e88de1cf2de1a799f76999f31890d1bbd0).
* This patch **fails Python style tests**.
* This patch merges cleanly.
* This patch adds no public classes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a change in pull request #34058: [SPARK-36711][PYTHON] Support multi-index in new syntax
Posted by GitBox <gi...@apache.org>.
HyukjinKwon commented on a change in pull request #34058:
URL: https://github.com/apache/spark/pull/34058#discussion_r714452391
##########
File path: python/pyspark/pandas/typedef/typehints.py
##########
@@ -673,98 +673,146 @@ def create_tuple_for_frame_type(params: Any) -> object:
Typing data columns with an index:
>>> ps.DataFrame[int, [int, int]] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, int, int]
+ typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[pdf.index.dtype, pdf.dtypes] # doctest: +ELLIPSIS
- typing.Tuple[...IndexNameType, numpy.int64]
+ typing.Tuple[...IndexNameType, ...NameType]
>>> ps.DataFrame[("index", int), [("id", int), ("A", int)]] # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType, ...NameType]
>>> ps.DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
... # doctest: +ELLIPSIS
typing.Tuple[...IndexNameType, ...NameType]
+
+ Typing data columns with an Multi-index:
+ >>> arrays = [[1, 1, 2], ['red', 'blue', 'red']]
+ >>> idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ >>> pdf = pd.DataFrame({'a': range(3)}, index=idx)
+ >>> ps.DataFrame[[int, int], [int, int]] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[pdf.index.dtypes, pdf.dtypes] # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
+ >>> ps.DataFrame[[("index-1", int), ("index-2", int)], [("id", int), ("A", int)]]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...IndexNameType, ...NameType, ...NameType]
+ >>> ps.DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
+ ... # doctest: +ELLIPSIS
+ typing.Tuple[...IndexNameType, ...NameType]
"""
return Tuple[extract_types(params)]
# TODO(SPARK-36708): numpy.typing (numpy 1.21+) support for nested types.
def extract_types(params: Any) -> Tuple:
origin = params
- if isinstance(params, zip): # type: ignore
- # Example:
- # DataFrame[zip(pdf.columns, pdf.dtypes)]
- params = tuple(slice(name, tpe) for name, tpe in params) # type: ignore
- if isinstance(params, Iterable):
- params = tuple(params)
- else:
- params = (params,)
+ params = _prepare_a_tuple(params)
- if all(
- isinstance(param, slice)
- and param.start is not None
- and param.step is None
- and param.stop is not None
- for param in params
- ):
+ if _is_valid_slices(params):
# Example:
# DataFrame["id": int, "A": int]
- new_params = []
- for param in params:
- new_param = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
- new_param.name = param.start
- # When the given argument is a numpy's dtype instance.
- new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
- new_params.append(new_param)
-
+ new_params = _convert_slices_to_holders(params, is_index=False)
return tuple(new_params)
elif len(params) == 2 and isinstance(params[1], (zip, list, pd.Series)):
# Example:
# DataFrame[int, [int, int]]
# DataFrame[pdf.index.dtype, pdf.dtypes]
# DataFrame[("index", int), [("id", int), ("A", int)]]
# DataFrame[(pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]
+ #
+ # DataFrame[[int, int], [int, int]]
+ # DataFrame[pdf.index.dtypes, pdf.dtypes]
+ # DataFrame[[("index", int), ("index-2", int)], [("id", int), ("A", int)]]
+ # DataFrame[zip(pdf.index.names, pdf.index.dtypes), zip(pdf.columns, pdf.dtypes)]
Review comment:
okie, sounds good to me.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org