You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@superset.apache.org by GitBox <gi...@apache.org> on 2020/04/28 14:01:51 UTC

[GitHub] [incubator-superset] dpgaspar commented on a change in pull request #9661: feat: Add geospatial post processing operations

dpgaspar commented on a change in pull request #9661:
URL: https://github.com/apache/incubator-superset/pull/9661#discussion_r416621439



##########
File path: superset/charts/schemas.py
##########
@@ -265,15 +265,23 @@ class ChartDataSelectOptionsSchema(ChartDataPostProcessingOperationOptionsSchema
     columns = fields.List(
         fields.String(),
         description="Columns which to select from the input data, in the desired "
-        "order. If columns are renamed, the old column name should be "
+        "order. If columns are renamed, the original column name should be "
         "referenced here.",
         example=["country", "gender", "age"],
+        required=False,
+    )
+    drop = fields.List(

Review comment:
       Just an opinion but I personally prefer `exclude`. If columns are not provided will it select all columns? 

##########
File path: superset/utils/pandas_postprocessing.py
##########
@@ -388,3 +398,92 @@ def cum(df: DataFrame, columns: Dict[str, str], operator: str) -> DataFrame:
             _("Invalid cumulative operator: %(operator)s", operator=operator)
         )
     return _append_columns(df, getattr(df_cum, operation)(), columns)
+
+
+def geohash_decode(
+    df: DataFrame, geohash: str, longitude: str, latitude: str
+) -> DataFrame:
+    """
+    Decode a geohash column into longitude and latitude
+
+    :param df: DataFrame containing geohash data
+    :param geohash: Name of source column containing geohash location.
+    :param longitude: Name of new column to be created containing longitude.
+    :param latitude: Name of new column to be created containing latitude.
+    :return: DataFrame with decoded longitudes and latitudes
+    """
+    try:
+        lonlat_df = DataFrame()
+        lonlat_df["latitude"], lonlat_df["longitude"] = zip(
+            *df[geohash].apply(geohash_lib.decode)
+        )
+        return _append_columns(
+            df, lonlat_df, {"latitude": latitude, "longitude": longitude}
+        )
+    except ValueError:
+        raise QueryObjectValidationError(_("Invalid geohash string"))
+
+
+def geohash_encode(
+    df: DataFrame, geohash: str, longitude: str, latitude: str,
+) -> DataFrame:
+    """
+    Encode longitude and latitude into geohash
+
+    :param df: DataFrame containing longitude and latitude data
+    :param geohash: Name of new column to be created containing geohash location.
+    :param longitude: Name of source column containing longitude.
+    :param latitude: Name of source column containing latitude.
+    :return: DataFrame with decoded longitudes and latitudes
+    """
+    try:
+        encode_df = df[[latitude, longitude]]
+        encode_df.columns = ["latitude", "longitude"]
+        encode_df["geohash"] = encode_df.apply(
+            lambda row: geohash_lib.encode(row["latitude"], row["longitude"]), axis=1,
+        )
+        return _append_columns(df, encode_df, {"geohash": geohash})
+    except ValueError:
+        QueryObjectValidationError(_("Invalid longitude/latitude"))
+
+
+def geodetic_parse(
+    df: DataFrame,
+    geodetic: str,
+    longitude: str,
+    latitude: str,
+    altitude: Optional[str] = None,
+) -> DataFrame:
+    """
+    Parse a column containing a geodetic point string
+    [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point).
+
+    :param df: DataFrame containing geodetic point data
+    :param geodetic: Name of source column containing geodetic point string.
+    :param longitude: Name of new column to be created containing longitude.
+    :param latitude: Name of new column to be created containing latitude.
+    :param altitude: Name of new column to be created containing altitude.
+    :return: DataFrame with decoded longitudes and latitudes
+    """
+
+    def _parse_location(location: str) -> Tuple[float, float, float]:
+        """
+        Parse a string containing a geodetic point and return latitude, longitude
+        and altitude
+        """
+        point = Point(location)  # type: ignore
+        return point[0], point[1], point[2]
+
+    try:
+        geodetic_df = DataFrame()
+        (
+            geodetic_df["latitude"],
+            geodetic_df["longitude"],
+            geodetic_df["altitude"],
+        ) = zip(*df[geodetic].apply(_parse_location))

Review comment:
       nice!

##########
File path: superset/utils/pandas_postprocessing.py
##########
@@ -325,23 +324,32 @@ def rolling(  # pylint: disable=too-many-arguments
 
 @validate_column_args("columns", "rename")
 def select(
-    df: DataFrame, columns: List[str], rename: Optional[Dict[str, str]] = None
+    df: DataFrame,
+    columns: Optional[List[str]] = None,
+    drop: Optional[List[str]] = None,

Review comment:
       Can `drop` be added to `@validate_column_args` they must exist in the dataframe right?

##########
File path: tests/pandas_postprocessing_tests.py
##########
@@ -288,3 +316,83 @@ def test_cum(self):
             columns={"y": "y"},
             operator="abc",
         )
+
+    def test_geohash_decode(self):
+        # decode lon/lat from geohash
+        post_df = proc.geohash_decode(
+            df=lonlat_df[["city", "geohash"]],
+            geohash="geohash",
+            latitude="latitude",
+            longitude="longitude",
+        )
+        self.assertListEqual(
+            sorted(post_df.columns.tolist()),
+            sorted(["city", "geohash", "latitude", "longitude"]),
+        )
+        self.assertListEqual(
+            round_floats(series_to_list(post_df["longitude"]), 6),
+            round_floats(series_to_list(lonlat_df["longitude"]), 6),
+        )
+        self.assertListEqual(
+            round_floats(series_to_list(post_df["latitude"]), 6),
+            round_floats(series_to_list(lonlat_df["latitude"]), 6),
+        )
+
+    def test_geohash_encode(self):
+        # encode lon/lat into geohash
+        post_df = proc.geohash_encode(
+            df=lonlat_df[["city", "latitude", "longitude"]],
+            latitude="latitude",
+            longitude="longitude",
+            geohash="geohash",
+        )
+        self.assertListEqual(
+            sorted(post_df.columns.tolist()),
+            sorted(["city", "geohash", "latitude", "longitude"]),
+        )
+        self.assertListEqual(
+            series_to_list(post_df["geohash"]), series_to_list(lonlat_df["geohash"]),
+        )
+
+    def geodetic_parse(self):

Review comment:
       rename to: `def test_geodetic_parse`

##########
File path: superset/charts/schemas.py
##########
@@ -335,12 +343,81 @@ class ChartDataPivotOptionsSchema(ChartDataPostProcessingOperationOptionsSchema)
     aggregates = ChartDataAggregateConfigField()
 
 
+class ChartDataGeohashDecodeOptionsSchema(
+    ChartDataPostProcessingOperationOptionsSchema
+):
+    """
+    Geohash decode operation config.
+    """
+
+    geohash = fields.String(
+        description="Name of source column containing geohash string", required=True,
+    )
+    latitude = fields.String(
+        description="Name of target column for decoded latitude", required=True,
+    )
+    longitude = fields.String(
+        description="Name of target column for decoded longitude", required=True,
+    )
+
+
+class ChartDataGeohashEncodeOptionsSchema(
+    ChartDataPostProcessingOperationOptionsSchema
+):
+    """
+    Geohash encode operation config.
+    """
+
+    latitude = fields.String(
+        description="Name of source latitude column", required=True,
+    )
+    longitude = fields.String(
+        description="Name of source longitude column", required=True,
+    )
+    geohash = fields.String(
+        description="Name of target column for encoded geohash string", required=True,
+    )
+
+
+class ChartDataGeodeticParseOptionsSchema(
+    ChartDataPostProcessingOperationOptionsSchema
+):
+    """
+    Geodetic point string parsing operation config.
+    """
+
+    geodetic = fields.String(
+        description="Name of source column containing geodetic point strings",
+        required=True,
+    )
+    latitude = fields.String(
+        description="Name of target column for decoded latitude", required=True,
+    )
+    longitude = fields.String(
+        description="Name of target column for decoded longitude", required=True,
+    )
+    altitude = fields.String(
+        description="Name of target column for decoded longitude. If omitted, "

Review comment:
       `Name of target column for decoded altitude`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@superset.apache.org
For additional commands, e-mail: notifications-help@superset.apache.org