You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@beam.apache.org by "ASF GitHub Bot (Jira)" <ji...@apache.org> on 2020/08/10 18:18:00 UTC

[jira] [Work logged] (BEAM-9179) Refactor Beam ZetaSQL type translation code

     [ https://issues.apache.org/jira/browse/BEAM-9179?focusedWorklogId=468788&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-468788 ]

ASF GitHub Bot logged work on BEAM-9179:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 10/Aug/20 18:17
            Start Date: 10/Aug/20 18:17
    Worklog Time Spent: 10m 
      Work Description: apilloud commented on a change in pull request #12507:
URL: https://github.com/apache/beam/pull/12507#discussion_r468087411



##########
File path: sdks/java/extensions/sql/zetasql/src/main/java/org/apache/beam/sdk/extensions/sql/zetasql/ZetaSqlCalciteTranslationUtils.java
##########
@@ -17,174 +17,333 @@
  */
 package org.apache.beam.sdk.extensions.sql.zetasql;
 
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_BOOL;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_BYTES;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_DATE;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_DATETIME;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_DOUBLE;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_INT64;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_NUMERIC;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_STRING;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_TIME;
-import static com.google.zetasql.ZetaSQLType.TypeKind.TYPE_TIMESTAMP;
-import static java.util.stream.Collectors.toList;
-
-import com.google.zetasql.ArrayType;
+import com.google.zetasql.CivilTimeEncoder;
 import com.google.zetasql.StructType;
 import com.google.zetasql.StructType.StructField;
 import com.google.zetasql.Type;
 import com.google.zetasql.TypeFactory;
+import com.google.zetasql.Value;
 import com.google.zetasql.ZetaSQLType.TypeKind;
+import com.google.zetasql.functions.ZetaSQLDateTime.DateTimestampPart;
+import java.math.BigDecimal;
+import java.time.LocalDateTime;
+import java.time.LocalTime;
 import java.util.List;
-import java.util.function.Function;
+import java.util.stream.Collectors;
 import org.apache.beam.sdk.annotations.Internal;
+import org.apache.beam.sdk.extensions.sql.meta.provider.bigquery.BeamBigQuerySqlDialect;
+import org.apache.beam.sdk.extensions.sql.zetasql.translation.SqlOperators;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.ByteString;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.TimeUnit;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.avatica.util.TimeUnitRange;
 import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rel.type.RelDataType;
 import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexBuilder;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.rex.RexNode;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.fun.SqlStdOperatorTable;
 import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.type.SqlTypeName;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.DateString;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.TimeString;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.util.TimestampString;
 import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap;
 
 /**
  * Utility methods for ZetaSQL <=> Calcite translation.
  *
- * <p>Unsupported ZetaSQL types: INT32, UINT32, UINT64, FLOAT, ENUM, PROTO, GEOGRAPHY
+ * <p>Unsupported ZetaSQL types: INT32, UINT32, UINT64, FLOAT, ENUM (internal), PROTO, GEOGRAPHY
  */
 @Internal
 public final class ZetaSqlCalciteTranslationUtils {
 
   private ZetaSqlCalciteTranslationUtils() {}
 
+  // TODO[BEAM-9178]: support DateTimestampPart.WEEK and "WEEK with weekday"s
+  private static final ImmutableMap<Integer, TimeUnit> TIME_UNIT_CASTING_MAP =
+      ImmutableMap.<Integer, TimeUnit>builder()
+          .put(DateTimestampPart.YEAR.getNumber(), TimeUnit.YEAR)
+          .put(DateTimestampPart.MONTH.getNumber(), TimeUnit.MONTH)
+          .put(DateTimestampPart.DAY.getNumber(), TimeUnit.DAY)
+          .put(DateTimestampPart.DAYOFWEEK.getNumber(), TimeUnit.DOW)
+          .put(DateTimestampPart.DAYOFYEAR.getNumber(), TimeUnit.DOY)
+          .put(DateTimestampPart.QUARTER.getNumber(), TimeUnit.QUARTER)
+          .put(DateTimestampPart.HOUR.getNumber(), TimeUnit.HOUR)
+          .put(DateTimestampPart.MINUTE.getNumber(), TimeUnit.MINUTE)
+          .put(DateTimestampPart.SECOND.getNumber(), TimeUnit.SECOND)
+          .put(DateTimestampPart.MILLISECOND.getNumber(), TimeUnit.MILLISECOND)
+          .put(DateTimestampPart.MICROSECOND.getNumber(), TimeUnit.MICROSECOND)
+          .put(DateTimestampPart.ISOYEAR.getNumber(), TimeUnit.ISOYEAR)
+          .put(DateTimestampPart.ISOWEEK.getNumber(), TimeUnit.WEEK)
+          .build();
+
   // Type conversion: Calcite => ZetaSQL
-  public static Type toZetaType(RelDataType calciteType) {
+  public static Type toZetaSqlType(RelDataType calciteType) {
     switch (calciteType.getSqlTypeName()) {
       case BIGINT:
-        return TypeFactory.createSimpleType(TYPE_INT64);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_INT64);
       case DOUBLE:
-        return TypeFactory.createSimpleType(TYPE_DOUBLE);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_DOUBLE);
       case BOOLEAN:
-        return TypeFactory.createSimpleType(TYPE_BOOL);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_BOOL);
       case VARCHAR:
-        return TypeFactory.createSimpleType(TYPE_STRING);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_STRING);
       case VARBINARY:
-        return TypeFactory.createSimpleType(TYPE_BYTES);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_BYTES);
       case DECIMAL:
-        return TypeFactory.createSimpleType(TYPE_NUMERIC);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_NUMERIC);
       case DATE:
-        return TypeFactory.createSimpleType(TYPE_DATE);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_DATE);
       case TIME:
-        return TypeFactory.createSimpleType(TYPE_TIME);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_TIME);
       case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
-        return TypeFactory.createSimpleType(TYPE_DATETIME);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_DATETIME);
       case TIMESTAMP:
-        return TypeFactory.createSimpleType(TYPE_TIMESTAMP);
+        return TypeFactory.createSimpleType(TypeKind.TYPE_TIMESTAMP);
       case ARRAY:
-        return TypeFactory.createArrayType(toZetaType(calciteType.getComponentType()));
+        return TypeFactory.createArrayType(toZetaSqlType(calciteType.getComponentType()));
       case ROW:
-        List<StructField> structFields =
+        return TypeFactory.createStructType(
             calciteType.getFieldList().stream()
-                .map(f -> new StructField(f.getName(), toZetaType(f.getType())))
-                .collect(toList());
-
-        return TypeFactory.createStructType(structFields);
+                .map(f -> new StructField(f.getName(), toZetaSqlType(f.getType())))
+                .collect(Collectors.toList()));
       default:
-        throw new UnsupportedOperationException("Unsupported RelDataType: " + calciteType);
+        throw new UnsupportedOperationException(
+            "Unknown Calcite type: " + calciteType.getSqlTypeName().getName());
     }
   }
 
   // Type conversion: ZetaSQL => Calcite
-  public static SqlTypeName toCalciteTypeName(TypeKind type) {
-    switch (type) {
+  public static RelDataType toCalciteType(Type type, boolean nullable, RexBuilder rexBuilder) {
+    RelDataType nonNullType;
+    switch (type.getKind()) {
       case TYPE_INT64:
-        return SqlTypeName.BIGINT;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.BIGINT);
+        break;
       case TYPE_DOUBLE:
-        return SqlTypeName.DOUBLE;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.DOUBLE);
+        break;
       case TYPE_BOOL:
-        return SqlTypeName.BOOLEAN;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.BOOLEAN);
+        break;
       case TYPE_STRING:
-        return SqlTypeName.VARCHAR;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR);
+        break;
       case TYPE_BYTES:
-        return SqlTypeName.VARBINARY;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARBINARY);
+        break;
       case TYPE_NUMERIC:
-        return SqlTypeName.DECIMAL;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.DECIMAL);
+        break;
       case TYPE_DATE:
-        return SqlTypeName.DATE;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.DATE);
+        break;
       case TYPE_TIME:
-        return SqlTypeName.TIME;
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.TIME);
+        break;
       case TYPE_DATETIME:
-        return SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE;
+        nonNullType =
+            rexBuilder.getTypeFactory().createSqlType(SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE);
+        break;
       case TYPE_TIMESTAMP:
-        // TODO: handle timestamp with time zone.
-        return SqlTypeName.TIMESTAMP;
-        // TODO[BEAM-9179] Add conversion code for ARRAY and ROW types
+        nonNullType = rexBuilder.getTypeFactory().createSqlType(SqlTypeName.TIMESTAMP);
+        break;
+      case TYPE_ARRAY:
+        // TODO: Should element type has the same nullability as the array type?
+        nonNullType = toCalciteArrayType(type.asArray().getElementType(), nullable, rexBuilder);
+        break;
+      case TYPE_STRUCT:
+        // TODO: Should field type has the same nullability as the struct type?
+        nonNullType = toCalciteStructType(type.asStruct(), nullable, rexBuilder);
+        break;
       default:
-        throw new UnsupportedOperationException("Unknown ZetaSQL type: " + type.name());
+        throw new UnsupportedOperationException("Unknown ZetaSQL type: " + type.getKind().name());
     }
+    return rexBuilder.getTypeFactory().createTypeWithNullability(nonNullType, nullable);
   }
 
-  public static RelDataType toRelDataType(RexBuilder rexBuilder, Type type, boolean isNullable) {
-    if (type.getKind().equals(TypeKind.TYPE_ARRAY)) {
-      return toArrayRelDataType(rexBuilder, type.asArray(), isNullable);
-    } else if (type.getKind().equals(TypeKind.TYPE_STRUCT)) {
-      return toStructRelDataType(rexBuilder, type.asStruct(), isNullable);
-    } else {
-      // TODO: Check type's nullability?
-      return toSimpleRelDataType(type.getKind(), rexBuilder, isNullable);
-    }
+  private static RelDataType toCalciteArrayType(
+      Type elementType, boolean nullable, RexBuilder rexBuilder) {
+    return rexBuilder
+        .getTypeFactory()
+        // -1 cardinality means unlimited array size
+        .createArrayType(toCalciteType(elementType, nullable, rexBuilder), -1);
   }
 
-  public static RelDataType toArrayRelDataType(
-      RexBuilder rexBuilder, ArrayType arrayType, boolean isNullable) {
-    // -1 cardinality means unlimited array size.
-    // TODO: is unlimited array size right for general case?
-    // TODO: whether isNullable should be ArrayType's nullablity (not its element type's?)
-    return nullable(
-        rexBuilder,
-        rexBuilder
-            .getTypeFactory()
-            .createArrayType(toRelDataType(rexBuilder, arrayType.getElementType(), isNullable), -1),
-        isNullable);
+  private static RelDataType toCalciteStructType(
+      StructType structType, boolean nullable, RexBuilder rexBuilder) {
+    List<StructField> fields = structType.getFieldList();
+    List<String> fieldNames = getFieldNameList(fields);
+    List<RelDataType> fieldTypes =
+        fields.stream()
+            .map(f -> toCalciteType(f.getType(), nullable, rexBuilder))
+            .collect(Collectors.toList());
+    return rexBuilder.getTypeFactory().createStructType(fieldTypes, fieldNames);
   }
 
-  private static List<String> toNameList(List<StructField> fields) {
+  private static List<String> getFieldNameList(List<StructField> fields) {
     ImmutableList.Builder<String> b = ImmutableList.builder();
     for (int i = 0; i < fields.size(); i++) {
       String name = fields.get(i).getName();
       if ("".equals(name)) {
-        name = "$col" + i;
+        name = "$col" + i; // empty field name is not allowed, generate an index-based name for it

Review comment:
       I believe a single column with an empty name is allowed. ZetaSQL allows multiple columns without names.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 468788)
    Time Spent: 7h 20m  (was: 7h 10m)

> Refactor Beam ZetaSQL type translation code
> -------------------------------------------
>
>                 Key: BEAM-9179
>                 URL: https://issues.apache.org/jira/browse/BEAM-9179
>             Project: Beam
>          Issue Type: Improvement
>          Components: dsl-sql-zetasql
>            Reporter: Robin Qiu
>            Assignee: Robin Qiu
>            Priority: P2
>          Time Spent: 7h 20m
>  Remaining Estimate: 0h
>
> Currently, how types work in Beam ZetaSQL is very confusing: code that does type conversion between Calcite/Beam/ZetaSQL type systems are everywhere and is likely to be not consistent. In addition, the recent migration to BeamZetaSqlCalcRel makes the situation even worse: it adds another layer for type conversion and makes a lot of code (for making ZetaSQL functions manually) obsolete. This makes debugging and supporting new features very hard. A few examples:
>  * Support new ZetaSQL types (e.g. DATE/TIME/DATETIME)
>  * Fix the long-standing precision & timezone issue of TIMESTMAP type
>  * Support BeamSQL query parameters
> Therefore, I propose we work on the following items to make the story of Beam ZetaSQL type clearer:
>  * Define a clear mapping between Calcite/Beam/ZetaSQL types and values
>  * Refactor the codebase to move all type conversion code to a single source of truth
>  * Add thorough testing for it
>  * Remove obsolete code that is added to make functions work before BeamZetaSqlCalcRel



--
This message was sent by Atlassian Jira
(v8.3.4#803005)