You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2022/12/01 19:27:14 UTC

[GitHub] [iceberg] szehon-ho commented on a diff in pull request #5376: Core: Add readable metrics columns to files metadata tables

szehon-ho commented on code in PR #5376:
URL: https://github.com/apache/iceberg/pull/5376#discussion_r1037486854


##########
core/src/main/java/org/apache/iceberg/MetricsUtil.java:
##########
@@ -56,4 +72,270 @@ public static MetricsModes.MetricsMode metricsMode(
     String columnName = inputSchema.findColumnName(fieldId);
     return metricsConfig.columnMode(columnName);
   }
+
+  public static final List<ReadableMetricCol> READABLE_COL_METRICS =
+      ImmutableList.of(
+          new ReadableMetricCol("column_size", f -> Types.LongType.get(), "Total size on disk"),
+          new ReadableMetricCol(
+              "value_count", f -> Types.LongType.get(), "Total count, including null and NaN"),
+          new ReadableMetricCol("null_value_count", f -> Types.LongType.get(), "Null value count"),
+          new ReadableMetricCol("nan_value_count", f -> Types.LongType.get(), "NaN value count"),
+          new ReadableMetricCol("lower_bound", Types.NestedField::type, "Lower bound"),
+          new ReadableMetricCol("upper_bound", Types.NestedField::type, "Upper bound"));
+
+  public static final String READABLE_METRICS = "readable_metrics";
+
+  public static class ReadableMetricCol {
+    private final String name;
+    private final Function<Types.NestedField, Type> typeFunction;
+    private final String doc;
+
+    ReadableMetricCol(String name, Function<Types.NestedField, Type> typeFunction, String doc) {
+      this.name = name;
+      this.typeFunction = typeFunction;
+      this.doc = doc;
+    }
+
+    String name() {
+      return name;
+    }
+
+    Type type(Types.NestedField field) {
+      return typeFunction.apply(field);
+    }
+
+    String doc() {
+      return doc;
+    }
+  }
+
+  /**
+   * Represents a struct of metrics for a primitive column
+   *
+   * @param <T> primitive column type
+   */
+  public static class ReadableColMetricsStruct<T> implements StructLike {
+
+    private final String columnName;
+    private final Long columnSize;
+    private final Long valueCount;
+    private final Long nullValueCount;
+    private final Long nanValueCount;
+    private final T lowerBound;
+    private final T upperBound;
+    private final Map<Integer, Integer> projectionMap;
+
+    public ReadableColMetricsStruct(
+        String columnName,
+        Long columnSize,
+        Long valueCount,
+        Long nullValueCount,
+        Long nanValueCount,
+        T lowerBound,
+        T upperBound,
+        Types.NestedField projection) {
+      this.columnName = columnName;
+      this.columnSize = columnSize;
+      this.valueCount = valueCount;
+      this.nullValueCount = nullValueCount;
+      this.nanValueCount = nanValueCount;
+      this.lowerBound = lowerBound;
+      this.upperBound = upperBound;
+      this.projectionMap = readableMetricsProjection(projection);
+    }
+
+    @Override
+    public int size() {
+      return projectionMap.size();
+    }
+
+    @Override
+    public <T> T get(int pos, Class<T> javaClass) {
+      Object value = get(pos);
+      return value == null ? null : javaClass.cast(value);
+    }
+
+    @Override
+    public <T> void set(int pos, T value) {
+      throw new UnsupportedOperationException("ReadableMetricsStruct is read only");
+    }
+
+    private Object get(int pos) {
+      int projectedPos = projectionMap.get(pos);
+      switch (projectedPos) {
+        case 0:
+          return columnSize;
+        case 1:
+          return valueCount;
+        case 2:
+          return nullValueCount;
+        case 3:
+          return nanValueCount;
+        case 4:
+          return lowerBound;
+        case 5:
+          return upperBound;
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid projected pos %d", projectedPos));
+      }
+    }
+
+    /** @return map of projected position to actual position of this struct's fields */
+    private Map<Integer, Integer> readableMetricsProjection(Types.NestedField projection) {
+      Map<Integer, Integer> result = Maps.newHashMap();
+
+      Set<String> projectedFields =
+          Sets.newHashSet(
+              projection.type().asStructType().fields().stream()
+                  .map(Types.NestedField::name)
+                  .collect(Collectors.toSet()));
+
+      int projectedIndex = 0;
+      for (int fieldIndex = 0; fieldIndex < READABLE_COL_METRICS.size(); fieldIndex++) {
+        ReadableMetricCol readableMetric = READABLE_COL_METRICS.get(fieldIndex);
+
+        if (projectedFields.contains(readableMetric.name())) {
+          result.put(projectedIndex, fieldIndex);
+          projectedIndex++;
+        }
+      }
+      return result;
+    }
+
+    String columnName() {
+      return columnName;
+    }
+  }
+
+  /**
+   * Represents a struct, consisting of all {@link ReadableColMetricsStruct} for all primitive
+   * columns of the table
+   */
+  public static class ReadableMetricsStruct implements StructLike {
+
+    private final List<StructLike> columnMetrics;
+
+    public ReadableMetricsStruct(List<StructLike> columnMetrics) {
+      this.columnMetrics = columnMetrics;
+    }
+
+    @Override
+    public int size() {
+      return columnMetrics.size();
+    }
+
+    @Override
+    public <T> T get(int pos, Class<T> javaClass) {
+      return javaClass.cast(columnMetrics.get(pos));
+    }
+
+    @Override
+    public <T> void set(int pos, T value) {
+      throw new UnsupportedOperationException("ReadableMetricsStruct is read only");
+    }
+  }
+
+  /**
+   * Calculates a dynamic schema for readable_metrics to add to metadata tables. The type will be
+   * the struct {@link ReadableColMetricsStruct}, composed of {@link ReadableMetricsStruct} for all
+   * primitive columns in the data table
+   *
+   * @param dataTableSchema schema of data table
+   * @param metadataTableSchema schema of existing metadata table (to ensure id uniqueness)
+   * @param baseId first id to assign. This algorithm assigns field ids by incrementing this value
+   *     and avoiding conflict with existing metadata table schema
+   * @return schema of readable_metrics struct
+   */
+  public static Schema readableMetricsSchema(
+      Schema dataTableSchema, Schema metadataTableSchema, int baseId) {
+    List<Types.NestedField> fields = Lists.newArrayList();
+    Set<Integer> usedIds = metadataTableSchema.idToName().keySet();
+
+    class NextFieldId {
+      private int next;
+
+      NextFieldId() {
+        this.next = baseId;
+      }
+
+      int next() {
+        do {
+          next++;
+        } while (usedIds.contains(next));
+        return next;
+      }
+    }
+    NextFieldId next = new NextFieldId();
+
+    Map<Integer, String> idToName = dataTableSchema.idToName();
+    for (int id : idToName.keySet()) {

Review Comment:
   Hm as I can see, schema.columns() will return only the top level columns, whereas Schema.idToName returns ids of all columns even nested ones. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org