You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/06/03 22:47:12 UTC
[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #5733: [HUDI-4176] Fixing `TableSchemaResolver` to avoid repeated `HoodieCommitMetadata` parsing

alexeykudinkin commented on code in PR #5733:
URL: https://github.com/apache/hudi/pull/5733#discussion_r889400793


##########
hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java:
##########
@@ -58,100 +60,46 @@
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
+import javax.annotation.concurrent.ThreadSafe;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
 
 import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema;
+import static org.apache.hudi.avro.AvroSchemaUtils.containsFieldInSchema;
 import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema;
 
 /**
  * Helper class to read schema from data files and log files and to convert it between different formats.
- *
- * TODO(HUDI-3626) cleanup
  */
+@ThreadSafe
 public class TableSchemaResolver {
 
   private static final Logger LOG = LogManager.getLogger(TableSchemaResolver.class);
-  private final HoodieTableMetaClient metaClient;
-  private final boolean hasOperationField;
 
-  public TableSchemaResolver(HoodieTableMetaClient metaClient) {
-    this.metaClient = metaClient;
-    this.hasOperationField = hasOperationField();
-  }
+  private final HoodieTableMetaClient metaClient;
 
   /**
-   * Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
-   * commit. We will assume that the schema has not changed within a single atomic write.
+   * NOTE: {@link HoodieCommitMetadata} could be of non-trivial size for large tables (in 100s of Mbs)
+   *       and therefore we'd want to limit amount of throw-away work being performed while fetching
+   *       commits' metadata
    *
-   * @return Parquet schema for this table
+   *       Please check out corresponding methods to fetch commonly used instances of {@link HoodieCommitMetadata}:
+   *       {@link #getLatestCommitMetadataWithValidSchema()},
+   *       {@link #getLatestCommitMetadataWithValidSchema()},
+   *       {@link #getCachedCommitMetadata(HoodieInstant)}
    */
-  private MessageType getTableParquetSchemaFromDataFile() {
-    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
-    Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
-        activeTimeline.getLastCommitMetadataWithValidData();
-    try {
-      switch (metaClient.getTableType()) {
-        case COPY_ON_WRITE:
-          // For COW table, the file has data written must be in parquet or orc format currently.
-          if (instantAndCommitMetadata.isPresent()) {
-            HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
-            Iterator<String> filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().iterator();
-            return fetchSchemaFromFiles(filePaths);
-          } else {
-            throw new IllegalArgumentException("Could not find any data file written for commit, "
-                + "so could not get schema for table " + metaClient.getBasePath());
-          }
-        case MERGE_ON_READ:
-          // For MOR table, the file has data written may be a parquet file, .log file, orc file or hfile.
-          // Determine the file format based on the file name, and then extract schema from it.
-          if (instantAndCommitMetadata.isPresent()) {
-            HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
-            Iterator<String> filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().iterator();
-            return fetchSchemaFromFiles(filePaths);
-          } else {
-            throw new IllegalArgumentException("Could not find any data file written for commit, "
-                + "so could not get schema for table " + metaClient.getBasePath());
-          }
-        default:
-          LOG.error("Unknown table type " + metaClient.getTableType());
-          throw new InvalidTableException(metaClient.getBasePath());
-      }
-    } catch (IOException e) {
-      throw new HoodieException("Failed to read data schema", e);
-    }
-  }
+  private final Lazy<ConcurrentHashMap<HoodieInstant, HoodieCommitMetadata>> commitMetadataCache;

Review Comment:
   Discussed offline: `TableSchemaResolver` is a short-lived object not meant to be refreshed -- to get latest schema you will have to create another instance with the refreshed `HoodieMetaClient` instance



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org