You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/04/01 04:37:14 UTC

[GitHub] [hudi] vinothchandar commented on a change in pull request #5179: [HUDI-3290] Different file formats for the partition metadata file.

vinothchandar commented on a change in pull request #5179:
URL: https://github.com/apache/hudi/pull/5179#discussion_r840218221



##########
File path: hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java
##########
@@ -117,30 +133,119 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : "";
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());

Review comment:
       cc @xushiyan and @vingov . This could be an issue potentially when testing.

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java
##########
@@ -117,30 +133,119 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : "";
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());
+          try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {
+            for (String key : props.stringPropertyNames()) {
+              writeSupport.addFooterMetadata(key, props.getProperty(key));
+            }
+          }
+          break;
+        case ORC:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(fs.getConf()).fileSystem(fs)
+              .setSchema(AvroOrcUtils.createOrcSchema(schema));
+          try (Writer writer = OrcFile.createWriter(filePath, writerOptions)) {
+            for (String key : props.stringPropertyNames()) {
+              writer.addUserMetadata(key, ByteBuffer.wrap(props.getProperty(key).getBytes()));
+            }
+          }
+          break;
+        default:

Review comment:
       okay makes sense. I was worried that if we flip this config, then we will write the file in HFile (since that's the base format) for MDT (metadata table). Guess it will throw this error. lets punt . you are right

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java
##########
@@ -117,30 +133,119 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : "";
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());
+          try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {

Review comment:
       okay makes sense. we can file a code cleanup task? 

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java
##########
@@ -117,30 +133,119 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : "";
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());
+          try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {
+            for (String key : props.stringPropertyNames()) {
+              writeSupport.addFooterMetadata(key, props.getProperty(key));
+            }
+          }
+          break;
+        case ORC:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(fs.getConf()).fileSystem(fs)
+              .setSchema(AvroOrcUtils.createOrcSchema(schema));
+          try (Writer writer = OrcFile.createWriter(filePath, writerOptions)) {
+            for (String key : props.stringPropertyNames()) {
+              writer.addUserMetadata(key, ByteBuffer.wrap(props.getProperty(key).getBytes()));
+            }
+          }
+          break;
+        default:
+          throw new HoodieException("Unsupported format for partition metafiles: " + format.get());
+      }
+    } else {
+      // Backwards compatible properties file format
+      FSDataOutputStream os = fs.create(filePath, true);
+      props.store(os, "partition metadata");
+      os.hsync();
+      os.hflush();
+      os.close();
+    }
+  }
+
   /**
    * Read out the metadata for this partition.
    */
   public void readFromFS() throws IOException {
-    FSDataInputStream is = null;
+    Option<Path> metafilePath = getPartitionMetafilePath(fs, partitionPath);
+    if (!metafilePath.isPresent()) {
+      throw new HoodieException("Partition metafile not found in path " + partitionPath);
+    }
+
     try {
-      Path metaFile = new Path(partitionPath, HOODIE_PARTITION_METAFILE);
-      is = fs.open(metaFile);
-      props.load(is);
-    } catch (IOException ioe) {
-      throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, ioe);
-    } finally {
-      if (is != null) {
-        is.close();
+      BaseFileUtils reader = BaseFileUtils.getInstance(metafilePath.toString());
+      Map<String, String> metadata = reader.readFooter(fs.getConf(), true, metafilePath.get(), PARTITION_DEPTH_KEY, COMMIT_TIME_KEY);
+      props.clear();
+      metadata.forEach((k, v) -> props.put(k, v));
+    } catch (UnsupportedOperationException e) {
+      // Properties file format
+      FSDataInputStream is = null;
+      try {
+        is = fs.open(metafilePath.get());
+        props.load(is);
+      } catch (IOException ioe) {
+        throw new HoodieException("Error reading Hoodie partition metadata from " + metafilePath, ioe);
+      } finally {
+        if (is != null) {
+          is.close();
+        }
       }
     }
   }
 
   // methods related to partition meta data
   public static boolean hasPartitionMetadata(FileSystem fs, Path partitionPath) {
+    return getPartitionMetafilePath(fs, partitionPath).isPresent();
+  }
+
+  /**
+   * Returns the name of the partition metadata.
+   *
+   * @param fs
+   * @param partitionPath
+   * @return Name of the partition metafile or empty option
+   */
+  public static Option<Path> getPartitionMetafilePath(FileSystem fs, Path partitionPath) {
+    // The partition listing is a costly operation so instead we are searching for existence of the files instead.
+    // This is in expected order as properties file based partition metafiles should be the most common.

Review comment:
       duh. you are right. I still like to avoid the exists call. let me think about it little more. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org