You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/04/04 10:39:22 UTC

[GitHub] [hudi] xushiyan commented on a diff in pull request #5179: [HUDI-3290] Different file formats for the partition metadata file.

xushiyan commented on code in PR #5179:
URL: https://github.com/apache/hudi/pull/5179#discussion_r841580208


##########
hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java:
##########
@@ -608,6 +614,16 @@ public String getMetadataPartitions() {
     return getStringOrDefault(TABLE_METADATA_PARTITIONS, StringUtils.EMPTY_STRING);
   }
   
+  /**
+   * Returns the format to use for partition meta files.
+   */
+  public Option<HoodieFileFormat> getPartitionMetafileFormat() {
+    if (Boolean.parseBoolean(getStringOrDefault(PARTITION_METAFILE_USE_BASE_FORMAT))) {

Review Comment:
   /nit directly use getBooleanOrDefault()



##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java:
##########
@@ -118,35 +136,112 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : StringUtils.EMPTY_STRING;
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());
+          try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {
+            for (String key : props.stringPropertyNames()) {
+              writeSupport.addFooterMetadata(key, props.getProperty(key));
+            }
+          }
+          break;
+        case ORC:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(fs.getConf()).fileSystem(fs)
+              .setSchema(AvroOrcUtils.createOrcSchema(schema));
+          try (Writer writer = OrcFile.createWriter(filePath, writerOptions)) {
+            for (String key : props.stringPropertyNames()) {
+              writer.addUserMetadata(key, ByteBuffer.wrap(props.getProperty(key).getBytes()));
+            }
+          }
+          break;
+        default:
+          throw new HoodieException("Unsupported format for partition metafiles: " + format.get());
+      }
+    } else {
+      // Backwards compatible properties file format
+      FSDataOutputStream os = fs.create(filePath, true);
+      props.store(os, "partition metadata");
+      os.hsync();
+      os.hflush();
+      os.close();
+    }
+  }
+
   /**
    * Read out the metadata for this partition.
    */
   public void readFromFS() throws IOException {
-    FSDataInputStream is = null;
-    try {
-      Path metaFile = new Path(partitionPath, HOODIE_PARTITION_METAFILE);
-      is = fs.open(metaFile);
+    // first try reading the text format (legacy, currently widespread)

Review Comment:
   looks good. just 1 question: this is not a static method why we can't rely on Option<HoodieFileFormat> format to read instead of try-n-error



##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java:
##########
@@ -118,35 +136,112 @@ public void trySave(int taskPartitionId) {
     }
   }
 
+  private String getMetafileExtension() {
+    // To be backwards compatible, there is no extension to the properties file base partition metafile
+    return format.isPresent() ? format.get().getFileExtension() : StringUtils.EMPTY_STRING;
+  }
+
+  /**
+   * Write the partition metadata in the correct format in the given file path.
+   *
+   * @param filePath Path of the file to write
+   * @throws IOException
+   */
+  private void writeMetafile(Path filePath) throws IOException {
+    if (format.isPresent()) {
+      Schema schema = HoodieAvroUtils.getRecordKeySchema();
+
+      switch (format.get()) {
+        case PARQUET:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy");
+          HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty());
+          try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {
+            for (String key : props.stringPropertyNames()) {
+              writeSupport.addFooterMetadata(key, props.getProperty(key));
+            }
+          }
+          break;
+        case ORC:
+          // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
+          // parameters are not important.
+          OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(fs.getConf()).fileSystem(fs)
+              .setSchema(AvroOrcUtils.createOrcSchema(schema));

Review Comment:
   why not the same dummy schema as parquet's case? understand schema does not matter but better to align across format



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org