You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "Ethan Guo (Jira)" <ji...@apache.org> on 2022/09/09 19:31:00 UTC

[jira] [Updated] (HUDI-4825) Commit metadata in Json contains redundant information

     [ https://issues.apache.org/jira/browse/HUDI-4825?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Ethan Guo updated HUDI-4825:
----------------------------
    Description: 
The commit metadata in Json (*.commit, *.deltacommit) written to the Hudi timeline under .hoodie contains redundant fields that can be trimmed.  As shown below, the same set of write stats is written to both "partitionToWriteStats" and "writeStats", doubling the size and increasing the serde overhead.  Other fields like "totalRecordsDeleted", "writePartitionPaths", "fileIdAndRelativePaths", etc., can be removed as well as they are derived from "partitionToWriteStats" and not directly used by HoodieCommitMetadata class.

Example commit metadata:

 
{code:java}
{
  "partitionToWriteStats" : {
    "2022/1/31" : [ {
      "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
      "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
      "prevCommit" : "20220410134320333",
      "numWrites" : 250175,
      "numDeletes" : 0,
      "numUpdateWrites" : 0,
      "numInserts" : 50035,
      "totalWriteBytes" : 90720802,
      "totalWriteErrors" : 0,
      "tempPath" : null,
      "partitionPath" : "2022/1/31",
      "totalLogRecords" : 0,
      "totalLogFilesCompacted" : 0,
      "totalLogSizeCompacted" : 0,
      "totalUpdatedRecordsCompacted" : 0,
      "totalLogBlocks" : 0,
      "totalCorruptLogBlock" : 0,
      "totalRollbackBlocks" : 0,
      "fileSizeInBytes" : 90720802,
      "minEventTime" : null,
      "maxEventTime" : null
    } ],
    ...
  },
  "compacted" : false,
  "extraMetadata" : {
    "schema" : "{\"type\":\"record\",\"name\":\"hoodie_source\",\"namespace\":\"hoodie.source\",\"fields\":[{\"name\":\"key\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"ts\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"textField\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"decimalField\",\"type\":[\"null\",\"float\"],\"default\":null},{\"name\":\"longField\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"arrayField\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"mapField\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"round\",\"type\":[\"null\",\"int\"],\"default\":null}]}",
    "deltastreamer.checkpoint.key" : "17"
  },
  "operationType" : "INSERT",
  "writeStats" : [ {
    "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
    "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
    "prevCommit" : "20220410134320333",
    "numWrites" : 250175,
    "numDeletes" : 0,
    "numUpdateWrites" : 0,
    "numInserts" : 50035,
    "totalWriteBytes" : 90720802,
    "totalWriteErrors" : 0,
    "tempPath" : null,
    "partitionPath" : "2022/1/31",
    "totalLogRecords" : 0,
    "totalLogFilesCompacted" : 0,
    "totalLogSizeCompacted" : 0,
    "totalUpdatedRecordsCompacted" : 0,
    "totalLogBlocks" : 0,
    "totalCorruptLogBlock" : 0,
    "totalRollbackBlocks" : 0,
    "fileSizeInBytes" : 90720802,
    "minEventTime" : null,
    "maxEventTime" : null
  }, 
  ... 
  ],
  "totalRecordsDeleted" : 0,
  "totalLogFilesSize" : 0,
  "totalScanTime" : 0,
  "totalCreateTime" : 0,
  "totalUpsertTime" : 309120,
  "minAndMaxEventTime" : {
    "Optional.empty" : {
      "val" : null,
      "present" : false
    }
  },
  "writePartitionPaths" : [ "2022/1/31", "2022/1/30", "2022/1/28", "2022/1/27", "2022/2/2", "2022/1/29", "2022/1/24", "2022/2/1", "2022/1/26", "2022/1/25" ],
  "fileIdAndRelativePaths" : {
    "3e31414c-fb4c-4ce9-aa27-a43640d94430-0" : "2022/1/25/3e31414c-fb4c-4ce9-aa27-a43640d94430-0_9-9-47_20220410134618909.parquet",
    ...
  },
  "totalLogRecordsCompacted" : 0,
  "totalLogFilesCompacted" : 0,
  "totalCompactedRecordsUpdated" : 0
} {code}
 

 

> Commit metadata in Json contains redundant information
> ------------------------------------------------------
>
>                 Key: HUDI-4825
>                 URL: https://issues.apache.org/jira/browse/HUDI-4825
>             Project: Apache Hudi
>          Issue Type: Bug
>            Reporter: Ethan Guo
>            Priority: Major
>
> The commit metadata in Json (*.commit, *.deltacommit) written to the Hudi timeline under .hoodie contains redundant fields that can be trimmed.  As shown below, the same set of write stats is written to both "partitionToWriteStats" and "writeStats", doubling the size and increasing the serde overhead.  Other fields like "totalRecordsDeleted", "writePartitionPaths", "fileIdAndRelativePaths", etc., can be removed as well as they are derived from "partitionToWriteStats" and not directly used by HoodieCommitMetadata class.
> Example commit metadata:
>  
> {code:java}
> {
>   "partitionToWriteStats" : {
>     "2022/1/31" : [ {
>       "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
>       "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
>       "prevCommit" : "20220410134320333",
>       "numWrites" : 250175,
>       "numDeletes" : 0,
>       "numUpdateWrites" : 0,
>       "numInserts" : 50035,
>       "totalWriteBytes" : 90720802,
>       "totalWriteErrors" : 0,
>       "tempPath" : null,
>       "partitionPath" : "2022/1/31",
>       "totalLogRecords" : 0,
>       "totalLogFilesCompacted" : 0,
>       "totalLogSizeCompacted" : 0,
>       "totalUpdatedRecordsCompacted" : 0,
>       "totalLogBlocks" : 0,
>       "totalCorruptLogBlock" : 0,
>       "totalRollbackBlocks" : 0,
>       "fileSizeInBytes" : 90720802,
>       "minEventTime" : null,
>       "maxEventTime" : null
>     } ],
>     ...
>   },
>   "compacted" : false,
>   "extraMetadata" : {
>     "schema" : "{\"type\":\"record\",\"name\":\"hoodie_source\",\"namespace\":\"hoodie.source\",\"fields\":[{\"name\":\"key\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"ts\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"textField\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"decimalField\",\"type\":[\"null\",\"float\"],\"default\":null},{\"name\":\"longField\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"arrayField\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"mapField\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"int\",\"null\"]}],\"default\":null},{\"name\":\"round\",\"type\":[\"null\",\"int\"],\"default\":null}]}",
>     "deltastreamer.checkpoint.key" : "17"
>   },
>   "operationType" : "INSERT",
>   "writeStats" : [ {
>     "fileId" : "0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0",
>     "path" : "2022/1/31/0cb6ac8a-ee31-4f00-a359-ba6ebfb80463-0_0-9-38_20220410134618909.parquet",
>     "prevCommit" : "20220410134320333",
>     "numWrites" : 250175,
>     "numDeletes" : 0,
>     "numUpdateWrites" : 0,
>     "numInserts" : 50035,
>     "totalWriteBytes" : 90720802,
>     "totalWriteErrors" : 0,
>     "tempPath" : null,
>     "partitionPath" : "2022/1/31",
>     "totalLogRecords" : 0,
>     "totalLogFilesCompacted" : 0,
>     "totalLogSizeCompacted" : 0,
>     "totalUpdatedRecordsCompacted" : 0,
>     "totalLogBlocks" : 0,
>     "totalCorruptLogBlock" : 0,
>     "totalRollbackBlocks" : 0,
>     "fileSizeInBytes" : 90720802,
>     "minEventTime" : null,
>     "maxEventTime" : null
>   }, 
>   ... 
>   ],
>   "totalRecordsDeleted" : 0,
>   "totalLogFilesSize" : 0,
>   "totalScanTime" : 0,
>   "totalCreateTime" : 0,
>   "totalUpsertTime" : 309120,
>   "minAndMaxEventTime" : {
>     "Optional.empty" : {
>       "val" : null,
>       "present" : false
>     }
>   },
>   "writePartitionPaths" : [ "2022/1/31", "2022/1/30", "2022/1/28", "2022/1/27", "2022/2/2", "2022/1/29", "2022/1/24", "2022/2/1", "2022/1/26", "2022/1/25" ],
>   "fileIdAndRelativePaths" : {
>     "3e31414c-fb4c-4ce9-aa27-a43640d94430-0" : "2022/1/25/3e31414c-fb4c-4ce9-aa27-a43640d94430-0_9-9-47_20220410134618909.parquet",
>     ...
>   },
>   "totalLogRecordsCompacted" : 0,
>   "totalLogFilesCompacted" : 0,
>   "totalCompactedRecordsUpdated" : 0
> } {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)