You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2019/08/13 17:14:28 UTC

[incubator-iceberg] branch master updated: Fix reading of old .metadata.json.gz files (#371)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 5233fd4  Fix reading of old .metadata.json.gz files (#371)
5233fd4 is described below

commit 5233fd4a2d6cf3b29d1dcfb6095b2955e38cbec5
Author: Xabriel J. Collazo Mojica <xc...@adobe.com>
AuthorDate: Tue Aug 13 10:14:23 2019 -0700

    Fix reading of old .metadata.json.gz files (#371)
---
 .../org/apache/iceberg/TableMetadataParser.java    |  5 +++++
 .../iceberg/hadoop/HadoopTableOperations.java      | 14 ++++++++++++
 .../apache/iceberg/hadoop/HadoopTableTestBase.java | 26 +++++++++++++++++++++-
 .../apache/iceberg/hadoop/TestHadoopCommits.java   | 26 ++++++++++++++++++++++
 4 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/core/src/main/java/org/apache/iceberg/TableMetadataParser.java b/core/src/main/java/org/apache/iceberg/TableMetadataParser.java
index 99bc1d7..03f947d 100644
--- a/core/src/main/java/org/apache/iceberg/TableMetadataParser.java
+++ b/core/src/main/java/org/apache/iceberg/TableMetadataParser.java
@@ -116,6 +116,11 @@ public class TableMetadataParser {
     return codec.extension + ".metadata.json";
   }
 
+  public static String getOldFileExtension(Codec codec) {
+    // we have to be backward-compatible with .metadata.json.gz files
+    return ".metadata.json" + codec.extension;
+  }
+
   public static String toJson(TableMetadata metadata) {
     StringWriter writer = new StringWriter();
     try {
diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java
index 0486bdd..67ae296 100644
--- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java
+++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java
@@ -178,7 +178,17 @@ public class HadoopTableOperations implements TableOperations {
       if (fs.exists(metadataFile)) {
         return metadataFile;
       }
+
+      if (codec.equals(TableMetadataParser.Codec.GZIP)) {
+        // we have to be backward-compatible with .metadata.json.gz files
+        metadataFile = oldMetadataFilePath(metadataVersion, codec);
+        fs = getFileSystem(metadataFile, conf);
+        if (fs.exists(metadataFile)) {
+          return metadataFile;
+        }
+      }
     }
+
     return null;
   }
 
@@ -186,6 +196,10 @@ public class HadoopTableOperations implements TableOperations {
     return metadataPath("v" + metadataVersion + TableMetadataParser.getFileExtension(codec));
   }
 
+  private Path oldMetadataFilePath(int metadataVersion, TableMetadataParser.Codec codec) {
+    return metadataPath("v" + metadataVersion + TableMetadataParser.getOldFileExtension(codec));
+  }
+
   private Path metadataPath(String filename) {
     return new Path(new Path(location, "metadata"), filename);
   }
diff --git a/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java b/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java
index f771df6..11e85b4 100644
--- a/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java
+++ b/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java
@@ -22,9 +22,12 @@ package org.apache.iceberg.hadoop;
 import com.google.common.collect.Lists;
 import com.google.common.io.Files;
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
+import java.util.zip.GZIPOutputStream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
@@ -121,7 +124,8 @@ public class HadoopTableTestBase {
   }
 
   List<File> listMetadataJsonFiles() {
-    return Lists.newArrayList(metadataDir.listFiles((dir, name) -> name.endsWith(".metadata.json")));
+    return Lists.newArrayList(metadataDir.listFiles((dir, name) ->
+        name.endsWith(".metadata.json") || name.endsWith(".metadata.json.gz")));
   }
 
   File version(int versionNumber) {
@@ -142,4 +146,24 @@ public class HadoopTableTestBase {
     new File(metadataDir, ".version-hint.text.crc").delete();
     Files.write(String.valueOf(version), versionHintFile, StandardCharsets.UTF_8);
   }
+
+  /*
+   * Rewrites all current metadata files to gz compressed with extension .metadata.json.gz.
+   * Assumes source files are not compressed.
+   */
+  void rewriteMetadataAsGzipWithOldExtension() throws IOException {
+    List<File> metadataJsonFiles = listMetadataJsonFiles();
+    for (File file : metadataJsonFiles) {
+      try (FileInputStream input = new FileInputStream(file)) {
+        try (GZIPOutputStream gzOutput = new GZIPOutputStream(new FileOutputStream(file.getAbsolutePath() + ".gz"))) {
+          int bb;
+          while ((bb = input.read()) != -1) {
+            gzOutput.write(bb);
+          }
+        }
+      }
+      // delete original metadata file
+      file.delete();
+    }
+  }
 }
diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java
index 1c56123..2012122 100644
--- a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java
+++ b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java
@@ -367,4 +367,30 @@ public class TestHadoopCommits extends HadoopTableTestBase {
     Set<String> expected = Sets.newHashSet("v1.metadata.json", "v2.metadata.json");
     assertEquals("only v1 and v2 metadata.json should exist.", expected, actual);
   }
+
+  @Test
+  public void testCanReadOldCompressedManifestFiles() throws Exception {
+    assertTrue("Should create v1 metadata",
+        version(1).exists() && version(1).isFile());
+
+    // do a file append
+    table.newAppend()
+        .appendFile(FILE_A)
+        .commit();
+
+    // since we don't generate old file extensions anymore, let's convert existing metadata to old .metadata.json.gz
+    // to test backwards compatibility
+    rewriteMetadataAsGzipWithOldExtension();
+
+    List<File> metadataFiles = listMetadataJsonFiles();
+
+    assertEquals("Should have two versions", 2, metadataFiles.size());
+    assertTrue("Metadata should be compressed with old format.",
+        metadataFiles.stream().allMatch(f -> f.getName().endsWith(".metadata.json.gz")));
+
+    Table reloaded = TABLES.load(tableLocation);
+
+    List<FileScanTask> tasks = Lists.newArrayList(reloaded.newScan().planFiles());
+    Assert.assertEquals("Should scan 1 files", 1, tasks.size());
+  }
 }