You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/03/16 13:42:03 UTC

[GitHub] [hudi] bvaradar commented on a change in pull request #4910: [RFC-33] [HUDI-2429][Stacked on HUDI-2560] Support full Schema evolution for Spark

bvaradar commented on a change in pull request #4910:
URL: https://github.com/apache/hudi/pull/4910#discussion_r828000988



##########
File path: hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchema.java
##########
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.internal.schema;
+
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.internal.schema.Types.Field;
+import org.apache.hudi.internal.schema.Types.RecordType;
+import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Internal schema for hudi table.
+ * used to support schema evolution.
+ */
+public class InternalSchema implements Serializable {
+
+  private static final long DEFAULT_VERSION_ID = 0;
+
+  private final RecordType record;
+
+  private int maxColumnId;
+  private long versionId;
+
+  private transient Map<Integer, Field> idToField = null;
+  private transient Map<String, Integer> nameToId = null;
+  private transient Map<Integer, String> idToName = null;
+
+  public static InternalSchema getDummyInternalSchema() {
+    return new InternalSchema(-1L, new ArrayList<>());
+  }
+
+  public boolean isDummySchema() {

Review comment:
       Instead of Dummy, rename as Empty everywhere

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/util/TableInternalSchemaUtils.java
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.InternalSchema;
+import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
+import org.apache.hudi.internal.schema.utils.SerDeHelper;
+
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+public class TableInternalSchemaUtils {
+  // use segment lock to reduce competition.
+  // the lock size should be powers of 2 for better hash.
+  private static Object[] lockList = new Object[16];
+
+  static {
+    for (int i = 0; i < lockList.length; i++) {
+      lockList[i] = new Object();
+    }
+  }
+
+  // historySchemas cache maintain a map about (tablePath, HistorySchemas).
+  // this is a Global cache, all threads in one container/executor share the same cache.
+  private static final Cache<String, TreeMap<Long, InternalSchema>>
+      HISTORICAL_SCHEMA_CACHE = Caffeine.newBuilder().maximumSize(1000).weakValues().build();
+
+  /**
+   * search internalSchema based on versionID.
+   * first step: try to get internalSchema from hoodie commit files, we no need to add lock.
+   * if we cannot get internalSchema by first step, then we try to get internalSchema from cache.
+   *
+   * @param versionID schema version_id need to search
+   * @param tablePath current hoodie table base path
+   * @param hadoopConf hadoopConf
+   * @return internalSchema
+   */
+  public static InternalSchema searchSchemaAndCache(long versionID, String tablePath, Configuration hadoopConf) {
+    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(tablePath).setConf(hadoopConf).build();
+    return searchSchemaAndCache(versionID, metaClient);
+  }
+
+  public static InternalSchema searchSchemaAndCache(long versionID, HoodieTableMetaClient metaClient) {
+    Option<InternalSchema> candidateSchema = searchSchema(versionID, metaClient);
+    if (candidateSchema.isPresent()) {
+      return candidateSchema.get();
+    }
+    String tablePath = metaClient.getBasePath();
+    // use segment lock to reduce competition.
+    synchronized (lockList[tablePath.hashCode() & (lockList.length - 1)]) {
+      TreeMap<Long, InternalSchema> historicalSchemas = HISTORICAL_SCHEMA_CACHE.getIfPresent(tablePath);
+      if (historicalSchemas == null || SerDeHelper.searchSchema(versionID, historicalSchemas) == null) {
+        historicalSchemas = getHistoricalSchemas(metaClient);
+        HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
+      } else {
+        long maxVersionId = historicalSchemas.keySet().stream().max(Long::compareTo).get();
+        if (versionID > maxVersionId) {
+          historicalSchemas = getHistoricalSchemas(metaClient);
+          HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
+        }
+      }
+      return SerDeHelper.searchSchema(versionID, historicalSchemas);
+    }
+  }
+
+  private static TreeMap<Long, InternalSchema> getHistoricalSchemas(HoodieTableMetaClient metaClient) {
+    TreeMap<Long, InternalSchema> result = new TreeMap<>();
+    FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient);
+    String historySchemaStr = schemasManager.getHistorySchemaStr();
+    if (!StringUtils.isNullOrEmpty(historySchemaStr)) {
+      result = SerDeHelper.parseSchemas(historySchemaStr);
+    }
+    return result;
+  }
+
+  private static Option<InternalSchema> searchSchema(long versionID, HoodieTableMetaClient metaClient) {
+    try {
+      HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+      List<HoodieInstant> instants = timeline.getInstants().filter(f -> f.getTimestamp().equals(String.valueOf(versionID))).collect(Collectors.toList());
+      if (instants.isEmpty()) {
+        return Option.empty();
+      }
+      byte[] data = timeline.getInstantDetails(instants.get(0)).get();
+      HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
+      String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATESTSCHEMA);
+      return SerDeHelper.fromJson(latestInternalSchemaStr);
+    } catch (Exception e) {
+      throw new HoodieException("Failed to read schema from commit metadata", e);
+    }
+  }
+
+  public static Pair<Option<String>, Option<String>> getInternalSchemaAndAvroSchemaForClusteringAndCompaction(HoodieTableMetaClient metaClient, String compactionAndClusteringInstant) {
+    // try to load internalSchema to support Schema Evolution
+    HoodieTimeline timelineBeforeCurrentCompaction = metaClient.getActiveTimeline().filterCompletedInstants().findInstantsBefore(compactionAndClusteringInstant);
+    Option<HoodieInstant> lastInstantBeforeCurrentCompaction =  timelineBeforeCurrentCompaction.lastInstant();
+    if (lastInstantBeforeCurrentCompaction.isPresent()) {
+      try {
+        // try to find internalSchema
+        byte[] data = timelineBeforeCurrentCompaction.getInstantDetails(lastInstantBeforeCurrentCompaction.get()).get();
+        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
+        String internalSchemaStr = metadata.getMetadata(SerDeHelper.LATESTSCHEMA);
+        if (internalSchemaStr != null) {
+          String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
+          return Pair.of(Option.of(internalSchemaStr), Option.of(existingSchemaStr));
+        }
+      } catch (Exception e) {
+        // swallow this exception

Review comment:
       Why are we eating the exception ?

##########
File path: hudi-common/pom.xml
##########
@@ -108,6 +108,13 @@
       <artifactId>jackson-databind</artifactId>
     </dependency>
 
+    <!-- caffeine -->

Review comment:
       This has Apache License. This new package should be included in packaging bundles: packaging/hudi-utilities-bundle/pom.xml and packaging/hudi-spark-bundle/pom.xml.

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/util/TableInternalSchemaUtils.java
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.InternalSchema;
+import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
+import org.apache.hudi.internal.schema.utils.SerDeHelper;
+
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+public class TableInternalSchemaUtils {
+  // use segment lock to reduce competition.
+  // the lock size should be powers of 2 for better hash.
+  private static Object[] lockList = new Object[16];
+
+  static {
+    for (int i = 0; i < lockList.length; i++) {
+      lockList[i] = new Object();
+    }
+  }
+
+  // historySchemas cache maintain a map about (tablePath, HistorySchemas).
+  // this is a Global cache, all threads in one container/executor share the same cache.
+  private static final Cache<String, TreeMap<Long, InternalSchema>>
+      HISTORICAL_SCHEMA_CACHE = Caffeine.newBuilder().maximumSize(1000).weakValues().build();
+
+  /**
+   * search internalSchema based on versionID.
+   * first step: try to get internalSchema from hoodie commit files, we no need to add lock.
+   * if we cannot get internalSchema by first step, then we try to get internalSchema from cache.
+   *
+   * @param versionID schema version_id need to search
+   * @param tablePath current hoodie table base path
+   * @param hadoopConf hadoopConf
+   * @return internalSchema
+   */
+  public static InternalSchema searchSchemaAndCache(long versionID, String tablePath, Configuration hadoopConf) {
+    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(tablePath).setConf(hadoopConf).build();
+    return searchSchemaAndCache(versionID, metaClient);
+  }
+
+  public static InternalSchema searchSchemaAndCache(long versionID, HoodieTableMetaClient metaClient) {

Review comment:
       Please add javadocs for all the public methods here

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/util/TableInternalSchemaUtils.java
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.InternalSchema;
+import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
+import org.apache.hudi.internal.schema.utils.SerDeHelper;
+
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+public class TableInternalSchemaUtils {

Review comment:
       This should be named as InternalSchemaCache. 

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/util/TableInternalSchemaUtils.java
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.internal.schema.InternalSchema;
+import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
+import org.apache.hudi.internal.schema.utils.SerDeHelper;
+
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+public class TableInternalSchemaUtils {
+  // use segment lock to reduce competition.
+  // the lock size should be powers of 2 for better hash.
+  private static Object[] lockList = new Object[16];
+
+  static {
+    for (int i = 0; i < lockList.length; i++) {
+      lockList[i] = new Object();
+    }
+  }
+
+  // historySchemas cache maintain a map about (tablePath, HistorySchemas).
+  // this is a Global cache, all threads in one container/executor share the same cache.

Review comment:
       What is the need for the global cache ? From the call hierarchy, it looks like it is only being used in driver side but the comment says both driver and executor.  Also, I don't see the cache being updated in when a new commit with updated schema gets added. Can we simply read the history schema for every commit and pass it along. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org