You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sh...@apache.org on 2021/07/16 21:30:52 UTC
[hadoop] 02/03: Add namespace key for INode. (shv)

This is an automated email from the ASF dual-hosted git repository.

shv pushed a commit to branch fgl
in repository https://gitbox.apache.org/repos/asf/hadoop.git

commit 1829b58b77341b83c9c73f356fa479d9940f6a24
Author: Konstantin V Shvachko <sh...@apache.org>
AuthorDate: Fri May 7 17:51:58 2021 -0700

    Add namespace key for INode. (shv)
---
 .../org/apache/hadoop/util/PartitionedGSet.java    | 80 ++++++++++++++++++----
 .../hadoop/hdfs/server/namenode/FSDirMkdirOp.java  |  3 +
 .../apache/hadoop/hdfs/server/namenode/INode.java  | 40 ++++++++++-
 .../hadoop/hdfs/server/namenode/INodeMap.java      | 71 +++++++++++++++++--
 4 files changed, 176 insertions(+), 18 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/PartitionedGSet.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/PartitionedGSet.java
index 4b0cdc9..7ebb1b3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/PartitionedGSet.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/PartitionedGSet.java
@@ -22,6 +22,7 @@ import java.util.Comparator;
 import java.util.Iterator;
 import java.util.Map.Entry;
 import java.util.NavigableMap;
+import java.util.Set;
 import java.util.TreeMap;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
@@ -44,7 +45,8 @@ import org.apache.hadoop.util.LightWeightGSet.LinkedElement;
 @InterfaceAudience.Private
 public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
 
-  private static final int DEFAULT_PARTITION_CAPACITY = 2027;
+  private static final int DEFAULT_PARTITION_CAPACITY = 65536; // 4096; // 5120; // 2048; // 1027;
+  private static final float DEFAULT_PARTITION_OVERFLOW = 1.8f;
 
   /**
    * An ordered map of contiguous segments of elements.
@@ -81,8 +83,11 @@ public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
       final E rootKey) {
     this.partitions = new TreeMap<K, PartitionEntry>(comparator);
     this.latchLock = latchLock;
-    addNewPartition(rootKey).put(rootKey);
-    this.size = 1;
+    // addNewPartition(rootKey).put(rootKey);
+    // this.size = 1;
+    this.size = 0;
+    LOG.info("Partition capacity = {}", DEFAULT_PARTITION_CAPACITY);
+    LOG.info("Partition overflow factor = {}", DEFAULT_PARTITION_OVERFLOW);
   }
 
   /**
@@ -90,16 +95,19 @@ public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
    * @param key
    * @return
    */
-  private PartitionEntry addNewPartition(final K key) {
+  public PartitionEntry addNewPartition(final K key) {
+    Entry<K, PartitionEntry> lastEntry = partitions.lastEntry();
     PartitionEntry lastPart = null;
-    if(size > 0)
-      lastPart = partitions.lastEntry().getValue();
+    if(lastEntry != null)
+      lastPart = lastEntry.getValue();
 
     PartitionEntry newPart =
         new PartitionEntry(DEFAULT_PARTITION_CAPACITY);
     // assert size == 0 || newPart.partLock.isWriteTopLocked() :
     //      "Must hold write Lock: key = " + key;
-    partitions.put(key, newPart);
+    PartitionEntry oldPart = partitions.put(key, newPart);
+    assert oldPart == null :
+      "RangeMap already has a partition associated with " + key;
 
     LOG.debug("Total GSet size = {}", size);
     LOG.debug("Number of partitions = {}", partitions.size());
@@ -173,7 +181,7 @@ public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
 
   private PartitionEntry addNewPartitionIfNeeded(
       PartitionEntry curPart, K key) {
-    if(curPart.size() < DEFAULT_PARTITION_CAPACITY * 1.1
+    if(curPart.size() < DEFAULT_PARTITION_CAPACITY * DEFAULT_PARTITION_OVERFLOW
         || curPart.contains(key)) {
       return curPart;
     }
@@ -197,12 +205,56 @@ public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
   public void clear() {
     LOG.error("Total GSet size = {}", size);
     LOG.error("Number of partitions = {}", partitions.size());
+    printStats();
     // assert latchLock.hasWriteTopLock() : "Must hold write topLock";
     // SHV May need to clear all partitions?
     partitions.clear();
     size = 0;
   }
 
+  private void printStats() {
+    int partSizeMin = Integer.MAX_VALUE, partSizeAvg = 0, partSizeMax = 0;
+    long totalSize = 0;
+    int numEmptyPartitions = 0, numFullPartitions = 0;
+    Collection<PartitionEntry> parts = partitions.values();
+    Set<Entry<K, PartitionEntry>> entries = partitions.entrySet();
+    int i = 0;
+    for(Entry<K, PartitionEntry> e : entries) {
+      PartitionEntry part = e.getValue();
+      int s = part.size;
+      if(s == 0) numEmptyPartitions++;
+      if(s > DEFAULT_PARTITION_CAPACITY) numFullPartitions++;
+      totalSize += s;
+      partSizeMin = (s < partSizeMin ? s : partSizeMin);
+      partSizeMax = (partSizeMax < s ? s : partSizeMax);
+      Class<?> inodeClass = e.getKey().getClass();
+      try {
+        long[] key = (long[]) inodeClass.
+            getMethod("getNamespaceKey", int.class).invoke(e.getKey(), 2);
+        long[] firstKey = new long[0];
+        if(part.iterator().hasNext()) {
+          Object first = part.iterator().next();
+          firstKey = (long[]) inodeClass.getMethod(
+            "getNamespaceKey", int.class).invoke(first, 2);
+          Object parent = inodeClass.
+              getMethod("getParent").invoke(first);
+          long parentId = (parent == null ? 0L :
+            (long) inodeClass.getMethod("getId").invoke(parent));
+          firstKey[0] = parentId;
+        }
+        LOG.error("Partition #{}\t key: {}\t size: {}\t first: {}",
+            i++, key, s, firstKey);  // SHV should be info
+      } catch (Exception ex) {
+        LOG.error("Cannot find Method getNamespaceKey() in {}", inodeClass);
+      }
+    }
+    partSizeAvg = (int) (totalSize / parts.size());
+    LOG.error("Partition sizes: min = {}, avg = {}, max = {}, sum = {}",
+        partSizeMin, partSizeAvg, partSizeMax, totalSize);
+    LOG.error("Number of partitions: empty = {}, full = {}",
+        numEmptyPartitions, numFullPartitions);
+  }
+
   @Override
   public Collection<E> values() {
     // TODO Auto-generated method stub
@@ -234,15 +286,19 @@ public class PartitionedGSet<K, E extends K> implements GSet<K, E> {
 
     @Override
     public boolean hasNext() {
-      if(partitionIterator.hasNext()) {
-        return true;
+      while(!partitionIterator.hasNext()) {
+        if(!keyIterator.hasNext()) {
+          return false;
+        }
+        K curKey = keyIterator.next();
+        partitionIterator = getPartition(curKey).iterator();
       }
-      return keyIterator.hasNext();
+      return partitionIterator.hasNext();
     }
 
     @Override
     public E next() {
-      if(!partitionIterator.hasNext()) {
+      while(!partitionIterator.hasNext()) {
         K curKey = keyIterator.next();
         partitionIterator = getPartition(curKey).iterator();
       }
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirMkdirOp.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirMkdirOp.java
index c8c6277..5a40906 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirMkdirOp.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirMkdirOp.java
@@ -265,10 +265,13 @@ class FSDirMkdirOp {
     // create the missing directories along the path
     INode[] missing = new INode[numMissing];
     final int last = iip.length();
+    INode parent = existing.getLastINode();
     for (int i = existing.length();  i < last; i++) {
       byte[] component = iip.getPathComponent(i);
       missing[i - existing.length()] =
           createDirectoryINode(fsd, existing, component, perm);
+      missing[i - existing.length()].setParent(parent.asDirectory());
+      parent = missing[i - existing.length()];
     }
     return missing;
   }
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java
index daff95c..42e462e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java
@@ -47,6 +47,7 @@ import org.slf4j.LoggerFactory;
 import java.io.PrintStream;
 import java.io.PrintWriter;
 import java.io.StringWriter;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 
@@ -577,6 +578,43 @@ public abstract class INode implements INodeAttributes, Diff.Element<byte[]> {
     return name == null? null: DFSUtil.bytes2String(name);
   }
 
+  private long[] namespaceKey;
+
+  /**
+   * Key of an INode.
+   * Defines partitioning of INodes in the INodeMap.
+   *
+   * @param level how many levels to be included in the key
+   * @return
+   */
+  public long[] getNamespaceKey(int level) {
+    if(namespaceKey == null) {  // generate the namespace key
+      long[] buf = new long[level];
+      INode cur = this;
+      for(int l = 0; l < level; l++) {
+        long curId = (cur == null) ? 0L : cur.getId();
+        buf[level - l - 1] = curId;
+        cur =  (cur == null) ? null : cur.parent;
+      }
+      buf[0] = indexOf(buf);
+      namespaceKey = buf;
+    }
+    return namespaceKey;
+  }
+
+  private final static long LARGE_PRIME = 512927357;
+  public static long indexOf(long[] key) {
+    if(key[key.length-1] == INodeId.ROOT_INODE_ID) {
+      return key[0];
+    }
+    long idx = LARGE_PRIME * key[0];
+    idx = (idx ^ (idx >> 32)) & (INodeMap.NUM_RANGES_STATIC -1);
+    return idx;
+  }
+
+  /**
+   * Key of a snapshot Diff Element
+   */
   @Override
   public final byte[] getKey() {
     return getLocalNameBytes();
@@ -636,7 +674,7 @@ public abstract class INode implements INodeAttributes, Diff.Element<byte[]> {
 
   @Override
   public String toString() {
-    return getLocalName();
+    return getLocalName() + ": " + Arrays.toString(namespaceKey);
   }
 
   @VisibleForTesting
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
index 88c3233..3b07dce 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
@@ -29,14 +29,63 @@ import org.apache.hadoop.util.GSet;
 import org.apache.hadoop.util.LatchLock;
 import org.apache.hadoop.util.LightWeightGSet;
 import org.apache.hadoop.util.PartitionedGSet;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * Storing all the {@link INode}s and maintaining the mapping between INode ID
  * and INode.  
  */
 public class INodeMap {
+  static final int NAMESPACE_KEY_DEBTH = 2;
+  static final int NUM_RANGES_STATIC = 256;  // power of 2
+
+  public static class INodeKeyComparator implements Comparator<INode> {
+    INodeKeyComparator() {
+      FSDirectory.LOG.info("Namespace key debth = {}", NAMESPACE_KEY_DEBTH);
+    }
+
+    @Override
+    public int compare(INode i1, INode i2) {
+      if (i1 == null || i2 == null) {
+        throw new NullPointerException("Cannot compare null INodes");
+      }
+      long[] key1 = i1.getNamespaceKey(NAMESPACE_KEY_DEBTH);
+      long[] key2 = i2.getNamespaceKey(NAMESPACE_KEY_DEBTH);
+      for(int l = 0; l < NAMESPACE_KEY_DEBTH; l++) {
+        if(key1[l] == key2[l]) continue;
+        return (key1[l] < key2[l] ? -1 : 1);
+      }
+      return 0;
+    }
+  }
+
+  /**
+   * INodeKeyComparator with Hashed Parent
+   *
+   */
+  public static class HPINodeKeyComparator implements Comparator<INode> {
+    HPINodeKeyComparator() {
+      FSDirectory.LOG.info("Namespace key debth = {}", NAMESPACE_KEY_DEBTH);
+    }
+
+    @Override
+    public int compare(INode i1, INode i2) {
+      if (i1 == null || i2 == null) {
+        throw new NullPointerException("Cannot compare null INodes");
+      }
+      long[] key1 = i1.getNamespaceKey(NAMESPACE_KEY_DEBTH);
+      long[] key2 = i2.getNamespaceKey(NAMESPACE_KEY_DEBTH);
+      long key1_0 = INode.indexOf(key1);
+      long key2_0 = INode.indexOf(key2);
+      if(key1_0 != key2_0)
+        return (key1_0 < key2_0 ? -1 : 1);
+      for(int l = 1; l < NAMESPACE_KEY_DEBTH; l++) {
+        if(key1[l] == key2[l]) continue;
+        return (key1[l] < key2[l] ? -1 : 1);
+      }
+      return 0;
+    }
+  }
+
   public static class INodeIdComparator implements Comparator<INode> {
     @Override
     public int compare(INode i1, INode i2) {
@@ -50,8 +99,6 @@ public class INodeMap {
   }
 
   public class INodeMapLock extends LatchLock<ReentrantReadWriteLock> {
-    Logger LOG = LoggerFactory.getLogger(INodeMapLock.class);
-
     private ReentrantReadWriteLock childLock;
 
     INodeMapLock() {
@@ -146,8 +193,22 @@ public class INodeMap {
     this.namesystem = ns;
     // Compute the map capacity by allocating 1% of total memory
     int capacity = LightWeightGSet.computeCapacity(1, "INodeMap");
-    this.map = new PartitionedGSet<>(capacity, new INodeIdComparator(),
+    this.map = new PartitionedGSet<>(capacity, new INodeKeyComparator(),
             new INodeMapLock(), rootDir);
+
+    // Pre-populate initial empty partitions
+    PartitionedGSet<INode, INodeWithAdditionalFields> pgs =
+        (PartitionedGSet<INode, INodeWithAdditionalFields>) map;
+    PermissionStatus perm = new PermissionStatus(
+        "", "", new FsPermission((short) 0));
+    for(int p = 0; p < NUM_RANGES_STATIC; p++) {
+      INodeDirectory key = new INodeDirectory(
+          INodeId.ROOT_INODE_ID, "range key".getBytes(), perm, 0);
+      key.setParent(new INodeDirectory((long)p, null, perm, 0));
+      pgs.addNewPartition(key);
+    }
+
+    map.put(rootDir);
   }
 
   /**

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org