You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@accumulo.apache.org by GitBox <gi...@apache.org> on 2022/09/14 12:37:44 UTC

[GitHub] [accumulo] milleruntime commented on a diff in pull request #2900: Update "du" command to compute disk usage by scanning the metadata table

milleruntime commented on code in PR #2900:
URL: https://github.com/apache/accumulo/pull/2900#discussion_r970716726


##########
core/src/main/java/org/apache/accumulo/core/util/tables/TableDiskUsage.java:
##########
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.core.util.tables;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.clientImpl.ClientContext;
+import org.apache.accumulo.core.data.TableId;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class TableDiskUsage {
+
+  private static final Logger log = LoggerFactory.getLogger(TableDiskUsage.class);
+  private int nextInternalId = 0;
+  private Map<TableId,Integer> internalIds = new HashMap<>();
+  private Map<Integer,TableId> externalIds = new HashMap<>();
+  private Map<String,Integer[]> tableFiles = new HashMap<>();
+  private Map<String,Long> fileSizes = new HashMap<>();
+
+  protected void addTableIfAbsent(TableId tableId) {
+    if (!internalIds.containsKey(tableId)) {
+      addTable(tableId);
+    }
+  }
+
+  protected void addTable(TableId tableId) {
+    if (internalIds.containsKey(tableId)) {
+      throw new IllegalArgumentException("Already added table " + tableId);
+    }
+
+    // Keep an internal counter for each table added
+    int iid = nextInternalId++;
+
+    // Store the table id to the internal id
+    internalIds.put(tableId, iid);
+    // Store the internal id to the table id
+    externalIds.put(iid, tableId);
+  }
+
+  protected void linkFileAndTable(TableId tableId, String file) {
+    // get the internal id for this table
+    int internalId = internalIds.get(tableId);
+
+    // Initialize a bitset for tables (internal IDs) that reference this file
+    Integer[] tables = tableFiles.get(file);
+    if (tables == null) {
+      tables = new Integer[internalIds.size()];
+      for (int i = 0; i < tables.length; i++) {
+        tables[i] = 0;
+      }
+      tableFiles.put(file, tables);
+    }
+
+    // Update the bitset to track that this table has seen this file
+    tables[internalId] = 1;
+  }
+
+  protected void addFileSize(String file, long size) {
+    fileSizes.put(file, size);
+  }
+
+  protected Map<List<TableId>,Long> calculateSharedUsage() {

Review Comment:
   ```suggestion
     private Map<List<TableId>,Long> calculateSharedUsage() {
   ```



##########
server/base/src/main/java/org/apache/accumulo/server/client/ClientServiceHandler.java:
##########
@@ -439,10 +440,10 @@ public List<TDiskUsage> getDiskUsage(Set<String> tables, TCredentials credential
       }
 
       // use the same set of tableIds that were validated above to avoid race conditions
-      Map<TreeSet<String>,Long> diskUsage =
-          TableDiskUsage.getDiskUsage(tableIds, context.getVolumeManager(), context);
+      Map<SortedSet<String>,Long> diskUsage =
+          HdfsTableDiskUsage.getDiskUsage(tableIds, context.getVolumeManager(), context);

Review Comment:
   ```suggestion
         HdfsTableDiskUsage hdfsTableDiskUsage = new HdfsTableDiskUsage();
         Map<SortedSet<String>,Long> diskUsage =
             hdfsTableDiskUsage.getDiskUsage(tableIds, context.getVolumeManager(), context);
   ```



##########
core/src/main/java/org/apache/accumulo/core/util/tables/TableDiskUsage.java:
##########
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.core.util.tables;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.clientImpl.ClientContext;
+import org.apache.accumulo.core.data.TableId;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class TableDiskUsage {
+
+  private static final Logger log = LoggerFactory.getLogger(TableDiskUsage.class);
+  private int nextInternalId = 0;
+  private Map<TableId,Integer> internalIds = new HashMap<>();
+  private Map<Integer,TableId> externalIds = new HashMap<>();
+  private Map<String,Integer[]> tableFiles = new HashMap<>();
+  private Map<String,Long> fileSizes = new HashMap<>();
+
+  protected void addTableIfAbsent(TableId tableId) {
+    if (!internalIds.containsKey(tableId)) {
+      addTable(tableId);
+    }
+  }
+
+  protected void addTable(TableId tableId) {
+    if (internalIds.containsKey(tableId)) {
+      throw new IllegalArgumentException("Already added table " + tableId);
+    }
+
+    // Keep an internal counter for each table added
+    int iid = nextInternalId++;
+
+    // Store the table id to the internal id
+    internalIds.put(tableId, iid);
+    // Store the internal id to the table id
+    externalIds.put(iid, tableId);
+  }
+
+  protected void linkFileAndTable(TableId tableId, String file) {
+    // get the internal id for this table
+    int internalId = internalIds.get(tableId);
+
+    // Initialize a bitset for tables (internal IDs) that reference this file
+    Integer[] tables = tableFiles.get(file);
+    if (tables == null) {
+      tables = new Integer[internalIds.size()];
+      for (int i = 0; i < tables.length; i++) {
+        tables[i] = 0;
+      }
+      tableFiles.put(file, tables);
+    }
+
+    // Update the bitset to track that this table has seen this file
+    tables[internalId] = 1;
+  }
+
+  protected void addFileSize(String file, long size) {
+    fileSizes.put(file, size);
+  }
+
+  protected Map<List<TableId>,Long> calculateSharedUsage() {
+    // Bitset of tables that contain a file and total usage by all files that share that usage
+    Map<List<Integer>,Long> usage = new HashMap<>();
+
+    if (log.isTraceEnabled()) {
+      log.trace("fileSizes {}", fileSizes);
+    }
+    // For each file w/ referenced-table bitset
+    for (Entry<String,Integer[]> entry : tableFiles.entrySet()) {
+      if (log.isTraceEnabled()) {
+        log.trace("file {} table bitset {}", entry.getKey(), Arrays.toString(entry.getValue()));
+      }
+      List<Integer> key = Arrays.asList(entry.getValue());
+      Long size = fileSizes.get(entry.getKey());
+
+      Long tablesUsage = usage.getOrDefault(key, 0L);
+      tablesUsage += size;
+      usage.put(key, tablesUsage);
+    }
+
+    final Map<List<TableId>,Long> externalUsage = new HashMap<>();
+
+    for (Entry<List<Integer>,Long> entry : usage.entrySet()) {
+      List<TableId> externalKey = new ArrayList<>();
+      List<Integer> key = entry.getKey();
+      // table bitset
+      for (int i = 0; i < key.size(); i++)
+        if (key.get(i) != 0) {
+          // Convert by internal id to the table id
+          externalKey.add(externalIds.get(i));
+        }
+
+      // list of table ids and size of files shared across the tables
+      externalUsage.put(externalKey, entry.getValue());
+    }
+
+    // mapping of all enumerations of files being referenced by tables and total size of files who
+    // share the same reference
+    return externalUsage;
+  }
+
+  protected static SortedMap<SortedSet<String>,Long> buildSharedUsageMap(final TableDiskUsage tdu,

Review Comment:
   ```suggestion
     protected SortedMap<SortedSet<String>,Long> buildSharedUsageMap(final TableDiskUsage tdu,
   ```



##########
core/src/main/java/org/apache/accumulo/core/util/tables/MetadataTableDiskUsage.java:
##########
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.core.util.tables;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+import org.apache.accumulo.core.client.AccumuloClient;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.admin.TableDiskUsageResult;
+import org.apache.accumulo.core.clientImpl.ClientContext;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.TableId;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.dataImpl.KeyExtent;
+import org.apache.accumulo.core.metadata.MetadataTable;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.TabletFile;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
+import org.apache.accumulo.core.metadata.schema.MetadataSchema;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * This utility class will scan the Accumulo Metadata table to compute the disk usage for a table or
+ * table(s) by using the size value stored in columns that contain the column family
+ * {@link MetadataSchema.TabletsSection.DataFileColumnFamily}.
+ *
+ * This class will also optionally track shared files to computed shared usage across all tables
+ * that are provided as part of the Set of tables when getting disk usage.
+ *
+ * Because the metadata table is used for computing usage and not the actual files in HDFS the
+ * results will be an estimate. Older entries may exist with no file metadata (resulting in size 0)
+ * and other actions in the cluster can impact the estimated size such as flushes, tablet splits,
+ * compactions, etc.
+ *
+ * For the most accurate information a compaction should first be run on the set of tables being
+ * computed.
+ */
+public class MetadataTableDiskUsage extends TableDiskUsage {
+
+  public MetadataTableDiskUsage(final Set<TableId> tableIds) {
+    // Add each tableID
+    Objects.requireNonNull(tableIds).forEach(tableId -> addTable(tableId));
+  }
+
+  /**
+   * Compute the estimated disk usage for the given set of tables by scanning the Metadata table for
+   * file sizes. Also will compute shared usage across tables.
+   *
+   * @param tableNames
+   *          set of tables to compute an estimated disk usage for
+   * @param auths
+   *          authorizations to scan the metadata table
+   * @return the computed estimated usage results
+   *
+   * @throws TableNotFoundException
+   *           if the table(s) do not exist
+   */
+  public static TableDiskUsageResult getDiskUsage(Set<String> tableNames, AccumuloClient client,

Review Comment:
   ```suggestion
     public TableDiskUsageResult getDiskUsage(Set<String> tableNames, AccumuloClient client,
   ```



##########
core/src/main/java/org/apache/accumulo/core/util/tables/MetadataTableDiskUsage.java:
##########
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.core.util.tables;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+import org.apache.accumulo.core.client.AccumuloClient;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.admin.TableDiskUsageResult;
+import org.apache.accumulo.core.clientImpl.ClientContext;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.TableId;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.dataImpl.KeyExtent;
+import org.apache.accumulo.core.metadata.MetadataTable;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.TabletFile;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
+import org.apache.accumulo.core.metadata.schema.MetadataSchema;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * This utility class will scan the Accumulo Metadata table to compute the disk usage for a table or
+ * table(s) by using the size value stored in columns that contain the column family
+ * {@link MetadataSchema.TabletsSection.DataFileColumnFamily}.
+ *
+ * This class will also optionally track shared files to computed shared usage across all tables
+ * that are provided as part of the Set of tables when getting disk usage.
+ *
+ * Because the metadata table is used for computing usage and not the actual files in HDFS the
+ * results will be an estimate. Older entries may exist with no file metadata (resulting in size 0)
+ * and other actions in the cluster can impact the estimated size such as flushes, tablet splits,
+ * compactions, etc.
+ *
+ * For the most accurate information a compaction should first be run on the set of tables being
+ * computed.
+ */
+public class MetadataTableDiskUsage extends TableDiskUsage {
+
+  public MetadataTableDiskUsage(final Set<TableId> tableIds) {
+    // Add each tableID
+    Objects.requireNonNull(tableIds).forEach(tableId -> addTable(tableId));
+  }
+
+  /**
+   * Compute the estimated disk usage for the given set of tables by scanning the Metadata table for
+   * file sizes. Also will compute shared usage across tables.
+   *
+   * @param tableNames
+   *          set of tables to compute an estimated disk usage for
+   * @param auths
+   *          authorizations to scan the metadata table
+   * @return the computed estimated usage results
+   *
+   * @throws TableNotFoundException
+   *           if the table(s) do not exist
+   */
+  public static TableDiskUsageResult getDiskUsage(Set<String> tableNames, AccumuloClient client,
+      Authorizations auths) throws TableNotFoundException {
+    return getDiskUsage(tableNames, true, client, auths);
+  }
+
+  /**
+   * Compute the estimated disk usage for the given set of tables by scanning the Metadata table for
+   * file sizes. Optionally computes shared usage across tables.
+   *
+   * @param tableNames
+   *          set of tables to compute an estimated disk usage for
+   * @param computeShared
+   *          whether to compute size metrics across shared files
+   * @param auths
+   *          authorizations to scan the metadata table
+   * @return the computed estimated usage results
+   *
+   * @throws TableNotFoundException
+   *           if the table(s) do not exist
+   */
+  public static TableDiskUsageResult getDiskUsage(Set<String> tableNames, boolean computeShared,

Review Comment:
   ```suggestion
     public TableDiskUsageResult getDiskUsage(Set<String> tableNames, boolean computeShared,
   ```



##########
server/base/src/main/java/org/apache/accumulo/server/util/HdfsTableDiskUsage.java:
##########
@@ -308,13 +184,12 @@ static class Opts extends ServerUtilOpts {
 
   public static void main(String[] args) throws Exception {
     Opts opts = new Opts();
-    opts.parseArgs(TableDiskUsage.class.getName(), args);
-    Span span = TraceUtil.startSpan(TableDiskUsage.class, "main");
+    opts.parseArgs(HdfsTableDiskUsage.class.getName(), args);
+    Span span = TraceUtil.startSpan(HdfsTableDiskUsage.class, "main");
     try (Scope scope = span.makeCurrent()) {
       try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build()) {
         VolumeManager fs = opts.getServerContext().getVolumeManager();
-        org.apache.accumulo.server.util.TableDiskUsage.printDiskUsage(opts.tables, fs, client,
-            false);
+        HdfsTableDiskUsage.printDiskUsage(opts.tables, fs, client, false);

Review Comment:
   ```suggestion
           HdfsTableDiskUsage hdfsTableDiskUsage = new HdfsTableDiskUsage();
           hdfsTableDiskUsage.printDiskUsage(opts.tables, fs, client, false);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@accumulo.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org