You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by dl...@apache.org on 2022/09/26 11:10:29 UTC
[accumulo] branch main updated: Update "du" command to compute disk usage by scanning the metadata table (#2900)

This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new b5c57f0c1d Update "du" command to compute disk usage by scanning the metadata table (#2900)
b5c57f0c1d is described below

commit b5c57f0c1de5e7dbe8dcaf9eab397c12599bac7d
Author: Christopher L. Shannon <ch...@gmail.com>
AuthorDate: Mon Sep 26 07:10:23 2022 -0400

    Update "du" command to compute disk usage by scanning the metadata table (#2900)
    
    This commit changes the source of file size information used to compute
    shared usages across tables to come from the metadata table instead of
    HDFS which will improve performance. This will be an estimate due to
    the fact that metadata table contains estimated sizes depending on
    what's happening in the cluster. Running a compaction before running the
    command gives the most accurate information.
    
    Closes #2820
---
 .../accumulo/core/client/admin/DiskUsage.java      |   6 +-
 .../core/client/admin/TableOperations.java         |  14 +-
 .../server/client/ClientServiceHandler.java        |   9 +-
 .../accumulo/server/util/TableDiskUsage.java       | 179 +++++++-----
 .../accumulo/server/util/TableDiskUsageTest.java   | 306 +++++++++++++++++++++
 .../apache/accumulo/shell/commands/DUCommand.java  |  23 +-
 .../apache/accumulo/test/TableOperationsIT.java    |   2 +-
 7 files changed, 450 insertions(+), 89 deletions(-)

diff --git a/core/src/main/java/org/apache/accumulo/core/client/admin/DiskUsage.java b/core/src/main/java/org/apache/accumulo/core/client/admin/DiskUsage.java
index 85a2983eef..a82e00a59e 100644
--- a/core/src/main/java/org/apache/accumulo/core/client/admin/DiskUsage.java
+++ b/core/src/main/java/org/apache/accumulo/core/client/admin/DiskUsage.java
@@ -24,9 +24,9 @@ import java.util.SortedSet;
 public class DiskUsage {
 
   protected final SortedSet<String> tables;
-  protected Long usage;
+  protected long usage;
 
-  public DiskUsage(SortedSet<String> tables, Long usage) {
+  public DiskUsage(SortedSet<String> tables, long usage) {
     this.tables = tables;
     this.usage = usage;
   }
@@ -35,7 +35,7 @@ public class DiskUsage {
     return tables;
   }
 
-  public Long getUsage() {
+  public long getUsage() {
     return usage;
   }
 
diff --git a/core/src/main/java/org/apache/accumulo/core/client/admin/TableOperations.java b/core/src/main/java/org/apache/accumulo/core/client/admin/TableOperations.java
index e6d74f75a1..3ce87e2886 100644
--- a/core/src/main/java/org/apache/accumulo/core/client/admin/TableOperations.java
+++ b/core/src/main/java/org/apache/accumulo/core/client/admin/TableOperations.java
@@ -1076,11 +1076,21 @@ public interface TableOperations {
       throws AccumuloException, TableNotFoundException;
 
   /**
-   * Gets the number of bytes being used in the files for a set of tables
+   * Gets the number of bytes being used by the files for a set of tables. This operation will scan
+   * the metadata table for file size information to compute the size metrics for the tables.
+   *
+   * Because the metadata table is used for computing usage and not the actual files in HDFS the
+   * results will be an estimate. Older entries may exist with no file metadata (resulting in size
+   * 0) and other actions in the cluster can impact the estimated size such as flushes, tablet
+   * splits, compactions, etc.
+   *
+   * For more accurate information a compaction should first be run on all files for the set of
+   * tables being computed.
    *
    * @param tables
    *          a set of tables
-   * @return a list of disk usage objects containing linked table names and sizes
+   * @return a list of disk usage objects containing linked table names and sizes set of tables to
+   *         compute usage across
    * @since 1.6.0
    */
   List<DiskUsage> getDiskUsage(Set<String> tables)
diff --git a/server/base/src/main/java/org/apache/accumulo/server/client/ClientServiceHandler.java b/server/base/src/main/java/org/apache/accumulo/server/client/ClientServiceHandler.java
index 112e8ede41..c7962f0362 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/client/ClientServiceHandler.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/client/ClientServiceHandler.java
@@ -27,7 +27,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
-import java.util.TreeSet;
+import java.util.SortedSet;
 
 import org.apache.accumulo.core.classloader.ClassLoaderUtil;
 import org.apache.accumulo.core.client.AccumuloSecurityException;
@@ -439,15 +439,14 @@ public class ClientServiceHandler implements ClientService.Iface {
       }
 
       // use the same set of tableIds that were validated above to avoid race conditions
-      Map<TreeSet<String>,Long> diskUsage =
-          TableDiskUsage.getDiskUsage(tableIds, context.getVolumeManager(), context);
+      Map<SortedSet<String>,Long> diskUsage = TableDiskUsage.getDiskUsage(tableIds, context);
       List<TDiskUsage> retUsages = new ArrayList<>();
-      for (Map.Entry<TreeSet<String>,Long> usageItem : diskUsage.entrySet()) {
+      for (Map.Entry<SortedSet<String>,Long> usageItem : diskUsage.entrySet()) {
         retUsages.add(new TDiskUsage(new ArrayList<>(usageItem.getKey()), usageItem.getValue()));
       }
       return retUsages;
 
-    } catch (TableNotFoundException | IOException e) {
+    } catch (TableNotFoundException e) {
       throw new TException(e);
     }
   }
diff --git a/server/base/src/main/java/org/apache/accumulo/server/util/TableDiskUsage.java b/server/base/src/main/java/org/apache/accumulo/server/util/TableDiskUsage.java
index be057ade8f..0046caa121 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/util/TableDiskUsage.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/util/TableDiskUsage.java
@@ -29,6 +29,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
@@ -42,25 +44,40 @@ import org.apache.accumulo.core.data.TableId;
 import org.apache.accumulo.core.data.Value;
 import org.apache.accumulo.core.dataImpl.KeyExtent;
 import org.apache.accumulo.core.metadata.MetadataTable;
-import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.TabletFile;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
+import org.apache.accumulo.core.metadata.schema.MetadataSchema;
 import org.apache.accumulo.core.security.Authorizations;
 import org.apache.accumulo.core.trace.TraceUtil;
 import org.apache.accumulo.core.util.NumUtil;
 import org.apache.accumulo.server.cli.ServerUtilOpts;
 import org.apache.accumulo.server.fs.VolumeManager;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RemoteIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.beust.jcommander.Parameter;
-import com.google.common.base.Joiner;
 
 import io.opentelemetry.api.trace.Span;
 import io.opentelemetry.context.Scope;
 
+/**
+ * This utility class will scan the Accumulo Metadata table to compute the disk usage for a table or
+ * table(s) by using the size value stored in columns that contain the column family
+ * {@link MetadataSchema.TabletsSection.DataFileColumnFamily}.
+ *
+ * This class will also track shared files to computed shared usage across all tables that are
+ * provided as part of the Set of tables when getting disk usage.
+ *
+ * Because the metadata table is used for computing usage and not the actual files in HDFS the
+ * results will be an estimate. Older entries may exist with no file metadata (resulting in size 0)
+ * and other actions in the cluster can impact the estimated size such as flushes, tablet splits,
+ * compactions, etc.
+ *
+ * For more accurate information a compaction should first be run on all files for the set of tables
+ * being computed.
+ */
 public class TableDiskUsage {
 
   private static final Logger log = LoggerFactory.getLogger(TableDiskUsage.class);
@@ -71,8 +88,9 @@ public class TableDiskUsage {
   private Map<String,Long> fileSizes = new HashMap<>();
 
   void addTable(TableId tableId) {
-    if (internalIds.containsKey(tableId))
+    if (internalIds.containsKey(tableId)) {
       throw new IllegalArgumentException("Already added table " + tableId);
+    }
 
     // Keep an internal counter for each table added
     int iid = nextInternalId++;
@@ -91,8 +109,9 @@ public class TableDiskUsage {
     Integer[] tables = tableFiles.get(file);
     if (tables == null) {
       tables = new Integer[internalIds.size()];
-      for (int i = 0; i < tables.length; i++)
+      for (int i = 0; i < tables.length; i++) {
         tables[i] = 0;
+      }
       tableFiles.put(file, tables);
     }
 
@@ -121,8 +140,9 @@ public class TableDiskUsage {
       Long size = fileSizes.get(entry.getKey());
 
       Long tablesUsage = usage.get(key);
-      if (tablesUsage == null)
+      if (tablesUsage == null) {
         tablesUsage = 0L;
+      }
 
       tablesUsage += size;
 
@@ -137,9 +157,10 @@ public class TableDiskUsage {
       List<Integer> key = entry.getKey();
       // table bitset
       for (int i = 0; i < key.size(); i++)
-        if (key.get(i) != 0)
+        if (key.get(i) != 0) {
           // Convert by internal id to the table id
           externalKey.add(externalIds.get(i));
+        }
 
       // list of table ids and size of files shared across the tables
       externalUsage.put(externalKey, entry.getValue());
@@ -154,83 +175,90 @@ public class TableDiskUsage {
     void print(String line);
   }
 
-  public static void printDiskUsage(Collection<String> tableNames, VolumeManager fs,
-      AccumuloClient client, boolean humanReadable) throws TableNotFoundException, IOException {
-    printDiskUsage(tableNames, fs, client, System.out::println, humanReadable);
+  public static void printDiskUsage(Collection<String> tableNames, AccumuloClient client,
+      boolean humanReadable) throws TableNotFoundException, IOException {
+    printDiskUsage(tableNames, client, System.out::println, humanReadable);
   }
 
-  public static Map<TreeSet<String>,Long> getDiskUsage(Set<TableId> tableIds, VolumeManager fs,
-      AccumuloClient client) throws IOException {
-    TableDiskUsage tdu = new TableDiskUsage();
+  /**
+   * Compute the estimated disk usage for the given set of tables by scanning the Metadata table for
+   * file sizes. Optionally computes shared usage across tables.
+   *
+   * @param tableIds
+   *          set of tables to compute an estimated disk usage for
+   * @param client
+   *          accumulo client used to scan
+   * @return the computed estimated usage results
+   *
+   * @throws TableNotFoundException
+   *           if the table(s) do not exist
+   */
+  public static Map<SortedSet<String>,Long> getDiskUsage(Set<TableId> tableIds,
+      AccumuloClient client) throws TableNotFoundException {
+    final TableDiskUsage tdu = new TableDiskUsage();
 
     // Add each tableID
     for (TableId tableId : tableIds)
       tdu.addTable(tableId);
 
-    HashSet<TableId> tablesReferenced = new HashSet<>(tableIds);
     HashSet<TableId> emptyTableIds = new HashSet<>();
-    HashSet<String> nameSpacesReferenced = new HashSet<>();
 
     // For each table ID
     for (TableId tableId : tableIds) {
-      Scanner mdScanner;
-      try {
-        mdScanner = client.createScanner(MetadataTable.NAME, Authorizations.EMPTY);
-      } catch (TableNotFoundException e) {
-        throw new RuntimeException(e);
-      }
-      mdScanner.fetchColumnFamily(DataFileColumnFamily.NAME);
-      mdScanner.setRange(new KeyExtent(tableId, null, null).toMetaRange());
+      // if the table to compute usage is for the metadata table itself then we need to scan the
+      // root table, else we scan the metadata table
+      try (Scanner mdScanner = tableId.equals(MetadataTable.ID)
+          ? client.createScanner(RootTable.NAME, Authorizations.EMPTY)
+          : client.createScanner(MetadataTable.NAME, Authorizations.EMPTY)) {
+        mdScanner.fetchColumnFamily(MetadataSchema.TabletsSection.DataFileColumnFamily.NAME);
+        mdScanner.setRange(new KeyExtent(tableId, null, null).toMetaRange());
+
+        final Set<TabletFile> files = new HashSet<>();
+
+        // Read each file referenced by that table
+        for (Map.Entry<Key,Value> entry : mdScanner) {
+          final TabletFile file =
+              new TabletFile(new Path(entry.getKey().getColumnQualifier().toString()));
+
+          // get the table referenced by the file which may not be the same as the current
+          // table we are scanning if the file is shared between multiple tables
+          final TableId fileTableRef = file.getTableId();
+
+          // if this is a ref to a different table than the one we are scanning then we need
+          // to make sure the table is also linked for this shared file if the table is
+          // part of the set of tables we are running du on so we can track shared usages
+          if (!fileTableRef.equals(tableId) && tableIds.contains(fileTableRef)) {
+            // link the table and the shared file for computing shared sizes
+            tdu.linkFileAndTable(fileTableRef, file.getFileName());
+          }
 
-      if (!mdScanner.iterator().hasNext()) {
-        emptyTableIds.add(tableId);
-      }
+          // link the file to the table we are scanning for
+          tdu.linkFileAndTable(tableId, file.getFileName());
 
-      // Read each file referenced by that table
-      for (Entry<Key,Value> entry : mdScanner) {
-        String file = entry.getKey().getColumnQualifier().toString();
-        String[] parts = file.split("/");
-        // the filename
-        String uniqueName = parts[parts.length - 1];
-        if (file.contains(":") || file.startsWith("../")) {
-          String ref = parts[parts.length - 3];
-          // Track any tables which are referenced externally by the current table
-          if (!ref.equals(tableId.canonical())) {
-            tablesReferenced.add(TableId.of(ref));
-          }
-          if (file.contains(":") && parts.length > 3) {
-            List<String> base = Arrays.asList(Arrays.copyOf(parts, parts.length - 3));
-            nameSpacesReferenced.add(Joiner.on("/").join(base));
+          // add the file size for the table if not already seen for this scan
+          if (files.add(file)) {
+            // This tracks the file size for individual files for computing shared file statistics
+            // later
+            tdu.addFileSize(file.getFileName(),
+                new DataFileValue(entry.getValue().get()).getSize());
           }
         }
 
-        // add this file to this table
-        tdu.linkFileAndTable(tableId, uniqueName);
-      }
-    }
-
-    // Each table seen (provided by user, or reference by table the user provided)
-    for (TableId tableId : tablesReferenced) {
-      for (String tableDir : nameSpacesReferenced) {
-        // Find each file and add its size
-        Path path = new Path(tableDir + "/" + tableId);
-        if (!fs.exists(path)) {
-          log.debug("Table ID directory {} does not exist.", path);
-          continue;
-        }
-        log.info("Get all files recursively in {}", path);
-        RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);
-        while (ri.hasNext()) {
-          FileStatus status = ri.next();
-          String name = status.getPath().getName();
-          tdu.addFileSize(name, status.getLen());
+        // Track tables that are empty with no metadata
+        if (files.isEmpty()) {
+          emptyTableIds.add(tableId);
         }
       }
     }
 
-    Map<TableId,String> reverseTableIdMap = ((ClientContext) client).getTableIdToNameMap();
+    return buildSharedUsageMap(tdu, ((ClientContext) client), emptyTableIds);
+  }
+
+  protected static Map<SortedSet<String>,Long> buildSharedUsageMap(final TableDiskUsage tdu,
+      final ClientContext clientContext, final Set<TableId> emptyTableIds) {
+    final Map<TableId,String> reverseTableIdMap = clientContext.getTableIdToNameMap();
 
-    TreeMap<TreeSet<String>,Long> usage = new TreeMap<>((o1, o2) -> {
+    SortedMap<SortedSet<String>,Long> usage = new TreeMap<>((o1, o2) -> {
       int len1 = o1.size();
       int len2 = o2.size();
 
@@ -247,8 +275,9 @@ public class TableDiskUsage {
 
         int cmp = s1.compareTo(s2);
 
-        if (cmp != 0)
+        if (cmp != 0) {
           return cmp;
+        }
 
         count++;
       }
@@ -259,8 +288,9 @@ public class TableDiskUsage {
     for (Entry<List<TableId>,Long> entry : tdu.calculateUsage().entrySet()) {
       TreeSet<String> tableNames = new TreeSet<>();
       // Convert size shared by each table id into size shared by each table name
-      for (TableId tableId : entry.getKey())
+      for (TableId tableId : entry.getKey()) {
         tableNames.add(reverseTableIdMap.get(tableId));
+      }
 
       // Make table names to shared file size
       usage.put(tableNames, entry.getValue());
@@ -277,25 +307,25 @@ public class TableDiskUsage {
     return usage;
   }
 
-  public static void printDiskUsage(Collection<String> tableNames, VolumeManager fs,
-      AccumuloClient client, Printer printer, boolean humanReadable)
-      throws TableNotFoundException, IOException {
+  public static void printDiskUsage(Collection<String> tableNames, AccumuloClient client,
+      Printer printer, boolean humanReadable) throws TableNotFoundException, IOException {
 
     HashSet<TableId> tableIds = new HashSet<>();
 
     // Get table IDs for all tables requested to be 'du'
     for (String tableName : tableNames) {
       TableId tableId = ((ClientContext) client).getTableId(tableName);
-      if (tableId == null)
+      if (tableId == null) {
         throw new TableNotFoundException(null, tableName, "Table " + tableName + " not found");
+      }
 
       tableIds.add(tableId);
     }
 
-    Map<TreeSet<String>,Long> usage = getDiskUsage(tableIds, fs, client);
+    Map<SortedSet<String>,Long> usage = getDiskUsage(tableIds, client);
 
     String valueFormat = humanReadable ? "%9s" : "%,24d";
-    for (Entry<TreeSet<String>,Long> entry : usage.entrySet()) {
+    for (Entry<SortedSet<String>,Long> entry : usage.entrySet()) {
       Object value = humanReadable ? NumUtil.bigNumberForSize(entry.getValue()) : entry.getValue();
       printer.print(String.format(valueFormat + " %s", value, entry.getKey()));
     }
@@ -313,8 +343,7 @@ public class TableDiskUsage {
     try (Scope scope = span.makeCurrent()) {
       try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build()) {
         VolumeManager fs = opts.getServerContext().getVolumeManager();
-        org.apache.accumulo.server.util.TableDiskUsage.printDiskUsage(opts.tables, fs, client,
-            false);
+        org.apache.accumulo.server.util.TableDiskUsage.printDiskUsage(opts.tables, client, false);
       } finally {
         span.end();
       }
diff --git a/server/base/src/test/java/org/apache/accumulo/server/util/TableDiskUsageTest.java b/server/base/src/test/java/org/apache/accumulo/server/util/TableDiskUsageTest.java
new file mode 100644
index 0000000000..9eaa2d55c6
--- /dev/null
+++ b/server/base/src/test/java/org/apache/accumulo/server/util/TableDiskUsageTest.java
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.server.util;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+import org.apache.accumulo.core.Constants;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.TableId;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.dataImpl.KeyExtent;
+import org.apache.accumulo.core.metadata.MetadataTable;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.TabletFile;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
+import org.apache.accumulo.core.metadata.schema.MetadataSchema;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.accumulo.server.ServerContext;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.easymock.EasyMock;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+public class TableDiskUsageTest {
+
+  private static final String volume1 = "hdfs://nn1/acc";
+  private static final String volume2 = "hdfs://nn2/acc";
+
+  private static final TableId tableId1 = TableId.of("1");
+  private static final TableId tableId2 = TableId.of("2");
+  private static final TableId tableId3 = TableId.of("3");
+  private static final String tabletName1 = "t-0001";
+  private static final String tabletName2 = "t-0002";
+  private static final String tabletName3 = "t-0003";
+  private static final String tabletName4 = "t-0004";
+
+  private static final Map<TableId,String> tableIdToNameMap = new HashMap<>();
+
+  @BeforeAll
+  public static void beforeClass() {
+    tableIdToNameMap.put(RootTable.ID, MetadataTable.NAME);
+    tableIdToNameMap.put(MetadataTable.ID, MetadataTable.NAME);
+    tableIdToNameMap.put(tableId1, "table1");
+    tableIdToNameMap.put(tableId2, "table2");
+    tableIdToNameMap.put(tableId3, "table3");
+  }
+
+  @Test
+  public void testSingleTableMultipleTablets() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+    mockScan(client, scanner, 1);
+
+    Map<Key,Value> tableEntries = new HashMap<>();
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0001.rf"), 1024);
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0002.rf"), 1024);
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName2, "C0003.rf"), 2048);
+    mockTableScan(scanner, tableEntries, tableId1);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result = TableDiskUsage.getDiskUsage(tableSet(tableId1), client);
+
+    assertEquals(4096, getTotalUsage(result, tableId1));
+    assertEquals(1, result.size());
+    Map.Entry<SortedSet<String>,Long> firstResult = result.entrySet().stream().findFirst().get();
+    assertEquals(1, firstResult.getKey().size());
+    assertTrue(firstResult.getKey().contains(getTableName(tableId1)));
+    assertEquals(4096, firstResult.getValue());
+
+    EasyMock.verify(client, scanner);
+  }
+
+  @Test
+  public void testMultipleVolumes() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+    mockScan(client, scanner, 1);
+
+    Map<Key,Value> tableEntries = new HashMap<>();
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0001.rf"), 1024);
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0002.rf"), 1024);
+    appendFileMetadata(tableEntries, getTabletFile(volume2, tableId1, tabletName2, "C0003.rf"),
+        2048);
+    appendFileMetadata(tableEntries, getTabletFile(volume2, tableId1, tabletName2, "C0004.rf"),
+        10000);
+    mockTableScan(scanner, tableEntries, tableId1);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result = TableDiskUsage.getDiskUsage(tableSet(tableId1), client);
+
+    assertEquals(14096, getTotalUsage(result, tableId1));
+    assertEquals(1, result.size());
+    Map.Entry<SortedSet<String>,Long> firstResult = result.entrySet().stream().findFirst().get();
+    assertEquals(1, firstResult.getKey().size());
+    assertEquals(14096, firstResult.getValue());
+
+    EasyMock.verify(client, scanner);
+  }
+
+  @Test
+  public void testMetadataTable() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+
+    // Expect root table instead to be scanned
+    EasyMock.expect(client.createScanner(RootTable.NAME, Authorizations.EMPTY)).andReturn(scanner);
+    EasyMock.expect(client.getTableIdToNameMap()).andReturn(tableIdToNameMap);
+
+    Map<Key,Value> tableEntries = new HashMap<>();
+    appendFileMetadata(tableEntries,
+        getTabletFile(MetadataTable.ID, MetadataTable.NAME, "C0001.rf"), 1024);
+    mockTableScan(scanner, tableEntries, MetadataTable.ID);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result =
+        TableDiskUsage.getDiskUsage(tableSet(MetadataTable.ID), client);
+
+    assertEquals(1024, getTotalUsage(result, MetadataTable.ID));
+    assertEquals(1, result.size());
+    Map.Entry<SortedSet<String>,Long> firstResult = result.entrySet().stream().findFirst().get();
+    assertEquals(1024, firstResult.getValue());
+
+    EasyMock.verify(client, scanner);
+  }
+
+  @Test
+  public void testDuplicateFile() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+    mockScan(client, scanner, 1);
+
+    Map<Key,Value> tableEntries = new HashMap<>();
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0001.rf"), 1024);
+    appendFileMetadata(tableEntries, getTabletFile(tableId1, tabletName1, "C0001.rf"), 1024);
+    mockTableScan(scanner, tableEntries, tableId1);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result = TableDiskUsage.getDiskUsage(tableSet(tableId1), client);
+
+    assertEquals(1024, getTotalUsage(result, tableId1));
+    assertEquals(1, result.size());
+    Map.Entry<SortedSet<String>,Long> firstResult = result.entrySet().stream().findFirst().get();
+    assertEquals(1, firstResult.getKey().size());
+    assertTrue(firstResult.getKey().contains(getTableName(tableId1)));
+    assertEquals(1024, firstResult.getValue());
+
+    EasyMock.verify(client, scanner);
+  }
+
+  @Test
+  public void testEmptyTable() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+    mockScan(client, scanner, 1);
+
+    Map<Key,Value> tableEntries = new HashMap<>();
+    mockTableScan(scanner, tableEntries, tableId1);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result = TableDiskUsage.getDiskUsage(tableSet(tableId1), client);
+
+    assertEquals(0, getTotalUsage(result, tableId1));
+    assertEquals(1, result.size());
+    Map.Entry<SortedSet<String>,Long> firstResult = result.entrySet().stream().findFirst().get();
+    assertEquals(1, firstResult.getKey().size());
+    assertEquals(0, firstResult.getValue());
+
+    EasyMock.verify(client, scanner);
+  }
+
+  @Test
+  public void testMultipleTables() throws Exception {
+    final ServerContext client = EasyMock.createMock(ServerContext.class);
+    final Scanner scanner = EasyMock.createMock(Scanner.class);
+    mockScan(client, scanner, 3);
+
+    Map<Key,Value> tableEntries1 = new HashMap<>();
+    appendFileMetadata(tableEntries1, getTabletFile(tableId1, tabletName1, "C0001.rf"), 1024);
+    appendFileMetadata(tableEntries1, getTabletFile(tableId1, tabletName1, "C0002.rf"), 4096);
+    mockTableScan(scanner, tableEntries1, tableId1);
+
+    Map<Key,Value> tableEntries2 = new HashMap<>();
+    appendFileMetadata(tableEntries2, getTabletFile(tableId2, tabletName2, "C0003.rf"), 2048);
+    appendFileMetadata(tableEntries2, getTabletFile(tableId2, tabletName2, "C0004.rf"), 3000);
+    mockTableScan(scanner, tableEntries2, tableId2);
+
+    Map<Key,Value> tableEntries3 = new HashMap<>();
+    // shared file
+    appendFileMetadata(tableEntries3, getTabletFile(tableId2, tabletName2, "C0003.rf"), 2048);
+    appendFileMetadata(tableEntries3, getTabletFile(tableId3, tabletName3, "C0005.rf"), 84520);
+    appendFileMetadata(tableEntries3, getTabletFile(tableId3, tabletName3, "C0006.rf"), 3000);
+    appendFileMetadata(tableEntries3, getTabletFile(tableId3, tabletName4, "C0007.rf"), 98456);
+    mockTableScan(scanner, tableEntries3, tableId3);
+
+    EasyMock.replay(client, scanner);
+
+    Map<SortedSet<String>,Long> result =
+        TableDiskUsage.getDiskUsage(tableSet(tableId1, tableId2, tableId3), client);
+
+    assertEquals(5120, getTotalUsage(result, tableId1));
+    assertEquals(5048, getTotalUsage(result, tableId2));
+    assertEquals(188024, getTotalUsage(result, tableId3));
+
+    // Make sure all shared tables exist in map
+    assertEquals(4, result.size());
+    assertTrue(result.containsKey(tableNameSet(tableId1)));
+    assertTrue(result.containsKey(tableNameSet(tableId2)));
+    assertTrue(result.containsKey(tableNameSet(tableId2, tableId3)));
+    assertTrue(result.containsKey(tableNameSet(tableId3)));
+
+    // Make sure all the shared disk usage computations are correct
+    assertEquals(5120, result.get(tableNameSet(tableId1)));
+    assertEquals(3000, result.get(tableNameSet(tableId2)));
+    assertEquals(2048, result.get(tableNameSet(tableId2, tableId3)));
+    assertEquals(185976, result.get(tableNameSet(tableId3)));
+
+    EasyMock.verify(client, scanner);
+  }
+
+  private static TreeSet<String> tableNameSet(TableId... tableIds) {
+    return Set.of(tableIds).stream().map(tableId -> getTableName(tableId))
+        .collect(Collectors.toCollection(TreeSet::new));
+  }
+
+  // Need to use a LinkedHashSet for predictable order due to the fact that
+  // we are using mock scanners that always return results in the same order
+  private static Set<TableId> tableSet(TableId... tableIds) {
+    return new LinkedHashSet<>(List.of(tableIds));
+  }
+
+  private static Long getTotalUsage(Map<SortedSet<String>,Long> result, TableId tableId) {
+    return result.entrySet().stream()
+        .filter(entry -> entry.getKey().contains(getTableName(tableId)))
+        .mapToLong(entry -> entry.getValue()).sum();
+  }
+
+  private static String getTableName(TableId tableId) {
+    return tableIdToNameMap.get(tableId);
+  }
+
+  private static void appendFileMetadata(Map<Key,Value> tableEntries, TabletFile file, long size) {
+    tableEntries.put(
+        new Key(new Text(file.getTableId() + "<"),
+            MetadataSchema.TabletsSection.DataFileColumnFamily.NAME, file.getMetaInsertText()),
+        new DataFileValue(size, 1).encodeAsValue());
+  }
+
+  private static TabletFile getTabletFile(String volume, TableId tableId, String tablet,
+      String fileName) {
+    return new TabletFile(new Path(
+        volume + Constants.HDFS_TABLES_DIR + "/" + tableId + "/" + tablet + "/" + fileName));
+  }
+
+  private static TabletFile getTabletFile(TableId tableId, String tablet, String fileName) {
+    return getTabletFile(volume1, tableId, tablet, fileName);
+  }
+
+  private void mockScan(ServerContext client, Scanner scanner, int times) throws Exception {
+    EasyMock.expect(client.createScanner(MetadataTable.NAME, Authorizations.EMPTY))
+        .andReturn(scanner).times(times);
+    EasyMock.expect(client.getTableIdToNameMap()).andReturn(tableIdToNameMap);
+  }
+
+  private void mockTableScan(Scanner scanner, Map<Key,Value> tableEntries, TableId tableId) {
+    scanner.fetchColumnFamily(MetadataSchema.TabletsSection.DataFileColumnFamily.NAME);
+    EasyMock.expectLastCall().once();
+    scanner.setRange(new KeyExtent(tableId, null, null).toMetaRange());
+    EasyMock.expectLastCall().once();
+    EasyMock.expect(scanner.iterator()).andReturn(tableEntries.entrySet().iterator());
+    scanner.close();
+    EasyMock.expectLastCall().once();
+  }
+}
diff --git a/shell/src/main/java/org/apache/accumulo/shell/commands/DUCommand.java b/shell/src/main/java/org/apache/accumulo/shell/commands/DUCommand.java
index 8f1fac248c..feddd5592d 100644
--- a/shell/src/main/java/org/apache/accumulo/shell/commands/DUCommand.java
+++ b/shell/src/main/java/org/apache/accumulo/shell/commands/DUCommand.java
@@ -37,6 +37,18 @@ import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 
+/**
+ * "du" command that will compute disk usage for tables and shared usage across tables by scanning
+ * the metadata table for file size information.
+ *
+ * Because the metadata table is used for computing usage and not the actual files in HDFS the
+ * results will be an estimate. Older entries may exist with no file metadata (resulting in size 0)
+ * and other actions in the cluster can impact the estimated size such as flushes, tablet splits,
+ * compactions, etc.
+ *
+ * For more accurate information a compaction should first be run on the set of tables being
+ * computed.
+ */
 public class DUCommand extends Command {
 
   private Option optTablePattern, optHumanReadble, optNamespace;
@@ -95,9 +107,14 @@ public class DUCommand extends Command {
 
   @Override
   public String description() {
-    return "prints how much space, in bytes, is used by files referenced by a"
-        + " table. When multiple tables are specified it prints how much space, in"
-        + " bytes, is used by files shared between tables, if any.";
+    return "Prints estimated space, in bytes, used by files referenced by a "
+        + "table or tables.  When multiple tables are specified it prints how much space, in "
+        + "bytes, are used by files shared between tables, if any. Because the metadata table "
+        + "is used for the file size information and not the actual files in HDFS the results "
+        + "will be an estimate. Older entries may exist with no file metadata (resulting in size 0) and "
+        + "other actions in the cluster can impact the estimated size such as flushes, tablet splits, "
+        + "compactions, etc. For more accurate information a compaction should first be run on all of the files for the "
+        + "set of tables being computed.";
   }
 
   @Override
diff --git a/test/src/main/java/org/apache/accumulo/test/TableOperationsIT.java b/test/src/main/java/org/apache/accumulo/test/TableOperationsIT.java
index c8c9bc2f50..faced1af4d 100644
--- a/test/src/main/java/org/apache/accumulo/test/TableOperationsIT.java
+++ b/test/src/main/java/org/apache/accumulo/test/TableOperationsIT.java
@@ -104,7 +104,7 @@ public class TableOperationsIT extends AccumuloClusterHarness {
     List<DiskUsage> diskUsage =
         accumuloClient.tableOperations().getDiskUsage(Collections.singleton(tableName));
     assertEquals(1, diskUsage.size());
-    assertEquals(0, (long) diskUsage.get(0).getUsage());
+    assertEquals(0, diskUsage.get(0).getUsage());
     assertEquals(tableName, diskUsage.get(0).getTables().iterator().next());
 
     accumuloClient.securityOperations().revokeTablePermission(getAdminPrincipal(), tableName,