You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by dl...@apache.org on 2022/09/22 14:50:51 UTC

[accumulo] branch main updated: GarbageCollector table existence sanity check (#2792)

This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new e948b9ad47 GarbageCollector table existence sanity check (#2792)
e948b9ad47 is described below

commit e948b9ad4799d8a9034966c32015f7b700827428
Author: Dave Marion <dl...@apache.org>
AuthorDate: Thu Sep 22 10:50:45 2022 -0400

    GarbageCollector table existence sanity check (#2792)
    
    Ensure all tables are checked when removing references to candidates
    that are still in use. This is done by getting tableIds from zookeeper at
    the start of the reference scan and at the end of the reference scan.
    During the reference scans, the list of tableIDs is tracked.  After all candidates
    that are still in use have been removed, there is a one more consistency
    check to ensure we looked at all tables.
    
    Closes #1377
    
    Co-authored-by: Michael Wall <mj...@apache.org>
---
 .../main/java/org/apache/accumulo/gc/GCRun.java    |  83 ++++++-
 .../accumulo/gc/GarbageCollectionAlgorithm.java    |  68 +++++-
 .../accumulo/gc/GarbageCollectionEnvironment.java  |  20 +-
 .../apache/accumulo/gc/GarbageCollectionTest.java  | 242 ++++++++++++++++++++-
 4 files changed, 394 insertions(+), 19 deletions(-)

diff --git a/server/gc/src/main/java/org/apache/accumulo/gc/GCRun.java b/server/gc/src/main/java/org/apache/accumulo/gc/GCRun.java
index cb38bddc0b..8fbd7d11e6 100644
--- a/server/gc/src/main/java/org/apache/accumulo/gc/GCRun.java
+++ b/server/gc/src/main/java/org/apache/accumulo/gc/GCRun.java
@@ -18,6 +18,7 @@
  */
 package org.apache.accumulo.gc;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.DIR;
 import static org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.FILES;
 import static org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.SCANS;
@@ -26,6 +27,7 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -49,9 +51,11 @@ import org.apache.accumulo.core.gc.Reference;
 import org.apache.accumulo.core.gc.ReferenceDirectory;
 import org.apache.accumulo.core.gc.ReferenceFile;
 import org.apache.accumulo.core.manager.state.tables.TableState;
+import org.apache.accumulo.core.metadata.MetadataTable;
 import org.apache.accumulo.core.metadata.RootTable;
 import org.apache.accumulo.core.metadata.ValidationUtil;
 import org.apache.accumulo.core.metadata.schema.Ample;
+import org.apache.accumulo.core.metadata.schema.Ample.DataLevel;
 import org.apache.accumulo.core.metadata.schema.MetadataSchema;
 import org.apache.accumulo.core.metadata.schema.TabletMetadata;
 import org.apache.accumulo.core.metadata.schema.TabletsMetadata;
@@ -59,6 +63,8 @@ import org.apache.accumulo.core.security.Authorizations;
 import org.apache.accumulo.core.util.Pair;
 import org.apache.accumulo.core.util.threads.ThreadPools;
 import org.apache.accumulo.core.volume.Volume;
+import org.apache.accumulo.fate.util.UtilWaitThread;
+import org.apache.accumulo.fate.zookeeper.ZooReader;
 import org.apache.accumulo.server.ServerContext;
 import org.apache.accumulo.server.fs.VolumeManager;
 import org.apache.accumulo.server.fs.VolumeUtil;
@@ -66,6 +72,8 @@ import org.apache.accumulo.server.gc.GcVolumeUtil;
 import org.apache.accumulo.server.replication.proto.Replication;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -171,8 +179,44 @@ public class GCRun implements GarbageCollectionEnvironment {
   }
 
   @Override
-  public Set<TableId> getTableIDs() {
-    return context.getTableIdToNameMap().keySet();
+  public Map<TableId,TableState> getTableIDs() throws InterruptedException {
+    final String tablesPath = context.getZooKeeperRoot() + Constants.ZTABLES;
+    final ZooReader zr = context.getZooReader();
+    int retries = 1;
+    IllegalStateException ioe = null;
+    while (retries <= 10) {
+      try {
+        zr.sync(tablesPath);
+        final Map<TableId,TableState> tids = new HashMap<>();
+        for (String table : zr.getChildren(tablesPath)) {
+          TableId tableId = TableId.of(table);
+          TableState tableState = null;
+          String statePath = context.getZooKeeperRoot() + Constants.ZTABLES + "/"
+              + tableId.canonical() + Constants.ZTABLE_STATE;
+          try {
+            byte[] state = zr.getData(statePath);
+            if (state == null) {
+              tableState = TableState.UNKNOWN;
+            } else {
+              tableState = TableState.valueOf(new String(state, UTF_8));
+            }
+          } catch (NoNodeException e) {
+            tableState = TableState.UNKNOWN;
+          }
+          tids.put(tableId, tableState);
+        }
+        return tids;
+      } catch (KeeperException e) {
+        retries++;
+        if (ioe == null) {
+          ioe = new IllegalStateException("Error getting table ids from ZooKeeper");
+        }
+        ioe.addSuppressed(e);
+        log.error("Error getting tables from ZooKeeper, retrying in {} seconds", retries, e);
+        UtilWaitThread.sleepUninterruptibly(retries, TimeUnit.SECONDS);
+      }
+    }
+    throw ioe;
   }
 
   @Override
@@ -462,4 +506,39 @@ public class GCRun implements GarbageCollectionEnvironment {
   public long getCandidatesStat() {
     return candidates;
   }
+
+  /**
+   * Return a set of all TableIDs in the
+   * {@link org.apache.accumulo.core.metadata.schema.Ample.DataLevel} for which we are considering
+   * deletes. When gathering candidates at DataLevel.USER this will return all user table ids in an
+   * ONLINE or OFFLINE state. When gathering candidates at DataLevel.METADATA this will return the
+   * table id for the accumulo.metadata table. When gathering candidates at DataLevel.ROOT this will
+   * return the table id for the accumulo.root table.
+   *
+   * @return The table ids
+   * @throws InterruptedException
+   *           if interrupted when calling ZooKeeper
+   */
+  @Override
+  public Set<TableId> getCandidateTableIDs() throws InterruptedException {
+    if (level == DataLevel.ROOT) {
+      return Set.of(RootTable.ID);
+    } else if (level == DataLevel.METADATA) {
+      return Set.of(MetadataTable.ID);
+    } else if (level == DataLevel.USER) {
+      Set<TableId> tableIds = new HashSet<>();
+      getTableIDs().forEach((k, v) -> {
+        if (v == TableState.ONLINE || v == TableState.OFFLINE) {
+          // Don't return tables that are NEW, DELETING, or in an
+          // UNKNOWN state.
+          tableIds.add(k);
+        }
+      });
+      tableIds.remove(MetadataTable.ID);
+      tableIds.remove(RootTable.ID);
+      return tableIds;
+    } else {
+      throw new IllegalArgumentException("Unexpected level in GC Env: " + this.level.name());
+    }
+  }
 }
diff --git a/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionAlgorithm.java b/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionAlgorithm.java
index e91f47a682..dc6d41eefe 100644
--- a/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionAlgorithm.java
+++ b/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionAlgorithm.java
@@ -23,6 +23,7 @@ import static java.util.function.Predicate.not;
 
 import java.io.IOException;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -44,6 +45,7 @@ import org.apache.accumulo.server.replication.proto.Replication.Status;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.PeekingIterator;
 
@@ -134,11 +136,14 @@ public class GarbageCollectionAlgorithm {
   }
 
   private void removeCandidatesInUse(GarbageCollectionEnvironment gce,
-      SortedMap<String,String> candidateMap) {
-    var refStream = gce.getReferences();
-    Iterator<Reference> iter = refStream.iterator();
+      SortedMap<String,String> candidateMap) throws InterruptedException {
+
+    Set<TableId> tableIdsBefore = gce.getCandidateTableIDs();
+    Set<TableId> tableIdsSeen = new HashSet<>();
+    Iterator<Reference> iter = gce.getReferences().iterator();
     while (iter.hasNext()) {
       Reference ref = iter.next();
+      tableIdsSeen.add(ref.getTableId());
 
       if (ref.isDirectory()) {
         var dirReference = (ReferenceDirectory) ref;
@@ -172,6 +177,9 @@ public class GarbageCollectionAlgorithm {
           log.debug("Candidate was still in use: {}", relativePath);
       }
     }
+    Set<TableId> tableIdsAfter = gce.getCandidateTableIDs();
+    ensureAllTablesChecked(Collections.unmodifiableSet(tableIdsBefore),
+        Collections.unmodifiableSet(tableIdsSeen), Collections.unmodifiableSet(tableIdsAfter));
   }
 
   private long removeBlipCandidates(GarbageCollectionEnvironment gce,
@@ -216,6 +224,46 @@ public class GarbageCollectionAlgorithm {
     return blipCount;
   }
 
+  @VisibleForTesting
+  /**
+   * Double check no tables were missed during GC
+   */
+  protected void ensureAllTablesChecked(Set<TableId> tableIdsBefore, Set<TableId> tableIdsSeen,
+      Set<TableId> tableIdsAfter) {
+
+    // if a table was added or deleted during this run, it is acceptable to not
+    // have seen those tables ids when scanning the metadata table. So get the intersection
+    final Set<TableId> tableIdsMustHaveSeen = new HashSet<>(tableIdsBefore);
+    tableIdsMustHaveSeen.retainAll(tableIdsAfter);
+
+    if (tableIdsMustHaveSeen.isEmpty() && !tableIdsSeen.isEmpty()) {
+      // This exception will end up terminating the current GC loop iteration
+      // in SimpleGarbageCollector.run and will be logged. SimpleGarbageCollector
+      // will start the loop again.
+      throw new RuntimeException("Garbage collection will not proceed because "
+          + "table ids were seen in the metadata table and none were seen Zookeeper. "
+          + "This can have two causes. First, total number of tables going to/from "
+          + "zero during a GC cycle will cause this. Second, it could be caused by "
+          + "corruption of the metadata table and/or Zookeeper. Only the second cause "
+          + "is problematic, but there is no way to distinguish between the two causes "
+          + "so this GC cycle will not proceed. The first cause should be transient "
+          + "and one would not expect to see this message repeated in subsequent GC cycles.");
+    }
+
+    // From that intersection, remove all the table ids that were seen.
+    tableIdsMustHaveSeen.removeAll(tableIdsSeen);
+
+    // If anything is left then we missed a table and may not have removed rfiles references
+    // from the candidates list that are actually still in use, which would
+    // result in the rfiles being deleted in the next step of the GC process
+    if (!tableIdsMustHaveSeen.isEmpty()) {
+      // maybe a scan failed?
+      throw new IllegalStateException("Saw table IDs in ZK that were not in metadata table:  "
+          + tableIdsMustHaveSeen + " TableIDs before GC: " + tableIdsBefore
+          + ", TableIDs during GC: " + tableIdsSeen + ", TableIDs after GC: " + tableIdsAfter);
+    }
+  }
+
   protected void confirmDeletesFromReplication(GarbageCollectionEnvironment gce,
       SortedMap<String,String> candidateMap) {
     var replicationNeededIterator = gce.getReplicationNeededIterator();
@@ -255,7 +303,7 @@ public class GarbageCollectionAlgorithm {
   }
 
   private void cleanUpDeletedTableDirs(GarbageCollectionEnvironment gce,
-      SortedMap<String,String> candidateMap) throws IOException {
+      SortedMap<String,String> candidateMap) throws InterruptedException, IOException {
     HashSet<TableId> tableIdsWithDeletes = new HashSet<>();
 
     // find the table ids that had dirs deleted
@@ -268,7 +316,7 @@ public class GarbageCollectionAlgorithm {
       }
     }
 
-    Set<TableId> tableIdsInZookeeper = gce.getTableIDs();
+    Set<TableId> tableIdsInZookeeper = gce.getTableIDs().keySet();
 
     tableIdsWithDeletes.removeAll(tableIdsInZookeeper);
 
@@ -280,7 +328,7 @@ public class GarbageCollectionAlgorithm {
   }
 
   private long confirmDeletesTrace(GarbageCollectionEnvironment gce,
-      SortedMap<String,String> candidateMap) throws TableNotFoundException {
+      SortedMap<String,String> candidateMap) throws InterruptedException, TableNotFoundException {
     long blips = 0;
     Span confirmDeletesSpan = TraceUtil.startSpan(this.getClass(), "confirmDeletes");
     try (Scope scope = confirmDeletesSpan.makeCurrent()) {
@@ -297,7 +345,8 @@ public class GarbageCollectionAlgorithm {
   }
 
   private void deleteConfirmedCandidates(GarbageCollectionEnvironment gce,
-      SortedMap<String,String> candidateMap) throws IOException, TableNotFoundException {
+      SortedMap<String,String> candidateMap)
+      throws InterruptedException, IOException, TableNotFoundException {
     Span deleteSpan = TraceUtil.startSpan(this.getClass(), "deleteFiles");
     try (Scope deleteScope = deleteSpan.makeCurrent()) {
       gce.deleteConfirmedCandidates(candidateMap);
@@ -311,7 +360,8 @@ public class GarbageCollectionAlgorithm {
     cleanUpDeletedTableDirs(gce, candidateMap);
   }
 
-  public long collect(GarbageCollectionEnvironment gce) throws TableNotFoundException, IOException {
+  public long collect(GarbageCollectionEnvironment gce)
+      throws InterruptedException, TableNotFoundException, IOException {
 
     Iterator<String> candidatesIter = gce.getCandidates();
     long totalBlips = 0;
@@ -336,7 +386,7 @@ public class GarbageCollectionAlgorithm {
    * Given a sub-list of possible deletion candidates, process and remove valid deletion candidates.
    */
   private long deleteBatch(GarbageCollectionEnvironment gce, List<String> currentBatch)
-      throws TableNotFoundException, IOException {
+      throws InterruptedException, TableNotFoundException, IOException {
 
     long origSize = currentBatch.size();
     gce.incrementCandidatesStat(origSize);
diff --git a/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionEnvironment.java b/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionEnvironment.java
index a754747cce..931f5a8ba2 100644
--- a/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionEnvironment.java
+++ b/server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionEnvironment.java
@@ -30,6 +30,7 @@ import java.util.stream.Stream;
 import org.apache.accumulo.core.client.TableNotFoundException;
 import org.apache.accumulo.core.data.TableId;
 import org.apache.accumulo.core.gc.Reference;
+import org.apache.accumulo.core.manager.state.tables.TableState;
 import org.apache.accumulo.core.metadata.MetadataTable;
 import org.apache.accumulo.core.metadata.RootTable;
 import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
@@ -73,11 +74,26 @@ public interface GarbageCollectionEnvironment {
   Stream<Reference> getReferences();
 
   /**
-   * Return the set of tableIDs for the given instance this GarbageCollector is running over
+   * Return a set of all TableIDs that should be seen in {@link #getReferences()} at the current
+   * time. Immediately after this method returns the information it produced may be out of date
+   * relative to {@link #getReferences()}. See also the javadoc for
+   * ({@link GCRun#getCandidateTableIDs()}.
+   *
+   * @return set of table ids
+   * @throws InterruptedException
+   *           if interrupted when calling ZooKeeper
+   */
+  Set<TableId> getCandidateTableIDs() throws InterruptedException;
+
+  /**
+   * Return the map of tableIDs and TableStates for the given instance this GarbageCollector is
+   * running over
    *
    * @return The valueSet for the table name to table id map.
+   * @throws InterruptedException
+   *           if interrupted when calling ZooKeeper
    */
-  Set<TableId> getTableIDs();
+  Map<TableId,TableState> getTableIDs() throws InterruptedException;
 
   /**
    * Delete the given files from the provided {@link Map} of relative path to absolute path for each
diff --git a/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java b/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java
index 47db92398f..425508a925 100644
--- a/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java
+++ b/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java
@@ -24,6 +24,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -40,6 +41,10 @@ import org.apache.accumulo.core.data.TableId;
 import org.apache.accumulo.core.gc.Reference;
 import org.apache.accumulo.core.gc.ReferenceDirectory;
 import org.apache.accumulo.core.gc.ReferenceFile;
+import org.apache.accumulo.core.manager.state.tables.TableState;
+import org.apache.accumulo.core.metadata.MetadataTable;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.schema.Ample;
 import org.apache.accumulo.server.replication.proto.Replication.Status;
 import org.junit.jupiter.api.Test;
 
@@ -55,6 +60,16 @@ public class GarbageCollectionTest {
     ArrayList<TableId> tablesDirsToDelete = new ArrayList<>();
     TreeMap<String,Status> filesToReplicate = new TreeMap<>();
 
+    private final Ample.DataLevel level;
+
+    TestGCE(Ample.DataLevel level) {
+      this.level = level;
+    }
+
+    TestGCE() {
+      this.level = Ample.DataLevel.USER;
+    }
+
     @Override
     public Iterator<String> getCandidates() throws TableNotFoundException {
       return List.copyOf(candidates).iterator();
@@ -80,8 +95,10 @@ public class GarbageCollectionTest {
     }
 
     @Override
-    public Set<TableId> getTableIDs() {
-      return tableIds;
+    public Map<TableId,TableState> getTableIDs() {
+      HashMap<TableId,TableState> results = new HashMap<>();
+      tableIds.forEach((t) -> results.put(t, TableState.ONLINE));
+      return results;
     }
 
     @Override
@@ -96,20 +113,39 @@ public class GarbageCollectionTest {
     }
 
     public void addFileReference(String tableId, String endRow, String file) {
-      references.put(tableId + ":" + endRow + ":" + file,
-          new ReferenceFile(TableId.of(tableId), file));
+      TableId tid = TableId.of(tableId);
+      references.put(tableId + ":" + endRow + ":" + file, new ReferenceFile(tid, file));
+      tableIds.add(tid);
     }
 
     public void removeFileReference(String tableId, String endRow, String file) {
       references.remove(tableId + ":" + endRow + ":" + file);
+      removeLastTableIdRef(TableId.of(tableId));
     }
 
     public void addDirReference(String tableId, String endRow, String dir) {
-      references.put(tableId + ":" + endRow, new ReferenceDirectory(TableId.of(tableId), dir));
+      TableId tid = TableId.of(tableId);
+      references.put(tableId + ":" + endRow, new ReferenceDirectory(tid, dir));
+      tableIds.add(tid);
     }
 
     public void removeDirReference(String tableId, String endRow) {
       references.remove(tableId + ":" + endRow);
+      removeLastTableIdRef(TableId.of(tableId));
+    }
+
+    /*
+     * this is to be called from removeDirReference or removeFileReference.
+     *
+     * If you just removed the last reference to a table, we need to remove it from the tableIds in
+     * zookeeper
+     */
+    private void removeLastTableIdRef(TableId tableId) {
+      boolean inUse = references.keySet().stream().map(k -> k.substring(0, k.indexOf(':')))
+          .anyMatch(tid -> tableId.canonical().equals(tid));
+      if (!inUse) {
+        assertTrue(tableIds.remove(tableId));
+      }
     }
 
     @Override
@@ -122,6 +158,29 @@ public class GarbageCollectionTest {
     public Iterator<Entry<String,Status>> getReplicationNeededIterator() {
       return filesToReplicate.entrySet().iterator();
     }
+
+    @Override
+    public Set<TableId> getCandidateTableIDs() {
+      if (level == Ample.DataLevel.ROOT) {
+        return Set.of(RootTable.ID);
+      } else if (level == Ample.DataLevel.METADATA) {
+        return Collections.singleton(MetadataTable.ID);
+      } else if (level == Ample.DataLevel.USER) {
+        Set<TableId> tableIds = new HashSet<>();
+        getTableIDs().forEach((k, v) -> {
+          if (v == TableState.ONLINE || v == TableState.OFFLINE) {
+            // Don't return tables that are NEW, DELETING, or in an
+            // UNKNOWN state.
+            tableIds.add(k);
+          }
+        });
+        tableIds.remove(MetadataTable.ID);
+        tableIds.remove(RootTable.ID);
+        return tableIds;
+      } else {
+        throw new IllegalArgumentException("unknown level " + level);
+      }
+    }
   }
 
   private void assertRemoved(TestGCE gce, String... refs) {
@@ -366,7 +425,6 @@ public class GarbageCollectionTest {
 
     gca.collect(gce);
     assertRemoved(gce, "/4/t0/F002.rf");
-
   }
 
   @Test
@@ -511,6 +569,8 @@ public class GarbageCollectionTest {
 
     // Removing the dir reference for a table will delete all tablet directories
     gce.removeDirReference("5", null);
+    // but we need to add a file ref
+    gce.addFileReference("8", "m", "/t-0/F00.rf");
     gca.collect(gce);
     assertRemoved(gce, "hdfs://foo.com:6000/user/foo/tables/5/t-0");
 
@@ -614,6 +674,7 @@ public class GarbageCollectionTest {
     GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
 
     TestGCE gce = new TestGCE();
+
     gce.candidates.add("/1636/default_tablet");
     gce.addDirReference("1636", null, "default_tablet");
     gca.collect(gce);
@@ -631,6 +692,7 @@ public class GarbageCollectionTest {
 
     // have an indirect file reference
     gce = new TestGCE();
+
     gce.addFileReference("1636", null, "../9/default_tablet/someFile");
     gce.addDirReference("1636", null, "default_tablet");
     gce.candidates.add("/9/default_tablet/someFile");
@@ -651,6 +713,7 @@ public class GarbageCollectionTest {
     assertEquals(0, blipCount);
 
     gce = new TestGCE();
+
     gce.blips.add("/1636/b-0001");
     gce.candidates.add("/1636/b-0001/I0000");
     blipCount = gca.collect(gce);
@@ -658,6 +721,7 @@ public class GarbageCollectionTest {
     assertEquals(1, blipCount);
 
     gce = new TestGCE();
+
     gce.blips.add("/1029/b-0001");
     gce.blips.add("/1029/b-0002");
     gce.blips.add("/1029/b-0003");
@@ -685,6 +749,7 @@ public class GarbageCollectionTest {
     gce.candidates.add("/6/t-0");
     gce.candidates.add("hdfs://foo:6000/accumulo/tables/7/t-0/");
 
+    gce.addDirReference("4", null, "t-0");
     gce.addDirReference("7", null, "t-0");
 
     gca.collect(gce);
@@ -778,4 +843,169 @@ public class GarbageCollectionTest {
     assertEquals(1, gce.deletes.size());
     assertEquals("hdfs://foo.com:6000/accumulo/tables/2/t-00002/A000002.rf", gce.deletes.get(0));
   }
+
+  @Test
+  public void testMissingTableIds() throws Exception {
+    GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+
+    TestGCE gce = new TestGCE(Ample.DataLevel.USER);
+
+    gce.candidates.add("hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+
+    gce.addFileReference("a", null, "hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+    gce.addFileReference("c", null, "hdfs://foo.com:6000/user/foo/tables/c/t-0/F00.rf");
+
+    // add 2 more table references that will not be seen by the scan
+    gce.tableIds.addAll(makeUnmodifiableSet("b", "d"));
+
+    String msg = assertThrows(RuntimeException.class, () -> gca.collect(gce)).getMessage();
+    assertTrue((msg.contains("[b, d]") || msg.contains("[d, b]"))
+        && msg.contains("Saw table IDs in ZK that were not in metadata table:"), msg);
+  }
+
+  // below are tests for potential failure conditions of the GC process. Some of these cases were
+  // observed on clusters. Some were hypothesis based on observations. The result was that
+  // candidate entries were not removed when they should have been and therefore files were
+  // removed from HDFS that were actually still in use
+
+  private Set<TableId> makeUnmodifiableSet(String... args) {
+    Set<TableId> t = new HashSet<>();
+    for (String arg : args) {
+      t.add(TableId.of(arg));
+    }
+    return Collections.unmodifiableSet(t);
+  }
+
+  /**
+   * happy path, no tables added or removed during this portion and all the tables checked
+   */
+  @Test
+  public void testNormalGCRun() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2");
+
+    new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter);
+  }
+
+  /**
+   * table was added during this portion and we don't see it, should be fine
+   */
+  @Test
+  public void testTableAddedInMiddle() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2", "4");
+
+    new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter);
+  }
+
+  /**
+   * Test that a table was added during this portion and we see it. This means that the table was
+   * added after the candidates were grabbed, so there should be nothing to remove
+   */
+  @Test
+  public void testTableAddedInMiddleTwo() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3", "4");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2", "4");
+
+    new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter);
+  }
+
+  /**
+   * Test that the table was deleted during this portion and we don't see it. This means any
+   * candidates from the deleted table will stay on the candidate list and during the delete step
+   * they will try to removed
+   */
+  @Test
+  public void testTableDeletedInMiddle() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "4");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "4");
+
+    new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter);
+  }
+
+  /**
+   * table was deleted during this portion and we DO see it this mean candidates from the deleted
+   * table may get removed from the candidate list which should be ok, as the delete table function
+   * should be responsible for removing those
+   */
+  @Test
+  public void testTableDeletedInMiddleTwo() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "4", "3");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "4");
+
+    new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter);
+  }
+
+  /**
+   * this test simulates missing an entire table when looking for what files are in use if you add
+   * custom splits to the metadata at able boundaries, this can happen with a failed scan recall the
+   * ~tab:~pr for this first entry of a new table is empty, so there is now way to check the prior
+   * row. If you split a couple of tables in the metadata the table boundary , say table ids 2,3,4,
+   * and then miss scanning table 3 but get 4, it is possible other consistency checks will miss
+   * this
+   */
+  @Test
+  public void testMissEntireTable() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("1", "2", "4");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "3", "4");
+
+    GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+    String msg = assertThrows(RuntimeException.class,
+        () -> gca.ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter)).getMessage();
+    assertTrue(msg.startsWith("Saw table IDs in ZK that were not in metadata table:"));
+
+  }
+
+  /**
+   * this test simulates getting nothing from ZK for table ids, which should not happen, but just in
+   * case let's test
+   */
+  @Test
+  public void testZKHadNoTables() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet();
+    Set<TableId> tablesSeen = makeUnmodifiableSet("1", "2");
+    Set<TableId> tablesAfter = makeUnmodifiableSet();
+
+    GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+    String msg = assertThrows(RuntimeException.class,
+        () -> gca.ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter)).getMessage();
+    assertTrue(msg.startsWith("Garbage collection will not proceed because"));
+  }
+
+  /**
+   * simulates missing a table when checking references, and a table being added
+   */
+  @Test
+  public void testMissingTableAndTableAdd() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("1", "2", "4");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "3", "4");
+
+    GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+    String msg = assertThrows(RuntimeException.class,
+        () -> gca.ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter)).getMessage();
+    assertTrue(msg.startsWith("Saw table IDs in ZK that were not in metadata table:  [3]"));
+  }
+
+  /**
+   * simulates missing a table when checking references, and a table being deleted
+   */
+  @Test
+  public void testMissingTableAndTableDeleted() {
+    Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+    Set<TableId> tablesSeen = makeUnmodifiableSet("1", "2", "4");
+    Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "3");
+
+    GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+    String msg = assertThrows(RuntimeException.class,
+        () -> gca.ensureAllTablesChecked(tablesBefore, tablesSeen, tablesAfter)).getMessage();
+    assertTrue(msg.startsWith("Saw table IDs in ZK that were not in metadata table:  [3]"));
+
+  }
 }