You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hbase.apache.org by GitBox <gi...@apache.org> on 2019/11/04 21:13:54 UTC

[GitHub] [hbase] cbaenziger commented on a change in pull request #785: HBASE-23239 Reporting on status of backing MOB files from client-facing cells

cbaenziger commented on a change in pull request #785: HBASE-23239 Reporting on status of backing MOB files from client-facing cells
URL: https://github.com/apache/hbase/pull/785#discussion_r342267711
 
 

 ##########
 File path: hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mob/mapreduce/MobRefReporter.java
 ##########
 @@ -0,0 +1,485 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mob.mapreduce;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.UUID;
+import java.util.Base64;
+
+import com.google.protobuf.ServiceException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.io.HFileLink;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
+import org.apache.hadoop.hbase.mapreduce.TableMapper;
+import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
+import org.apache.hadoop.hbase.mob.MobConstants;
+import org.apache.hadoop.hbase.mob.MobUtils;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.HFileArchiveUtil;
+import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.yetus.audience.InterfaceAudience;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Scans a given table + CF for all mob reference cells to get the list of backing mob files.
+ * For each referenced file we attempt to verify that said file is on the FileSystem in a place
+ * that the MOB system will look when attempting to resolve the actual value.
+ *
+ * The job includes counters that can help provide a rough sketch of the mob data.
+ *
+ * <pre>
+ * Map-Reduce Framework
+ *         Map input records=10000
+ * ...
+ *         Reduce output records=99
+ * ...
+ * CELLS_PER_ROW_DIGITS
+ *         1=10000
+ * MOB
+ *         NUM_CELLS=52364
+ * PROBLEM
+ *         IMPACTED_ROWS=338
+ *         MOB_FILES=2
+ * PROBLEM_ROWS_PER_FILE_DIGITS
+ *         3=2
+ * SIZE_PER_CELL_DIGITS
+ *         5=627
+ *         6=51392
+ *         7=345
+ * SIZE_PER_ROW_DIGITS
+ *         6=6838
+ *         7=3162
+ * </pre>
+ *
+ *   * Map-Reduce Framework:Map input records - the number of rows with mob references
+ *   * Map-Reduce Framework:Reduce output records - the number of unique hfiles referenced
+ *   * MOB:NUM_CELLS - the total number of mob reference cells
+ *   * PROBLEM:IMPACTED_ROWS - the number of rows that reference hfiles with an issue
+ *   * PROBLEM:MOB_FILES - the number of unique hfiles that have an issue
+ *   * CELLS_PER_ROW_DIGITS: - this counter group gives a histogram of the order of magnitude of the
+ *         number of cells in a given row by grouping by the number of digits used in each count.
+ *         This allows us to see more about the distribution of cells than what we can determine
+ *         with just the cell count and the row count. In this particular example we can see that
+ *         all of our rows have somewhere between 1 - 9 cells.
+ *   * PROBLEM_ROWS_PER_FILE_DIGITS: - this counter group gives a histogram of the order of
+ *         magnitude of the number of rows in each of the hfiles with a problem. e.g. in the
+ *         example there are 2 hfiles and they each have the same order of magnitude number of rows,
+ *         namely 3 digits worth i.e. 100-999.
+ *   * SIZE_PER_CELL_DIGITS: - this counter group gives a histogram of the order of magnitude of
+ *         the size of mob values according to our reference cells. e.g. in the example above we
+ *         have between 5 and 7 digits in our cell sizes i.e. 10,000 - 9,999,999 bytes. From this
+ *         histogram we can also see that _most_ cells are 100,000 - 199,000 bytes and the smaller
+ *         and bigger ones are outliers making up less than 2% of mob cells.
+ *   * SIZE_PER_ROW_DIGITS: - this counter group gives a histogram of the order of magnitude of the
+ *         size of
+ *
+ * Generates a report that gives one file status per line, with tabs dividing fields.
+ *
+ * <pre>
+ * RESULT OF LOOKUP	FILE REF	comma seperated, base64 encoded rows when there's a problem
+ * </pre>
+ *
+ * e.g.
+ *
+ * <pre>
+ * MOB DIR	09c576e28a65ed2ead0004d192ffaa382019110184b30a1c7e034573bf8580aef8393402
+ * MISSING FILE    28e252d7f013973174750d483d358fa020191101f73536e7133f4cd3ab1065edf588d509        MmJiMjMyYzBiMTNjNzc0OTY1ZWY4NTU4ZjBmYmQ2MTUtNTIz,MmEzOGE0YTkzMTZjNDllNWE4MzM1MTdjNDVkMzEwNzAtODg=
+ * </pre>
+ *
+ * Possible results are listed; the first three indicate things are working properly.
+ *   * MOB DIR - the reference is in the normal MOB area for the given table and CF
+ *   * HLINK TO ARCHIVE FOR SAME TABLE - the reference is present in the archive area for this
+ *         table and CF
+ *   * HLINK TO ARCHIVE FOR OTHER TABLE - the reference is present in a different table and CF,
+ *         either in the MOB or archive areas (e.g. from a snapshot restore or clone)
+ *   * ARCHIVE WITH HLINK BUT NOT FROM OUR TABLE - the reference is currently present in the archive
+ *         area for this table and CF, but it is kept there because a _different_ table has a
+ *         reference to it (e.g. from a snapshot clone). If these other tables are removed then
+ *         the file will likely be deleted unless there is a snapshot also referencing it.
+ *   * ARCHIVE BUT NO HLINKS - the reference is currently present in the archive for this table and
+ *         CF, but there are no references present to prevent its removal. Unless it is newer than
+ *         the general TTL (default 5 minutes) or referenced in a snapshot it will be subject to
+ *         cleaning.
+ *   * ARCHIVE BUT IOE WHILE CHECKING HLINKS - Check the job logs to see why things failed while
+ *         looking for why this file is being kept around.
+ *   * MISSING FILE - We couldn't find the reference on the FileSystem. Note that MOB cells contain
+ *         a pointer to the table the reference was originally created in, much like HFileLinks do,
+ *         but that pointer is implemented via a server-side tag. It is thus possible that lookups
+ *         to impacted rows will still work (e.g. if the HFileLink in our table's mob area was
+ *         manually removed, but the original file in the referenced table is still around for some
+ *         reason.) However, even in this edge case HBase's internal cleaning systems should
+ *         eventually remove that file once it is no longer needed by another table or snapshot.
+ *   * HLINK BUT POINT TO MISSING FILE - There is a pointer in our mob area for this table and CF
+ *         to a file elsewhere on the FileSystem, however the file it points to no longer exists.
+ *   * MISSING FILE BUT IOE WHILE CHECKING HLINKS - We could not find the referenced file, however
+ *         you should check the job logs to see why we couldn't check to see if there is a pointer
+ *         to the referenced file in our archive or another table's archive or mob area.
+ *
+ */
+@InterfaceAudience.Private
+public class MobRefReporter extends Configured implements Tool {
+  private static Logger LOG = LoggerFactory.getLogger(MobRefReporter.class);
+  public static final String NAME = "mobrefs";
+  static final String REPORT_JOB_ID = "mob.report.job.id";
+  static final String REPORT_START_DATETIME = "mob.report.job.start";
+  static final int SCAN_CACHING = 10000;
+  static final long ONE_DAY = 24 * 60 * 60 * 1000;
+
+  public static class MobRefMapper extends TableMapper<Text, ImmutableBytesWritable> {
+    @Override
+    public void map(ImmutableBytesWritable r, Result columns, Context context) throws IOException,
+        InterruptedException {
+      if (columns == null) {
+        return;
+      }
+      Cell[] cells = columns.rawCells();
+      if (cells == null || cells.length == 0) {
+        return;
+      }
+      Set<String> files = new HashSet<>();
+      long count = 0;
+      long size = 0;
+      for (Cell c : cells) {
+        if (MobUtils.hasValidMobRefCellValue(c)) {
+          // TODO confirm there aren't tags
+          String fileName = MobUtils.getMobFileName(c);
+          if (!files.contains(fileName)) {
+            context.write(new Text(fileName), r);
+            files.add(fileName);
+          }
+          final int cellsize = MobUtils.getMobValueLength(c);
+          context.getCounter("SIZE_PER_CELL_DIGITS", Integer.toString(Integer.toString(cellsize).length())).increment(1L);
+          size += cellsize;
+          count++;
+        }
+      }
+      context.getCounter("CELLS_PER_ROW_DIGITS", Integer.toString(Long.toString(count).length())).increment(1L);
+      context.getCounter("SIZE_PER_ROW_DIGITS", Integer.toString(Long.toString(size).length())).increment(1L);
+      context.getCounter("MOB","NUM_CELLS").increment(count);
+    }
+  }
+
+  public static class MobRefReducer extends
+      Reducer<Text, ImmutableBytesWritable, Text, Text> {
+
+    TableName table;
+    String mobRegion;
+    Path mob;
+    Path archive;
+    String seperator;
+
+    /* Results that mean things are fine */
+    final Text MOB_DIR = new Text("MOB DIR");
+    final Text HLINK_RESTORE = new Text("HLINK TO ARCHIVE FOR SAME TABLE");
+    final Text HLINK_CLONE = new Text("HLINK TO ARCHIVE FOR OTHER TABLE");
+    /* Results that mean something is incorrect */
+    final Text ARCHIVE_ERROR_BAD_LINK = new Text("ARCHIVE WITH HLINK BUT NOT FROM OUR TABLE");
+    final Text ARCHIVE_ERROR_STALE = new Text("ARCHIVE BUT NO HLINKS");
+    final Text ARCHIVE_ERROR_IO = new Text("ARCHIVE BUT IOE WHILE CHECKING HLINKS");
+    /* Results that mean data is probably already gone */
+    final Text MISSING = new Text("MISSING FILE");
+    final Text HLINK_ERROR_DANGLING = new Text("HLINK BUT POINTS TO MISSING FILE");
+    final Text MISSING_IO = new Text("MISSING FILE BUT IOE WHILE CHECKING HLINKS");
+    final Base64.Encoder base64 = Base64.getEncoder();
+
+    @Override
+    public void setup(Context context) throws IOException, InterruptedException {
+      final Configuration conf = context.getConfiguration();
+      final String tableName = conf.get(TableInputFormat.INPUT_TABLE);
+      if (null == tableName) {
+        throw new IOException("Job configuration did not include table.");
+      }
+      table = TableName.valueOf(tableName);
+      mobRegion = MobUtils.getMobRegionInfo(table).getEncodedName();
+      final String family = conf.get(TableInputFormat.SCAN_COLUMN_FAMILY);
+      if (null == family) {
+        throw new IOException("Job configuration did not include column family");
+      }
+      mob = MobUtils.getMobFamilyPath(conf, table, family);
+      LOG.info("Using active mob area '{}'", mob);
+      archive = HFileArchiveUtil.getStoreArchivePath(conf, table,
+          MobUtils.getMobRegionInfo(table).getEncodedName(), family);
+      LOG.info("Using archive mob area '{}'", archive);
+      seperator = conf.get(TextOutputFormat.SEPERATOR, "\t");
+    }
+
+    @Override
+    public void reduce(Text key, Iterable<ImmutableBytesWritable> rows, Context context)
+        throws IOException, InterruptedException {
+      final Configuration conf = context.getConfiguration();
+      final String file = key.toString();
+      // active mob area
+      if (mob.getFileSystem(conf).exists(new Path(mob, file))) {
+        LOG.debug("Found file '{}' in mob area", file);
+        context.write(MOB_DIR, key);
+      // archive area - is there an hlink back reference (from a snapshot from same table)
+      } else if (archive.getFileSystem(conf).exists(new Path(archive, file))) {
+
+        Path backRefDir = HFileLink.getBackReferencesDir(archive, file);
+        try {
+          FileStatus[] backRefs = FSUtils.listStatus(archive.getFileSystem(conf), backRefDir);
+          if (backRefs != null) {
+            boolean found = false;
+            for (FileStatus backRef : backRefs) {
+              Pair<TableName, String> refParts = HFileLink.parseBackReferenceName(
+                  backRef.getPath().getName());
+              if (table.equals(refParts.getFirst()) && mobRegion.equals(refParts.getSecond())) {
+                Path hlinkPath = HFileLink.getHFileFromBackReference(MobUtils.getMobHome(conf),
+                    backRef.getPath());
+                if (hlinkPath.getFileSystem(conf).exists(hlinkPath)) {
+                  found = true;
+                } else {
+                  LOG.warn("Found file '{}' in archive area with a back reference to the mob area "
+                      + "for our table, but the mob area doesn't have a corresponding hfilelink.",
+                      file);
+                }
+              }
+            }
+            if (found) {
+              LOG.debug("Found file '{}' in archive area. has proper hlink back references to "
+                  + "suggest it's from a restored snapshot for this table.", file);
+              context.write(HLINK_RESTORE, key);
+            } else {
+              LOG.warn("Found file '{}' in archive area, but the hlink back references do not "
+                  + "properly point to the mob area for our table.", file);
+              context.write(ARCHIVE_ERROR_BAD_LINK, encodeRows(context, key, rows));
+            }
+          } else {
+            LOG.warn("Found file '{}' in archive area, but there are no hlinks pointing to it. Not "
+                + "yet used snapshot or an error.", file);
+            context.write(ARCHIVE_ERROR_STALE, encodeRows(context, key, rows));
+          }
+        } catch (IOException e) {
+          LOG.warn("Found file '{}' in archive area, but got an error while checking "
+              + "on back references.", file, e);
+          context.write(ARCHIVE_ERROR_IO, encodeRows(context, key, rows));
+        }
+
+      } else {
+        // hlink in active mob area (from a snapshot of a different table)
+        // we're doing this ourselves instead of using FSUtils.getReferenceFilePaths because
+        // we know the mob region never splits, so we can only have HFileLink references
+        // and looking for just them is cheaper then listing everything.
+        try {
+          FileStatus[] hlinks = mob.getFileSystem(conf).globStatus(new Path(mob + "/*=*-" + file));
+          if (hlinks != null && hlinks.length != 0) {
+            if (hlinks.length != 1) {
+              LOG.warn("Found file '{}' as hfilelinks in the mob area, but there are more than " +
+                  "one: {}", file, Arrays.deepToString(hlinks));
+            }
+            HFileLink found = null;
+            for (FileStatus hlink : hlinks) {
+              HFileLink tmp = HFileLink.buildFromHFileLinkPattern(conf, hlink.getPath());
+              if (tmp.exists(archive.getFileSystem(conf))) {
+                found = tmp;
+                break;
+              } else {
+                LOG.debug("Target file doesn't exist for ref {}", tmp);
+              }
+            }
+            if (found != null) {
+              LOG.debug("Found file '{}' as a ref in the mob area: {}", file, found);
+              context.write(HLINK_CLONE, key);
+            } else {
+              LOG.warn("Found file '{}' as ref(s) in the mob area but they don't point to an hfile "
 
 Review comment:
   Nit: `don't` to `do not`?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services