You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by db...@apache.org on 2015/12/18 18:19:07 UTC
[1/2] incubator-trafodion git commit: [TRAFODION-1618] Fix row
estimation logic to scale to more than 255 columns
Repository: incubator-trafodion
Updated Branches:
refs/heads/master b2b3e6b01 -> 55a5d00c6
[TRAFODION-1618] Fix row estimation logic to scale to more than 255 columns
Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/1ba7bd28
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/1ba7bd28
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/1ba7bd28
Branch: refs/heads/master
Commit: 1ba7bd280f24679049fac386a00c61144a99d35e
Parents: 52afc68
Author: Dave Birdsall <db...@apache.org>
Authored: Tue Dec 15 21:44:03 2015 +0000
Committer: Dave Birdsall <db...@apache.org>
Committed: Tue Dec 15 21:44:03 2015 +0000
----------------------------------------------------------------------
.../java/org/trafodion/sql/HBaseClient.java | 148 ++++++++++++++-----
1 file changed, 114 insertions(+), 34 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1ba7bd28/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
----------------------------------------------------------------------
diff --git a/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java b/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
index 1860fc8..98ac1b1 100644
--- a/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
+++ b/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
@@ -1032,11 +1032,58 @@ public class HBaseClient {
float defCacheFraction = 0.4f;
return config.getFloat("hfile.block.cache.size",defCacheFraction);
}
+
+ // if we make the method below public later, should think about whether this is the
+ // right class to host this method
+
+ // compares two qualifiers as unsigned, lexicographically ordered byte strings
+ static private boolean isQualifierLessThanOrEqual(KeyValue nextKv,
+ KeyValue currKv)
+ {
+ int currLength = currKv.getQualifierLength();
+ int currOffset = currKv.getQualifierOffset();
+ byte [] currQual = currKv.getQualifierArray();
+ int nextLength = nextKv.getQualifierLength();
+ int nextOffset = nextKv.getQualifierOffset();
+ byte [] nextQual = nextKv.getQualifierArray();
+
+ // If we later decide we need a performance-critical version of this method,
+ // we should just use a native method that calls C memcmp.
+
+ int minLength = nextLength;
+ if (currLength < nextLength)
+ minLength = currLength;
+
+ for (int i = 0; i < minLength; i++) {
+ // ugh... have to do some gymnastics to make this an
+ // unsigned comparison
+ int nextQualI = nextQual[i+nextOffset];
+ if (nextQualI < 0)
+ nextQualI = nextQualI + 256;
+ int currQualI = currQual[i+currOffset];
+ if (currQualI < 0)
+ currQualI = currQualI + 256;
+
+ if (nextQualI < currQualI)
+ return true;
+ else if (nextQualI > currQualI)
+ return false;
+ // else equal, move on to next byte
+ }
+
+ // the first minLength bytes are the same; the shorter array
+ // is regarded as less
+
+ boolean rc = (nextLength <= currLength);
+
+ return rc;
+ }
+
// Estimates row count for tblName by iterating over the HFiles for
// the table, extracting the KeyValue entry count from the file's
// trailer block, summing the counts, and dividing by the number of
// columns in the table. An adjustment is made for the estimated
- // number of missing (null) values by sampling the first several
+ // number of missing values by sampling the first several
// hundred KeyValues to see how many are missing.
public boolean estimateRowCount(String tblName, int partialRowSize,
int numCols, long[] rc)
@@ -1051,7 +1098,8 @@ public class HBaseClient {
final int ROWS_TO_SAMPLE = 500;
int putKVsSampled = 0;
int nonPutKVsSampled = 0;
- int nullCount = 0;
+ int missingKVsCount = 0;
+ int sampleRowCount = 0;
long totalEntries = 0; // KeyValues in all HFiles for table
long totalSizeBytes = 0; // Size of all HFiles for table
long estimatedTotalPuts = 0;
@@ -1060,7 +1108,10 @@ public class HBaseClient {
// Access the file system to go directly to the table's HFiles.
// Create a reader for the file to access the entry count stored
// in the trailer block, and a scanner to iterate over a few
- // hundred KeyValues to estimate the incidence of nulls.
+ // hundred KeyValues to estimate the incidence of missing
+ // KeyValues. KeyValues may be missing because the column has
+ // a null value, or because the column has a default value that
+ // has not been materialized.
long nano1, nano2;
nano1 = System.nanoTime();
FileSystem fileSystem = FileSystem.get(config);
@@ -1088,36 +1139,65 @@ public class HBaseClient {
//printQualifiers(reader, 100);
if (ROWS_TO_SAMPLE > 0 &&
totalEntries == reader.getEntries()) { // first file only
- // Trafodion column qualifiers are ordinal numbers, which
- // makes it easy to count missing (null) values. We also count
- // the non-Put KVs (typically delete-row markers) to estimate
- // their frequency in the full file set.
+
+ // Trafodion column qualifiers are ordinal numbers, but are represented
+ // as varying length unsigned little-endian integers in lexicographical
+ // order. So, for example, in a table with 260 columns, the column
+ // qualifiers (if present) will be read in this order:
+ // 1 (x'01'), 257 (x'0101'), 2 (x'02'), 258 (x'0201'), 3 (x'03'),
+ // 259 (x'0301'), 4 (x'04'), 260 (x'0401'), 5 (x'05'), 6 (x'06'),
+ // 7 (x'07'), ...
+ // We have crossed the boundary to the next row if and only if the
+ // next qualifier read is less than or equal to the previous,
+ // compared unsigned, lexicographically.
+
HFileScanner scanner = reader.getScanner(false, false, false);
scanner.seekTo(); //position at beginning of first data block
- byte currQual = 0;
- byte nextQual;
- do {
- KeyValue kv = scanner.getKeyValue();
- if (kv.getType() == KeyValue.Type.Put.getCode()) {
- nextQual = kv.getQualifier()[0];
- if (nextQual <= currQual)
- nullCount += ((numCols - currQual) // nulls at end of this row
- + (nextQual - 1)); // nulls at start of next row
- else
- nullCount += (nextQual - currQual - 1);
- currQual = nextQual;
- putKVsSampled++;
- } else {
- nonPutKVsSampled++; // don't count these toward the number
- } // we want to scan
- } while ((putKVsSampled + nullCount) < (numCols * ROWS_TO_SAMPLE)
- && (more = scanner.next()));
-
- // If all rows were read, count any nulls at end of last row.
- if (!more && putKVsSampled > 0)
- nullCount += (numCols - currQual);
-
- if (logger.isDebugEnabled()) logger.debug("Sampled " + nullCount + " nulls.");
+
+ // the next line should succeed, as we know the HFile is non-empty
+ KeyValue currKv = scanner.getKeyValue();
+ while ((more) && (currKv.getType() != KeyValue.Type.Put.getCode())) {
+ nonPutKVsSampled++;
+ more = scanner.next();
+ currKv = scanner.getKeyValue();
+ }
+ if (more) {
+ // now we have the first KeyValue in the HFile
+
+ int putKVsThisRow = 1;
+ putKVsSampled++;
+ sampleRowCount++; // we have at least one row
+ more = scanner.next();
+
+ while ((more) && (sampleRowCount <= ROWS_TO_SAMPLE)) {
+ KeyValue nextKv = scanner.getKeyValue();
+ if (nextKv.getType() == KeyValue.Type.Put.getCode()) {
+ if (isQualifierLessThanOrEqual(nextKv,currKv)) {
+ // we have crossed a row boundary
+ sampleRowCount++;
+ missingKVsCount += (numCols - putKVsThisRow);
+ putKVsThisRow = 1;
+ } else {
+ putKVsThisRow++;
+ }
+ currKv = nextKv;
+ putKVsSampled++;
+ } else {
+ nonPutKVsSampled++; // don't count these toward the number
+ }
+ more = scanner.next();
+ }
+ }
+
+ if (sampleRowCount > ROWS_TO_SAMPLE) {
+ // we read one KeyValue beyond the ROWS_TO_SAMPLE-eth row, so
+ // adjust counts for that
+ putKVsSampled--;
+ sampleRowCount--;
+ }
+
+ if (logger.isDebugEnabled())
+ logger.debug("Sampled " + missingKVsCount + " missing values.");
} // code for first file
} finally {
reader.close(false);
@@ -1131,7 +1211,7 @@ public class HBaseClient {
{
estimatedTotalPuts = (putKVsSampled * totalEntries) /
(putKVsSampled + nonPutKVsSampled);
- estimatedEntries = ((putKVsSampled + nullCount) * estimatedTotalPuts)
+ estimatedEntries = ((putKVsSampled + missingKVsCount) * estimatedTotalPuts)
/ putKVsSampled;
}
@@ -1172,9 +1252,9 @@ public class HBaseClient {
if (logger.isDebugEnabled()) logger.debug(tblName + " contains a total of " + totalEntries + " KeyValues in all HFiles.");
if (logger.isDebugEnabled()) logger.debug("Based on a sample, it is estimated that " + estimatedTotalPuts +
" of these KeyValues are of type Put.");
- if (putKVsSampled + nullCount > 0)
+ if (putKVsSampled + missingKVsCount > 0)
if (logger.isDebugEnabled()) logger.debug("Sampling indicates a null incidence of " +
- (nullCount * 100)/(putKVsSampled + nullCount) +
+ (missingKVsCount * 100)/(putKVsSampled + missingKVsCount) +
" percent.");
if (logger.isDebugEnabled()) logger.debug("Estimated number of actual values (including nulls) is " + estimatedEntries);
if (logger.isDebugEnabled()) logger.debug("Estimated row count in HFiles = " + estimatedEntries +
[2/2] incubator-trafodion git commit: Merge [TRAFODION-1618] PR 229
Fix row est to scale past 255 columns
Posted by db...@apache.org.
Merge [TRAFODION-1618] PR 229 Fix row est to scale past 255 columns
Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/55a5d00c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/55a5d00c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/55a5d00c
Branch: refs/heads/master
Commit: 55a5d00c6ac64e46da4ccc6bff382bdcdaee7c12
Parents: b2b3e6b 1ba7bd2
Author: Dave Birdsall <db...@apache.org>
Authored: Fri Dec 18 17:18:00 2015 +0000
Committer: Dave Birdsall <db...@apache.org>
Committed: Fri Dec 18 17:18:00 2015 +0000
----------------------------------------------------------------------
.../java/org/trafodion/sql/HBaseClient.java | 148 ++++++++++++++-----
1 file changed, 114 insertions(+), 34 deletions(-)
----------------------------------------------------------------------