You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by db...@apache.org on 2015/12/18 18:19:07 UTC

[1/2] incubator-trafodion git commit: [TRAFODION-1618] Fix row estimation logic to scale to more than 255 columns

Repository: incubator-trafodion
Updated Branches:
  refs/heads/master b2b3e6b01 -> 55a5d00c6


[TRAFODION-1618] Fix row estimation logic to scale to more than 255 columns


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/1ba7bd28
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/1ba7bd28
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/1ba7bd28

Branch: refs/heads/master
Commit: 1ba7bd280f24679049fac386a00c61144a99d35e
Parents: 52afc68
Author: Dave Birdsall <db...@apache.org>
Authored: Tue Dec 15 21:44:03 2015 +0000
Committer: Dave Birdsall <db...@apache.org>
Committed: Tue Dec 15 21:44:03 2015 +0000

----------------------------------------------------------------------
 .../java/org/trafodion/sql/HBaseClient.java     | 148 ++++++++++++++-----
 1 file changed, 114 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1ba7bd28/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
----------------------------------------------------------------------
diff --git a/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java b/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
index 1860fc8..98ac1b1 100644
--- a/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
+++ b/core/sql/src/main/java/org/trafodion/sql/HBaseClient.java
@@ -1032,11 +1032,58 @@ public class HBaseClient {
         float defCacheFraction = 0.4f;
         return config.getFloat("hfile.block.cache.size",defCacheFraction);
     }
+
+    // if we make the method below public later, should think about whether this is the
+    // right class to host this method
+
+    // compares two qualifiers as unsigned, lexicographically ordered byte strings
+    static private boolean isQualifierLessThanOrEqual(KeyValue nextKv,
+                                                      KeyValue currKv)
+    {
+       int currLength = currKv.getQualifierLength(); 
+       int currOffset = currKv.getQualifierOffset();
+       byte [] currQual = currKv.getQualifierArray();
+       int nextLength = nextKv.getQualifierLength(); 
+       int nextOffset = nextKv.getQualifierOffset();
+       byte [] nextQual = nextKv.getQualifierArray();   
+
+       // If we later decide we need a performance-critical version of this method,
+       // we should just use a native method that calls C memcmp.
+
+       int minLength = nextLength;
+       if (currLength < nextLength)
+         minLength = currLength;
+
+       for (int i = 0; i < minLength; i++) {
+         // ugh... have to do some gymnastics to make this an
+         // unsigned comparison
+         int nextQualI = nextQual[i+nextOffset];
+         if (nextQualI < 0)
+           nextQualI = nextQualI + 256;
+         int currQualI = currQual[i+currOffset];
+         if (currQualI < 0)
+           currQualI = currQualI + 256;
+
+         if (nextQualI < currQualI)
+           return true;
+         else if (nextQualI > currQualI)
+           return false;
+         // else equal, move on to next byte
+       }
+
+       // the first minLength bytes are the same; the shorter array
+       // is regarded as less
+
+       boolean rc = (nextLength <= currLength);      
+
+       return rc;
+    }
+
     // Estimates row count for tblName by iterating over the HFiles for
     // the table, extracting the KeyValue entry count from the file's
     // trailer block, summing the counts, and dividing by the number of
     // columns in the table. An adjustment is made for the estimated
-    // number of missing (null) values by sampling the first several
+    // number of missing values by sampling the first several
     // hundred KeyValues to see how many are missing.
     public boolean estimateRowCount(String tblName, int partialRowSize,
                                     int numCols, long[] rc)
@@ -1051,7 +1098,8 @@ public class HBaseClient {
       final int ROWS_TO_SAMPLE = 500;
       int putKVsSampled = 0;
       int nonPutKVsSampled = 0;
-      int nullCount = 0;
+      int missingKVsCount = 0;
+      int sampleRowCount = 0;
       long totalEntries = 0;   // KeyValues in all HFiles for table
       long totalSizeBytes = 0; // Size of all HFiles for table 
       long estimatedTotalPuts = 0;
@@ -1060,7 +1108,10 @@ public class HBaseClient {
       // Access the file system to go directly to the table's HFiles.
       // Create a reader for the file to access the entry count stored
       // in the trailer block, and a scanner to iterate over a few
-      // hundred KeyValues to estimate the incidence of nulls.
+      // hundred KeyValues to estimate the incidence of missing 
+      // KeyValues. KeyValues may be missing because the column has
+      // a null value, or because the column has a default value that
+      // has not been materialized.
       long nano1, nano2;
       nano1 = System.nanoTime();
       FileSystem fileSystem = FileSystem.get(config);
@@ -1088,36 +1139,65 @@ public class HBaseClient {
           //printQualifiers(reader, 100);
           if (ROWS_TO_SAMPLE > 0 &&
               totalEntries == reader.getEntries()) {  // first file only
-            // Trafodion column qualifiers are ordinal numbers, which
-            // makes it easy to count missing (null) values. We also count
-            // the non-Put KVs (typically delete-row markers) to estimate
-            // their frequency in the full file set.
+
+            // Trafodion column qualifiers are ordinal numbers, but are represented
+            // as varying length unsigned little-endian integers in lexicographical
+            // order. So, for example, in a table with 260 columns, the column
+            // qualifiers (if present) will be read in this order: 
+            // 1 (x'01'), 257 (x'0101'), 2 (x'02'), 258 (x'0201'), 3 (x'03'),
+            // 259 (x'0301'), 4 (x'04'), 260 (x'0401'), 5 (x'05'), 6 (x'06'), 
+            // 7 (x'07'), ...
+            // We have crossed the boundary to the next row if and only if the
+            // next qualifier read is less than or equal to the previous, 
+            // compared unsigned, lexicographically.
+
             HFileScanner scanner = reader.getScanner(false, false, false);
             scanner.seekTo();  //position at beginning of first data block
-            byte currQual = 0;
-            byte nextQual;
-            do {
-              KeyValue kv = scanner.getKeyValue();
-              if (kv.getType() == KeyValue.Type.Put.getCode()) {
-                nextQual = kv.getQualifier()[0];
-                if (nextQual <= currQual)
-                  nullCount += ((numCols - currQual)  // nulls at end of this row
-                              + (nextQual - 1));      // nulls at start of next row
-                else
-                  nullCount += (nextQual - currQual - 1);
-                currQual = nextQual;
-                putKVsSampled++;
-              } else {
-                nonPutKVsSampled++;  // don't count these toward the number
-              }                      //   we want to scan
-            } while ((putKVsSampled + nullCount) < (numCols * ROWS_TO_SAMPLE)
-                     && (more = scanner.next()));
-
-            // If all rows were read, count any nulls at end of last row.
-            if (!more && putKVsSampled > 0)
-              nullCount += (numCols - currQual);
-
-            if (logger.isDebugEnabled()) logger.debug("Sampled " + nullCount + " nulls.");
+
+            // the next line should succeed, as we know the HFile is non-empty
+            KeyValue currKv = scanner.getKeyValue();
+            while ((more) && (currKv.getType() != KeyValue.Type.Put.getCode())) {
+              nonPutKVsSampled++;
+              more = scanner.next();
+              currKv = scanner.getKeyValue();
+            }
+            if (more) {
+              // now we have the first KeyValue in the HFile
+
+              int putKVsThisRow = 1;
+              putKVsSampled++;
+              sampleRowCount++;  // we have at least one row
+              more = scanner.next();
+    
+              while ((more) && (sampleRowCount <= ROWS_TO_SAMPLE)) {
+                KeyValue nextKv = scanner.getKeyValue();
+                if (nextKv.getType() == KeyValue.Type.Put.getCode()) {
+                  if (isQualifierLessThanOrEqual(nextKv,currKv)) {
+                    // we have crossed a row boundary
+                    sampleRowCount++;
+                    missingKVsCount += (numCols - putKVsThisRow);
+                    putKVsThisRow = 1;
+                  } else {
+                    putKVsThisRow++;
+                  }
+                  currKv = nextKv;
+                  putKVsSampled++;
+                } else {
+                  nonPutKVsSampled++;  // don't count these toward the number
+                } 
+              more = scanner.next();
+              }
+            }   
+  
+            if (sampleRowCount > ROWS_TO_SAMPLE) {
+              // we read one KeyValue beyond the ROWS_TO_SAMPLE-eth row, so
+              // adjust counts for that
+              putKVsSampled--;
+              sampleRowCount--;
+            }
+
+            if (logger.isDebugEnabled())
+              logger.debug("Sampled " + missingKVsCount + " missing values.");
           }  // code for first file
         } finally {
           reader.close(false);
@@ -1131,7 +1211,7 @@ public class HBaseClient {
         {
           estimatedTotalPuts = (putKVsSampled * totalEntries) / 
                                (putKVsSampled + nonPutKVsSampled);
-          estimatedEntries = ((putKVsSampled + nullCount) * estimatedTotalPuts)
+          estimatedEntries = ((putKVsSampled + missingKVsCount) * estimatedTotalPuts)
                                    / putKVsSampled;
         }
 
@@ -1172,9 +1252,9 @@ public class HBaseClient {
       if (logger.isDebugEnabled()) logger.debug(tblName + " contains a total of " + totalEntries + " KeyValues in all HFiles.");
       if (logger.isDebugEnabled()) logger.debug("Based on a sample, it is estimated that " + estimatedTotalPuts +
                    " of these KeyValues are of type Put.");
-      if (putKVsSampled + nullCount > 0)
+      if (putKVsSampled + missingKVsCount > 0)
         if (logger.isDebugEnabled()) logger.debug("Sampling indicates a null incidence of " + 
-                     (nullCount * 100)/(putKVsSampled + nullCount) +
+                     (missingKVsCount * 100)/(putKVsSampled + missingKVsCount) +
                      " percent.");
       if (logger.isDebugEnabled()) logger.debug("Estimated number of actual values (including nulls) is " + estimatedEntries);
       if (logger.isDebugEnabled()) logger.debug("Estimated row count in HFiles = " + estimatedEntries +


[2/2] incubator-trafodion git commit: Merge [TRAFODION-1618] PR 229 Fix row est to scale past 255 columns

Posted by db...@apache.org.
Merge [TRAFODION-1618] PR 229 Fix row est to scale past 255 columns


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/55a5d00c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/55a5d00c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/55a5d00c

Branch: refs/heads/master
Commit: 55a5d00c6ac64e46da4ccc6bff382bdcdaee7c12
Parents: b2b3e6b 1ba7bd2
Author: Dave Birdsall <db...@apache.org>
Authored: Fri Dec 18 17:18:00 2015 +0000
Committer: Dave Birdsall <db...@apache.org>
Committed: Fri Dec 18 17:18:00 2015 +0000

----------------------------------------------------------------------
 .../java/org/trafodion/sql/HBaseClient.java     | 148 ++++++++++++++-----
 1 file changed, 114 insertions(+), 34 deletions(-)
----------------------------------------------------------------------