You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by nz...@apache.org on 2011/06/03 18:45:40 UTC
svn commit: r1131106 [1/11] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/
hbase-handler/src/java/org/apache/hadoop/hive/...
Author: nzhang
Date: Fri Jun 3 16:45:37 2011
New Revision: 1131106
URL: http://svn.apache.org/viewvc?rev=1131106&view=rev
Log:
HIVE-2185. extend table statistics to store the size of uncompressed data (+extend interfaces for collecting other types of statistics) (Tomasz Nykiel via Ning Zhang)
Added:
hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsUtils.java
hive/trunk/hbase-handler/src/test/queries/hbase_stats2.q
hive/trunk/hbase-handler/src/test/results/hbase_stats2.q.out
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsUtils.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/TestStatsPublisherEnhanced.java
hive/trunk/ql/src/test/queries/clientpositive/stats14.q
hive/trunk/ql/src/test/queries/clientpositive/stats15.q
hive/trunk/ql/src/test/results/clientpositive/stats14.q.out
hive/trunk/ql/src/test/results/clientpositive/stats15.q.out
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/SerDeStatsStruct.java
hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/TestStatsSerde.java
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java
hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java
hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java
hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java
hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsPublisher.java
hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsSetupConstants.java
hive/trunk/hbase-handler/src/test/queries/hbase_stats.q
hive/trunk/hbase-handler/src/test/results/hbase_stats.q.out
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Stat.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsAggregator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsPublisher.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsSetupConst.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsAggregator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsPublisher.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsSetupConstants.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/TestStatsPublisher.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/serde2/TestSerDe.java
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin1.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin2.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin3.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin4.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin5.q.out
hive/trunk/ql/src/test/results/clientpositive/combine2.q.out
hive/trunk/ql/src/test/results/clientpositive/filter_join_breaktask.q.out
hive/trunk/ql/src/test/results/clientpositive/join_map_ppr.q.out
hive/trunk/ql/src/test/results/clientpositive/merge3.q.out
hive/trunk/ql/src/test/results/clientpositive/merge4.q.out
hive/trunk/ql/src/test/results/clientpositive/pcr.q.out
hive/trunk/ql/src/test/results/clientpositive/sample10.q.out
hive/trunk/ql/src/test/results/clientpositive/stats11.q.out
hive/trunk/ql/src/test/results/clientpositive/union22.q.out
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/Deserializer.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/MetadataTypedColumnsetSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/Serializer.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/TypedSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/dynamic_type/DynamicSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinarySerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/LazyBinaryStruct.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftDeserializer.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri Jun 3 16:45:37 2011
@@ -395,6 +395,8 @@ public class HiveConf extends Configurat
0), // maximum # of retries to insert/select/delete the stats DB
HIVE_STATS_RETRIES_WAIT("hive.stats.retries.wait",
3000), // # milliseconds to wait before the next retry
+ HIVE_STATS_COLLECT_RAWDATASIZE("hive.stats.collect.rawdatasize", true),
+ // should the raw data size be collected when analayzing tables
// Concurrency
Modified: hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java (original)
+++ hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java Fri Jun 3 16:45:37 2011
@@ -31,6 +31,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
@@ -45,25 +46,25 @@ import org.apache.hadoop.io.Writable;
/**
* RegexSerDe uses regular expression (regex) to serialize/deserialize.
- *
+ *
* It can deserialize the data using regex and extracts groups as columns. It
* can also serialize the row object using a format string.
- *
+ *
* In deserialization stage, if a row does not match the regex, then all columns
* in the row will be NULL. If a row matches the regex but has less than
* expected groups, the missing groups will be NULL. If a row matches the regex
* but has more than expected groups, the additional groups are just ignored.
- *
+ *
* In serialization stage, it uses java string formatter to format the columns
* into a row. If the output type of the column in a query is not a string, it
* will be automatically converted to String by Hive.
- *
+ *
* For the format of the format String, please refer to {@link http
* ://java.sun.com/j2se/1.5.0/docs/api/java/util/Formatter.html#syntax}
- *
+ *
* NOTE: Obviously, all columns have to be strings. Users can use
* "CAST(a AS INT)" to convert columns to other types.
- *
+ *
* NOTE: This implementation is using String, and javaStringObjectInspector. A
* more efficient implementation should use UTF-8 encoded Text and
* writableStringObjectInspector. We should switch to that when we have a UTF-8
@@ -257,4 +258,9 @@ public class RegexSerDe implements SerDe
return outputRowText;
}
+ public SerDeStats getSerDeStats() {
+ // no support for statistics
+ return null;
+ }
+
}
Modified: hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java (original)
+++ hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java Fri Jun 3 16:45:37 2011
@@ -34,16 +34,17 @@ import org.apache.hadoop.hive.ql.io.NonS
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
@@ -380,4 +381,9 @@ public class TypedBytesSerDe implements
}
}
}
+
+ public SerDeStats getSerDeStats() {
+ // no support for statistics
+ return null;
+ }
}
Modified: hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java (original)
+++ hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java Fri Jun 3 16:45:37 2011
@@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ReflectionStructObjectInspector;
@@ -201,4 +202,9 @@ public class S3LogDeserializer implement
}
+ public SerDeStats getSerDeStats() {
+ // no support for statistics
+ return null;
+ }
+
}
Modified: hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java (original)
+++ hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java Fri Jun 3 16:45:37 2011
@@ -34,19 +34,20 @@ import org.apache.hadoop.hive.serde.Cons
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
-import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.SerDeParameters;
+import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
@@ -589,4 +590,9 @@ public class HBaseSerDe implements SerDe
int getKeyColumnOffset() {
return iKey;
}
+
+ public SerDeStats getSerDeStats() {
+ // no support for statistics
+ return null;
+ }
}
Modified: hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java (original)
+++ hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java Fri Jun 3 16:45:37 2011
@@ -19,26 +19,20 @@
package org.apache.hadoop.hive.hbase;
import java.io.IOException;
-import java.sql.SQLException;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
-import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Delete;
+import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.PrefixFilter;
-import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.ql.stats.StatsAggregator;
-import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
/**
@@ -47,7 +41,6 @@ import org.apache.hadoop.hive.ql.stats.S
public class HBaseStatsAggregator implements StatsAggregator {
private HTable htable;
- private byte[] rowCountFamily, rowCountColumn;
private final Log LOG = LogFactory.getLog(this.getClass().getName());
/**
@@ -57,17 +50,13 @@ public class HBaseStatsAggregator implem
try {
HBaseConfiguration hbaseConf = new HBaseConfiguration(hiveconf);
- HBaseAdmin hbase = new HBaseAdmin(hbaseConf);
-
- rowCountFamily = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_FAMILY);
- rowCountColumn = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_NAME);
- htable = new HTable(HBaseStatsSetupConstants.PART_STAT_TABLE_NAME);
+ htable = new HTable(hbaseConf, HBaseStatsSetupConstants.PART_STAT_TABLE_NAME);
return true;
} catch (IOException e) {
LOG.error("Error during HBase connection. ", e);
return false;
- }
+ }
}
/**
@@ -75,41 +64,34 @@ public class HBaseStatsAggregator implem
*/
public String aggregateStats(String rowID, String key) {
- if (key != StatsSetupConst.ROW_COUNT) {
- LOG.warn("Warning. Invalid statistic. Currently " +
- "row count is the only supported statistic");
+ byte[] family, column;
+ if (!HBaseStatsUtils.isValidStatistic(key)) {
+ LOG.warn("Warning. Invalid statistic: " + key + ", supported stats: " +
+ HBaseStatsUtils.getSupportedStatistics());
return null;
}
- int retValue = 0;
+ family = HBaseStatsUtils.getFamilyName();
+ column = HBaseStatsUtils.getColumnName(key);
+
try {
+
+ long retValue = 0;
Scan scan = new Scan();
- scan.addColumn(rowCountFamily, rowCountColumn);
+ scan.addColumn(family, column);
// Filter the row by its ID
// The complete key is "tableName/PartSpecs/jobID/taskID"
- // This is a prefix filter, the prefix is "tableName/PartSpecs/JobID", i.e. the taskID is ignored
- // In SQL, this is equivalent to "Select * FROM tableName where ID LIKE 'tableName/PartSpecs/JobID%';"
+ // This is a prefix filter, the prefix is "tableName/PartSpecs/JobID", i.e. the taskID is
+ // ignored. In SQL, this is equivalent to
+ // "Select * FROM tableName where ID LIKE 'tableName/PartSpecs/JobID%';"
PrefixFilter filter = new PrefixFilter(Bytes.toBytes(rowID));
scan.setFilter(filter);
ResultScanner scanner = htable.getScanner(scan);
- ArrayList<Delete> toDelete = new ArrayList<Delete>();
- for (Result result: scanner) {
- retValue += Integer.parseInt(Bytes.toString(result.getValue(rowCountFamily, rowCountColumn)));
- /* Automatic Cleaning:
- IMPORTANT: Since we publish and aggregate only 1 value (1 column) which is the row count, it
- is valid to delete the row after aggregation (automatic cleaning) because we know that there is no
- other values to aggregate.
- If ;in the future; other values are aggregated and published, then we cannot do cleaning except
- when we are sure that all values are aggregated, or we can separate the implementation of cleaning
- through a separate method which the developer has to call it manually in the code.
- */
- Delete delete = new Delete(result.getRow());
- toDelete.add(delete);
+ for (Result result : scanner) {
+ retValue += Long.parseLong(Bytes.toString(result.getValue(family, column)));
}
- htable.delete(toDelete);
-
- return Integer.toString(retValue);
+ return Long.toString(retValue);
} catch (IOException e) {
LOG.error("Error during publishing aggregation. ", e);
return null;
@@ -131,17 +113,15 @@ public class HBaseStatsAggregator implem
scan.setFilter(filter);
ResultScanner scanner = htable.getScanner(scan);
ArrayList<Delete> toDelete = new ArrayList<Delete>();
- for (Result result: scanner) {
- Delete delete = new Delete(result.getRow());
+ for (Result result : scanner) {
+ Delete delete = new Delete(result.getRow());
toDelete.add(delete);
}
- htable.delete(toDelete);
+ htable.delete(toDelete);
return true;
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error("Error during publishing aggregation. ", e);
return false;
}
}
-
-}
\ No newline at end of file
+}
Modified: hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsPublisher.java
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsPublisher.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsPublisher.java (original)
+++ hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsPublisher.java Fri Jun 3 16:45:37 2011
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.hbase;
import java.io.IOException;
+import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -31,9 +32,8 @@ import org.apache.hadoop.hbase.client.HB
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.RowLock;
import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hive.ql.stats.*;
+import org.apache.hadoop.hive.ql.stats.StatsPublisher;
/**
* A class that implements the StatsPublisher interface through HBase.
@@ -41,7 +41,6 @@ import org.apache.hadoop.hive.ql.stats.*
public class HBaseStatsPublisher implements StatsPublisher {
private HTable htable;
- private byte[] rowCountFamily, rowCountColumn;
private final Log LOG = LogFactory.getLog(this.getClass().getName());
/**
@@ -51,10 +50,7 @@ public class HBaseStatsPublisher impleme
try {
HBaseConfiguration hbaseConf = new HBaseConfiguration(hiveconf);
- HBaseAdmin hbase = new HBaseAdmin(hbaseConf);
- rowCountFamily = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_FAMILY);
- rowCountColumn = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_NAME);
- htable = new HTable(HBaseStatsSetupConstants.PART_STAT_TABLE_NAME);
+ htable = new HTable(hbaseConf, HBaseStatsSetupConstants.PART_STAT_TABLE_NAME);
// for performance reason, defer update until the closeConnection
htable.setAutoFlush(false);
} catch (IOException e) {
@@ -68,28 +64,47 @@ public class HBaseStatsPublisher impleme
/**
* Writes temporary statistics into HBase;
*/
- public boolean publishStat(String rowID, String key, String value) {
+ public boolean publishStat(String rowID, Map<String, String> stats) {
+
+ // Write in HBase
- if (key != StatsSetupConst.ROW_COUNT) {
- LOG.warn("Warning. Invalid statistic. Currently " +
- "row count is the only supported statistic");
+ if (!HBaseStatsUtils.isValidStatisticSet(stats.keySet())) {
+ LOG.warn("Warning. Invalid statistic: " + stats.keySet().toString()
+ + ", supported stats: "
+ + HBaseStatsUtils.getSupportedStatistics());
return false;
}
- // Write in HBase
try {
+
+ // check the basic stat (e.g., row_count)
+
Get get = new Get(Bytes.toBytes(rowID));
Result result = htable.get(get);
- int val = Integer.parseInt(value);
- int oldVal = 0;
+
+ byte[] family = HBaseStatsUtils.getFamilyName();
+ byte[] column = HBaseStatsUtils.getColumnName(HBaseStatsUtils.getBasicStat());
+
+ long val = Long.parseLong(HBaseStatsUtils.getStatFromMap(HBaseStatsUtils.getBasicStat(),
+ stats));
+ long oldVal = 0;
+
if (!result.isEmpty()) {
- oldVal = Integer.parseInt(Bytes.toString(result.getValue(rowCountFamily, rowCountColumn)));
+ oldVal = Long.parseLong(Bytes.toString(result.getValue(family, column)));
}
- if (oldVal < val) {
- Put row = new Put(Bytes.toBytes(rowID));
- row.add(rowCountFamily, rowCountColumn, Bytes.toBytes(Integer.toString(val)));
- htable.put(row);
+
+ if (oldVal >= val) {
+ return true; // we do not need to publish anything
+ }
+
+ // we need to update
+ Put row = new Put(Bytes.toBytes(rowID));
+ for (String statType : HBaseStatsUtils.getSupportedStatistics()) {
+ column = HBaseStatsUtils.getColumnName(statType);
+ row.add(family, column, Bytes.toBytes(HBaseStatsUtils.getStatFromMap(statType, stats)));
}
+
+ htable.put(row);
return true;
} catch (IOException e) {
@@ -118,16 +133,11 @@ public class HBaseStatsPublisher impleme
HBaseConfiguration hbaseConf = new HBaseConfiguration(hiveconf);
HBaseAdmin hbase = new HBaseAdmin(hbaseConf);
- rowCountFamily = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_FAMILY);
- rowCountColumn = Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_NAME);
-
// Creating table if not exists
if (!hbase.tableExists(HBaseStatsSetupConstants.PART_STAT_TABLE_NAME)) {
HTableDescriptor table = new HTableDescriptor(HBaseStatsSetupConstants.PART_STAT_TABLE_NAME);
-
- HColumnDescriptor rowCount = new HColumnDescriptor(rowCountFamily);
- table.addFamily(rowCount);
-
+ HColumnDescriptor family = new HColumnDescriptor(HBaseStatsUtils.getFamilyName());
+ table.addFamily(family);
hbase.createTable(table);
}
} catch (IOException e) {
@@ -136,5 +146,5 @@ public class HBaseStatsPublisher impleme
}
return true;
- }
+ }
}
Modified: hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsSetupConstants.java
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsSetupConstants.java?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsSetupConstants.java (original)
+++ hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsSetupConstants.java Fri Jun 3 16:45:37 2011
@@ -21,9 +21,14 @@ package org.apache.hadoop.hive.hbase;
public final class HBaseStatsSetupConstants {
public static final String PART_STAT_TABLE_NAME = "PARTITION_STAT_TBL";
-
+
+ public static final String PART_STAT_COLUMN_FAMILY = "PARTITION_STAT_FAMILY";
+
+ //supported stats
+
public static final String PART_STAT_ROW_COUNT_COLUMN_NAME = "ROW_COUNT";
-
- public static final String PART_STAT_ROW_COUNT_COLUMN_FAMILY = "ROW_COUNT_FAMILY";
+
+ public static final String PART_STAT_RAW_DATA_SIZE_COLUMN_NAME = "RAW_DATA_SIZE";
+
}
Added: hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsUtils.java?rev=1131106&view=auto
==============================================================================
--- hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsUtils.java (added)
+++ hive/trunk/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsUtils.java Fri Jun 3 16:45:37 2011
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.hbase;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
+
+
+
+public class HBaseStatsUtils {
+
+ private static final List<String> supportedStats = new ArrayList<String>();
+ private static final Map<String, String> columnNameMapping = new HashMap<String, String>();
+
+ static {
+ // supported statistics
+ supportedStats.add(StatsSetupConst.ROW_COUNT);
+ supportedStats.add(StatsSetupConst.RAW_DATA_SIZE);
+
+ // row count statistics
+ columnNameMapping.put(StatsSetupConst.ROW_COUNT,
+ HBaseStatsSetupConstants.PART_STAT_ROW_COUNT_COLUMN_NAME);
+
+ // raw data size
+ columnNameMapping.put(StatsSetupConst.RAW_DATA_SIZE,
+ HBaseStatsSetupConstants.PART_STAT_RAW_DATA_SIZE_COLUMN_NAME);
+
+ }
+
+ /**
+ * Returns the set of supported statistics
+ */
+ public static List<String> getSupportedStatistics() {
+ return supportedStats;
+ }
+
+ /**
+ * Retrieves the value for a particular stat from the published map.
+ *
+ * @param statType
+ * - statistic type to be retrieved from the map
+ * @param stats
+ * - stats map
+ * @return value for the given statistic as string, "0" if the statistic is not present
+ */
+ public static String getStatFromMap(String statType, Map<String, String> stats) {
+ String value = stats.get(statType);
+ if (value == null) {
+ return "0";
+ }
+ return value;
+ }
+
+ /**
+ * Check if the set to be published is within the supported statistics.
+ * It must also contain at least the basic statistics (used for comparison).
+ *
+ * @param stats
+ * - stats to be published
+ * @return true if is a valid statistic set, false otherwise
+ */
+
+ public static boolean isValidStatisticSet(Collection<String> stats) {
+ if(!stats.contains(getBasicStat())) {
+ return false;
+ }
+ for (String stat : stats) {
+ if (!supportedStats.contains(stat)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Check if a particular statistic type is supported
+ *
+ * @param statType
+ * - statistic to be published
+ * @return true if statType is supported, false otherwise
+ */
+ public static boolean isValidStatistic(String statType) {
+ return supportedStats.contains(statType);
+ }
+
+ /**
+ * Returns the HBase column where the statistics for the given type are stored.
+ *
+ * @param statType
+ * - supported statistic.
+ * @return column name for the given statistic.
+ */
+ public static byte[] getColumnName(String statType) {
+ return Bytes.toBytes(columnNameMapping.get(statType));
+ }
+
+ /**
+ * Returns the family name for stored statistics.
+ */
+ public static byte[] getFamilyName() {
+ return Bytes.toBytes(HBaseStatsSetupConstants.PART_STAT_COLUMN_FAMILY);
+ }
+
+ /**
+ * Returns the basic type of the supported statistics.
+ * It is used to determine which statistics are fresher.
+ */
+
+ public static String getBasicStat() {
+ return supportedStats.get(0);
+ }
+
+}
Modified: hive/trunk/hbase-handler/src/test/queries/hbase_stats.q
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/test/queries/hbase_stats.q?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/test/queries/hbase_stats.q (original)
+++ hive/trunk/hbase-handler/src/test/queries/hbase_stats.q Fri Jun 3 16:45:37 2011
@@ -1,4 +1,6 @@
set datanucleus.cache.collections=false;
+set hive.stats.autogather=true;
+set hive.ststs.atomic=false;
set hive.stats.dbclass=hbase;
@@ -7,15 +9,22 @@ insert overwrite table stats_src select
analyze table stats_src compute statistics;
desc formatted stats_src;
-create table hbase_part like srcpart;
+create table stats_part like srcpart;
-insert overwrite table hbase_part partition (ds='2010-04-08', hr = '11') select key, value from src;
-insert overwrite table hbase_part partition (ds='2010-04-08', hr = '12') select key, value from src;
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '11') select key, value from src;
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '12') select key, value from src;
-analyze table hbase_part partition(ds='2008-04-08', hr=11) compute statistics;
-analyze table hbase_part partition(ds='2008-04-08', hr=12) compute statistics;
+analyze table stats_part partition(ds='2010-04-08', hr='11') compute statistics;
+analyze table stats_part partition(ds='2010-04-08', hr='12') compute statistics;
-desc formatted hbase_part;
-desc formatted hbase_part partition (ds='2010-04-08', hr = '11');
-desc formatted hbase_part partition (ds='2010-04-08', hr = '12');
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src;
+desc formatted stats_part;
+desc formatted stats_part partition (ds='2010-04-08', hr = '11');
+desc formatted stats_part partition (ds='2010-04-08', hr = '12');
+
+analyze table stats_part partition(ds, hr) compute statistics;
+desc formatted stats_part;
+
+drop table stats_src;
+drop table stats_part;
Added: hive/trunk/hbase-handler/src/test/queries/hbase_stats2.q
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/test/queries/hbase_stats2.q?rev=1131106&view=auto
==============================================================================
--- hive/trunk/hbase-handler/src/test/queries/hbase_stats2.q (added)
+++ hive/trunk/hbase-handler/src/test/queries/hbase_stats2.q Fri Jun 3 16:45:37 2011
@@ -0,0 +1,31 @@
+set datanucleus.cache.collections=false;
+set hive.stats.autogather=true;
+set hive.stats.atomic=false;
+set hive.stats.collect.uncompressedsize=false;
+
+set hive.stats.dbclass=hbase;
+
+create table stats_src like src;
+insert overwrite table stats_src select * from src;
+analyze table stats_src compute statistics;
+desc formatted stats_src;
+
+create table stats_part like srcpart;
+
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '11') select key, value from src;
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '12') select key, value from src;
+
+analyze table stats_part partition(ds='2010-04-08', hr='11') compute statistics;
+analyze table stats_part partition(ds='2010-04-08', hr='12') compute statistics;
+
+insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src;
+
+desc formatted stats_part;
+desc formatted stats_part partition (ds='2010-04-08', hr = '11');
+desc formatted stats_part partition (ds='2010-04-08', hr = '12');
+
+analyze table stats_part partition(ds, hr) compute statistics;
+desc formatted stats_part;
+
+drop table stats_src;
+drop table stats_part;
Modified: hive/trunk/hbase-handler/src/test/results/hbase_stats.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/hbase-handler/src/test/results/hbase_stats.q.out?rev=1131106&r1=1131105&r2=1131106&view=diff
==============================================================================
--- hive/trunk/hbase-handler/src/test/results/hbase_stats.q.out (original)
+++ hive/trunk/hbase-handler/src/test/results/hbase_stats.q.out Fri Jun 3 16:45:37 2011
@@ -37,14 +37,19 @@ value string
# Detailed Table Information
Database: default
Owner: null
-CreateTime: Tue Jan 25 14:48:20 PST 2011
+CreateTime: Wed Jun 01 18:17:56 PDT 2011
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
-Location: pfile:/data/users/jsichi/open/hive-trunk/build/hbase-handler/test/data/warehouse/stats_src
+Location: pfile:/data/users/tomasz/apache-hive/build/hbase-handler/test/data/warehouse/stats_src
Table Type: MANAGED_TABLE
Table Parameters:
- transient_lastDdlTime 1295995714
+ numFiles 1
+ numPartitions 0
+ numRows 500
+ rawDataSize 5312
+ totalSize 5812
+ transient_lastDdlTime 1306977484
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -56,75 +61,97 @@ Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
-PREHOOK: query: create table hbase_part like srcpart
+PREHOOK: query: create table stats_part like srcpart
PREHOOK: type: CREATETABLE
-POSTHOOK: query: create table hbase_part like srcpart
+POSTHOOK: query: create table stats_part like srcpart
POSTHOOK: type: CREATETABLE
-POSTHOOK: Output: default@hbase_part
+POSTHOOK: Output: default@stats_part
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-PREHOOK: query: insert overwrite table hbase_part partition (ds='2010-04-08', hr = '11') select key, value from src
+PREHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '11') select key, value from src
PREHOOK: type: QUERY
PREHOOK: Input: default@src
-PREHOOK: Output: default@hbase_part@ds=2010-04-08/hr=11
-POSTHOOK: query: insert overwrite table hbase_part partition (ds='2010-04-08', hr = '11') select key, value from src
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '11') select key, value from src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
-POSTHOOK: Output: default@hbase_part@ds=2010-04-08/hr=11
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-PREHOOK: query: insert overwrite table hbase_part partition (ds='2010-04-08', hr = '12') select key, value from src
+PREHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '12') select key, value from src
PREHOOK: type: QUERY
PREHOOK: Input: default@src
-PREHOOK: Output: default@hbase_part@ds=2010-04-08/hr=12
-POSTHOOK: query: insert overwrite table hbase_part partition (ds='2010-04-08', hr = '12') select key, value from src
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '12') select key, value from src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
-POSTHOOK: Output: default@hbase_part@ds=2010-04-08/hr=12
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-PREHOOK: query: analyze table hbase_part partition(ds='2008-04-08', hr=11) compute statistics
+PREHOOK: query: analyze table stats_part partition(ds='2010-04-08', hr='11') compute statistics
PREHOOK: type: QUERY
-PREHOOK: Input: default@hbase_part@ds=2008-04-08/hr=11
-PREHOOK: Output: default@hbase_part
-POSTHOOK: query: analyze table hbase_part partition(ds='2008-04-08', hr=11) compute statistics
+PREHOOK: Input: default@stats_part@ds=2010-04-08/hr=11
+PREHOOK: Output: default@stats_part
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: query: analyze table stats_part partition(ds='2010-04-08', hr='11') compute statistics
POSTHOOK: type: QUERY
-POSTHOOK: Input: default@hbase_part@ds=2008-04-08/hr=11
-POSTHOOK: Output: default@hbase_part
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Input: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: Output: default@stats_part
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-PREHOOK: query: analyze table hbase_part partition(ds='2008-04-08', hr=12) compute statistics
+PREHOOK: query: analyze table stats_part partition(ds='2010-04-08', hr='12') compute statistics
PREHOOK: type: QUERY
-PREHOOK: Input: default@hbase_part@ds=2008-04-08/hr=12
-PREHOOK: Output: default@hbase_part
-POSTHOOK: query: analyze table hbase_part partition(ds='2008-04-08', hr=12) compute statistics
+PREHOOK: Input: default@stats_part@ds=2010-04-08/hr=12
+PREHOOK: Output: default@stats_part
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: query: analyze table stats_part partition(ds='2010-04-08', hr='12') compute statistics
POSTHOOK: type: QUERY
-POSTHOOK: Input: default@hbase_part@ds=2008-04-08/hr=12
-POSTHOOK: Output: default@hbase_part
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Input: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: Output: default@stats_part
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-PREHOOK: query: desc formatted hbase_part
+PREHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13
+POSTHOOK: query: insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: desc formatted stats_part
PREHOOK: type: DESCTABLE
-POSTHOOK: query: desc formatted hbase_part
+POSTHOOK: query: desc formatted stats_part
POSTHOOK: type: DESCTABLE
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
# col_name data_type comment
@@ -141,14 +168,19 @@ hr string
# Detailed Table Information
Database: default
Owner: null
-CreateTime: Tue Jan 25 14:49:11 PST 2011
+CreateTime: Wed Jun 01 18:18:04 PDT 2011
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
-Location: pfile:/data/users/jsichi/open/hive-trunk/build/hbase-handler/test/data/warehouse/hbase_part
+Location: pfile:/data/users/tomasz/apache-hive/build/hbase-handler/test/data/warehouse/stats_part
Table Type: MANAGED_TABLE
Table Parameters:
- transient_lastDdlTime 1295995751
+ numFiles 3
+ numPartitions 3
+ numRows 1500
+ rawDataSize 15936
+ totalSize 17436
+ transient_lastDdlTime 1306977503
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -160,14 +192,16 @@ Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
-PREHOOK: query: desc formatted hbase_part partition (ds='2010-04-08', hr = '11')
+PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '11')
PREHOOK: type: DESCTABLE
-POSTHOOK: query: desc formatted hbase_part partition (ds='2010-04-08', hr = '11')
+POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '11')
POSTHOOK: type: DESCTABLE
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
# col_name data_type comment
@@ -184,13 +218,17 @@ hr string
# Detailed Partition Information
Partition Value: [2010-04-08, 11]
Database: default
-Table: hbase_part
-CreateTime: Tue Jan 25 14:49:26 PST 2011
+Table: stats_part
+CreateTime: Wed Jun 01 18:18:08 PDT 2011
LastAccessTime: UNKNOWN
Protect Mode: None
-Location: pfile:/data/users/jsichi/open/hive-trunk/build/hbase-handler/test/data/warehouse/hbase_part/ds=2010-04-08/hr=11
+Location: pfile:/data/users/tomasz/apache-hive/build/hbase-handler/test/data/warehouse/stats_part/ds=2010-04-08/hr=11
Partition Parameters:
- transient_lastDdlTime 1295995766
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ totalSize 5812
+ transient_lastDdlTime 1306977496
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -202,14 +240,16 @@ Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
-PREHOOK: query: desc formatted hbase_part partition (ds='2010-04-08', hr = '12')
+PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '12')
PREHOOK: type: DESCTABLE
-POSTHOOK: query: desc formatted hbase_part partition (ds='2010-04-08', hr = '12')
+POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '12')
POSTHOOK: type: DESCTABLE
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
-POSTHOOK: Lineage: hbase_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
# col_name data_type comment
@@ -226,13 +266,93 @@ hr string
# Detailed Partition Information
Partition Value: [2010-04-08, 12]
Database: default
-Table: hbase_part
-CreateTime: Tue Jan 25 14:49:52 PST 2011
+Table: stats_part
+CreateTime: Wed Jun 01 18:18:12 PDT 2011
LastAccessTime: UNKNOWN
Protect Mode: None
-Location: pfile:/data/users/jsichi/open/hive-trunk/build/hbase-handler/test/data/warehouse/hbase_part/ds=2010-04-08/hr=12
+Location: pfile:/data/users/tomasz/apache-hive/build/hbase-handler/test/data/warehouse/stats_part/ds=2010-04-08/hr=12
Partition Parameters:
- transient_lastDdlTime 1295995792
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ totalSize 5812
+ transient_lastDdlTime 1306977499
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: analyze table stats_part partition(ds, hr) compute statistics
+PREHOOK: type: QUERY
+PREHOOK: Input: default@stats_part@ds=2010-04-08/hr=11
+PREHOOK: Input: default@stats_part@ds=2010-04-08/hr=12
+PREHOOK: Input: default@stats_part@ds=2010-04-08/hr=13
+PREHOOK: Output: default@stats_part
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13
+POSTHOOK: query: analyze table stats_part partition(ds, hr) compute statistics
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: Input: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: Input: default@stats_part@ds=2010-04-08/hr=13
+POSTHOOK: Output: default@stats_part
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=11
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=12
+POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: desc formatted stats_part
+PREHOOK: type: DESCTABLE
+POSTHOOK: query: desc formatted stats_part
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+# col_name data_type comment
+
+key string default
+value string default
+
+# Partition Information
+# col_name data_type comment
+
+ds string None
+hr string None
+
+# Detailed Table Information
+Database: default
+Owner: null
+CreateTime: Wed Jun 01 18:18:04 PDT 2011
+LastAccessTime: UNKNOWN
+Protect Mode: None
+Retention: 0
+Location: pfile:/data/users/tomasz/apache-hive/build/hbase-handler/test/data/warehouse/stats_part
+Table Type: MANAGED_TABLE
+Table Parameters:
+ numFiles 3
+ numPartitions 3
+ numRows 1500
+ rawDataSize 15936
+ totalSize 17436
+ transient_lastDdlTime 1306977508
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -244,3 +364,35 @@ Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
+PREHOOK: query: drop table stats_src
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@stats_src
+PREHOOK: Output: default@stats_src
+POSTHOOK: query: drop table stats_src
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@stats_src
+POSTHOOK: Output: default@stats_src
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: drop table stats_part
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@stats_part
+PREHOOK: Output: default@stats_part
+POSTHOOK: query: drop table stats_part
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@stats_part
+POSTHOOK: Output: default@stats_part
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=12).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: stats_src.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]