You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2015/07/02 01:53:39 UTC
parquet-mr git commit: PARQUET-308: Add ParquetWriter#getDataSize
accessor.
Repository: parquet-mr
Updated Branches:
refs/heads/master c7720ca4c -> a747456bf
PARQUET-308: Add ParquetWriter#getDataSize accessor.
This returns the current file position plus the amount of data buffered
in the current row group as an estimate of final data size.
Author: Ryan Blue <bl...@apache.org>
Closes #212 from rdblue/PARQUET-308-add-data-size-accessor and squashes the following commits:
1c0d798 [Ryan Blue] PARQUET-308: Add ParquetWriter#getDataSize accessor.
Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/a747456b
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/a747456b
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/a747456b
Branch: refs/heads/master
Commit: a747456bfe077da467ff036172968a37f3b1e893
Parents: c7720ca
Author: Ryan Blue <bl...@apache.org>
Authored: Wed Jul 1 16:53:34 2015 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Wed Jul 1 16:53:34 2015 -0700
----------------------------------------------------------------------
.../apache/parquet/hadoop/InternalParquetRecordWriter.java | 9 +++++++++
.../main/java/org/apache/parquet/hadoop/ParquetWriter.java | 7 +++++++
2 files changed, 16 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/a747456b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
index d12086d..37e8db5 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
@@ -59,6 +59,7 @@ class InternalParquetRecordWriter<T> {
private long recordCount = 0;
private long recordCountForNextMemCheck = MINIMUM_RECORD_COUNT_FOR_CHECK;
+ private long lastRowGroupEndPos = 0;
private ColumnWriteStore columnStore;
private ColumnChunkPageWriteStore pageStore;
@@ -122,6 +123,13 @@ class InternalParquetRecordWriter<T> {
checkBlockSizeReached();
}
+ /**
+ * @return the total size of data written to the file and buffered in memory
+ */
+ public long getDataSize() {
+ return lastRowGroupEndPos + columnStore.getBufferedSize();
+ }
+
private void checkBlockSizeReached() throws IOException {
if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
long memSize = columnStore.getBufferedSize();
@@ -133,6 +141,7 @@ class InternalParquetRecordWriter<T> {
flushRowGroupToStore();
initStore();
recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
+ this.lastRowGroupEndPos = parquetFileWriter.getPos();
} else {
recordCountForNextMemCheck = min(
max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/a747456b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
index 70abdac..e3b7953 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
@@ -301,6 +301,13 @@ public class ParquetWriter<T> implements Closeable {
}
/**
+ * @return the total size of data written to the file and buffered in memory
+ */
+ public long getDataSize() {
+ return writer.getDataSize();
+ }
+
+ /**
* An abstract builder class for ParquetWriter instances.
*
* Object models should extend this builder to provide writer configuration