You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2015/07/02 01:53:39 UTC

parquet-mr git commit: PARQUET-308: Add ParquetWriter#getDataSize accessor.

Repository: parquet-mr
Updated Branches:
  refs/heads/master c7720ca4c -> a747456bf


PARQUET-308: Add ParquetWriter#getDataSize accessor.

This returns the current file position plus the amount of data buffered
in the current row group as an estimate of final data size.

Author: Ryan Blue <bl...@apache.org>

Closes #212 from rdblue/PARQUET-308-add-data-size-accessor and squashes the following commits:

1c0d798 [Ryan Blue] PARQUET-308: Add ParquetWriter#getDataSize accessor.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/a747456b
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/a747456b
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/a747456b

Branch: refs/heads/master
Commit: a747456bfe077da467ff036172968a37f3b1e893
Parents: c7720ca
Author: Ryan Blue <bl...@apache.org>
Authored: Wed Jul 1 16:53:34 2015 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Wed Jul 1 16:53:34 2015 -0700

----------------------------------------------------------------------
 .../apache/parquet/hadoop/InternalParquetRecordWriter.java  | 9 +++++++++
 .../main/java/org/apache/parquet/hadoop/ParquetWriter.java  | 7 +++++++
 2 files changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/a747456b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
index d12086d..37e8db5 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
@@ -59,6 +59,7 @@ class InternalParquetRecordWriter<T> {
 
   private long recordCount = 0;
   private long recordCountForNextMemCheck = MINIMUM_RECORD_COUNT_FOR_CHECK;
+  private long lastRowGroupEndPos = 0;
 
   private ColumnWriteStore columnStore;
   private ColumnChunkPageWriteStore pageStore;
@@ -122,6 +123,13 @@ class InternalParquetRecordWriter<T> {
     checkBlockSizeReached();
   }
 
+  /**
+   * @return the total size of data written to the file and buffered in memory
+   */
+  public long getDataSize() {
+    return lastRowGroupEndPos + columnStore.getBufferedSize();
+  }
+
   private void checkBlockSizeReached() throws IOException {
     if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
       long memSize = columnStore.getBufferedSize();
@@ -133,6 +141,7 @@ class InternalParquetRecordWriter<T> {
         flushRowGroupToStore();
         initStore();
         recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
+        this.lastRowGroupEndPos = parquetFileWriter.getPos();
       } else {
         recordCountForNextMemCheck = min(
             max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/a747456b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
index 70abdac..e3b7953 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
@@ -301,6 +301,13 @@ public class ParquetWriter<T> implements Closeable {
   }
 
   /**
+   * @return the total size of data written to the file and buffered in memory
+   */
+  public long getDataSize() {
+    return writer.getDataSize();
+  }
+
+  /**
    * An abstract builder class for ParquetWriter instances.
    *
    * Object models should extend this builder to provide writer configuration