You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ju...@apache.org on 2016/08/15 16:25:56 UTC

arrow git commit: ARROW-245: add endianness to RecordBatch

Repository: arrow
Updated Branches:
  refs/heads/master e8724f837 -> 689cd270e


ARROW-245: add endianness to RecordBatch

Author: Julien Le Dem <ju...@dremio.com>

Closes #113 from julienledem/arrow_245_endianness and squashes the following commits:

e4cd749 [Julien Le Dem] fix linter error
c727844 [Julien Le Dem] Fix NOTICE; typo; doc wording
88aaee3 [Julien Le Dem] move endianness to Schema
e5f7355 [Julien Le Dem] clarifying big endian support
36caf3c [Julien Le Dem] autodetect endianness
7477de1 [Julien Le Dem] update Layout.md endianness; add image source file
eea3edd [Julien Le Dem] update cpp to use the new field
9b56874 [Julien Le Dem] ARROW-245: add endianness to RecordBatch


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/689cd270
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/689cd270
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/689cd270

Branch: refs/heads/master
Commit: 689cd270e923d4f3f15913843c2569b36e87c4db
Parents: e8724f8
Author: Julien Le Dem <ju...@dremio.com>
Authored: Mon Aug 15 09:25:51 2016 -0700
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Mon Aug 15 09:25:51 2016 -0700

----------------------------------------------------------------------
 NOTICE.txt                             |   5 +++++
 cpp/src/arrow/ipc/metadata-internal.cc |  20 ++++++++++++++++++--
 format/Arrow.graffle                   | Bin 0 -> 3646 bytes
 format/Arrow.png                       | Bin 0 -> 86598 bytes
 format/Layout.md                       |   9 ++++++++-
 format/Message.fbs                     |  11 +++++++++++
 6 files changed, 42 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/NOTICE.txt
----------------------------------------------------------------------
diff --git a/NOTICE.txt b/NOTICE.txt
index 0310c89..a851016 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -7,3 +7,8 @@ The Apache Software Foundation (http://www.apache.org/).
 This product includes software from the SFrame project (BSD, 3-clause).
 * Copyright (C) 2015 Dato, Inc.
 * Copyright (c) 2009 Carnegie Mellon University.
+
+This product includes software from the Numpy project (BSD-new)
+ https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
+ * Copyright (c) 1995, 1996, 1997 Jim Hugunin, hugunin@mit.edu
+ * Copyright (c) 2005 Travis E. Oliphant oliphant@ee.byu.edu Brigham Young University

http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/cpp/src/arrow/ipc/metadata-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index e6b47de..1d3edf0 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -243,6 +243,17 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr<Field>*
 
 // Implement MessageBuilder
 
+// will return the endianness of the system we are running on
+// based the NUMPY_API function. See NOTICE.txt
+flatbuf::Endianness endianness() {
+  union {
+    uint32_t i;
+    char c[4];
+  } bint = {0x01020304};
+
+  return bint.c[0] == 1 ? flatbuf::Endianness_Big : flatbuf::Endianness_Little;
+}
+
 Status MessageBuilder::SetSchema(const Schema* schema) {
   header_type_ = flatbuf::MessageHeader_Schema;
 
@@ -254,7 +265,11 @@ Status MessageBuilder::SetSchema(const Schema* schema) {
     field_offsets.push_back(offset);
   }
 
-  header_ = flatbuf::CreateSchema(fbb_, fbb_.CreateVector(field_offsets)).Union();
+  header_ = flatbuf::CreateSchema(
+                fbb_,
+                endianness(),
+                fbb_.CreateVector(field_offsets))
+                .Union();
   body_length_ = 0;
   return Status::OK();
 }
@@ -263,7 +278,8 @@ Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length,
     const std::vector<flatbuf::FieldNode>& nodes,
     const std::vector<flatbuf::Buffer>& buffers) {
   header_type_ = flatbuf::MessageHeader_RecordBatch;
-  header_ = flatbuf::CreateRecordBatch(fbb_, length, fbb_.CreateVectorOfStructs(nodes),
+  header_ = flatbuf::CreateRecordBatch(fbb_, length,
+                fbb_.CreateVectorOfStructs(nodes),
                 fbb_.CreateVectorOfStructs(buffers))
                 .Union();
   body_length_ = body_length;

http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Arrow.graffle
----------------------------------------------------------------------
diff --git a/format/Arrow.graffle b/format/Arrow.graffle
new file mode 100644
index 0000000..453e850
Binary files /dev/null and b/format/Arrow.graffle differ

http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Arrow.png
----------------------------------------------------------------------
diff --git a/format/Arrow.png b/format/Arrow.png
new file mode 100644
index 0000000..361dc82
Binary files /dev/null and b/format/Arrow.png differ

http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Layout.md
----------------------------------------------------------------------
diff --git a/format/Layout.md b/format/Layout.md
index 815c47f..5eaefee 100644
--- a/format/Layout.md
+++ b/format/Layout.md
@@ -78,7 +78,14 @@ Base requirements
 
 ## Byte Order ([Endianness][3])
 
-The Arrow format is little endian.
+The Arrow format is little endian by default.
+The Schema metadata has an endianness field indicating endianness of RecordBatches.
+Typically this is the endianness of the system where the RecordBatch was generated.
+The main use case is exchanging RecordBatches between systems with the same Endianness.
+At first we will return an error when trying to read a Schema with an endianness
+that does not match the underlying system. The reference implementation is focused on
+Little Endian and provides tests for it. Eventually we may provide automatic conversion
+via byte swapping.
 
 ## Alignment and Padding
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Message.fbs
----------------------------------------------------------------------
diff --git a/format/Message.fbs b/format/Message.fbs
index 6a351b9..3f688c1 100644
--- a/format/Message.fbs
+++ b/format/Message.fbs
@@ -88,9 +88,20 @@ table Field {
 }
 
 /// ----------------------------------------------------------------------
+/// Endianness of the platform that produces the RecordBatch
+
+enum Endianness:int { Little, Big }
+
+/// ----------------------------------------------------------------------
 /// A Schema describes the columns in a row batch
 
 table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
   fields: [Field];
 }