You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ju...@apache.org on 2016/08/15 16:25:56 UTC
arrow git commit: ARROW-245: add endianness to RecordBatch
Repository: arrow
Updated Branches:
refs/heads/master e8724f837 -> 689cd270e
ARROW-245: add endianness to RecordBatch
Author: Julien Le Dem <ju...@dremio.com>
Closes #113 from julienledem/arrow_245_endianness and squashes the following commits:
e4cd749 [Julien Le Dem] fix linter error
c727844 [Julien Le Dem] Fix NOTICE; typo; doc wording
88aaee3 [Julien Le Dem] move endianness to Schema
e5f7355 [Julien Le Dem] clarifying big endian support
36caf3c [Julien Le Dem] autodetect endianness
7477de1 [Julien Le Dem] update Layout.md endianness; add image source file
eea3edd [Julien Le Dem] update cpp to use the new field
9b56874 [Julien Le Dem] ARROW-245: add endianness to RecordBatch
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/689cd270
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/689cd270
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/689cd270
Branch: refs/heads/master
Commit: 689cd270e923d4f3f15913843c2569b36e87c4db
Parents: e8724f8
Author: Julien Le Dem <ju...@dremio.com>
Authored: Mon Aug 15 09:25:51 2016 -0700
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Mon Aug 15 09:25:51 2016 -0700
----------------------------------------------------------------------
NOTICE.txt | 5 +++++
cpp/src/arrow/ipc/metadata-internal.cc | 20 ++++++++++++++++++--
format/Arrow.graffle | Bin 0 -> 3646 bytes
format/Arrow.png | Bin 0 -> 86598 bytes
format/Layout.md | 9 ++++++++-
format/Message.fbs | 11 +++++++++++
6 files changed, 42 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/NOTICE.txt
----------------------------------------------------------------------
diff --git a/NOTICE.txt b/NOTICE.txt
index 0310c89..a851016 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -7,3 +7,8 @@ The Apache Software Foundation (http://www.apache.org/).
This product includes software from the SFrame project (BSD, 3-clause).
* Copyright (C) 2015 Dato, Inc.
* Copyright (c) 2009 Carnegie Mellon University.
+
+This product includes software from the Numpy project (BSD-new)
+ https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
+ * Copyright (c) 1995, 1996, 1997 Jim Hugunin, hugunin@mit.edu
+ * Copyright (c) 2005 Travis E. Oliphant oliphant@ee.byu.edu Brigham Young University
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/cpp/src/arrow/ipc/metadata-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index e6b47de..1d3edf0 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -243,6 +243,17 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr<Field>*
// Implement MessageBuilder
+// will return the endianness of the system we are running on
+// based the NUMPY_API function. See NOTICE.txt
+flatbuf::Endianness endianness() {
+ union {
+ uint32_t i;
+ char c[4];
+ } bint = {0x01020304};
+
+ return bint.c[0] == 1 ? flatbuf::Endianness_Big : flatbuf::Endianness_Little;
+}
+
Status MessageBuilder::SetSchema(const Schema* schema) {
header_type_ = flatbuf::MessageHeader_Schema;
@@ -254,7 +265,11 @@ Status MessageBuilder::SetSchema(const Schema* schema) {
field_offsets.push_back(offset);
}
- header_ = flatbuf::CreateSchema(fbb_, fbb_.CreateVector(field_offsets)).Union();
+ header_ = flatbuf::CreateSchema(
+ fbb_,
+ endianness(),
+ fbb_.CreateVector(field_offsets))
+ .Union();
body_length_ = 0;
return Status::OK();
}
@@ -263,7 +278,8 @@ Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length,
const std::vector<flatbuf::FieldNode>& nodes,
const std::vector<flatbuf::Buffer>& buffers) {
header_type_ = flatbuf::MessageHeader_RecordBatch;
- header_ = flatbuf::CreateRecordBatch(fbb_, length, fbb_.CreateVectorOfStructs(nodes),
+ header_ = flatbuf::CreateRecordBatch(fbb_, length,
+ fbb_.CreateVectorOfStructs(nodes),
fbb_.CreateVectorOfStructs(buffers))
.Union();
body_length_ = body_length;
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Arrow.graffle
----------------------------------------------------------------------
diff --git a/format/Arrow.graffle b/format/Arrow.graffle
new file mode 100644
index 0000000..453e850
Binary files /dev/null and b/format/Arrow.graffle differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Arrow.png
----------------------------------------------------------------------
diff --git a/format/Arrow.png b/format/Arrow.png
new file mode 100644
index 0000000..361dc82
Binary files /dev/null and b/format/Arrow.png differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Layout.md
----------------------------------------------------------------------
diff --git a/format/Layout.md b/format/Layout.md
index 815c47f..5eaefee 100644
--- a/format/Layout.md
+++ b/format/Layout.md
@@ -78,7 +78,14 @@ Base requirements
## Byte Order ([Endianness][3])
-The Arrow format is little endian.
+The Arrow format is little endian by default.
+The Schema metadata has an endianness field indicating endianness of RecordBatches.
+Typically this is the endianness of the system where the RecordBatch was generated.
+The main use case is exchanging RecordBatches between systems with the same Endianness.
+At first we will return an error when trying to read a Schema with an endianness
+that does not match the underlying system. The reference implementation is focused on
+Little Endian and provides tests for it. Eventually we may provide automatic conversion
+via byte swapping.
## Alignment and Padding
http://git-wip-us.apache.org/repos/asf/arrow/blob/689cd270/format/Message.fbs
----------------------------------------------------------------------
diff --git a/format/Message.fbs b/format/Message.fbs
index 6a351b9..3f688c1 100644
--- a/format/Message.fbs
+++ b/format/Message.fbs
@@ -88,9 +88,20 @@ table Field {
}
/// ----------------------------------------------------------------------
+/// Endianness of the platform that produces the RecordBatch
+
+enum Endianness:int { Little, Big }
+
+/// ----------------------------------------------------------------------
/// A Schema describes the columns in a row batch
table Schema {
+
+ /// endianness of the buffer
+ /// it is Little Endian by default
+ /// if endianness doesn't match the underlying system then the vectors need to be converted
+ endianness: Endianness=Little;
+
fields: [Field];
}