You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2021/04/18 17:00:47 UTC
[arrow-datafusion] branch master updated: Remove format dir
This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new d8904eb Remove format dir
d8904eb is described below
commit d8904eb03406dfb05e96b01e096888485302765f
Author: Andy Grove <an...@gmail.com>
AuthorDate: Sun Apr 18 11:00:35 2021 -0600
Remove format dir
---
format/File.fbs | 52 -------
format/Flight.proto | 335 ---------------------------------------
format/Message.fbs | 140 -----------------
format/README.rst | 25 ---
format/Schema.fbs | 407 ------------------------------------------------
format/SparseTensor.fbs | 228 ---------------------------
format/Tensor.fbs | 54 -------
7 files changed, 1241 deletions(-)
diff --git a/format/File.fbs b/format/File.fbs
deleted file mode 100644
index 906d494..0000000
--- a/format/File.fbs
+++ /dev/null
@@ -1,52 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-include "Schema.fbs";
-
-namespace org.apache.arrow.flatbuf;
-
-/// ----------------------------------------------------------------------
-/// Arrow File metadata
-///
-
-table Footer {
- version: org.apache.arrow.flatbuf.MetadataVersion;
-
- schema: org.apache.arrow.flatbuf.Schema;
-
- dictionaries: [ Block ];
-
- recordBatches: [ Block ];
-
- /// User-defined metadata
- custom_metadata: [ KeyValue ];
-}
-
-struct Block {
-
- /// Index to the start of the RecordBlock (note this is past the Message header)
- offset: long;
-
- /// Length of the metadata
- metaDataLength: int;
-
- /// Length of the data (this is aligned so there can be a gap between this and
- /// the metadata).
- bodyLength: long;
-}
-
-root_type Footer;
diff --git a/format/Flight.proto b/format/Flight.proto
deleted file mode 100644
index b291d9d..0000000
--- a/format/Flight.proto
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-syntax = "proto3";
-
-option java_package = "org.apache.arrow.flight.impl";
-option go_package = "github.com/apache/arrow/go/flight;flight";
-option csharp_namespace = "Apache.Arrow.Flight.Protocol";
-
-package arrow.flight.protocol;
-
-/*
- * A flight service is an endpoint for retrieving or storing Arrow data. A
- * flight service can expose one or more predefined endpoints that can be
- * accessed using the Arrow Flight Protocol. Additionally, a flight service
- * can expose a set of actions that are available.
- */
-service FlightService {
-
- /*
- * Handshake between client and server. Depending on the server, the
- * handshake may be required to determine the token that should be used for
- * future operations. Both request and response are streams to allow multiple
- * round-trips depending on auth mechanism.
- */
- rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {}
-
- /*
- * Get a list of available streams given a particular criteria. Most flight
- * services will expose one or more streams that are readily available for
- * retrieval. This api allows listing the streams available for
- * consumption. A user can also provide a criteria. The criteria can limit
- * the subset of streams that can be listed via this interface. Each flight
- * service allows its own definition of how to consume criteria.
- */
- rpc ListFlights(Criteria) returns (stream FlightInfo) {}
-
- /*
- * For a given FlightDescriptor, get information about how the flight can be
- * consumed. This is a useful interface if the consumer of the interface
- * already can identify the specific flight to consume. This interface can
- * also allow a consumer to generate a flight stream through a specified
- * descriptor. For example, a flight descriptor might be something that
- * includes a SQL statement or a Pickled Python operation that will be
- * executed. In those cases, the descriptor will not be previously available
- * within the list of available streams provided by ListFlights but will be
- * available for consumption for the duration defined by the specific flight
- * service.
- */
- rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {}
-
- /*
- * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema
- * This is used when a consumer needs the Schema of flight stream. Similar to
- * GetFlightInfo this interface may generate a new flight that was not previously
- * available in ListFlights.
- */
- rpc GetSchema(FlightDescriptor) returns (SchemaResult) {}
-
- /*
- * Retrieve a single stream associated with a particular descriptor
- * associated with the referenced ticket. A Flight can be composed of one or
- * more streams where each stream can be retrieved using a separate opaque
- * ticket that the flight service uses for managing a collection of streams.
- */
- rpc DoGet(Ticket) returns (stream FlightData) {}
-
- /*
- * Push a stream to the flight service associated with a particular
- * flight stream. This allows a client of a flight service to upload a stream
- * of data. Depending on the particular flight service, a client consumer
- * could be allowed to upload a single stream per descriptor or an unlimited
- * number. In the latter, the service might implement a 'seal' action that
- * can be applied to a descriptor once all streams are uploaded.
- */
- rpc DoPut(stream FlightData) returns (stream PutResult) {}
-
- /*
- * Open a bidirectional data channel for a given descriptor. This
- * allows clients to send and receive arbitrary Arrow data and
- * application-specific metadata in a single logical stream. In
- * contrast to DoGet/DoPut, this is more suited for clients
- * offloading computation (rather than storage) to a Flight service.
- */
- rpc DoExchange(stream FlightData) returns (stream FlightData) {}
-
- /*
- * Flight services can support an arbitrary number of simple actions in
- * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut
- * operations that are potentially available. DoAction allows a flight client
- * to do a specific action against a flight service. An action includes
- * opaque request and response objects that are specific to the type action
- * being undertaken.
- */
- rpc DoAction(Action) returns (stream Result) {}
-
- /*
- * A flight service exposes all of the available action types that it has
- * along with descriptions. This allows different flight consumers to
- * understand the capabilities of the flight service.
- */
- rpc ListActions(Empty) returns (stream ActionType) {}
-
-}
-
-/*
- * The request that a client provides to a server on handshake.
- */
-message HandshakeRequest {
-
- /*
- * A defined protocol version
- */
- uint64 protocol_version = 1;
-
- /*
- * Arbitrary auth/handshake info.
- */
- bytes payload = 2;
-}
-
-message HandshakeResponse {
-
- /*
- * A defined protocol version
- */
- uint64 protocol_version = 1;
-
- /*
- * Arbitrary auth/handshake info.
- */
- bytes payload = 2;
-}
-
-/*
- * A message for doing simple auth.
- */
-message BasicAuth {
- string username = 2;
- string password = 3;
-}
-
-message Empty {}
-
-/*
- * Describes an available action, including both the name used for execution
- * along with a short description of the purpose of the action.
- */
-message ActionType {
- string type = 1;
- string description = 2;
-}
-
-/*
- * A service specific expression that can be used to return a limited set
- * of available Arrow Flight streams.
- */
-message Criteria {
- bytes expression = 1;
-}
-
-/*
- * An opaque action specific for the service.
- */
-message Action {
- string type = 1;
- bytes body = 2;
-}
-
-/*
- * An opaque result returned after executing an action.
- */
-message Result {
- bytes body = 1;
-}
-
-/*
- * Wrap the result of a getSchema call
- */
-message SchemaResult {
- // schema of the dataset as described in Schema.fbs::Schema.
- bytes schema = 1;
-}
-
-/*
- * The name or tag for a Flight. May be used as a way to retrieve or generate
- * a flight or be used to expose a set of previously defined flights.
- */
-message FlightDescriptor {
-
- /*
- * Describes what type of descriptor is defined.
- */
- enum DescriptorType {
-
- // Protobuf pattern, not used.
- UNKNOWN = 0;
-
- /*
- * A named path that identifies a dataset. A path is composed of a string
- * or list of strings describing a particular dataset. This is conceptually
- * similar to a path inside a filesystem.
- */
- PATH = 1;
-
- /*
- * An opaque command to generate a dataset.
- */
- CMD = 2;
- }
-
- DescriptorType type = 1;
-
- /*
- * Opaque value used to express a command. Should only be defined when
- * type = CMD.
- */
- bytes cmd = 2;
-
- /*
- * List of strings identifying a particular dataset. Should only be defined
- * when type = PATH.
- */
- repeated string path = 3;
-}
-
-/*
- * The access coordinates for retrieval of a dataset. With a FlightInfo, a
- * consumer is able to determine how to retrieve a dataset.
- */
-message FlightInfo {
- // schema of the dataset as described in Schema.fbs::Schema.
- bytes schema = 1;
-
- /*
- * The descriptor associated with this info.
- */
- FlightDescriptor flight_descriptor = 2;
-
- /*
- * A list of endpoints associated with the flight. To consume the whole
- * flight, all endpoints must be consumed.
- */
- repeated FlightEndpoint endpoint = 3;
-
- // Set these to -1 if unknown.
- int64 total_records = 4;
- int64 total_bytes = 5;
-}
-
-/*
- * A particular stream or split associated with a flight.
- */
-message FlightEndpoint {
-
- /*
- * Token used to retrieve this stream.
- */
- Ticket ticket = 1;
-
- /*
- * A list of URIs where this ticket can be redeemed. If the list is
- * empty, the expectation is that the ticket can only be redeemed on the
- * current service where the ticket was generated.
- */
- repeated Location location = 2;
-}
-
-/*
- * A location where a Flight service will accept retrieval of a particular
- * stream given a ticket.
- */
-message Location {
- string uri = 1;
-}
-
-/*
- * An opaque identifier that the service can use to retrieve a particular
- * portion of a stream.
- */
-message Ticket {
- bytes ticket = 1;
-}
-
-/*
- * A batch of Arrow data as part of a stream of batches.
- */
-message FlightData {
-
- /*
- * The descriptor of the data. This is only relevant when a client is
- * starting a new DoPut stream.
- */
- FlightDescriptor flight_descriptor = 1;
-
- /*
- * Header for message data as described in Message.fbs::Message.
- */
- bytes data_header = 2;
-
- /*
- * Application-defined metadata.
- */
- bytes app_metadata = 3;
-
- /*
- * The actual batch of Arrow data. Preferably handled with minimal-copies
- * coming last in the definition to help with sidecar patterns (it is
- * expected that some implementations will fetch this field off the wire
- * with specialized code to avoid extra memory copies).
- */
- bytes data_body = 1000;
-}
-
-/**
- * The response message associated with the submission of a DoPut.
- */
-message PutResult {
- bytes app_metadata = 1;
-}
diff --git a/format/Message.fbs b/format/Message.fbs
deleted file mode 100644
index f1c18d7..0000000
--- a/format/Message.fbs
+++ /dev/null
@@ -1,140 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-include "Schema.fbs";
-include "SparseTensor.fbs";
-include "Tensor.fbs";
-
-namespace org.apache.arrow.flatbuf;
-
-/// ----------------------------------------------------------------------
-/// Data structures for describing a table row batch (a collection of
-/// equal-length Arrow arrays)
-
-/// Metadata about a field at some level of a nested type tree (but not
-/// its children).
-///
-/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
-/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
-/// null_count: 0} for its Int16 node, as separate FieldNode structs
-struct FieldNode {
- /// The number of value slots in the Arrow array at this level of a nested
- /// tree
- length: long;
-
- /// The number of observed nulls. Fields with null_count == 0 may choose not
- /// to write their physical validity bitmap out as a materialized buffer,
- /// instead setting the length of the bitmap buffer to 0.
- null_count: long;
-}
-
-enum CompressionType:byte {
- // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
- // thereof. Not to be confused with "raw" (also called "block") format
- // provided by lz4.h
- LZ4_FRAME,
-
- // Zstandard
- ZSTD
-}
-
-/// Provided for forward compatibility in case we need to support different
-/// strategies for compressing the IPC message body (like whole-body
-/// compression rather than buffer-level) in the future
-enum BodyCompressionMethod:byte {
- /// Each constituent buffer is first compressed with the indicated
- /// compressor, and then written with the uncompressed length in the first 8
- /// bytes as a 64-bit little-endian signed integer followed by the compressed
- /// buffer bytes (and then padding as required by the protocol). The
- /// uncompressed length may be set to -1 to indicate that the data that
- /// follows is not compressed, which can be useful for cases where
- /// compression does not yield appreciable savings.
- BUFFER
-}
-
-/// Optional compression for the memory buffers constituting IPC message
-/// bodies. Intended for use with RecordBatch but could be used for other
-/// message types
-table BodyCompression {
- /// Compressor library
- codec: CompressionType = LZ4_FRAME;
-
- /// Indicates the way the record batch body was compressed
- method: BodyCompressionMethod = BUFFER;
-}
-
-/// A data header describing the shared memory layout of a "record" or "row"
-/// batch. Some systems call this a "row batch" internally and others a "record
-/// batch".
-table RecordBatch {
- /// number of records / rows. The arrays in the batch should all have this
- /// length
- length: long;
-
- /// Nodes correspond to the pre-ordered flattened logical schema
- nodes: [FieldNode];
-
- /// Buffers correspond to the pre-ordered flattened buffer tree
- ///
- /// The number of buffers appended to this list depends on the schema. For
- /// example, most primitive arrays will have 2 buffers, 1 for the validity
- /// bitmap and 1 for the values. For struct arrays, there will only be a
- /// single buffer for the validity (nulls) bitmap
- buffers: [Buffer];
-
- /// Optional compression of the message body
- compression: BodyCompression;
-}
-
-/// For sending dictionary encoding information. Any Field can be
-/// dictionary-encoded, but in this case none of its children may be
-/// dictionary-encoded.
-/// There is one vector / column per dictionary, but that vector / column
-/// may be spread across multiple dictionary batches by using the isDelta
-/// flag
-
-table DictionaryBatch {
- id: long;
- data: RecordBatch;
-
- /// If isDelta is true the values in the dictionary are to be appended to a
- /// dictionary with the indicated id. If isDelta is false this dictionary
- /// should replace the existing dictionary.
- isDelta: bool = false;
-}
-
-/// ----------------------------------------------------------------------
-/// The root Message type
-
-/// This union enables us to easily send different message types without
-/// redundant storage, and in the future we can easily add new message types.
-///
-/// Arrow implementations do not need to implement all of the message types,
-/// which may include experimental metadata types. For maximum compatibility,
-/// it is best to send data using RecordBatch
-union MessageHeader {
- Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor
-}
-
-table Message {
- version: org.apache.arrow.flatbuf.MetadataVersion;
- header: MessageHeader;
- bodyLength: long;
- custom_metadata: [ KeyValue ];
-}
-
-root_type Message;
diff --git a/format/README.rst b/format/README.rst
deleted file mode 100644
index 0eaad49..0000000
--- a/format/README.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Arrow Protocol Files
-====================
-
-This folder contains binary protocol definitions for the Arrow columnar format
-and other parts of the project, like the Flight RPC framework.
-
-For documentation about the Arrow format, see the `docs/source/format`
-directory.
diff --git a/format/Schema.fbs b/format/Schema.fbs
deleted file mode 100644
index 3b00dd4..0000000
--- a/format/Schema.fbs
+++ /dev/null
@@ -1,407 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-/// Logical types, vector layouts, and schemas
-
-namespace org.apache.arrow.flatbuf;
-
-enum MetadataVersion:short {
- /// 0.1.0 (October 2016).
- V1,
-
- /// 0.2.0 (February 2017). Non-backwards compatible with V1.
- V2,
-
- /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
- V3,
-
- /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
- V4,
-
- /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
- /// metadata and IPC messages). Implementations are recommended to provide a
- /// V4 compatibility mode with V5 format changes disabled.
- ///
- /// Incompatible changes between V4 and V5:
- /// - Union buffer layout has changed. In V5, Unions don't have a validity
- /// bitmap buffer.
- V5,
-}
-
-/// Represents Arrow Features that might not have full support
-/// within implementations. This is intended to be used in
-/// two scenarios:
-/// 1. A mechanism for readers of Arrow Streams
-/// and files to understand that the stream or file makes
-/// use of a feature that isn't supported or unknown to
-/// the implementation (and therefore can meet the Arrow
-/// forward compatibility guarantees).
-/// 2. A means of negotiating between a client and server
-/// what features a stream is allowed to use. The enums
-/// values here are intented to represent higher level
-/// features, additional details maybe negotiated
-/// with key-value pairs specific to the protocol.
-///
-/// Enums added to this list should be assigned power-of-two values
-/// to facilitate exchanging and comparing bitmaps for supported
-/// features.
-enum Feature : long {
- /// Needed to make flatbuffers happy.
- UNUSED = 0,
- /// The stream makes use of multiple full dictionaries with the
- /// same ID and assumes clients implement dictionary replacement
- /// correctly.
- DICTIONARY_REPLACEMENT = 1,
- /// The stream makes use of compressed bodies as described
- /// in Message.fbs.
- COMPRESSED_BODY = 2
-}
-
-/// These are stored in the flatbuffer in the Type union below
-
-table Null {
-}
-
-/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
-/// (according to the physical memory layout). We used Struct_ here as
-/// Struct is a reserved word in Flatbuffers
-table Struct_ {
-}
-
-table List {
-}
-
-/// Same as List, but with 64-bit offsets, allowing to represent
-/// extremely large data values.
-table LargeList {
-}
-
-table FixedSizeList {
- /// Number of list items per value
- listSize: int;
-}
-
-/// A Map is a logical nested type that is represented as
-///
-/// List<entries: Struct<key: K, value: V>>
-///
-/// In this layout, the keys and values are each respectively contiguous. We do
-/// not constrain the key and value types, so the application is responsible
-/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
-/// may be set in the metadata for this field.
-///
-/// In a field with Map type, the field has a child Struct field, which then
-/// has two children: key type and the second the value type. The names of the
-/// child fields may be respectively "entries", "key", and "value", but this is
-/// not enforced.
-///
-/// Map
-/// ```text
-/// - child[0] entries: Struct
-/// - child[0] key: K
-/// - child[1] value: V
-/// ```
-/// Neither the "entries" field nor the "key" field may be nullable.
-///
-/// The metadata is structured so that Arrow systems without special handling
-/// for Map can make Map an alias for List. The "layout" attribute for the Map
-/// field must have the same contents as a List.
-table Map {
- /// Set to true if the keys within each value are sorted
- keysSorted: bool;
-}
-
-enum UnionMode:short { Sparse, Dense }
-
-/// A union is a complex type with children in Field
-/// By default ids in the type vector refer to the offsets in the children
-/// optionally typeIds provides an indirection between the child offset and the type id
-/// for each child `typeIds[offset]` is the id used in the type vector
-table Union {
- mode: UnionMode;
- typeIds: [ int ]; // optional, describes typeid of each child.
-}
-
-table Int {
- bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
- is_signed: bool;
-}
-
-enum Precision:short {HALF, SINGLE, DOUBLE}
-
-table FloatingPoint {
- precision: Precision;
-}
-
-/// Unicode with UTF-8 encoding
-table Utf8 {
-}
-
-/// Opaque binary data
-table Binary {
-}
-
-/// Same as Utf8, but with 64-bit offsets, allowing to represent
-/// extremely large data values.
-table LargeUtf8 {
-}
-
-/// Same as Binary, but with 64-bit offsets, allowing to represent
-/// extremely large data values.
-table LargeBinary {
-}
-
-table FixedSizeBinary {
- /// Number of bytes per value
- byteWidth: int;
-}
-
-table Bool {
-}
-
-/// Exact decimal value represented as an integer value in two's
-/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
-/// are used. The representation uses the endianness indicated
-/// in the Schema.
-table Decimal {
- /// Total number of decimal digits
- precision: int;
-
- /// Number of digits after the decimal point "."
- scale: int;
-
- /// Number of bits per value. The only accepted widths are 128 and 256.
- /// We use bitWidth for consistency with Int::bitWidth.
- bitWidth: int = 128;
-}
-
-enum DateUnit: short {
- DAY,
- MILLISECOND
-}
-
-/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
-/// epoch (1970-01-01), stored in either of two units:
-///
-/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
-/// leap seconds), where the values are evenly divisible by 86400000
-/// * Days (32 bits) since the UNIX epoch
-table Date {
- unit: DateUnit = MILLISECOND;
-}
-
-enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
-
-/// Time type. The physical storage type depends on the unit
-/// - SECOND and MILLISECOND: 32 bits
-/// - MICROSECOND and NANOSECOND: 64 bits
-table Time {
- unit: TimeUnit = MILLISECOND;
- bitWidth: int = 32;
-}
-
-/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding
-/// leap seconds, as a 64-bit integer. Note that UNIX time does not include
-/// leap seconds.
-///
-/// The Timestamp metadata supports both "time zone naive" and "time zone
-/// aware" timestamps. Read about the timezone attribute for more detail
-table Timestamp {
- unit: TimeUnit;
-
- /// The time zone is a string indicating the name of a time zone, one of:
- ///
- /// * As used in the Olson time zone database (the "tz database" or
- /// "tzdata"), such as "America/New_York"
- /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
- ///
- /// Whether a timezone string is present indicates different semantics about
- /// the data:
- ///
- /// * If the time zone is null or equal to an empty string, the data is "time
- /// zone naive" and shall be displayed *as is* to the user, not localized
- /// to the locale of the user. This data can be though of as UTC but
- /// without having "UTC" as the time zone, it is not considered to be
- /// localized to any time zone
- ///
- /// * If the time zone is set to a valid value, values can be displayed as
- /// "localized" to that time zone, even though the underlying 64-bit
- /// integers are identical to the same data stored in UTC. Converting
- /// between time zones is a metadata-only operation and does not change the
- /// underlying values
- timezone: string;
-}
-
-enum IntervalUnit: short { YEAR_MONTH, DAY_TIME}
-// A "calendar" interval which models types that don't necessarily
-// have a precise duration without the context of a base timestamp (e.g.
-// days can differ in length during day light savings time transitions).
-// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
-// 4-byte integers.
-// DAY_TIME - Indicates the number of elapsed days and milliseconds,
-// stored as 2 contiguous 32-bit integers (8-bytes in total). Support
-// of this IntervalUnit is not required for full arrow compatibility.
-table Interval {
- unit: IntervalUnit;
-}
-
-// An absolute length of time unrelated to any calendar artifacts.
-//
-// For the purposes of Arrow Implementations, adding this value to a Timestamp
-// ("t1") naively (i.e. simply summing the two number) is acceptable even
-// though in some cases the resulting Timestamp (t2) would not account for
-// leap-seconds during the elapsed time between "t1" and "t2". Similarly,
-// representing the difference between two Unix timestamp is acceptable, but
-// would yield a value that is possibly a few seconds off from the true elapsed
-// time.
-//
-// The resolution defaults to millisecond, but can be any of the other
-// supported TimeUnit values as with Timestamp and Time types. This type is
-// always represented as an 8-byte integer.
-table Duration {
- unit: TimeUnit = MILLISECOND;
-}
-
-/// ----------------------------------------------------------------------
-/// Top-level Type value, enabling extensible type-specific metadata. We can
-/// add new logical types to Type without breaking backwards compatibility
-
-union Type {
- Null,
- Int,
- FloatingPoint,
- Binary,
- Utf8,
- Bool,
- Decimal,
- Date,
- Time,
- Timestamp,
- Interval,
- List,
- Struct_,
- Union,
- FixedSizeBinary,
- FixedSizeList,
- Map,
- Duration,
- LargeBinary,
- LargeUtf8,
- LargeList,
-}
-
-/// ----------------------------------------------------------------------
-/// user defined key value pairs to add custom metadata to arrow
-/// key namespacing is the responsibility of the user
-
-table KeyValue {
- key: string;
- value: string;
-}
-
-/// ----------------------------------------------------------------------
-/// Dictionary encoding metadata
-/// Maintained for forwards compatibility, in the future
-/// Dictionaries might be explicit maps between integers and values
-/// allowing for non-contiguous index values
-enum DictionaryKind : short { DenseArray }
-table DictionaryEncoding {
- /// The known dictionary id in the application where this data is used. In
- /// the file or streaming formats, the dictionary ids are found in the
- /// DictionaryBatch messages
- id: long;
-
- /// The dictionary indices are constrained to be non-negative integers. If
- /// this field is null, the indices must be signed int32. To maximize
- /// cross-language compatibility and performance, implementations are
- /// recommended to prefer signed integer types over unsigned integer types
- /// and to avoid uint64 indices unless they are required by an application.
- indexType: Int;
-
- /// By default, dictionaries are not ordered, or the order does not have
- /// semantic meaning. In some statistical, applications, dictionary-encoding
- /// is used to represent ordered categorical data, and we provide a way to
- /// preserve that metadata here
- isOrdered: bool;
-
- dictionaryKind: DictionaryKind;
-}
-
-/// ----------------------------------------------------------------------
-/// A field represents a named column in a record / row batch or child of a
-/// nested type.
-
-table Field {
- /// Name is not required, in i.e. a List
- name: string;
-
- /// Whether or not this field can contain nulls. Should be true in general.
- nullable: bool;
-
- /// This is the type of the decoded value if the field is dictionary encoded.
- type: Type;
-
- /// Present only if the field is dictionary encoded.
- dictionary: DictionaryEncoding;
-
- /// children apply only to nested data types like Struct, List and Union. For
- /// primitive types children will have length 0.
- children: [ Field ];
-
- /// User-defined metadata
- custom_metadata: [ KeyValue ];
-}
-
-/// ----------------------------------------------------------------------
-/// Endianness of the platform producing the data
-
-enum Endianness:short { Little, Big }
-
-/// ----------------------------------------------------------------------
-/// A Buffer represents a single contiguous memory segment
-struct Buffer {
- /// The relative offset into the shared memory page where the bytes for this
- /// buffer starts
- offset: long;
-
- /// The absolute length (in bytes) of the memory buffer. The memory is found
- /// from offset (inclusive) to offset + length (non-inclusive). When building
- /// messages using the encapsulated IPC message, padding bytes may be written
- /// after a buffer, but such padding bytes do not need to be accounted for in
- /// the size here.
- length: long;
-}
-
-/// ----------------------------------------------------------------------
-/// A Schema describes the columns in a row batch
-
-table Schema {
-
- /// endianness of the buffer
- /// it is Little Endian by default
- /// if endianness doesn't match the underlying system then the vectors need to be converted
- endianness: Endianness=Little;
-
- fields: [Field];
- // User-defined metadata
- custom_metadata: [ KeyValue ];
-
- /// Features used in the stream/file.
- features : [ Feature ];
-}
-
-root_type Schema;
diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs
deleted file mode 100644
index a6fd2f9..0000000
--- a/format/SparseTensor.fbs
+++ /dev/null
@@ -1,228 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors".
-/// Arrow implementations in general are not required to implement this type
-
-include "Tensor.fbs";
-
-namespace org.apache.arrow.flatbuf;
-
-/// ----------------------------------------------------------------------
-/// EXPERIMENTAL: Data structures for sparse tensors
-
-/// Coordinate (COO) format of sparse tensor index.
-///
-/// COO's index list are represented as a NxM matrix,
-/// where N is the number of non-zero values,
-/// and M is the number of dimensions of a sparse tensor.
-///
-/// indicesBuffer stores the location and size of the data of this indices
-/// matrix. The value type and the stride of the indices matrix is
-/// specified in indicesType and indicesStrides fields.
-///
-/// For example, let X be a 2x3x4x5 tensor, and it has the following
-/// 6 non-zero values:
-/// ```text
-/// X[0, 1, 2, 0] := 1
-/// X[1, 1, 2, 3] := 2
-/// X[0, 2, 1, 0] := 3
-/// X[0, 1, 3, 0] := 4
-/// X[0, 1, 2, 1] := 5
-/// X[1, 2, 0, 4] := 6
-/// ```
-/// In COO format, the index matrix of X is the following 4x6 matrix:
-/// ```text
-/// [[0, 0, 0, 0, 1, 1],
-/// [1, 1, 1, 2, 1, 2],
-/// [2, 2, 3, 1, 2, 0],
-/// [0, 1, 0, 0, 3, 4]]
-/// ```
-/// When isCanonical is true, the indices is sorted in lexicographical order
-/// (row-major order), and it does not have duplicated entries. Otherwise,
-/// the indices may not be sorted, or may have duplicated entries.
-table SparseTensorIndexCOO {
- /// The type of values in indicesBuffer
- indicesType: Int (required);
-
- /// Non-negative byte offsets to advance one value cell along each dimension
- /// If omitted, default to row-major order (C-like).
- indicesStrides: [long];
-
- /// The location and size of the indices matrix's data
- indicesBuffer: Buffer (required);
-
- /// This flag is true if and only if the indices matrix is sorted in
- /// row-major order, and does not have duplicated entries.
- /// This sort order is the same as of Tensorflow's SparseTensor,
- /// but it is inverse order of SciPy's canonical coo_matrix
- /// (SciPy employs column-major order for its coo_matrix).
- isCanonical: bool;
-}
-
-enum SparseMatrixCompressedAxis: short { Row, Column }
-
-/// Compressed Sparse format, that is matrix-specific.
-table SparseMatrixIndexCSX {
- /// Which axis, row or column, is compressed
- compressedAxis: SparseMatrixCompressedAxis;
-
- /// The type of values in indptrBuffer
- indptrType: Int (required);
-
- /// indptrBuffer stores the location and size of indptr array that
- /// represents the range of the rows.
- /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
- /// The length of this array is 1 + (the number of rows), and the type
- /// of index value is long.
- ///
- /// For example, let X be the following 6x4 matrix:
- /// ```text
- /// X := [[0, 1, 2, 0],
- /// [0, 0, 3, 0],
- /// [0, 4, 0, 5],
- /// [0, 0, 0, 0],
- /// [6, 0, 7, 8],
- /// [0, 9, 0, 0]].
- /// ```
- /// The array of non-zero values in X is:
- /// ```text
- /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
- /// ```
- /// And the indptr of X is:
- /// ```text
- /// indptr(X) = [0, 2, 3, 5, 5, 8, 10].
- /// ```
- indptrBuffer: Buffer (required);
-
- /// The type of values in indicesBuffer
- indicesType: Int (required);
-
- /// indicesBuffer stores the location and size of the array that
- /// contains the column indices of the corresponding non-zero values.
- /// The type of index value is long.
- ///
- /// For example, the indices of the above X is:
- /// ```text
- /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
- /// ```
- /// Note that the indices are sorted in lexicographical order for each row.
- indicesBuffer: Buffer (required);
-}
-
-/// Compressed Sparse Fiber (CSF) sparse tensor index.
-table SparseTensorIndexCSF {
- /// CSF is a generalization of compressed sparse row (CSR) index.
- /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
- ///
- /// CSF index recursively compresses each dimension of a tensor into a set
- /// of prefix trees. Each path from a root to leaf forms one tensor
- /// non-zero index. CSF is implemented with two arrays of buffers and one
- /// arrays of integers.
- ///
- /// For example, let X be a 2x3x4x5 tensor and let it have the following
- /// 8 non-zero values:
- /// ```text
- /// X[0, 0, 0, 1] := 1
- /// X[0, 0, 0, 2] := 2
- /// X[0, 1, 0, 0] := 3
- /// X[0, 1, 0, 2] := 4
- /// X[0, 1, 1, 0] := 5
- /// X[1, 1, 1, 0] := 6
- /// X[1, 1, 1, 1] := 7
- /// X[1, 1, 1, 2] := 8
- /// ```
- /// As a prefix tree this would be represented as:
- /// ```text
- /// 0 1
- /// / \ |
- /// 0 1 1
- /// / / \ |
- /// 0 0 1 1
- /// /| /| | /| |
- /// 1 2 0 2 0 0 1 2
- /// ```
- /// The type of values in indptrBuffers
- indptrType: Int (required);
-
- /// indptrBuffers stores the sparsity structure.
- /// Each two consecutive dimensions in a tensor correspond to a buffer in
- /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
- /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
- /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
- ///
- /// For example, the indptrBuffers for the above X is:
- /// ```text
- /// indptrBuffer(X) = [
- /// [0, 2, 3],
- /// [0, 1, 3, 4],
- /// [0, 2, 4, 5, 8]
- /// ].
- /// ```
- indptrBuffers: [Buffer] (required);
-
- /// The type of values in indicesBuffers
- indicesType: Int (required);
-
- /// indicesBuffers stores values of nodes.
- /// Each tensor dimension corresponds to a buffer in indicesBuffers.
- /// For example, the indicesBuffers for the above X is:
- /// ```text
- /// indicesBuffer(X) = [
- /// [0, 1],
- /// [0, 1, 1],
- /// [0, 0, 1, 1],
- /// [1, 2, 0, 2, 0, 0, 1, 2]
- /// ].
- /// ```
- indicesBuffers: [Buffer] (required);
-
- /// axisOrder stores the sequence in which dimensions were traversed to
- /// produce the prefix tree.
- /// For example, the axisOrder for the above X is:
- /// ```text
- /// axisOrder(X) = [0, 1, 2, 3].
- /// ```
- axisOrder: [int] (required);
-}
-
-union SparseTensorIndex {
- SparseTensorIndexCOO,
- SparseMatrixIndexCSX,
- SparseTensorIndexCSF
-}
-
-table SparseTensor {
- /// The type of data contained in a value cell.
- /// Currently only fixed-width value types are supported,
- /// no strings or nested types.
- type: Type (required);
-
- /// The dimensions of the tensor, optionally named.
- shape: [TensorDim] (required);
-
- /// The number of non-zero values in a sparse tensor.
- non_zero_length: long;
-
- /// Sparse tensor index
- sparseIndex: SparseTensorIndex (required);
-
- /// The location and size of the tensor's data
- data: Buffer (required);
-}
-
-root_type SparseTensor;
diff --git a/format/Tensor.fbs b/format/Tensor.fbs
deleted file mode 100644
index 409297c..0000000
--- a/format/Tensor.fbs
+++ /dev/null
@@ -1,54 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-/// EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or
-/// "ndarrays". Arrow implementations in general are not required to implement
-/// this type
-
-include "Schema.fbs";
-
-namespace org.apache.arrow.flatbuf;
-
-/// ----------------------------------------------------------------------
-/// Data structures for dense tensors
-
-/// Shape data for a single axis in a tensor
-table TensorDim {
- /// Length of dimension
- size: long;
-
- /// Name of the dimension, optional
- name: string;
-}
-
-table Tensor {
- /// The type of data contained in a value cell. Currently only fixed-width
- /// value types are supported, no strings or nested types
- type: Type (required);
-
- /// The dimensions of the tensor, optionally named
- shape: [TensorDim] (required);
-
- /// Non-negative byte offsets to advance one value cell along each dimension
- /// If omitted, default to row-major order (C-like).
- strides: [long];
-
- /// The location and size of the tensor's data
- data: Buffer (required);
-}
-
-root_type Tensor;