You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/09 19:55:32 UTC
arrow git commit: ARROW-874: [JS] Read dictionary-encoded vectors
Repository: arrow
Updated Branches:
refs/heads/master 670612e6f -> 22c738cc7
ARROW-874: [JS] Read dictionary-encoded vectors
Author: Brian Hulette <br...@ccri.com>
Author: Emilio Lahr-Vivaz <el...@ccri.com>
Author: Brian Hulette <hu...@gmail.com>
Closes #655 from TheNeuralBit/js_dictionary and squashes the following commits:
4fbaf9d [Brian Hulette] add unit tests, fix errors in file format dictionary reading
d89c84b [Brian Hulette] Add dictionary support for file format
5bdf8a1 [Emilio Lahr-Vivaz] dictionary encoding
c6eff38 [Brian Hulette] Updated API documenation
4c84362 [Brian Hulette] added struct_example tests (Struct type, and multiple record batches)
5a2efe6 [Brian Hulette] add tests for streaming format
9aed94b [Brian Hulette] Fix file format, unit tests
e2e0d4d [Emilio Lahr-Vivaz] renaming reader method
844b5e9 [Emilio Lahr-Vivaz] fix for file format
bfb7754 [Emilio Lahr-Vivaz] cleanup
162c9be [Emilio Lahr-Vivaz] working for streaming
290497f [Emilio Lahr-Vivaz] js support multiple record batches
4b3b412 [Emilio Lahr-Vivaz] initial support for streaming file format, added FixedSizeList
c9d705d [Brian Hulette] Created npm build script
53db587 [Brian Hulette] Fixes to make tests pass
304c669 [Brian Hulette] Added basic js unit tests
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/22c738cc
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/22c738cc
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/22c738cc
Branch: refs/heads/master
Commit: 22c738cc7b26bff9e7319d438dd3fef1238d46ad
Parents: 670612e
Author: Brian Hulette <br...@ccri.com>
Authored: Tue May 9 15:55:27 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue May 9 15:55:27 2017 -0400
----------------------------------------------------------------------
js/.gitignore | 2 +
js/README.md | 17 +-
js/bin/arrow2csv.js | 9 +-
js/bin/arrow_schema.js | 8 +-
js/examples/read_file.html | 6 +-
js/flatbuffers.sh | 19 +
js/lib/arrow.ts | 440 +++++++++++++++++++----
js/lib/bitarray.ts | 17 +-
js/lib/types.ts | 581 ++++++++++++++++++++++---------
js/package.json | 12 +-
js/postinstall.sh | 18 -
js/spec/arrow.js | 179 ++++++++++
js/spec/dictionary-stream.arrow | Bin 0 -> 1776 bytes
js/spec/dictionary.arrow | Bin 0 -> 2522 bytes
js/spec/simple-stream.arrow | Bin 0 -> 1188 bytes
js/spec/simple.arrow | Bin 0 -> 1642 bytes
js/spec/struct_example-stream.arrow | Bin 0 -> 1884 bytes
js/spec/struct_example.arrow | Bin 0 -> 2354 bytes
18 files changed, 1026 insertions(+), 282 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/.gitignore
----------------------------------------------------------------------
diff --git a/js/.gitignore b/js/.gitignore
index 3b97e3a..f67c1cc 100644
--- a/js/.gitignore
+++ b/js/.gitignore
@@ -2,3 +2,5 @@ lib/*_generated.js
dist
node_modules
typings
+.idea
+*.iml
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/README.md
----------------------------------------------------------------------
diff --git a/js/README.md b/js/README.md
index 98ef756..cdabf54 100644
--- a/js/README.md
+++ b/js/README.md
@@ -20,6 +20,7 @@ From this directory, run:
$ npm install # pull dependencies
$ tsc # build typescript
$ webpack # bundle for the browser
+$ npm test # run unit tests
```
### Usage
@@ -42,9 +43,19 @@ Include `dist/arrow-bundle.js` in a `<script />` tag:
See [examples/read_file.html](examples/read_file.html) for a usage example - or try it out now at [theneuralbit.github.io/arrow](http://theneuralbit.github.io/arrow)
### API
-##### `arrow.loadSchema(buffer)`
+##### `arrow.getReader(buffer)`
+Returns an `ArrowReader` object representing the Arrow file or stream contained in
+the `buffer`.
+
+##### `ArrowReader.loadNextBatch()`
+Loads the next record batch and returns it's length.
+
+##### `ArrowReader.getSchema()`
Returns a JSON representation of the file's Arrow schema.
-##### `arrow.loadVectors(buffer)`
-Returns a dictionary of `Vector` objects, one for each column, indexed by the column's name.
+##### `ArrowReader.getVectors()`
+Returns a list of `Vector` objects, one for each column.
Vector objects have, at minimum, a `get(i)` method and a `length` attribute.
+
+##### `ArrowReader.getVector(name: String)`
+Return a Vector object for column `name`
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/bin/arrow2csv.js
----------------------------------------------------------------------
diff --git a/js/bin/arrow2csv.js b/js/bin/arrow2csv.js
index 48df2f9..8122e95 100755
--- a/js/bin/arrow2csv.js
+++ b/js/bin/arrow2csv.js
@@ -19,7 +19,7 @@
var fs = require('fs')
var process = require('process');
-var loadVectors = require('../dist/arrow.js').loadVectors;
+var arrow = require('../dist/arrow.js');
var program = require('commander');
function list (val) {
@@ -38,10 +38,11 @@ if (!program.schema) {
}
var buf = fs.readFileSync(process.argv[process.argv.length - 1]);
-var vectors = loadVectors(buf);
+var reader = arrow.getReader(buf);
+reader.loadNextBatch();
-for (var i = 0; i < vectors[program.schema[0]].length; i += 1|0) {
+for (var i = 0; i < reader.getVector(program.schema[0]).length; i += 1|0) {
console.log(program.schema.map(function (field) {
- return '' + vectors[field].get(i);
+ return '' + reader.getVector(field).get(i);
}).join(','));
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/bin/arrow_schema.js
----------------------------------------------------------------------
diff --git a/js/bin/arrow_schema.js b/js/bin/arrow_schema.js
index 7044778..44dabb4 100755
--- a/js/bin/arrow_schema.js
+++ b/js/bin/arrow_schema.js
@@ -19,7 +19,11 @@
var fs = require('fs');
var process = require('process');
-var loadSchema = require('../dist/arrow.js').loadSchema;
+var arrow = require('../dist/arrow.js');
var buf = fs.readFileSync(process.argv[process.argv.length - 1]);
-console.log(JSON.stringify(loadSchema(buf), null, '\t'));
+var reader = arrow.getReader(buf);
+console.log(JSON.stringify(reader.getSchema(), null, '\t'));
+//console.log(JSON.stringify(reader.getVectors(), null, '\t'));
+console.log('block count: ' + reader.getBatchCount());
+
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/examples/read_file.html
----------------------------------------------------------------------
diff --git a/js/examples/read_file.html b/js/examples/read_file.html
index 02b6f08..933b142 100644
--- a/js/examples/read_file.html
+++ b/js/examples/read_file.html
@@ -40,9 +40,11 @@ function addCell (tr, type, name) {
}
reader.onload = function (evt) {
var buf = new Uint8Array(evt.target.result);
- var schema = arrow.loadSchema(buf);
- var vectors = arrow.loadVectors(buf);
+ var schema = arrow.loadSchemaFromStream(buf);
+ var vectors = arrow.loadVectorsFromStream(buf);
var length = vectors[schema[0].name].length;
+console.log(JSON.stringify(schema, null, '\t'));
+console.log(JSON.stringify(vectors, null, '\t'));
var thead = document.getElementById("thead");
var tbody = document.getElementById("tbody");
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/flatbuffers.sh
----------------------------------------------------------------------
diff --git a/js/flatbuffers.sh b/js/flatbuffers.sh
new file mode 100755
index 0000000..99d2815
--- /dev/null
+++ b/js/flatbuffers.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. See accompanying LICENSE file.
+
+echo "Compiling flatbuffer schemas..."
+#flatc -o lib --js ../format/Message.fbs ../format/File.fbs
+flatc -o lib --js ../format/*.fbs
+rm -f lib/Arrow_generated.js
+cat lib/*_generated.js > lib/Arrow_generated.js
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/arrow.ts
----------------------------------------------------------------------
diff --git a/js/lib/arrow.ts b/js/lib/arrow.ts
index 0762885..74def4d 100644
--- a/js/lib/arrow.ts
+++ b/js/lib/arrow.ts
@@ -17,48 +17,186 @@
import { flatbuffers } from 'flatbuffers';
import { org } from './Arrow_generated';
-var arrow = org.apache.arrow;
import { vectorFromField, Vector } from './types';
-export function loadVectors(buf) {
- var fileLength = buf.length, bb, footerLengthOffset, footerLength,
- footerOffset, footer, schema, field, type, type_str, i,
- len, rb_metas, rb_meta, rtrn, recordBatchBlock, recordBatchBlocks = [];
- var vectors : Vector[] = [];
+import ByteBuffer = flatbuffers.ByteBuffer;
+var Footer = org.apache.arrow.flatbuf.Footer;
+var Message = org.apache.arrow.flatbuf.Message;
+var MessageHeader = org.apache.arrow.flatbuf.MessageHeader;
+var RecordBatch = org.apache.arrow.flatbuf.RecordBatch;
+var DictionaryBatch = org.apache.arrow.flatbuf.DictionaryBatch;
+var Schema = org.apache.arrow.flatbuf.Schema;
+var Type = org.apache.arrow.flatbuf.Type;
+var VectorType = org.apache.arrow.flatbuf.VectorType;
+
+export class ArrowReader {
+
+ private bb;
+ private schema: any = [];
+ private vectors: Vector[];
+ private vectorMap: any = {};
+ private dictionaries: any = {};
+ private batches: any = [];
+ private batchIndex: number = 0;
+
+ constructor(bb, schema, vectors: Vector[], batches, dictionaries) {
+ this.bb = bb;
+ this.schema = schema;
+ this.vectors = vectors;
+ for (var i = 0; i < vectors.length; i += 1|0) {
+ this.vectorMap[vectors[i].name] = vectors[i]
+ }
+ this.batches = batches;
+ this.dictionaries = dictionaries;
+ }
+
+ loadNextBatch() {
+ if (this.batchIndex < this.batches.length) {
+ var batch = this.batches[this.batchIndex];
+ this.batchIndex += 1;
+ loadVectors(this.bb, this.vectors, batch);
+ return batch.length;
+ } else {
+ return 0;
+ }
+ }
+
+ getSchema() {
+ return this.schema;
+ }
+
+ getVectors() {
+ return this.vectors;
+ }
+
+ getVector(name) {
+ return this.vectorMap[name];
+ }
- bb = new flatbuffers.ByteBuffer(buf);
+ getBatchCount() {
+ return this.batches.length;
+ }
+
+ // the index of the next batch to be loaded
+ getBatchIndex() {
+ return this.batchIndex;
+ }
- footer = _loadFooter(bb);
+ // set the index of the next batch to be loaded
+ setBatchIndex(i: number) {
+ this.batchIndex = i;
+ }
+}
- schema = footer.schema();
+export function getSchema(buf) { return getReader(buf).getSchema(); }
+
+export function getReader(buf) : ArrowReader {
+ if (_checkMagic(buf, 0)) {
+ return getFileReader(buf);
+ } else {
+ return getStreamReader(buf);
+ }
+}
+
+export function getStreamReader(buf) : ArrowReader {
+ var bb = new ByteBuffer(buf);
+
+ var schema = _loadSchema(bb),
+ field,
+ vectors: Vector[] = [],
+ i,j,
+ iLen,jLen,
+ batch,
+ recordBatches = [],
+ dictionaryBatches = [],
+ dictionaries = {};
+
+ for (i = 0, iLen = schema.fieldsLength(); i < iLen; i += 1|0) {
+ field = schema.fields(i);
+ _createDictionaryVectors(field, dictionaries);
+ vectors.push(vectorFromField(field, dictionaries));
+ }
+
+ while (bb.position() < bb.capacity()) {
+ batch = _loadBatch(bb);
+ if (batch == null) {
+ break;
+ } else if (batch.type == MessageHeader.DictionaryBatch) {
+ dictionaryBatches.push(batch);
+ } else if (batch.type == MessageHeader.RecordBatch) {
+ recordBatches.push(batch)
+ } else {
+ console.error("Expected batch type" + MessageHeader.RecordBatch + " or " +
+ MessageHeader.DictionaryBatch + " but got " + batch.type);
+ }
+ }
+
+ // load dictionary vectors
+ for (i = 0; i < dictionaryBatches.length; i += 1|0) {
+ batch = dictionaryBatches[i];
+ loadVectors(bb, [dictionaries[batch.id]], batch);
+ }
+
+ return new ArrowReader(bb, parseSchema(schema), vectors, recordBatches, dictionaries);
+}
+
+export function getFileReader (buf) : ArrowReader {
+ var bb = new ByteBuffer(buf);
+
+ var footer = _loadFooter(bb);
+
+ var schema = footer.schema();
+ var i, len, field,
+ vectors: Vector[] = [],
+ block,
+ batch,
+ recordBatchBlocks = [],
+ dictionaryBatchBlocks = [],
+ dictionaries = {};
for (i = 0, len = schema.fieldsLength(); i < len; i += 1|0) {
field = schema.fields(i);
- vectors.push(vectorFromField(field));
+ _createDictionaryVectors(field, dictionaries);
+ vectors.push(vectorFromField(field, dictionaries));
+ }
+
+ for (i = 0; i < footer.dictionariesLength(); i += 1|0) {
+ block = footer.dictionaries(i);
+ dictionaryBatchBlocks.push({
+ offset: block.offset().low,
+ metaDataLength: block.metaDataLength(),
+ bodyLength: block.bodyLength().low,
+ })
}
for (i = 0; i < footer.recordBatchesLength(); i += 1|0) {
- recordBatchBlock = footer.recordBatches(i);
+ block = footer.recordBatches(i);
recordBatchBlocks.push({
- offset: recordBatchBlock.offset(),
- metaDataLength: recordBatchBlock.metaDataLength(),
- bodyLength: recordBatchBlock.bodyLength(),
+ offset: block.offset().low,
+ metaDataLength: block.metaDataLength(),
+ bodyLength: block.bodyLength().low,
})
}
- loadBuffersIntoVectors(recordBatchBlocks, bb, vectors);
- var rtrn : any = {};
- for (var i : any = 0; i < vectors.length; i += 1|0) {
- rtrn[vectors[i].name] = vectors[i]
+ var dictionaryBatches = dictionaryBatchBlocks.map(function (block) {
+ bb.setPosition(block.offset);
+ // TODO: Make sure this is a dictionary batch
+ return _loadBatch(bb);
+ });
+
+ var recordBatches = recordBatchBlocks.map(function (block) {
+ bb.setPosition(block.offset);
+ // TODO: Make sure this is a record batch
+ return _loadBatch(bb);
+ });
+
+ // load dictionary vectors
+ for (i = 0; i < dictionaryBatches.length; i += 1|0) {
+ batch = dictionaryBatches[i];
+ loadVectors(bb, [dictionaries[batch.id]], batch);
}
- return rtrn;
-}
-export function loadSchema(buf) {
- var footer = _loadFooter(new flatbuffers.ByteBuffer(buf));
- var schema = footer.schema();
-
- return parseSchema(schema);
+ return new ArrowReader(bb, parseSchema(schema), vectors, recordBatches, dictionaries);
}
function _loadFooter(bb) {
@@ -81,7 +219,7 @@ function _loadFooter(bb) {
var footerLengthOffset: number = fileLength - MAGIC.length - 4;
bb.setPosition(footerLengthOffset);
- var footerLength: number = Int64FromByteBuffer(bb, footerLengthOffset)
+ var footerLength: number = Int32FromByteBuffer(bb, footerLengthOffset)
if (footerLength <= 0 || footerLength + MAGIC.length*2 + 4 > fileLength) {
console.log("Invalid footer length: " + footerLength)
@@ -89,19 +227,166 @@ function _loadFooter(bb) {
var footerOffset: number = footerLengthOffset - footerLength;
bb.setPosition(footerOffset);
- var footer = arrow.flatbuf.Footer.getRootAsFooter(bb);
+ var footer = Footer.getRootAsFooter(bb);
return footer;
}
-function Int64FromByteBuffer(bb, offset) {
+function _loadSchema(bb) {
+ var message =_loadMessage(bb);
+ if (message.headerType() != MessageHeader.Schema) {
+ console.error("Expected header type " + MessageHeader.Schema + " but got " + message.headerType());
+ return;
+ }
+ return message.header(new Schema());
+}
+
+function _loadBatch(bb) {
+ var message = _loadMessage(bb);
+ if (message == null) {
+ return;
+ } else if (message.headerType() == MessageHeader.RecordBatch) {
+ var batch = { header: message.header(new RecordBatch()), length: message.bodyLength().low }
+ return _loadRecordBatch(bb, batch);
+ } else if (message.headerType() == MessageHeader.DictionaryBatch) {
+ var batch = { header: message.header(new DictionaryBatch()), length: message.bodyLength().low }
+ return _loadDictionaryBatch(bb, batch);
+ } else {
+ console.error("Expected header type " + MessageHeader.RecordBatch + " or " + MessageHeader.DictionaryBatch +
+ " but got " + message.headerType());
+ return;
+ }
+}
+
+function _loadRecordBatch(bb, batch) {
+ var data = batch.header;
+ var i, nodes_ = [], nodesLength = data.nodesLength();
+ var buffer, buffers_ = [], buffersLength = data.buffersLength();
+
+ for (i = 0; i < nodesLength; i += 1) {
+ nodes_.push(data.nodes(i));
+ }
+ for (i = 0; i < buffersLength; i += 1) {
+ buffer = data.buffers(i);
+ buffers_.push({ offset: bb.position() + buffer.offset().low, length: buffer.length().low });
+ }
+ // position the buffer after the body to read the next message
+ bb.setPosition(bb.position() + batch.length);
+
+ return { nodes: nodes_, buffers: buffers_, length: data.length().low, type: MessageHeader.RecordBatch };
+}
+
+function _loadDictionaryBatch(bb, batch) {
+ var id_ = batch.header.id().toFloat64().toString(), data = batch.header.data();
+ var i, nodes_ = [], nodesLength = data.nodesLength();
+ var buffer, buffers_ = [], buffersLength = data.buffersLength();
+
+ for (i = 0; i < nodesLength; i += 1) {
+ nodes_.push(data.nodes(i));
+ }
+ for (i = 0; i < buffersLength; i += 1) {
+ buffer = data.buffers(i);
+ buffers_.push({ offset: bb.position() + buffer.offset().low, length: buffer.length().low });
+ }
+ // position the buffer after the body to read the next message
+ bb.setPosition(bb.position() + batch.length);
+
+ return { id: id_, nodes: nodes_, buffers: buffers_, length: data.length().low, type: MessageHeader.DictionaryBatch };
+}
+
+function _loadMessage(bb) {
+ var messageLength: number = Int32FromByteBuffer(bb, bb.position());
+ if (messageLength == 0) {
+ return;
+ }
+ bb.setPosition(bb.position() + 4);
+ var message = Message.getRootAsMessage(bb);
+ // position the buffer at the end of the message so it's ready to read further
+ bb.setPosition(bb.position() + messageLength);
+
+ return message;
+}
+
+function _createDictionaryVectors(field, dictionaries) {
+ var encoding = field.dictionary();
+ if (encoding != null) {
+ var id = encoding.id().toFloat64().toString();
+ if (dictionaries[id] == null) {
+ // create a field for the dictionary
+ var dictionaryField = _createDictionaryField(id, field);
+ dictionaries[id] = vectorFromField(dictionaryField, null);
+ }
+ }
+
+ // recursively examine child fields
+ for (var i = 0, len = field.childrenLength(); i < len; i += 1|0) {
+ _createDictionaryVectors(field.children(i), dictionaries);
+ }
+}
+
+function _createDictionaryField(id, field) {
+ var builder = new flatbuffers.Builder();
+ var nameOffset = builder.createString("dict-" + id);
+
+ var typeType = field.typeType();
+ var typeOffset;
+ if (typeType === Type.Int) {
+ var type = field.type(new org.apache.arrow.flatbuf.Int());
+ org.apache.arrow.flatbuf.Int.startInt(builder);
+ org.apache.arrow.flatbuf.Int.addBitWidth(builder, type.bitWidth());
+ org.apache.arrow.flatbuf.Int.addIsSigned(builder, type.isSigned());
+ typeOffset = org.apache.arrow.flatbuf.Int.endInt(builder);
+ } else if (typeType === Type.FloatingPoint) {
+ var type = field.type(new org.apache.arrow.flatbuf.FloatingPoint());
+ org.apache.arrow.flatbuf.FloatingPoint.startFloatingPoint(builder);
+ org.apache.arrow.flatbuf.FloatingPoint.addPrecision(builder, type.precision());
+ typeOffset = org.apache.arrow.flatbuf.FloatingPoint.endFloatingPoint(builder);
+ } else if (typeType === Type.Utf8) {
+ org.apache.arrow.flatbuf.Utf8.startUtf8(builder);
+ typeOffset = org.apache.arrow.flatbuf.Utf8.endUtf8(builder);
+ } else if (typeType === Type.Date) {
+ var type = field.type(new org.apache.arrow.flatbuf.Date());
+ org.apache.arrow.flatbuf.Date.startDate(builder);
+ org.apache.arrow.flatbuf.Date.addUnit(builder, type.unit());
+ typeOffset = org.apache.arrow.flatbuf.Date.endDate(builder);
+ } else {
+ throw "Unimplemented dictionary type " + typeType;
+ }
+ if (field.childrenLength() > 0) {
+ throw "Dictionary encoded fields can't have children"
+ }
+ var childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, []);
+
+ var layout, layoutOffsets = [];
+ for (var i = 0, len = field.layoutLength(); i < len; i += 1|0) {
+ layout = field.layout(i);
+ org.apache.arrow.flatbuf.VectorLayout.startVectorLayout(builder);
+ org.apache.arrow.flatbuf.VectorLayout.addBitWidth(builder, layout.bitWidth());
+ org.apache.arrow.flatbuf.VectorLayout.addType(builder, layout.type());
+ layoutOffsets.push(org.apache.arrow.flatbuf.VectorLayout.endVectorLayout(builder));
+ }
+ var layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, layoutOffsets);
+
+ org.apache.arrow.flatbuf.Field.startField(builder);
+ org.apache.arrow.flatbuf.Field.addName(builder, nameOffset);
+ org.apache.arrow.flatbuf.Field.addNullable(builder, field.nullable());
+ org.apache.arrow.flatbuf.Field.addTypeType(builder, typeType);
+ org.apache.arrow.flatbuf.Field.addType(builder, typeOffset);
+ org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset);
+ org.apache.arrow.flatbuf.Field.addLayout(builder, layoutOffset);
+ var offset = org.apache.arrow.flatbuf.Field.endField(builder);
+ builder.finish(offset);
+
+ return org.apache.arrow.flatbuf.Field.getRootAsField(builder.bb);
+}
+
+function Int32FromByteBuffer(bb, offset) {
return ((bb.bytes_[offset + 3] & 255) << 24) |
((bb.bytes_[offset + 2] & 255) << 16) |
((bb.bytes_[offset + 1] & 255) << 8) |
((bb.bytes_[offset] & 255));
}
-
var MAGIC_STR = "ARROW1";
var MAGIC = new Uint8Array(MAGIC_STR.length);
for (var i = 0; i < MAGIC_STR.length; i += 1|0) {
@@ -118,27 +403,28 @@ function _checkMagic(buf, index) {
}
var TYPEMAP = {}
-TYPEMAP[arrow.flatbuf.Type.NONE] = "NONE";
-TYPEMAP[arrow.flatbuf.Type.Null] = "Null";
-TYPEMAP[arrow.flatbuf.Type.Int] = "Int";
-TYPEMAP[arrow.flatbuf.Type.FloatingPoint] = "FloatingPoint";
-TYPEMAP[arrow.flatbuf.Type.Binary] = "Binary";
-TYPEMAP[arrow.flatbuf.Type.Utf8] = "Utf8";
-TYPEMAP[arrow.flatbuf.Type.Bool] = "Bool";
-TYPEMAP[arrow.flatbuf.Type.Decimal] = "Decimal";
-TYPEMAP[arrow.flatbuf.Type.Date] = "Date";
-TYPEMAP[arrow.flatbuf.Type.Time] = "Time";
-TYPEMAP[arrow.flatbuf.Type.Timestamp] = "Timestamp";
-TYPEMAP[arrow.flatbuf.Type.Interval] = "Interval";
-TYPEMAP[arrow.flatbuf.Type.List] = "List";
-TYPEMAP[arrow.flatbuf.Type.Struct_] = "Struct";
-TYPEMAP[arrow.flatbuf.Type.Union] = "Union";
+TYPEMAP[Type.NONE] = "NONE";
+TYPEMAP[Type.Null] = "Null";
+TYPEMAP[Type.Int] = "Int";
+TYPEMAP[Type.FloatingPoint] = "FloatingPoint";
+TYPEMAP[Type.Binary] = "Binary";
+TYPEMAP[Type.Utf8] = "Utf8";
+TYPEMAP[Type.Bool] = "Bool";
+TYPEMAP[Type.Decimal] = "Decimal";
+TYPEMAP[Type.Date] = "Date";
+TYPEMAP[Type.Time] = "Time";
+TYPEMAP[Type.Timestamp] = "Timestamp";
+TYPEMAP[Type.Interval] = "Interval";
+TYPEMAP[Type.List] = "List";
+TYPEMAP[Type.FixedSizeList] = "FixedSizeList";
+TYPEMAP[Type.Struct_] = "Struct";
+TYPEMAP[Type.Union] = "Union";
var VECTORTYPEMAP = {};
-VECTORTYPEMAP[arrow.flatbuf.VectorType.OFFSET] = 'OFFSET';
-VECTORTYPEMAP[arrow.flatbuf.VectorType.DATA] = 'DATA';
-VECTORTYPEMAP[arrow.flatbuf.VectorType.VALIDITY] = 'VALIDITY';
-VECTORTYPEMAP[arrow.flatbuf.VectorType.TYPE] = 'TYPE';
+VECTORTYPEMAP[VectorType.OFFSET] = 'OFFSET';
+VECTORTYPEMAP[VectorType.DATA] = 'DATA';
+VECTORTYPEMAP[VectorType.VALIDITY] = 'VALIDITY';
+VECTORTYPEMAP[VectorType.TYPE] = 'TYPE';
function parseField(field) {
var children = [];
@@ -149,7 +435,6 @@ function parseField(field) {
var layouts = [];
for (var i = 0; i < field.layoutLength(); i += 1|0) {
layouts.push(VECTORTYPEMAP[field.layout(i).type()]);
-
}
return {
@@ -170,32 +455,39 @@ function parseSchema(schema) {
return result;
}
-function parseBuffer(buffer) {
- return {
- offset: buffer.offset(),
- length: buffer.length()
- };
+function loadVectors(bb, vectors: Vector[], recordBatch) {
+ var indices = { bufferIndex: 0, nodeIndex: 0 }, i;
+ for (i = 0; i < vectors.length; i += 1) {
+ loadVector(bb, vectors[i], recordBatch, indices);
+ }
}
-function loadBuffersIntoVectors(recordBatchBlocks, bb, vectors : Vector[]) {
- var fieldNode, recordBatchBlock, recordBatch, numBuffers, bufReader = {index: 0, node_index: 1}, field_ctr = 0;
- var buffer = bb.bytes_.buffer;
- var baseOffset = bb.bytes_.byteOffset;
- for (var i = recordBatchBlocks.length - 1; i >= 0; i -= 1|0) {
- recordBatchBlock = recordBatchBlocks[i];
- bb.setPosition(recordBatchBlock.offset.low);
- recordBatch = arrow.flatbuf.RecordBatch.getRootAsRecordBatch(bb);
- bufReader.index = 0;
- bufReader.node_index = 0;
- numBuffers = recordBatch.buffersLength();
-
- //console.log('num buffers: ' + recordBatch.buffersLength());
- //console.log('num nodes: ' + recordBatch.nodesLength());
-
- while (bufReader.index < numBuffers) {
- //console.log('Allocating buffers starting at ' + bufReader.index + '/' + numBuffers + ' to field ' + field_ctr);
- vectors[field_ctr].loadData(recordBatch, buffer, bufReader, baseOffset + recordBatchBlock.offset.low + recordBatchBlock.metaDataLength)
- field_ctr += 1;
- }
+/**
+ * Loads a vector with data from a batch
+ * recordBatch: { nodes: org.apache.arrow.flatbuf.FieldNode[], buffers: { offset: number, length: number }[] }
+ */
+function loadVector(bb, vector: Vector, recordBatch, indices) {
+ var node = recordBatch.nodes[indices.nodeIndex], ownBuffersLength, ownBuffers = [], i;
+ indices.nodeIndex += 1;
+
+ // dictionary vectors are always ints, so will have a data vector plus optional null vector
+ if (vector.field.dictionary() == null) {
+ ownBuffersLength = vector.field.layoutLength();
+ } else if (vector.field.nullable()) {
+ ownBuffersLength = 2;
+ } else {
+ ownBuffersLength = 1;
+ }
+
+ for (i = 0; i < ownBuffersLength; i += 1) {
+ ownBuffers.push(recordBatch.buffers[indices.bufferIndex + i]);
+ }
+ indices.bufferIndex += ownBuffersLength;
+
+ vector.loadData(bb, node, ownBuffers);
+
+ var children = vector.getChildVectors();
+ for (i = 0; i < children.length; i++) {
+ loadVector(bb, children[i], recordBatch, indices);
}
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/bitarray.ts
----------------------------------------------------------------------
diff --git a/js/lib/bitarray.ts b/js/lib/bitarray.ts
index 82fff32..fc3c091 100644
--- a/js/lib/bitarray.ts
+++ b/js/lib/bitarray.ts
@@ -17,22 +17,9 @@
export class BitArray {
private view: Uint8Array;
- constructor(buffer: ArrayBuffer, offset: number, length: number) {
- //if (ArrayBuffer.isView(buffer)) {
- // var og_view = buffer;
- // buffer = buffer.buffer;
- // offset = og_view.offset;
- // length = og_view.length/og_view.BYTES_PER_ELEMENT*8;
- //} else if (buffer instanceof ArrayBuffer) {
- var offset = offset || 0;
- var length = length;// || buffer.length*8;
- //} else if (buffer instanceof Number) {
- // length = buffer;
- // buffer = new ArrayBuffer(Math.ceil(length/8));
- // offset = 0;
- //}
- this.view = new Uint8Array(buffer, offset, Math.ceil(length/8));
+ constructor(buffer: ArrayBuffer, offset: number, length: number) {
+ this.view = new Uint8Array(buffer, offset || 0, Math.ceil(length / 8));
}
get(i) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/types.ts
----------------------------------------------------------------------
diff --git a/js/lib/types.ts b/js/lib/types.ts
index bbc7558..d656c6a 100644
--- a/js/lib/types.ts
+++ b/js/lib/types.ts
@@ -18,7 +18,8 @@
import { BitArray } from './bitarray';
import { TextDecoder } from 'text-encoding';
import { org } from './Arrow_generated';
-var arrow = org.apache.arrow;
+
+var Type = org.apache.arrow.flatbuf.Type;
interface ArrayView {
slice(start: number, end: number) : ArrayView
@@ -26,72 +27,90 @@ interface ArrayView {
}
export abstract class Vector {
+ field: any;
name: string;
length: number;
null_count: number;
- constructor(name: string) {
- this.name = name;
+
+ constructor(field) {
+ this.field = field;
+ this.name = field.name();
}
+
/* Access datum at index i */
abstract get(i);
/* Return array representing data in the range [start, end) */
abstract slice(start: number, end: number);
-
- /* Use recordBatch fieldNodes and Buffers to construct this Vector */
- public loadData(recordBatch: any, buffer: any, bufReader: any, baseOffset: any) {
- var fieldNode = recordBatch.nodes(bufReader.node_index);
- this.length = fieldNode.length();
- this.null_count = fieldNode.length();
- bufReader.node_index += 1|0;
-
- this.loadBuffers(recordBatch, buffer, bufReader, baseOffset);
- }
-
- protected abstract loadBuffers(recordBatch: any, buffer: any, bufReader: any, baseOffset: any);
-
- /* Helper function for loading a VALIDITY buffer (for Nullable types) */
- static loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset) : BitArray {
- var buf_meta = recordBatch.buffers(bufReader.index);
- var offset = baseOffset + buf_meta.offset().low;
- var length = buf_meta.length().low;
- bufReader.index += 1|0;
- return new BitArray(buffer, offset, length*8);
- }
-
- /* Helper function for loading an OFFSET buffer */
- static loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset) : Int32Array {
- var buf_meta = recordBatch.buffers(bufReader.index);
- var offset = baseOffset + buf_meta.offset().low;
- var length = buf_meta.length().low/Int32Array.BYTES_PER_ELEMENT;
- bufReader.index += 1|0;
- return new Int32Array(buffer, offset, length);
+ /* Return array of child vectors, for container types */
+ abstract getChildVectors();
+
+ /**
+ * Use recordBatch fieldNodes and Buffers to construct this Vector
+ * bb: flatbuffers.ByteBuffer
+ * node: org.apache.arrow.flatbuf.FieldNode
+ * buffers: { offset: number, length: number }[]
+ */
+ public loadData(bb, node, buffers) {
+ this.length = node.length().low;
+ this.null_count = node.nullCount().low;
+ this.loadBuffers(bb, node, buffers);
+ }
+
+ protected abstract loadBuffers(bb, node, buffers);
+
+ /**
+ * Helper function for loading a VALIDITY buffer (for Nullable types)
+ * bb: flatbuffers.ByteBuffer
+ * buffer: org.apache.arrow.flatbuf.Buffer
+ */
+ static loadValidityBuffer(bb, buffer) : BitArray {
+ var arrayBuffer = bb.bytes_.buffer;
+ var offset = bb.bytes_.byteOffset + buffer.offset;
+ return new BitArray(arrayBuffer, offset, buffer.length * 8);
+ }
+
+ /**
+ * Helper function for loading an OFFSET buffer
+ * buffer: org.apache.arrow.flatbuf.Buffer
+ */
+ static loadOffsetBuffer(bb, buffer) : Int32Array {
+ var arrayBuffer = bb.bytes_.buffer;
+ var offset = bb.bytes_.byteOffset + buffer.offset;
+ var length = buffer.length / Int32Array.BYTES_PER_ELEMENT;
+ return new Int32Array(arrayBuffer, offset, length);
}
}
class SimpleVector<T extends ArrayView> extends Vector {
protected dataView: T;
- private TypedArray: {new(buffer: any, offset: number, length: number) : T, BYTES_PER_ELEMENT: number};
+ private TypedArray: { new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number };
- constructor (TypedArray: {new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number}, name: string) {
- super(name);
+ constructor (field, TypedArray: { new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number }) {
+ super(field);
this.TypedArray = TypedArray;
}
+ getChildVectors() {
+ return [];
+ }
+
get(i) {
return this.dataView[i];
}
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset);
+ loadBuffers(bb, node, buffers) {
+ this.loadDataBuffer(bb, buffers[0]);
}
- loadDataBuffer(recordBatch, buffer, bufReader, baseOffset) : T {
- var buf_meta = recordBatch.buffers(bufReader.index);
- var offset = baseOffset + buf_meta.offset().low;
- var length = buf_meta.length().low/this.TypedArray.BYTES_PER_ELEMENT;
- bufReader.index += 1|0;
- return new this.TypedArray(buffer, offset, length);
+ /**
+ * buffer: org.apache.arrow.flatbuf.Buffer
+ */
+ protected loadDataBuffer(bb, buffer) {
+ var arrayBuffer = bb.bytes_.buffer;
+ var offset = bb.bytes_.byteOffset + buffer.offset;
+ var length = buffer.length / this.TypedArray.BYTES_PER_ELEMENT;
+ this.dataView = new this.TypedArray(arrayBuffer, offset, length);
}
getDataView() {
@@ -108,77 +127,173 @@ class SimpleVector<T extends ArrayView> extends Vector {
}
class NullableSimpleVector<T extends ArrayView> extends SimpleVector<T> {
- private validityView: BitArray;
+
+ protected validityView: BitArray;
+
+ get(i: number) {
+ if (this.validityView.get(i)) {
+ return this.dataView[i];
+ } else {
+ return null;
+ }
+ }
+
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
+ this.loadDataBuffer(bb, buffers[1]);
+ }
+
+ getValidityVector() {
+ return this.validityView;
+ }
+}
+
+class Uint8Vector extends SimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; }
+class Uint16Vector extends SimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; }
+class Uint32Vector extends SimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; }
+class Int8Vector extends SimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; }
+class Int16Vector extends SimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; }
+class Int32Vector extends SimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; }
+class Float32Vector extends SimpleVector<Float32Array> { constructor(field) { super(field, Float32Array); }; }
+class Float64Vector extends SimpleVector<Float64Array> { constructor(field) { super(field, Float64Array); }; }
+
+class NullableUint8Vector extends NullableSimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; }
+class NullableUint16Vector extends NullableSimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; }
+class NullableUint32Vector extends NullableSimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; }
+class NullableInt8Vector extends NullableSimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; }
+class NullableInt16Vector extends NullableSimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; }
+class NullableInt32Vector extends NullableSimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; }
+class NullableFloat32Vector extends NullableSimpleVector<Float32Array> { constructor(field) { super(field, Float32Array); }; }
+class NullableFloat64Vector extends NullableSimpleVector<Float64Array> { constructor(field) { super(field, Float64Array); }; }
+
+class Uint64Vector extends SimpleVector<Uint32Array> {
+ constructor(field) {
+ super(field, Uint32Array);
+ }
get(i: number) {
- if (this.validityView.get(i)) return this.dataView[i];
- else return null
+ return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] };
}
+}
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset);
- this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset);
+class NullableUint64Vector extends NullableSimpleVector<Uint32Array> {
+ constructor(field) {
+ super(field, Uint32Array);
}
+ get(i: number) {
+ if (this.validityView.get(i)) {
+ return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] };
+ } else {
+ return null;
+ }
+ }
}
-class Uint8Vector extends SimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; }
-class Uint16Vector extends SimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; }
-class Uint32Vector extends SimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; }
-class Int8Vector extends SimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; }
-class Int16Vector extends SimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; }
-class Int32Vector extends SimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; }
-class Float32Vector extends SimpleVector<Float32Array> { constructor(name: string) { super(Float32Array, name); }; }
-class Float64Vector extends SimpleVector<Float64Array> { constructor(name: string) { super(Float64Array, name); }; }
-
-class NullableUint8Vector extends NullableSimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; }
-class NullableUint16Vector extends NullableSimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; }
-class NullableUint32Vector extends NullableSimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; }
-class NullableInt8Vector extends NullableSimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; }
-class NullableInt16Vector extends NullableSimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; }
-class NullableInt32Vector extends NullableSimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; }
-class NullableFloat32Vector extends NullableSimpleVector<Float32Array> { constructor(name: string) { super(Float32Array, name); }; }
-class NullableFloat64Vector extends NullableSimpleVector<Float64Array> { constructor(name: string) { super(Float64Array, name); }; }
+class Int64Vector extends NullableSimpleVector<Uint32Array> {
+ constructor(field) {
+ super(field, Uint32Array);
+ }
+
+ get(i: number) {
+ return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] };
+ }
+}
+
+class NullableInt64Vector extends NullableSimpleVector<Uint32Array> {
+ constructor(field) {
+ super(field, Uint32Array);
+ }
+
+ get(i: number) {
+ if (this.validityView.get(i)) {
+ return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] };
+ } else {
+ return null;
+ }
+ }
+}
+
+class DateVector extends SimpleVector<Uint32Array> {
+ constructor(field) {
+ super(field, Uint32Array);
+ }
+
+ get (i) {
+ return new Date(super.get(2*i+1)*Math.pow(2,32) + super.get(2*i));
+ }
+}
+
+class NullableDateVector extends DateVector {
+ private validityView: BitArray;
+
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
+ this.loadDataBuffer(bb, buffers[1]);
+ }
+
+ get (i) {
+ if (this.validityView.get(i)) {
+ return super.get(i);
+ } else {
+ return null;
+ }
+ }
+
+ getValidityVector() {
+ return this.validityView;
+ }
+}
class Utf8Vector extends SimpleVector<Uint8Array> {
protected offsetView: Int32Array;
static decoder: TextDecoder = new TextDecoder('utf8');
- constructor(name: string) {
- super(Uint8Array, name);
+ constructor(field) {
+ super(field, Uint8Array);
}
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.offsetView = Vector.loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset);
- this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset);
+ loadBuffers(bb, node, buffers) {
+ this.offsetView = Vector.loadOffsetBuffer(bb, buffers[0]);
+ this.loadDataBuffer(bb, buffers[1]);
}
get(i) {
- return Utf8Vector.decoder.decode
- (this.dataView.slice(this.offsetView[i], this.offsetView[i + 1]));
+ return Utf8Vector.decoder.decode(this.dataView.slice(this.offsetView[i], this.offsetView[i + 1]));
}
slice(start: number, end: number) {
- var rtrn: string[] = [];
+ var result: string[] = [];
for (var i: number = start; i < end; i += 1|0) {
- rtrn.push(this.get(i));
+ result.push(this.get(i));
}
- return rtrn;
+ return result;
+ }
+
+ getOffsetView() {
+ return this.offsetView;
}
}
class NullableUtf8Vector extends Utf8Vector {
private validityView: BitArray;
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset);
- this.offsetView = Vector.loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset);
- this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset);
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
+ this.offsetView = Vector.loadOffsetBuffer(bb, buffers[1]);
+ this.loadDataBuffer(bb, buffers[2]);
}
get(i) {
- if (!this.validityView.get(i)) return null;
- return super.get(i);
+ if (this.validityView.get(i)) {
+ return super.get(i);
+ } else {
+ return null;
+ }
+ }
+
+ getValidityVector() {
+ return this.validityView;
}
}
@@ -186,14 +301,17 @@ class NullableUtf8Vector extends Utf8Vector {
class ListVector extends Uint32Vector {
private dataVector: Vector;
- constructor(name, dataVector : Vector) {
- super(name);
+ constructor(field, dataVector: Vector) {
+ super(field);
this.dataVector = dataVector;
}
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- super.loadBuffers(recordBatch, buffer, bufReader, baseOffset);
- this.dataVector.loadData(recordBatch, buffer, bufReader, baseOffset);
+ getChildVectors() {
+ return [this.dataVector];
+ }
+
+ loadBuffers(bb, node, buffers) {
+ super.loadBuffers(bb, node, buffers);
this.length -= 1;
}
@@ -210,119 +328,262 @@ class ListVector extends Uint32Vector {
return "length: " + (this.length);
}
- slice(start : number, end : number) { return []; };
+ slice(start: number, end: number) {
+ var result = [];
+ for (var i = start; i < end; i += 1|0) {
+ result.push(this.get(i));
+ }
+ return result;
+ }
}
class NullableListVector extends ListVector {
private validityView: BitArray;
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset);
- super.loadBuffers(recordBatch, buffer, bufReader, baseOffset);
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
+ this.loadDataBuffer(bb, buffers[1]);
+ this.length -= 1;
}
get(i) {
- if (!this.validityView.get(i)) return null;
- return super.get(i);
+ if (this.validityView.get(i)) {
+ return super.get(i);
+ } else {
+ return null;
+ }
+ }
+
+ getValidityVector() {
+ return this.validityView;
+ }
+}
+
+class FixedSizeListVector extends Vector {
+ private size: number
+ private dataVector: Vector;
+
+ constructor(field, size: number, dataVector: Vector) {
+ super(field);
+ this.size = size;
+ this.dataVector = dataVector;
+ }
+
+ getChildVectors() {
+ return [this.dataVector];
+ }
+
+ loadBuffers(bb, node, buffers) {
+ // no buffers to load
+ }
+
+ get(i: number) {
+ return this.dataVector.slice(i * this.size, (i + 1) * this.size);
+ }
+
+ slice(start : number, end : number) {
+ var result = [];
+ for (var i = start; i < end; i += 1|0) {
+ result.push(this.get(i));
+ }
+ return result;
+ }
+
+ getListSize() {
+ return this.size;
+ }
+}
+
+class NullableFixedSizeListVector extends FixedSizeListVector {
+ private validityView: BitArray;
+
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
+ }
+
+ get(i: number) {
+ if (this.validityView.get(i)) {
+ return super.get(i);
+ } else {
+ return null;
+ }
+ }
+
+ getValidityVector() {
+ return this.validityView;
}
}
class StructVector extends Vector {
private validityView: BitArray;
- private vectors : Vector[];
- constructor(name: string, vectors: Vector[]) {
- super(name);
+ private vectors: Vector[];
+
+ constructor(field, vectors: Vector[]) {
+ super(field);
this.vectors = vectors;
}
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset);
- this.vectors.forEach((v: Vector) => v.loadData(recordBatch, buffer, bufReader, baseOffset));
+ getChildVectors() {
+ return this.vectors;
+ }
+
+ loadBuffers(bb, node, buffers) {
+ this.validityView = Vector.loadValidityBuffer(bb, buffers[0]);
}
get(i : number) {
- if (!this.validityView.get(i)) return null;
- return this.vectors.map((v: Vector) => v.get(i));
+ if (this.validityView.get(i)) {
+ return this.vectors.map((v: Vector) => v.get(i));
+ } else {
+ return null;
+ }
}
slice(start : number, end : number) {
- var rtrn = [];
- for (var i: number = start; i < end; i += 1|0) {
- rtrn.push(this.get(i));
+ var result = [];
+ for (var i = start; i < end; i += 1|0) {
+ result.push(this.get(i));
}
- return rtrn;
+ return result;
+ }
+
+ getValidityVector() {
+ return this.validityView;
}
}
-class DateVector extends SimpleVector<Uint32Array> {
- constructor (name: string) {
- super(Uint32Array, name);
+class DictionaryVector extends Vector {
+
+ private indices: Vector;
+ private dictionary: Vector;
+
+ constructor (field, indices: Vector, dictionary: Vector) {
+ super(field);
+ this.indices = indices;
+ this.dictionary = dictionary;
}
- get (i) {
- return new Date(super.get(2*i+1)*Math.pow(2,32) + super.get(2*i));
+ get(i) {
+ var encoded = this.indices.get(i);
+ if (encoded == null) {
+ return null;
+ } else {
+ return this.dictionary.get(encoded);
+ }
}
-}
-class NullableDateVector extends DateVector {
- private validityView: BitArray;
+ /** Get the dictionary encoded value */
+ public getEncoded(i) {
+ return this.indices.get(i);
+ }
- loadBuffers(recordBatch, buffer, bufReader, baseOffset) {
- this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset);
- super.loadBuffers(recordBatch, buffer, bufReader, baseOffset);
+ slice(start, end) {
+ return this.indices.slice(start, end); // TODO decode
}
- get (i) {
- if (!this.validityView.get(i)) return null;
- return super.get(i);
+ getChildVectors() {
+ return this.indices.getChildVectors();
+ }
+
+ loadBuffers(bb, node, buffers) {
+ this.indices.loadData(bb, node, buffers);
+ }
+
+ /** Get the index (encoded) vector */
+ public getIndexVector() {
+ return this.indices;
+ }
+
+ /** Get the dictionary vector */
+ public getDictionaryVector() {
+ return this.dictionary;
+ }
+
+ toString() {
+ return this.indices.toString();
}
}
-var BASIC_TYPES = [arrow.flatbuf.Type.Int, arrow.flatbuf.Type.FloatingPoint, arrow.flatbuf.Type.Utf8, arrow.flatbuf.Type.Date];
-
-export function vectorFromField(field) : Vector {
- var typeType = field.typeType();
- if (BASIC_TYPES.indexOf(typeType) >= 0) {
- var type = field.typeType();
- if (type === arrow.flatbuf.Type.Int) {
- type = field.type(new arrow.flatbuf.Int());
- var VectorConstructor : {new(string) : Vector};
- if (type.isSigned()) {
- if (type.bitWidth() == 32)
- VectorConstructor = field.nullable() ? NullableInt32Vector : Int32Vector;
- else if (type.bitWidth() == 16)
- VectorConstructor = field.nullable() ? NullableInt16Vector : Int16Vector;
- else if (type.bitWidth() == 8)
- VectorConstructor = field.nullable() ? NullableInt8Vector : Int8Vector;
+export function vectorFromField(field, dictionaries) : Vector {
+ var dictionary = field.dictionary(), nullable = field.nullable();
+ if (dictionary == null) {
+ var typeType = field.typeType();
+ if (typeType === Type.List) {
+ var dataVector = vectorFromField(field.children(0), dictionaries);
+ return nullable ? new NullableListVector(field, dataVector) : new ListVector(field, dataVector);
+ } else if (typeType === Type.FixedSizeList) {
+ var dataVector = vectorFromField(field.children(0), dictionaries);
+ var size = field.type(new org.apache.arrow.flatbuf.FixedSizeList()).listSize();
+ if (nullable) {
+ return new NullableFixedSizeListVector(field, size, dataVector);
} else {
- if (type.bitWidth() == 32)
- VectorConstructor = field.nullable() ? NullableUint32Vector : Uint32Vector;
- else if (type.bitWidth() == 16)
- VectorConstructor = field.nullable() ? NullableUint16Vector : Uint16Vector;
- else if (type.bitWidth() == 8)
- VectorConstructor = field.nullable() ? NullableUint8Vector : Uint8Vector;
+ return new FixedSizeListVector(field, size, dataVector);
+ }
+ } else if (typeType === Type.Struct_) {
+ var vectors : Vector[] = [];
+ for (var i : number = 0; i < field.childrenLength(); i += 1|0) {
+ vectors.push(vectorFromField(field.children(i), dictionaries));
+ }
+ return new StructVector(field, vectors);
+ } else {
+ if (typeType === Type.Int) {
+ var type = field.type(new org.apache.arrow.flatbuf.Int());
+ return _createIntVector(field, type.bitWidth(), type.isSigned(), nullable)
+ } else if (typeType === Type.FloatingPoint) {
+ var precision = field.type(new org.apache.arrow.flatbuf.FloatingPoint()).precision();
+ if (precision == org.apache.arrow.flatbuf.Precision.SINGLE) {
+ return nullable ? new NullableFloat32Vector(field) : new Float32Vector(field);
+ } else if (precision == org.apache.arrow.flatbuf.Precision.DOUBLE) {
+ return nullable ? new NullableFloat64Vector(field) : new Float64Vector(field);
+ } else {
+ throw "Unimplemented FloatingPoint precision " + precision;
+ }
+ } else if (typeType === Type.Utf8) {
+ return nullable ? new NullableUtf8Vector(field) : new Utf8Vector(field);
+ } else if (typeType === Type.Date) {
+ return nullable ? new NullableDateVector(field) : new DateVector(field);
+ } else {
+ throw "Unimplemented type " + typeType;
}
- } else if (type === arrow.flatbuf.Type.FloatingPoint) {
- type = field.type(new arrow.flatbuf.FloatingPoint());
- if (type.precision() == arrow.flatbuf.Precision.SINGLE)
- VectorConstructor = field.nullable() ? NullableFloat32Vector : Float32Vector;
- else if (type.precision() == arrow.flatbuf.Precision.DOUBLE)
- VectorConstructor = field.nullable() ? NullableFloat64Vector : Float64Vector;
- } else if (type === arrow.flatbuf.Type.Utf8) {
- VectorConstructor = field.nullable() ? NullableUtf8Vector : Utf8Vector;
- } else if (type === arrow.flatbuf.Type.Date) {
- VectorConstructor = field.nullable() ? NullableDateVector : DateVector;
}
+ } else {
+ // determine arrow type - default is signed 32 bit int
+ var type = dictionary.indexType(), bitWidth = 32, signed = true;
+ if (type != null) {
+ bitWidth = type.bitWidth();
+ signed = type.isSigned();
+ }
+ var indices = _createIntVector(field, bitWidth, signed, nullable);
+ return new DictionaryVector(field, indices, dictionaries[dictionary.id().toFloat64().toString()]);
+ }
+}
- return new VectorConstructor(field.name());
- } else if (typeType === arrow.flatbuf.Type.List) {
- var dataVector = vectorFromField(field.children(0));
- return field.nullable() ? new NullableListVector(field.name(), dataVector) : new ListVector(field.name(), dataVector);
- } else if (typeType === arrow.flatbuf.Type.Struct_) {
- var vectors : Vector[] = [];
- for (var i : number = 0; i < field.childrenLength(); i += 1|0) {
- vectors.push(vectorFromField(field.children(i)));
+function _createIntVector(field, bitWidth, signed, nullable) {
+ if (bitWidth == 64) {
+ if (signed) {
+ return nullable ? new NullableInt64Vector(field) : new Int64Vector(field);
+ } else {
+ return nullable ? new NullableUint64Vector(field) : new Uint64Vector(field);
+ }
+ } else if (bitWidth == 32) {
+ if (signed) {
+ return nullable ? new NullableInt32Vector(field) : new Int32Vector(field);
+ } else {
+ return nullable ? new NullableUint32Vector(field) : new Uint32Vector(field);
+ }
+ } else if (bitWidth == 16) {
+ if (signed) {
+ return nullable ? new NullableInt16Vector(field) : new Int16Vector(field);
+ } else {
+ return nullable ? new NullableUint16Vector(field) : new Uint16Vector(field);
+ }
+ } else if (bitWidth == 8) {
+ if (signed) {
+ return nullable ? new NullableInt8Vector(field) : new Int8Vector(field);
+ } else {
+ return nullable ? new NullableUint8Vector(field) : new Uint8Vector(field);
}
- return new StructVector(field.name(), vectors);
+ } else {
+ throw "Unimplemented Int bit width " + bitWidth;
}
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/package.json
----------------------------------------------------------------------
diff --git a/js/package.json b/js/package.json
index b1e583b..8687f50 100644
--- a/js/package.json
+++ b/js/package.json
@@ -4,16 +4,20 @@
"description": "",
"main": "dist/arrow.js",
"scripts": {
- "postinstall": "./postinstall.sh",
- "test": "echo \"Error: no test specified\" && exit 1"
+ "postinstall": "./flatbuffers.sh",
+ "build": "./flatbuffers.sh && tsc && webpack",
+ "test": "./node_modules/mocha/bin/mocha ./spec/arrow.js"
},
"author": "",
"license": "Apache-2.0",
"devDependencies": {
- "flatbuffers": "^1.5.0",
- "text-encoding": "^0.6.4"
+ "chai": "^3.5.0",
+ "mocha": "^3.3.0",
+ "webpack": "^2.3.3"
},
"dependencies": {
+ "flatbuffers": "^1.5.0",
+ "text-encoding": "^0.6.4",
"commander": "^2.9.0"
}
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/postinstall.sh
----------------------------------------------------------------------
diff --git a/js/postinstall.sh b/js/postinstall.sh
deleted file mode 100755
index 1e6622f..0000000
--- a/js/postinstall.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. See accompanying LICENSE file.
-
-echo "Compiling flatbuffer schemas..."
-#flatc -o lib --js ../format/Message.fbs ../format/File.fbs
-flatc -o lib --js ../format/*.fbs
-cat lib/*_generated.js > lib/Arrow_generated.js
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/arrow.js
----------------------------------------------------------------------
diff --git a/js/spec/arrow.js b/js/spec/arrow.js
new file mode 100644
index 0000000..61a6f81
--- /dev/null
+++ b/js/spec/arrow.js
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+var fs = require('fs');
+var chai = require('chai');
+var assert = chai.assert;
+var path= require('path');
+var arrow = require('../dist/arrow.js');
+
+test_files = [
+ {
+ name: 'simple',
+ batches: 1,
+ fields: [
+ {
+ "name": "foo",
+ "type": "Int",
+ "data": [[1, null, 3, 4, 5]]
+ },
+ {
+ "name": "bar",
+ "type": "FloatingPoint",
+ "data": [[1.0, null, null, 4.0, 5.0]]
+ },
+ {
+ "name": "baz",
+ "type": "Utf8",
+ "data": [["aa", null, null, "bbb", "cccc"]]
+ }
+ ]
+ },
+ {
+ name: 'struct_example',
+ batches: 2,
+ fields: [
+ {
+ "name": "struct_nullable",
+ "type": "Struct",
+ "data": [
+ [
+ null,
+ [null, 'MhRNxD4'],
+ [137773603, '3F9HBxK'],
+ [410361374, 'aVd88fp'],
+ null,
+ [null, '3loZrRf'],
+ null
+ ], [
+ null,
+ [null,null],
+ [null,null],
+ null,
+ [null, '78SLiRw'],
+ null,
+ null,
+ [null, '0ilsf82'],
+ [null, 'LjS9MbU'],
+ [null, null],
+ ]
+ ]
+ }
+ ]
+ },
+ {
+ name: 'dictionary',
+ batches: 2,
+ fields: [
+ {
+ "name": "example-csv",
+ "type": "Struct",
+ "data": [
+ [
+ ["Hermione", 25, new Float32Array([-53.235599517822266, 40.231998443603516])],
+ ["Severus", 30, new Float32Array([-62.22999954223633, 3])],
+ ], [
+ ["Harry", 20, new Float32Array([23, -100.23652648925781])]
+ ]
+ ]
+ }
+ ]
+ },
+];
+
+var buf;
+
+function makeSchemaChecks(fields) {
+ describe('schema', function () {
+ var schema;
+ beforeEach(function () {
+ schema = arrow.getSchema(buf);
+ });
+
+ it('should read the number of fields', function () {
+ assert.lengthOf(schema, fields.length);
+ });
+
+ it("should understand fields", function () {
+ for (i = 0; i < fields.length; i += 1|0) {
+ assert.equal(schema[i].name, fields[i].name);
+ assert.equal(schema[i].type, fields[i].type,
+ 'bad type for field ' + schema[i].name);
+ }
+ });
+ });
+}
+
+function makeDataChecks (batches, fields) {
+ describe('data', function() {
+ var reader;
+ beforeEach(function () {
+ reader = arrow.getReader(buf)
+ });
+ it('should read the correct number of record batches', function () {
+ assert.equal(reader.getBatchCount(), batches);
+ });
+ fields.forEach(function (field, i) {
+ it('should read ' + field.type + ' vector ' + field.name, function () {
+ for (var batch_idx = 0; batch_idx < batches; batch_idx += 1|0) {
+ reader.loadNextBatch();
+ var batch = field.data[batch_idx];
+ var vector = reader.getVector(field.name)
+ assert.isDefined(vector, "vector " + field.name);
+ assert.lengthOf(vector, batch.length, "vector " + field.name)
+ for (i = 0; i < vector.length; i += 1|0) {
+ if (field.type == "Date") {
+ assert.equal(vector.get(i).getTime(), batch[i].getTime(),
+ "vector " + field.name + " index " + i);
+ } else {
+ assert.deepEqual(vector.get(i), batch[i],
+ "vector " + field.name + " index " + i);
+ }
+ }
+ }
+ });
+ });
+ });
+}
+
+describe('arrow random-access file', function () {
+ test_files.forEach(function (test_file) {
+ describe(test_file.name, function () {
+ var fields = test_file.fields
+ beforeEach(function () {
+ buf = fs.readFileSync(path.resolve(__dirname, test_file.name + '.arrow'));
+ });
+
+ makeSchemaChecks(fields);
+ makeDataChecks(test_file.batches, fields);
+ })
+ });
+});
+
+describe('arrow streaming file format', function () {
+ test_files.forEach(function (test_file) {
+ describe(test_file.name, function () {
+ var fields = test_file.fields
+ beforeEach(function () {
+ buf = fs.readFileSync(path.resolve(__dirname, test_file.name + '-stream.arrow'));
+ });
+
+ makeSchemaChecks(fields);
+ makeDataChecks(test_file.batches, fields);
+ })
+ });
+});
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/dictionary-stream.arrow
----------------------------------------------------------------------
diff --git a/js/spec/dictionary-stream.arrow b/js/spec/dictionary-stream.arrow
new file mode 100644
index 0000000..17ca48b
Binary files /dev/null and b/js/spec/dictionary-stream.arrow differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/dictionary.arrow
----------------------------------------------------------------------
diff --git a/js/spec/dictionary.arrow b/js/spec/dictionary.arrow
new file mode 100644
index 0000000..34d41db
Binary files /dev/null and b/js/spec/dictionary.arrow differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/simple-stream.arrow
----------------------------------------------------------------------
diff --git a/js/spec/simple-stream.arrow b/js/spec/simple-stream.arrow
new file mode 100644
index 0000000..2c68c0e
Binary files /dev/null and b/js/spec/simple-stream.arrow differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/simple.arrow
----------------------------------------------------------------------
diff --git a/js/spec/simple.arrow b/js/spec/simple.arrow
new file mode 100644
index 0000000..838db6d
Binary files /dev/null and b/js/spec/simple.arrow differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/struct_example-stream.arrow
----------------------------------------------------------------------
diff --git a/js/spec/struct_example-stream.arrow b/js/spec/struct_example-stream.arrow
new file mode 100644
index 0000000..4e97b70
Binary files /dev/null and b/js/spec/struct_example-stream.arrow differ
http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/struct_example.arrow
----------------------------------------------------------------------
diff --git a/js/spec/struct_example.arrow b/js/spec/struct_example.arrow
new file mode 100644
index 0000000..3d2c018
Binary files /dev/null and b/js/spec/struct_example.arrow differ