You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bh...@apache.org on 2018/12/04 03:12:46 UTC
[arrow] branch master updated: ARROW-2909: [JS] Add convenience
function for creating a table from a list of vectors
This is an automated email from the ASF dual-hosted git repository.
bhulette pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0729cb7 ARROW-2909: [JS] Add convenience function for creating a table from a list of vectors
0729cb7 is described below
commit 0729cb771bd51f60423b52d44a50bddc45653d90
Author: Brian Hulette <hu...@gmail.com>
AuthorDate: Mon Dec 3 19:12:34 2018 -0800
ARROW-2909: [JS] Add convenience function for creating a table from a list of vectors
Simplifies the creation of a `Table` from JS Arrays:
```js
const LENGTH = 20000;
const idVec = Arrow.vector.IntVector.from(
Uint32Array.from({length: LENGTH}, () => Math.round(Math.random() * 2E9))
);
const latVec = Arrow.vector.FloatVector.from(
Float32Array.from({length: LENGTH}, () => Number((Math.random() * 180 - 90).toFixed(1)))
);
const lngVec = Arrow.vector.FloatVector.from(
Float32Array.from({length: LENGTH}, () => Number((Math.random() * 360 - 180).toFixed(1)))
);
const table = Arrow.Table.fromVectors([idVec, latVec, lngVec], ['id', 'lat', 'lng'])
onsole.log(table.schema.fields.map((f) => f.name));
// [ 'id', 'lat', 'lng' ]
console.log(table.schema.fields.map((f) => f.type));
// [ Uint32 [Int] { TType: 2, children: undefined, isSigned: false, bitWidth: 32 },
// Float32 [Float] { TType: 3, children: undefined, precision: 1 },
// Float32 [Float] { TType: 3, children: undefined, precision: 1 } ]
```
Author: Brian Hulette <hu...@gmail.com>
Closes #2322 from TheNeuralBit/from-vectors and squashes the following commits:
12b4c286 <Brian Hulette> Update "Table from JS Arrays" example
cfc4948a <Brian Hulette> Use Table.fromVectors in table-tests
f6551a3e <Brian Hulette> Add convenience function for constructing a Table from a list of Vectors
---
js/README.md | 26 +--
js/src/Arrow.externs.js | 2 +
js/src/Arrow.ts | 1 +
js/src/recordbatch.ts | 4 +-
js/src/table.ts | 7 +
js/src/type.ts | 4 +-
js/test/unit/table-tests.ts | 392 ++++++++------------------------------------
7 files changed, 86 insertions(+), 350 deletions(-)
diff --git a/js/README.md b/js/README.md
index e048ba1..15d7ed0 100644
--- a/js/README.md
+++ b/js/README.md
@@ -94,26 +94,14 @@ console.log(table.toString());
### Create a Table from JavaScript arrays
```es6
-const fields = [{
- name: 'precipitation',
- type: { name: 'floatingpoint', precision: 'SINGLE'},
- nullable: false, children: []
- }, {
- name: 'date',
- type: { name: 'date', unit: 'MILLISECOND' },
- nullable: false, children: []
- }];
-const rainAmounts = Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1)));
-const rainDates = Array.from({length: LENGTH}, (_, i) => Date.now() - 1000 * 60 * 60 * 24 * i);
-
const LENGTH = 2000;
-const rainfall = arrow.Table.from({
- schema: { fields: fields },
- batches: [{
- count: LENGTH,
- columns: [
- {name: "precipitation", count: LENGTH, VALIDITY: [], DATA: rainAmounts },
- {name: "date", count: LENGTH, VALIDITY: [], DATA: rainDates } ] }] })
+const rainAmounts = Float32Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1)));
+const rainDates = Array.from({length: LENGTH}, (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i));
+
+const rainfall = arrow.Table.fromVectors(
+ [FloatVector.from(rainAmounts), DateVector.from(rainDates)],
+ ['precipitation', 'date']
+);
```
### Load data with `fetch`
diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js
index f01ea5c..7ad0665 100644
--- a/js/src/Arrow.externs.js
+++ b/js/src/Arrow.externs.js
@@ -32,6 +32,8 @@ var Table = function() {};
/** @type {?} */
Table.from = function() {};
/** @type {?} */
+Table.fromVectors = function() {};
+/** @type {?} */
Table.fromAsync = function() {};
/** @type {?} */
Table.fromStruct = function() {};
diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts
index b1f4a3a..c76578b 100644
--- a/js/src/Arrow.ts
+++ b/js/src/Arrow.ts
@@ -238,6 +238,7 @@ try {
// set them via string indexers to save them from the mangler
Schema['from'] = Schema.from;
Table['from'] = Table.from;
+Table['fromVectors'] = Table.fromVectors;
Table['fromAsync'] = Table.fromAsync;
Table['fromStruct'] = Table.fromStruct;
Table['empty'] = Table.empty;
diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts
index 91ea5cf..cfc236d 100644
--- a/js/src/recordbatch.ts
+++ b/js/src/recordbatch.ts
@@ -25,8 +25,8 @@ import { valueToString, leftPad } from './util/pretty';
import Long = flatbuffers.Long;
export class RecordBatch<T extends StructData = StructData> extends StructVector<T> {
- public static from<R extends StructData = StructData>(vectors: Vector[]) {
- return new RecordBatch<R>(Schema.from(vectors),
+ public static from<R extends StructData = StructData>(vectors: Vector[], names?: string[]) {
+ return new RecordBatch<R>(Schema.from(vectors, names),
Math.max(...vectors.map((v) => v.length)),
vectors
);
diff --git a/js/src/table.ts b/js/src/table.ts
index 634092f..3559cd8 100644
--- a/js/src/table.ts
+++ b/js/src/table.ts
@@ -38,6 +38,9 @@ export interface DataFrame<T extends StructData = StructData> {
export class Table<T extends StructData = StructData> implements DataFrame {
static empty<R extends StructData = StructData>() { return new Table<R>(new Schema([]), []); }
+ static fromVectors<R extends StructData = StructData>(vectors: Vector[], names?: string[]) {
+ return new Table<R>([RecordBatch.from<R>(vectors, names)])
+ }
static from<R extends StructData = StructData>(sources?: Iterable<Uint8Array | Buffer | string> | object | string) {
if (sources) {
let schema: Schema | undefined;
@@ -199,6 +202,10 @@ export class Table<T extends StructData = StructData> implements DataFrame {
}
}
+// protect batches, batchesUnion from es2015/umd mangler
+(<any> Table.prototype).batches = Object.freeze([]);
+(<any> Table.prototype).batchesUnion = Object.freeze([]);
+
class FilteredDataFrame<T extends StructData = StructData> implements DataFrame<T> {
private predicate: Predicate;
private batches: RecordBatch<T>[];
diff --git a/js/src/type.ts b/js/src/type.ts
index 811086c..3f75903 100644
--- a/js/src/type.ts
+++ b/js/src/type.ts
@@ -47,8 +47,8 @@ function generateDictionaryMap(fields: Field[]) {
}
export class Schema {
- public static from(vectors: Vector[]) {
- return new Schema(vectors.map((v, i) => new Field('' + i, v.type)));
+ public static from(vectors: Vector[], names?: string[]) {
+ return new Schema(vectors.map((v, i) => new Field(names ? names[i] : ('' + i), v.type)));
}
// @ts-ignore
protected _bodyLength: number;
diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts
index 4ee1411..3a90a0d 100644
--- a/js/test/unit/table-tests.ts
+++ b/js/test/unit/table-tests.ts
@@ -16,13 +16,20 @@
// under the License.
import '../jest-extensions';
+import { TextEncoder } from 'text-encoding-utf-8';
import Arrow, { vector, RecordBatch } from '../Arrow';
const { predicate, Table } = Arrow;
+const { DictionaryVector, IntVector, FloatVector, Utf8Vector } = Arrow.vector;
+const { Dictionary, Utf8, Int } = Arrow.type;
+
const { col, lit, custom, and, or, And, Or } = predicate;
+const utf8Encoder = new TextEncoder('utf-8');
+
+const NAMES = ['f32', 'i32', 'dictionary'];
const F32 = 0, I32 = 1, DICT = 2;
const test_data = [
{
@@ -336,338 +343,69 @@ function leftPad(str: string, fill: string, n: number) {
return (new Array(n + 1).join(fill) + str).slice(-1 * n);
}
+function makeUtf8Vector(values) {
+ const n = values.length;
+ let offset = 0;
+ const offsets = Uint32Array.of(0, ...values.map((d) => { offset += d.length; return offset; }));
+ return new Utf8Vector(new Arrow.data.FlatListData(new Utf8(), n, null, offsets, utf8Encoder.encode(values.join(''))));
+}
+
+function getTestVectors(f32Values, i32Values, dictionaryValues) {
+ const f32Vec = FloatVector.from(
+ Float32Array.from(f32Values)
+ );
+
+ const i32Vec = IntVector.from(
+ Int32Array.from(i32Values)
+ );
+
+ const dictionaryVec = new DictionaryVector(
+ new Arrow.data.DictionaryData(
+ new Dictionary(new Utf8(), new Int(true, 8)),
+ makeUtf8Vector(['a', 'b', 'c']),
+ IntVector.from(Int8Array.from(dictionaryValues)).data
+ )
+ );
+
+ return [f32Vec, i32Vec, dictionaryVec];
+}
+
export function getSingleRecordBatchTable() {
- return Table.from({
- 'schema': {
- 'fields': [
- {
- 'name': 'f32',
- 'type': {
- 'name': 'floatingpoint',
- 'precision': 'SINGLE'
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'i32',
- 'type': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 32
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'dictionary',
- 'type': {
- 'name': 'utf8'
- },
- 'nullable': false,
- 'children': [],
- 'dictionary': {
- 'id': 0,
- 'indexType': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 8
- },
- 'isOrdered': false
- }
- }
- ]
- },
- 'dictionaries': [{
- 'id': 0,
- 'data': {
- 'count': 3,
- 'columns': [
- {
- 'name': 'DICT0',
- 'count': 3,
- 'VALIDITY': [],
- 'OFFSET': [
- 0,
- 1,
- 2,
- 3
- ],
- 'DATA': [
- 'a',
- 'b',
- 'c',
- ]
- }
- ]
- }
- }],
- 'batches': [{
- 'count': 7,
- 'columns': [
- {
- 'name': 'f32',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
- },
- {
- 'name': 'i32',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [-1, 1, -1, 1, -1, 1, -1]
- },
- {
- 'name': 'dictionary',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [0, 1, 2, 0, 1, 2, 0]
- }
- ]
- }]
- });
+ const vectors = getTestVectors(
+ [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3],
+ [-1, 1, -1, 1, -1, 1, -1],
+ [0, 1, 2, 0, 1, 2, 0]
+ );
+
+ return Table.fromVectors(
+ vectors,
+ NAMES
+ );
}
function getMultipleRecordBatchesTable() {
- return Table.from({
- 'schema': {
- 'fields': [
- {
- 'name': 'f32',
- 'type': {
- 'name': 'floatingpoint',
- 'precision': 'SINGLE'
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'i32',
- 'type': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 32
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'dictionary',
- 'type': {
- 'name': 'utf8'
- },
- 'nullable': false,
- 'children': [],
- 'dictionary': {
- 'id': 0,
- 'indexType': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 8
- },
- 'isOrdered': false
- }
- }
- ]
- },
- 'dictionaries': [{
- 'id': 0,
- 'data': {
- 'count': 3,
- 'columns': [
- {
- 'name': 'DICT0',
- 'count': 3,
- 'VALIDITY': [],
- 'OFFSET': [
- 0,
- 1,
- 2,
- 3
- ],
- 'DATA': [
- 'a',
- 'b',
- 'c',
- ]
- }
- ]
- }
- }],
- 'batches': [{
- 'count': 3,
- 'columns': [
- {
- 'name': 'f32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [-0.3, -0.2, -0.1]
- },
- {
- 'name': 'i32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [-1, 1, -1]
- },
- {
- 'name': 'dictionary',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [0, 1, 2]
- }
- ]
- }, {
- 'count': 3,
- 'columns': [
- {
- 'name': 'f32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [0, 0.1, 0.2]
- },
- {
- 'name': 'i32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [1, -1, 1]
- },
- {
- 'name': 'dictionary',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [0, 1, 2]
- }
- ]
- }, {
- 'count': 3,
- 'columns': [
- {
- 'name': 'f32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [0.3, 0.2, 0.1]
- },
- {
- 'name': 'i32',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [-1, 1, -1]
- },
- {
- 'name': 'dictionary',
- 'count': 3,
- 'VALIDITY': [],
- 'DATA': [0, 1, 2]
- }
- ]
- }]
- });
+ const b1 = Arrow.RecordBatch.from(getTestVectors(
+ [-0.3, -0.2, -0.1],
+ [-1, 1, -1],
+ [0, 1, 2]
+ ), NAMES);
+
+ const b2 = Arrow.RecordBatch.from(getTestVectors(
+ [0, 0.1, 0.2],
+ [1, -1, 1],
+ [0, 1, 2]
+ ), NAMES);
+
+ const b3 = Arrow.RecordBatch.from(getTestVectors(
+ [0.3, 0.2, 0.1],
+ [-1, 1, -1],
+ [0, 1, 2]
+ ), NAMES);
+
+ return new Table([b1, b2, b3])
}
function getStructTable() {
- return Table.from({
- 'schema': {
- 'fields': [
- {
- 'name': 'struct',
- 'type': {
- 'name': 'struct'
- },
- 'nullable': false,
- 'children': [
- {
- 'name': 'f32',
- 'type': {
- 'name': 'floatingpoint',
- 'precision': 'SINGLE'
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'i32',
- 'type': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 32
- },
- 'nullable': false,
- 'children': [],
- },
- {
- 'name': 'dictionary',
- 'type': {
- 'name': 'utf8'
- },
- 'nullable': false,
- 'children': [],
- 'dictionary': {
- 'id': 0,
- 'indexType': {
- 'name': 'int',
- 'isSigned': true,
- 'bitWidth': 8
- },
- 'isOrdered': false
- }
- }
- ]
- }
- ]
- },
- 'dictionaries': [{
- 'id': 0,
- 'data': {
- 'count': 3,
- 'columns': [
- {
- 'name': 'DICT0',
- 'count': 3,
- 'VALIDITY': [],
- 'OFFSET': [
- 0,
- 1,
- 2,
- 3
- ],
- 'DATA': [
- 'a',
- 'b',
- 'c',
- ]
- }
- ]
- }
- }],
- 'batches': [{
- 'count': 7,
- 'columns': [
- {
- 'name': 'struct',
- 'count': 7,
- 'VALIDITY': [],
- 'children': [
- {
- 'name': 'f32',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
- },
- {
- 'name': 'i32',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [-1, 1, -1, 1, -1, 1, -1]
- },
- {
- 'name': 'dictionary',
- 'count': 7,
- 'VALIDITY': [],
- 'DATA': [0, 1, 2, 0, 1, 2, 0]
- }
- ]
- }
- ]
- }]
- });
+ const structVec = getSingleRecordBatchTable().batchesUnion
+ return Table.fromVectors([structVec], ['struct'])
}