You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bh...@apache.org on 2018/12/04 03:12:46 UTC

[arrow] branch master updated: ARROW-2909: [JS] Add convenience function for creating a table from a list of vectors

This is an automated email from the ASF dual-hosted git repository.

bhulette pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0729cb7  ARROW-2909: [JS] Add convenience function for creating a table from a list of vectors
0729cb7 is described below

commit 0729cb771bd51f60423b52d44a50bddc45653d90
Author: Brian Hulette <hu...@gmail.com>
AuthorDate: Mon Dec 3 19:12:34 2018 -0800

    ARROW-2909: [JS] Add convenience function for creating a table from a list of vectors
    
    Simplifies the creation of a `Table` from JS Arrays:
    
    ```js
    const LENGTH = 20000;
    
    const idVec = Arrow.vector.IntVector.from(
      Uint32Array.from({length: LENGTH}, () => Math.round(Math.random() * 2E9))
    );
    const latVec = Arrow.vector.FloatVector.from(
      Float32Array.from({length: LENGTH}, () => Number((Math.random() * 180 - 90).toFixed(1)))
    );
    const lngVec = Arrow.vector.FloatVector.from(
      Float32Array.from({length: LENGTH}, () => Number((Math.random() * 360 - 180).toFixed(1)))
    );
    
    const table = Arrow.Table.fromVectors([idVec, latVec, lngVec], ['id', 'lat', 'lng'])
    
    onsole.log(table.schema.fields.map((f) => f.name));
    // [ 'id', 'lat', 'lng' ]
    
    console.log(table.schema.fields.map((f) => f.type));
    // [ Uint32 [Int] { TType: 2, children: undefined, isSigned: false, bitWidth: 32 },
    //   Float32 [Float] { TType: 3, children: undefined, precision: 1 },
    //   Float32 [Float] { TType: 3, children: undefined, precision: 1 } ]
    ```
    
    Author: Brian Hulette <hu...@gmail.com>
    
    Closes #2322 from TheNeuralBit/from-vectors and squashes the following commits:
    
    12b4c286 <Brian Hulette> Update "Table from JS Arrays" example
    cfc4948a <Brian Hulette> Use Table.fromVectors in table-tests
    f6551a3e <Brian Hulette> Add convenience function for constructing a Table from a list of Vectors
---
 js/README.md                |  26 +--
 js/src/Arrow.externs.js     |   2 +
 js/src/Arrow.ts             |   1 +
 js/src/recordbatch.ts       |   4 +-
 js/src/table.ts             |   7 +
 js/src/type.ts              |   4 +-
 js/test/unit/table-tests.ts | 392 ++++++++------------------------------------
 7 files changed, 86 insertions(+), 350 deletions(-)

diff --git a/js/README.md b/js/README.md
index e048ba1..15d7ed0 100644
--- a/js/README.md
+++ b/js/README.md
@@ -94,26 +94,14 @@ console.log(table.toString());
 ### Create a Table from JavaScript arrays
 
 ```es6
-const fields = [{
-        name: 'precipitation',
-        type: { name: 'floatingpoint', precision: 'SINGLE'},
-        nullable: false, children: []
-    }, {
-        name: 'date',
-        type: { name: 'date', unit: 'MILLISECOND' },
-        nullable: false, children: []
-    }];
-const rainAmounts = Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1)));
-const rainDates = Array.from({length: LENGTH}, (_, i) => Date.now() - 1000 * 60 * 60 * 24 * i);
-
 const LENGTH = 2000;
-const rainfall = arrow.Table.from({
-  schema: { fields: fields },
-  batches: [{
-    count: LENGTH,
-    columns: [
-      {name: "precipitation", count: LENGTH, VALIDITY: [], DATA: rainAmounts },
-      {name: "date",          count: LENGTH, VALIDITY: [], DATA: rainDates } ] }] })
+const rainAmounts = Float32Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1)));
+const rainDates = Array.from({length: LENGTH}, (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i));
+
+const rainfall = arrow.Table.fromVectors(
+  [FloatVector.from(rainAmounts), DateVector.from(rainDates)],
+  ['precipitation', 'date']
+);
 ```
 
 ### Load data with `fetch`
diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js
index f01ea5c..7ad0665 100644
--- a/js/src/Arrow.externs.js
+++ b/js/src/Arrow.externs.js
@@ -32,6 +32,8 @@ var Table = function() {};
 /** @type {?} */
 Table.from = function() {};
 /** @type {?} */
+Table.fromVectors = function() {};
+/** @type {?} */
 Table.fromAsync = function() {};
 /** @type {?} */
 Table.fromStruct = function() {};
diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts
index b1f4a3a..c76578b 100644
--- a/js/src/Arrow.ts
+++ b/js/src/Arrow.ts
@@ -238,6 +238,7 @@ try {
 // set them via string indexers to save them from the mangler
 Schema['from'] = Schema.from;
 Table['from'] = Table.from;
+Table['fromVectors'] = Table.fromVectors;
 Table['fromAsync'] = Table.fromAsync;
 Table['fromStruct'] = Table.fromStruct;
 Table['empty'] = Table.empty;
diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts
index 91ea5cf..cfc236d 100644
--- a/js/src/recordbatch.ts
+++ b/js/src/recordbatch.ts
@@ -25,8 +25,8 @@ import { valueToString, leftPad } from './util/pretty';
 import Long = flatbuffers.Long;
 
 export class RecordBatch<T extends StructData = StructData> extends StructVector<T> {
-    public static from<R extends StructData = StructData>(vectors: Vector[]) {
-        return new RecordBatch<R>(Schema.from(vectors),
+    public static from<R extends StructData = StructData>(vectors: Vector[], names?: string[]) {
+      return new RecordBatch<R>(Schema.from(vectors, names),
             Math.max(...vectors.map((v) => v.length)),
             vectors
         );
diff --git a/js/src/table.ts b/js/src/table.ts
index 634092f..3559cd8 100644
--- a/js/src/table.ts
+++ b/js/src/table.ts
@@ -38,6 +38,9 @@ export interface DataFrame<T extends StructData = StructData> {
 
 export class Table<T extends StructData = StructData> implements DataFrame {
     static empty<R extends StructData = StructData>() { return new Table<R>(new Schema([]), []); }
+    static fromVectors<R extends StructData = StructData>(vectors: Vector[], names?: string[]) {
+       return new Table<R>([RecordBatch.from<R>(vectors, names)])
+    }
     static from<R extends StructData = StructData>(sources?: Iterable<Uint8Array | Buffer | string> | object | string) {
         if (sources) {
             let schema: Schema | undefined;
@@ -199,6 +202,10 @@ export class Table<T extends StructData = StructData> implements DataFrame {
     }
 }
 
+// protect batches, batchesUnion from es2015/umd mangler
+(<any> Table.prototype).batches = Object.freeze([]);
+(<any> Table.prototype).batchesUnion = Object.freeze([]);
+
 class FilteredDataFrame<T extends StructData = StructData> implements DataFrame<T> {
     private predicate: Predicate;
     private batches: RecordBatch<T>[];
diff --git a/js/src/type.ts b/js/src/type.ts
index 811086c..3f75903 100644
--- a/js/src/type.ts
+++ b/js/src/type.ts
@@ -47,8 +47,8 @@ function generateDictionaryMap(fields: Field[]) {
 }
 
 export class Schema {
-    public static from(vectors: Vector[]) {
-        return new Schema(vectors.map((v, i) => new Field('' + i, v.type)));
+    public static from(vectors: Vector[], names?: string[]) {
+        return new Schema(vectors.map((v, i) => new Field(names ? names[i] : ('' + i), v.type)));
     }
     // @ts-ignore
     protected _bodyLength: number;
diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts
index 4ee1411..3a90a0d 100644
--- a/js/test/unit/table-tests.ts
+++ b/js/test/unit/table-tests.ts
@@ -16,13 +16,20 @@
 // under the License.
 
 import '../jest-extensions';
+import { TextEncoder } from 'text-encoding-utf-8';
 
 import Arrow, { vector, RecordBatch } from '../Arrow';
 
 const { predicate, Table } = Arrow;
 
+const { DictionaryVector, IntVector, FloatVector, Utf8Vector } = Arrow.vector;
+const { Dictionary, Utf8, Int } = Arrow.type;
+
 const { col, lit, custom, and, or, And, Or } = predicate;
 
+const utf8Encoder = new TextEncoder('utf-8');
+
+const NAMES = ['f32', 'i32', 'dictionary'];
 const F32 = 0, I32 = 1, DICT = 2;
 const test_data = [
     {
@@ -336,338 +343,69 @@ function leftPad(str: string, fill: string, n: number) {
     return (new Array(n + 1).join(fill) + str).slice(-1 * n);
 }
 
+function makeUtf8Vector(values) {
+    const n = values.length;
+    let offset = 0;
+    const offsets = Uint32Array.of(0, ...values.map((d) => { offset += d.length; return offset; }));
+    return new Utf8Vector(new Arrow.data.FlatListData(new Utf8(), n, null, offsets, utf8Encoder.encode(values.join(''))));
+}
+
+function getTestVectors(f32Values, i32Values, dictionaryValues) {
+    const f32Vec = FloatVector.from(
+        Float32Array.from(f32Values)
+    );
+
+    const i32Vec = IntVector.from(
+        Int32Array.from(i32Values)
+    );
+
+    const dictionaryVec = new DictionaryVector(
+        new Arrow.data.DictionaryData(
+            new Dictionary(new Utf8(), new Int(true, 8)),
+            makeUtf8Vector(['a', 'b', 'c']),
+            IntVector.from(Int8Array.from(dictionaryValues)).data
+        )
+    );
+
+    return [f32Vec, i32Vec, dictionaryVec];
+}
+
 export function getSingleRecordBatchTable() {
-    return Table.from({
-        'schema': {
-            'fields': [
-                {
-                    'name': 'f32',
-                    'type': {
-                        'name': 'floatingpoint',
-                        'precision': 'SINGLE'
-                    },
-                    'nullable': false,
-                    'children': [],
-                },
-                {
-                    'name': 'i32',
-                    'type': {
-                        'name': 'int',
-                        'isSigned': true,
-                        'bitWidth': 32
-                    },
-                    'nullable': false,
-                    'children': [],
-                },
-                {
-                    'name': 'dictionary',
-                    'type': {
-                        'name': 'utf8'
-                    },
-                    'nullable': false,
-                    'children': [],
-                    'dictionary': {
-                        'id': 0,
-                        'indexType': {
-                            'name': 'int',
-                            'isSigned': true,
-                            'bitWidth': 8
-                        },
-                        'isOrdered': false
-                    }
-                }
-            ]
-        },
-        'dictionaries': [{
-            'id': 0,
-            'data': {
-                'count': 3,
-                'columns': [
-                    {
-                        'name': 'DICT0',
-                        'count': 3,
-                        'VALIDITY': [],
-                        'OFFSET': [
-                            0,
-                            1,
-                            2,
-                            3
-                        ],
-                        'DATA': [
-                            'a',
-                            'b',
-                            'c',
-                        ]
-                    }
-                ]
-            }
-        }],
-        'batches': [{
-            'count': 7,
-            'columns': [
-                {
-                    'name': 'f32',
-                    'count': 7,
-                    'VALIDITY': [],
-                    'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
-                },
-                {
-                    'name': 'i32',
-                    'count': 7,
-                    'VALIDITY': [],
-                    'DATA': [-1, 1, -1, 1, -1, 1, -1]
-                },
-                {
-                    'name': 'dictionary',
-                    'count': 7,
-                    'VALIDITY': [],
-                    'DATA': [0, 1, 2, 0, 1, 2, 0]
-                }
-            ]
-        }]
-    });
+    const vectors = getTestVectors(
+        [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3],
+        [-1, 1, -1, 1, -1, 1, -1],
+        [0, 1, 2, 0, 1, 2, 0]
+    );
+
+    return Table.fromVectors(
+        vectors,
+        NAMES
+    );
 }
 
 function getMultipleRecordBatchesTable() {
-    return Table.from({
-        'schema': {
-            'fields': [
-                {
-                    'name': 'f32',
-                    'type': {
-                        'name': 'floatingpoint',
-                        'precision': 'SINGLE'
-                    },
-                    'nullable': false,
-                    'children': [],
-                },
-                {
-                    'name': 'i32',
-                    'type': {
-                        'name': 'int',
-                        'isSigned': true,
-                        'bitWidth': 32
-                    },
-                    'nullable': false,
-                    'children': [],
-                },
-                {
-                    'name': 'dictionary',
-                    'type': {
-                        'name': 'utf8'
-                    },
-                    'nullable': false,
-                    'children': [],
-                    'dictionary': {
-                        'id': 0,
-                        'indexType': {
-                            'name': 'int',
-                            'isSigned': true,
-                            'bitWidth': 8
-                        },
-                        'isOrdered': false
-                    }
-                }
-            ]
-        },
-        'dictionaries': [{
-            'id': 0,
-            'data': {
-                'count': 3,
-                'columns': [
-                    {
-                        'name': 'DICT0',
-                        'count': 3,
-                        'VALIDITY': [],
-                        'OFFSET': [
-                            0,
-                            1,
-                            2,
-                            3
-                        ],
-                        'DATA': [
-                            'a',
-                            'b',
-                            'c',
-                        ]
-                    }
-                ]
-            }
-        }],
-        'batches': [{
-            'count': 3,
-            'columns': [
-                {
-                    'name': 'f32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [-0.3, -0.2, -0.1]
-                },
-                {
-                    'name': 'i32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [-1, 1, -1]
-                },
-                {
-                    'name': 'dictionary',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [0, 1, 2]
-                }
-            ]
-        }, {
-            'count': 3,
-            'columns': [
-                {
-                    'name': 'f32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [0, 0.1, 0.2]
-                },
-                {
-                    'name': 'i32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [1, -1, 1]
-                },
-                {
-                    'name': 'dictionary',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [0, 1, 2]
-                }
-            ]
-        }, {
-            'count': 3,
-            'columns': [
-                {
-                    'name': 'f32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [0.3, 0.2, 0.1]
-                },
-                {
-                    'name': 'i32',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [-1, 1, -1]
-                },
-                {
-                    'name': 'dictionary',
-                    'count': 3,
-                    'VALIDITY': [],
-                    'DATA': [0, 1, 2]
-                }
-            ]
-        }]
-    });
+    const b1 = Arrow.RecordBatch.from(getTestVectors(
+        [-0.3, -0.2, -0.1],
+        [-1, 1, -1],
+        [0, 1, 2]
+    ), NAMES);
+
+    const b2 = Arrow.RecordBatch.from(getTestVectors(
+        [0, 0.1, 0.2],
+        [1, -1, 1],
+        [0, 1, 2]
+    ), NAMES);
+
+    const b3 = Arrow.RecordBatch.from(getTestVectors(
+        [0.3, 0.2, 0.1],
+        [-1, 1, -1],
+        [0, 1, 2]
+    ), NAMES);
+
+    return new Table([b1, b2, b3])
 }
 
 function getStructTable() {
-    return Table.from({
-        'schema': {
-            'fields': [
-                {
-                    'name': 'struct',
-                    'type': {
-                        'name': 'struct'
-                    },
-                    'nullable': false,
-                    'children': [
-                        {
-                            'name': 'f32',
-                            'type': {
-                                'name': 'floatingpoint',
-                                'precision': 'SINGLE'
-                            },
-                            'nullable': false,
-                            'children': [],
-                        },
-                        {
-                            'name': 'i32',
-                            'type': {
-                                'name': 'int',
-                                'isSigned': true,
-                                'bitWidth': 32
-                            },
-                            'nullable': false,
-                            'children': [],
-                        },
-                        {
-                            'name': 'dictionary',
-                            'type': {
-                                'name': 'utf8'
-                            },
-                            'nullable': false,
-                            'children': [],
-                            'dictionary': {
-                                'id': 0,
-                                'indexType': {
-                                    'name': 'int',
-                                    'isSigned': true,
-                                    'bitWidth': 8
-                                },
-                                'isOrdered': false
-                            }
-                        }
-                    ]
-                }
-            ]
-        },
-        'dictionaries': [{
-            'id': 0,
-            'data': {
-                'count': 3,
-                'columns': [
-                    {
-                        'name': 'DICT0',
-                        'count': 3,
-                        'VALIDITY': [],
-                        'OFFSET': [
-                            0,
-                            1,
-                            2,
-                            3
-                        ],
-                        'DATA': [
-                            'a',
-                            'b',
-                            'c',
-                        ]
-                    }
-                ]
-            }
-        }],
-        'batches': [{
-            'count': 7,
-            'columns': [
-                {
-                    'name': 'struct',
-                    'count': 7,
-                    'VALIDITY': [],
-                    'children': [
-                        {
-                            'name': 'f32',
-                            'count': 7,
-                            'VALIDITY': [],
-                            'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
-                        },
-                        {
-                            'name': 'i32',
-                            'count': 7,
-                            'VALIDITY': [],
-                            'DATA': [-1, 1, -1, 1, -1, 1, -1]
-                        },
-                        {
-                            'name': 'dictionary',
-                            'count': 7,
-                            'VALIDITY': [],
-                            'DATA': [0, 1, 2, 0, 1, 2, 0]
-                        }
-                    ]
-                }
-            ]
-        }]
-    });
+    const structVec = getSingleRecordBatchTable().batchesUnion
+    return Table.fromVectors([structVec], ['struct'])
 }