You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by do...@apache.org on 2022/04/19 01:14:22 UTC

[arrow] branch master updated: ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray

This is an automated email from the ASF dual-hosted git repository.

domoritz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d24e4be501 ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray
d24e4be501 is described below

commit d24e4be501530d69d727b2586ee20d0c76ed4f43
Author: Dominik Moritz <do...@gmail.com>
AuthorDate: Mon Apr 18 21:14:12 2022 -0400

    ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray
    
    Uses vector construction to then make a table.
    
    Closes #12908 from domoritz/dom/tableFromJSON
    
    Authored-by: Dominik Moritz <do...@gmail.com>
    Signed-off-by: Dominik Moritz <do...@gmail.com>
---
 js/src/Arrow.dom.ts                 |  2 +-
 js/src/Arrow.ts                     |  2 +-
 js/src/factories.ts                 | 41 ++++++++++++++++++++++++++-----
 js/src/interfaces.ts                |  5 ++++
 js/test/unit/table/table-test.ts    | 27 +++++++++++++++++++-
 js/test/unit/vector/vector-tests.ts | 49 +++++++++++++++++++++++++++++++++++--
 6 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts
index d2c44cfe44..2fdef60c1f 100644
--- a/js/src/Arrow.dom.ts
+++ b/js/src/Arrow.dom.ts
@@ -64,7 +64,7 @@ export {
     Table, makeTable, tableFromArrays,
     Schema, Field,
     Visitor,
-    Vector, makeVector, vectorFromArray,
+    Vector, makeVector, vectorFromArray, tableFromJSON,
     ByteStream, AsyncByteStream, AsyncByteQueue,
     RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader,
     RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter,
diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts
index 4f41ede107..dc44e10b92 100644
--- a/js/src/Arrow.ts
+++ b/js/src/Arrow.ts
@@ -62,7 +62,7 @@ export { StructRow } from './row/struct.js';
 export type { StructRowProxy } from './row/struct.js';
 
 export { Builder } from './builder.js';
-export { makeBuilder, vectorFromArray, builderThroughIterable, builderThroughAsyncIterable } from './factories.js';
+export { makeBuilder, vectorFromArray, tableFromJSON, builderThroughIterable, builderThroughAsyncIterable } from './factories.js';
 export type { BuilderOptions } from './builder.js';
 export { BoolBuilder } from './builder/bool.js';
 export { NullBuilder } from './builder/null.js';
diff --git a/js/src/factories.ts b/js/src/factories.ts
index 6032339e8f..6f06efba6c 100644
--- a/js/src/factories.ts
+++ b/js/src/factories.ts
@@ -15,14 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-import { Field } from './schema.js';
+import { Field, Schema } from './schema.js';
 import * as dtypes from './type.js';
 import { Data, DataProps } from './data.js';
-import { BuilderType } from './interfaces.js';
+import { BuilderType, JavaScriptDataType } from './interfaces.js';
 import { Vector, makeVector } from './vector.js';
 import { Builder, BuilderOptions } from './builder.js';
 import { instance as getBuilderConstructor } from './visitor/builderctor.js';
 import { ArrayDataType, BigIntArray, JavaScriptArrayDataType, TypedArray, TypedArrayDataType } from './interfaces.js';
+import { Table } from './table.js';
+import { RecordBatch } from './recordbatch.js';
+import { compareTypes } from './visitor/typecomparator.js';
 
 export function makeBuilder<T extends dtypes.DataType = any, TNull = any>(options: BuilderOptions<T, TNull>): BuilderType<T, TNull> {
 
@@ -56,6 +59,7 @@ export function makeBuilder<T extends dtypes.DataType = any, TNull = any>(option
  * const vf64 = vectorFromArray([1, 2, 3]);
  * const vi8 = vectorFromArray([1, 2, 3], new Int8);
  * const vdict = vectorFromArray(['foo', 'bar']);
+ * const vstruct = vectorFromArray([{a: 'foo', b: 42}, {a: 'bar', b: 12}]);
  * ```
  */
 export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector<dtypes.Null>;
@@ -88,13 +92,23 @@ export function vectorFromArray(init: any, type?: dtypes.DataType) {
     return vector;
 }
 
+/**
+ * Creates a {@link Table} from an array of objects.
+ *
+ * @param array A table of objects.
+ */
+export function tableFromJSON<T extends Record<string, unknown>>(array: T[]): Table<{ [P in keyof T]: JavaScriptDataType<T[P]> }> {
+    const vector = vectorFromArray(array) as Vector<dtypes.Struct<any>>;
+    const batch = new RecordBatch(new Schema(vector.type.children), vector.data[0]);
+    return new Table(batch);
+}
+
 /** @ignore */
+function inferType<T extends readonly unknown[]>(values: T): JavaScriptArrayDataType<T>;
 function inferType(value: readonly unknown[]): dtypes.DataType {
     if (value.length === 0) { return new dtypes.Null; }
     let nullsCount = 0;
-    // @ts-ignore
     let arraysCount = 0;
-    // @ts-ignore
     let objectsCount = 0;
     let numbersCount = 0;
     let stringsCount = 0;
@@ -132,8 +146,24 @@ function inferType(value: readonly unknown[]): dtypes.DataType {
         return new dtypes.Bool;
     } else if (datesCount + nullsCount === value.length) {
         return new dtypes.DateMillisecond;
+    } else if (arraysCount + nullsCount === value.length) {
+        const array = value as Array<unknown>[];
+        const childType = inferType(array[array.findIndex((ary) => ary != null)]);
+        if (array.every((ary) => ary == null || compareTypes(childType, inferType(ary)))) {
+            return new dtypes.List(new Field('', childType, true));
+        }
+    } else if (objectsCount + nullsCount === value.length) {
+        const fields = new Map<string, Field>();
+        for (const row of value as Record<string, unknown>[]) {
+            for (const key of Object.keys(row)) {
+                if (!fields.has(key) && row[key] != null) {
+                    // use the type inferred for the first instance of a found key
+                    fields.set(key, new Field(key, inferType([row[key]]), true));
+                }
+            }
+        }
+        return new dtypes.Struct([...fields.values()]);
     }
-    // TODO: add more types to infererence
 
     throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected');
 }
@@ -143,7 +173,6 @@ function inferType(value: readonly unknown[]): dtypes.DataType {
  * @see {@link builderThroughIterable}
  * @see {@link builderThroughAsyncIterable}
  */
-
 export interface IterableBuilderOptions<T extends dtypes.DataType = any, TNull = any> extends BuilderOptions<T, TNull> {
     highWaterMark?: number;
     queueingStrategy?: 'bytes' | 'count';
diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts
index 3f85e16e03..8d61295919 100644
--- a/js/src/interfaces.ts
+++ b/js/src/interfaces.ts
@@ -160,6 +160,9 @@ export type TypedArrayDataType<T extends TypedArray | BigIntArray> =
     T extends Float64Array ? type.Float64 :
     never;
 
+/** @ignore */
+export type JavaScriptDataType<T> = JavaScriptArrayDataType<T[]>;
+
 /** @ignore */
 export type JavaScriptArrayDataType<T extends readonly unknown[]> =
     T extends readonly (null | undefined)[] ? type.Null :
@@ -168,6 +171,8 @@ export type JavaScriptArrayDataType<T extends readonly unknown[]> =
     T extends readonly (null | undefined | Date)[] ? type.Date_ :
     T extends readonly (null | undefined | bigint)[] ? type.Int64 :
     T extends readonly (null | undefined | number)[] ? type.Float64 :
+    T extends readonly (null | undefined | readonly (infer U)[])[] ? type.List<JavaScriptDataType<U>> :
+    T extends readonly (null | undefined | Record<string, unknown>)[] ? T extends readonly (null | undefined | infer U)[] ? type.Struct<{ [P in keyof U]: JavaScriptDataType<U[P]> }> : never :
     never;
 
 /** @ignore */
diff --git a/js/test/unit/table/table-test.ts b/js/test/unit/table/table-test.ts
index e055da3787..01af409098 100644
--- a/js/test/unit/table/table-test.ts
+++ b/js/test/unit/table/table-test.ts
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-import { Int32, Float32, Float64, Int8, makeTable, tableFromArrays, Dictionary } from 'apache-arrow';
+import { Bool, Dictionary, Float32, Float64, Int32, Int8, makeTable, tableFromArrays, tableFromJSON } from 'apache-arrow';
 
 describe('makeTable()', () => {
     test(`creates a new Table from Typed Arrays`, () => {
@@ -47,6 +47,9 @@ describe('tableFromArrays()', () => {
             d: ['foo', 'bar'],
         });
 
+        expect(table.numRows).toBe(3);
+        expect(table.numCols).toBe(4);
+
         expect(table.getChild('a')!.type).toBeInstanceOf(Float32);
         expect(table.getChild('b')!.type).toBeInstanceOf(Int8);
         expect(table.getChild('c')!.type).toBeInstanceOf(Float64);
@@ -54,3 +57,25 @@ describe('tableFromArrays()', () => {
         expect(table.getChild('e' as any)).toBeNull();
     });
 });
+
+
+describe('tableFromJSON()', () => {
+    test(`creates table from array of objects`, () => {
+        const table = tableFromJSON([{
+            a: 42,
+            b: true,
+            c: 'foo',
+        }, {
+            a: 12,
+            b: false,
+            c: 'bar',
+        }]);
+
+        expect(table.numRows).toBe(2);
+        expect(table.numCols).toBe(3);
+
+        expect(table.getChild('a')!.type).toBeInstanceOf(Float64);
+        expect(table.getChild('b')!.type).toBeInstanceOf(Bool);
+        expect(table.getChild('c')!.type).toBeInstanceOf(Dictionary);
+    });
+});
diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts
index f76a521ee8..9ad3ca85b0 100644
--- a/js/test/unit/vector/vector-tests.ts
+++ b/js/test/unit/vector/vector-tests.ts
@@ -16,7 +16,7 @@
 // under the License.
 
 import {
-    DateDay, DateMillisecond, Dictionary, Field, Int32, List, makeVector, Utf8, util, Vector, vectorFromArray
+    Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, util, Vector, vectorFromArray
 } from 'apache-arrow';
 
 describe(`makeVectorFromArray`, () => {
@@ -33,6 +33,51 @@ describe(`makeVectorFromArray`, () => {
     });
 });
 
+describe(`StructVector`, () => {
+    test(`makeVectorFromArray`, () => {
+        const values: { a?: number; b?: string | null; c?: boolean | null }[] = [
+            { a: 1, b: null },
+            { a: 4, b: 'foo', c: null },
+            { a: 7, b: 'bar', c: true },
+            { a: 10, b: 'baz', c: true },
+        ];
+        const vector = vectorFromArray(values);
+
+        expect(vector.numChildren).toBe(3);
+        expect(vector).toHaveLength(4);
+        expect(vector.type.children[0].type).toBeInstanceOf(Float64);
+        expect(vector.type.children[1].type).toBeInstanceOf(Dictionary);
+        expect(vector.type.children[2].type).toBeInstanceOf(Bool);
+    });
+
+
+    const values: { a?: number; b?: string; c?: boolean }[] = [
+        { a: 1, b: 'foo', c: true },
+        { a: 4, b: 'foo', c: false },
+        { a: 7, b: 'bar', c: true },
+        { a: 10, b: 'baz', c: true },
+    ];
+    const vector = vectorFromArray(values);
+
+    test(`has list struct`, () => {
+        expect(vector.type).toBeInstanceOf(Struct);
+
+        expect(vector.type.children[0].type).toBeInstanceOf(Float64);
+        expect(vector.type.children[1].type).toBeInstanceOf(Dictionary);
+        expect(vector.type.children[2].type).toBeInstanceOf(Bool);
+
+        expect(vector.type.children[0].nullable).toBeTruthy();
+        expect(vector.type.children[1].nullable).toBeTruthy();
+        expect(vector.type.children[2].nullable).toBeTruthy();
+    });
+
+    test(`get value`, () => {
+        for (const [i, value] of values.entries()) {
+            expect(vector.get(i)!.toJSON()).toEqual(value);
+        }
+    });
+});
+
 describe(`DateVector`, () => {
     const extras = [
         new Date(2000, 0, 1),
@@ -153,7 +198,7 @@ describe(`Utf8Vector`, () => {
 
 describe(`ListVector`, () => {
     const values = [[1, 2], [1, 2, 3]];
-    const vector = vectorFromArray(values, new List(Field.new({ name: 'field', type: new Int32 })));
+    const vector = vectorFromArray(values);
 
     test(`has list type`, () => {
         expect(vector.type).toBeInstanceOf(List);