You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by do...@apache.org on 2022/04/19 01:14:22 UTC
[arrow] branch master updated: ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray
This is an automated email from the ASF dual-hosted git repository.
domoritz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d24e4be501 ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray
d24e4be501 is described below
commit d24e4be501530d69d727b2586ee20d0c76ed4f43
Author: Dominik Moritz <do...@gmail.com>
AuthorDate: Mon Apr 18 21:14:12 2022 -0400
ARROW-16210: [JS] Implement tableFromJSON and support struct vector in vectorFromArray
Uses vector construction to then make a table.
Closes #12908 from domoritz/dom/tableFromJSON
Authored-by: Dominik Moritz <do...@gmail.com>
Signed-off-by: Dominik Moritz <do...@gmail.com>
---
js/src/Arrow.dom.ts | 2 +-
js/src/Arrow.ts | 2 +-
js/src/factories.ts | 41 ++++++++++++++++++++++++++-----
js/src/interfaces.ts | 5 ++++
js/test/unit/table/table-test.ts | 27 +++++++++++++++++++-
js/test/unit/vector/vector-tests.ts | 49 +++++++++++++++++++++++++++++++++++--
6 files changed, 115 insertions(+), 11 deletions(-)
diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts
index d2c44cfe44..2fdef60c1f 100644
--- a/js/src/Arrow.dom.ts
+++ b/js/src/Arrow.dom.ts
@@ -64,7 +64,7 @@ export {
Table, makeTable, tableFromArrays,
Schema, Field,
Visitor,
- Vector, makeVector, vectorFromArray,
+ Vector, makeVector, vectorFromArray, tableFromJSON,
ByteStream, AsyncByteStream, AsyncByteQueue,
RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader,
RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter,
diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts
index 4f41ede107..dc44e10b92 100644
--- a/js/src/Arrow.ts
+++ b/js/src/Arrow.ts
@@ -62,7 +62,7 @@ export { StructRow } from './row/struct.js';
export type { StructRowProxy } from './row/struct.js';
export { Builder } from './builder.js';
-export { makeBuilder, vectorFromArray, builderThroughIterable, builderThroughAsyncIterable } from './factories.js';
+export { makeBuilder, vectorFromArray, tableFromJSON, builderThroughIterable, builderThroughAsyncIterable } from './factories.js';
export type { BuilderOptions } from './builder.js';
export { BoolBuilder } from './builder/bool.js';
export { NullBuilder } from './builder/null.js';
diff --git a/js/src/factories.ts b/js/src/factories.ts
index 6032339e8f..6f06efba6c 100644
--- a/js/src/factories.ts
+++ b/js/src/factories.ts
@@ -15,14 +15,17 @@
// specific language governing permissions and limitations
// under the License.
-import { Field } from './schema.js';
+import { Field, Schema } from './schema.js';
import * as dtypes from './type.js';
import { Data, DataProps } from './data.js';
-import { BuilderType } from './interfaces.js';
+import { BuilderType, JavaScriptDataType } from './interfaces.js';
import { Vector, makeVector } from './vector.js';
import { Builder, BuilderOptions } from './builder.js';
import { instance as getBuilderConstructor } from './visitor/builderctor.js';
import { ArrayDataType, BigIntArray, JavaScriptArrayDataType, TypedArray, TypedArrayDataType } from './interfaces.js';
+import { Table } from './table.js';
+import { RecordBatch } from './recordbatch.js';
+import { compareTypes } from './visitor/typecomparator.js';
export function makeBuilder<T extends dtypes.DataType = any, TNull = any>(options: BuilderOptions<T, TNull>): BuilderType<T, TNull> {
@@ -56,6 +59,7 @@ export function makeBuilder<T extends dtypes.DataType = any, TNull = any>(option
* const vf64 = vectorFromArray([1, 2, 3]);
* const vi8 = vectorFromArray([1, 2, 3], new Int8);
* const vdict = vectorFromArray(['foo', 'bar']);
+ * const vstruct = vectorFromArray([{a: 'foo', b: 42}, {a: 'bar', b: 12}]);
* ```
*/
export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector<dtypes.Null>;
@@ -88,13 +92,23 @@ export function vectorFromArray(init: any, type?: dtypes.DataType) {
return vector;
}
+/**
+ * Creates a {@link Table} from an array of objects.
+ *
+ * @param array A table of objects.
+ */
+export function tableFromJSON<T extends Record<string, unknown>>(array: T[]): Table<{ [P in keyof T]: JavaScriptDataType<T[P]> }> {
+ const vector = vectorFromArray(array) as Vector<dtypes.Struct<any>>;
+ const batch = new RecordBatch(new Schema(vector.type.children), vector.data[0]);
+ return new Table(batch);
+}
+
/** @ignore */
+function inferType<T extends readonly unknown[]>(values: T): JavaScriptArrayDataType<T>;
function inferType(value: readonly unknown[]): dtypes.DataType {
if (value.length === 0) { return new dtypes.Null; }
let nullsCount = 0;
- // @ts-ignore
let arraysCount = 0;
- // @ts-ignore
let objectsCount = 0;
let numbersCount = 0;
let stringsCount = 0;
@@ -132,8 +146,24 @@ function inferType(value: readonly unknown[]): dtypes.DataType {
return new dtypes.Bool;
} else if (datesCount + nullsCount === value.length) {
return new dtypes.DateMillisecond;
+ } else if (arraysCount + nullsCount === value.length) {
+ const array = value as Array<unknown>[];
+ const childType = inferType(array[array.findIndex((ary) => ary != null)]);
+ if (array.every((ary) => ary == null || compareTypes(childType, inferType(ary)))) {
+ return new dtypes.List(new Field('', childType, true));
+ }
+ } else if (objectsCount + nullsCount === value.length) {
+ const fields = new Map<string, Field>();
+ for (const row of value as Record<string, unknown>[]) {
+ for (const key of Object.keys(row)) {
+ if (!fields.has(key) && row[key] != null) {
+ // use the type inferred for the first instance of a found key
+ fields.set(key, new Field(key, inferType([row[key]]), true));
+ }
+ }
+ }
+ return new dtypes.Struct([...fields.values()]);
}
- // TODO: add more types to infererence
throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected');
}
@@ -143,7 +173,6 @@ function inferType(value: readonly unknown[]): dtypes.DataType {
* @see {@link builderThroughIterable}
* @see {@link builderThroughAsyncIterable}
*/
-
export interface IterableBuilderOptions<T extends dtypes.DataType = any, TNull = any> extends BuilderOptions<T, TNull> {
highWaterMark?: number;
queueingStrategy?: 'bytes' | 'count';
diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts
index 3f85e16e03..8d61295919 100644
--- a/js/src/interfaces.ts
+++ b/js/src/interfaces.ts
@@ -160,6 +160,9 @@ export type TypedArrayDataType<T extends TypedArray | BigIntArray> =
T extends Float64Array ? type.Float64 :
never;
+/** @ignore */
+export type JavaScriptDataType<T> = JavaScriptArrayDataType<T[]>;
+
/** @ignore */
export type JavaScriptArrayDataType<T extends readonly unknown[]> =
T extends readonly (null | undefined)[] ? type.Null :
@@ -168,6 +171,8 @@ export type JavaScriptArrayDataType<T extends readonly unknown[]> =
T extends readonly (null | undefined | Date)[] ? type.Date_ :
T extends readonly (null | undefined | bigint)[] ? type.Int64 :
T extends readonly (null | undefined | number)[] ? type.Float64 :
+ T extends readonly (null | undefined | readonly (infer U)[])[] ? type.List<JavaScriptDataType<U>> :
+ T extends readonly (null | undefined | Record<string, unknown>)[] ? T extends readonly (null | undefined | infer U)[] ? type.Struct<{ [P in keyof U]: JavaScriptDataType<U[P]> }> : never :
never;
/** @ignore */
diff --git a/js/test/unit/table/table-test.ts b/js/test/unit/table/table-test.ts
index e055da3787..01af409098 100644
--- a/js/test/unit/table/table-test.ts
+++ b/js/test/unit/table/table-test.ts
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-import { Int32, Float32, Float64, Int8, makeTable, tableFromArrays, Dictionary } from 'apache-arrow';
+import { Bool, Dictionary, Float32, Float64, Int32, Int8, makeTable, tableFromArrays, tableFromJSON } from 'apache-arrow';
describe('makeTable()', () => {
test(`creates a new Table from Typed Arrays`, () => {
@@ -47,6 +47,9 @@ describe('tableFromArrays()', () => {
d: ['foo', 'bar'],
});
+ expect(table.numRows).toBe(3);
+ expect(table.numCols).toBe(4);
+
expect(table.getChild('a')!.type).toBeInstanceOf(Float32);
expect(table.getChild('b')!.type).toBeInstanceOf(Int8);
expect(table.getChild('c')!.type).toBeInstanceOf(Float64);
@@ -54,3 +57,25 @@ describe('tableFromArrays()', () => {
expect(table.getChild('e' as any)).toBeNull();
});
});
+
+
+describe('tableFromJSON()', () => {
+ test(`creates table from array of objects`, () => {
+ const table = tableFromJSON([{
+ a: 42,
+ b: true,
+ c: 'foo',
+ }, {
+ a: 12,
+ b: false,
+ c: 'bar',
+ }]);
+
+ expect(table.numRows).toBe(2);
+ expect(table.numCols).toBe(3);
+
+ expect(table.getChild('a')!.type).toBeInstanceOf(Float64);
+ expect(table.getChild('b')!.type).toBeInstanceOf(Bool);
+ expect(table.getChild('c')!.type).toBeInstanceOf(Dictionary);
+ });
+});
diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts
index f76a521ee8..9ad3ca85b0 100644
--- a/js/test/unit/vector/vector-tests.ts
+++ b/js/test/unit/vector/vector-tests.ts
@@ -16,7 +16,7 @@
// under the License.
import {
- DateDay, DateMillisecond, Dictionary, Field, Int32, List, makeVector, Utf8, util, Vector, vectorFromArray
+ Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, util, Vector, vectorFromArray
} from 'apache-arrow';
describe(`makeVectorFromArray`, () => {
@@ -33,6 +33,51 @@ describe(`makeVectorFromArray`, () => {
});
});
+describe(`StructVector`, () => {
+ test(`makeVectorFromArray`, () => {
+ const values: { a?: number; b?: string | null; c?: boolean | null }[] = [
+ { a: 1, b: null },
+ { a: 4, b: 'foo', c: null },
+ { a: 7, b: 'bar', c: true },
+ { a: 10, b: 'baz', c: true },
+ ];
+ const vector = vectorFromArray(values);
+
+ expect(vector.numChildren).toBe(3);
+ expect(vector).toHaveLength(4);
+ expect(vector.type.children[0].type).toBeInstanceOf(Float64);
+ expect(vector.type.children[1].type).toBeInstanceOf(Dictionary);
+ expect(vector.type.children[2].type).toBeInstanceOf(Bool);
+ });
+
+
+ const values: { a?: number; b?: string; c?: boolean }[] = [
+ { a: 1, b: 'foo', c: true },
+ { a: 4, b: 'foo', c: false },
+ { a: 7, b: 'bar', c: true },
+ { a: 10, b: 'baz', c: true },
+ ];
+ const vector = vectorFromArray(values);
+
+ test(`has list struct`, () => {
+ expect(vector.type).toBeInstanceOf(Struct);
+
+ expect(vector.type.children[0].type).toBeInstanceOf(Float64);
+ expect(vector.type.children[1].type).toBeInstanceOf(Dictionary);
+ expect(vector.type.children[2].type).toBeInstanceOf(Bool);
+
+ expect(vector.type.children[0].nullable).toBeTruthy();
+ expect(vector.type.children[1].nullable).toBeTruthy();
+ expect(vector.type.children[2].nullable).toBeTruthy();
+ });
+
+ test(`get value`, () => {
+ for (const [i, value] of values.entries()) {
+ expect(vector.get(i)!.toJSON()).toEqual(value);
+ }
+ });
+});
+
describe(`DateVector`, () => {
const extras = [
new Date(2000, 0, 1),
@@ -153,7 +198,7 @@ describe(`Utf8Vector`, () => {
describe(`ListVector`, () => {
const values = [[1, 2], [1, 2, 3]];
- const vector = vectorFromArray(values, new List(Field.new({ name: 'field', type: new Int32 })));
+ const vector = vectorFromArray(values);
test(`has list type`, () => {
expect(vector.type).toBeInstanceOf(List);