You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pt...@apache.org on 2018/09/27 18:09:00 UTC

[arrow] branch master updated: ARROW-3336: [JS] Fix IPC writer serializing sliced arrays

This is an automated email from the ASF dual-hosted git repository.

ptaylor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b796b57  ARROW-3336: [JS] Fix IPC writer serializing sliced arrays
b796b57 is described below

commit b796b579b8aba472c6690a49f95e8e174d01a4bb
Author: ptaylor <pa...@me.com>
AuthorDate: Thu Sep 27 11:08:39 2018 -0700

    ARROW-3336: [JS] Fix IPC writer serializing sliced arrays
    
    Fixes https://issues.apache.org/jira/browse/ARROW-3336. Realized too late I branched from the https://github.com/apache/arrow/pull/2616, so that's why there's some extra commits at the front.
    
    Check out the relevant tests here: https://github.com/trxcllnt/arrow/blob/860f61046fca0081dbe0ce986a97408ed5934e22/js/test/unit/writer-tests.ts#L26. This test covers the primitive, nested, and dictionary vectors, but it'd be worth adding more (for example, Unions).
    
    Author: ptaylor <pa...@me.com>
    
    Closes #2638 from trxcllnt/js-fix-writing-sliced-arrays and squashes the following commits:
    
    bcd58caa <ptaylor> fix writer-tests.ts import
    fa4d8f54 <ptaylor> only use vector offset to serialize bitmaps and data buffers of variable-width vectors, as all other buffers have already been adjusted as part of the original call to Vector#slice()
    5ece0806 <ptaylor> reset internal children Array in case the childData for each has been sliced
    e093f7c0 <ptaylor> add test validating writer serializes sliced arrays correctly
    f9dd91b0 <ptaylor> move vector comparison extension into separate module
    380d6555 <ptaylor> fix table constructor args
---
 js/src/ipc/writer/binary.ts           | 12 +++---
 js/src/table.ts                       | 21 ++++++----
 js/src/vector/nested.ts               |  2 +-
 js/test/integration/validate-tests.ts | 58 +--------------------------
 js/test/jest-extensions.ts            | 74 +++++++++++++++++++++++++++++++++++
 js/test/unit/table-tests.ts           |  2 +-
 js/test/unit/writer-tests.ts          | 62 +++++++++++++++++++++++++++++
 7 files changed, 159 insertions(+), 72 deletions(-)

diff --git a/js/src/ipc/writer/binary.ts b/js/src/ipc/writer/binary.ts
index 4be3dc7..015e747 100644
--- a/js/src/ipc/writer/binary.ts
+++ b/js/src/ipc/writer/binary.ts
@@ -215,7 +215,7 @@ export class RecordBatchSerializer extends VectorVisitor {
                 // Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet
                 const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1);
                 const shiftedOffsets = new Int32Array(length);
-                const unshiftedOffsets = this.getZeroBasedValueOffsets(sliceOffset, length, valueOffsets);
+                const unshiftedOffsets = this.getZeroBasedValueOffsets(0, length, valueOffsets);
                 for (let typeId, shift, index = -1; ++index < length;) {
                     typeId = typeIds[index];
                     // ~(-1) used to be faster than x === -1, so maybe worth benchmarking the difference of these two impls for large dense unions:
@@ -257,9 +257,9 @@ export class RecordBatchSerializer extends VectorVisitor {
     }
     protected visitFlatVector<T extends FlatType>(vector: Vector<T>) {
         const { view, data } = vector;
-        const { offset, length, values } = data;
+        const { length, values } = data;
         const scaledLength = length * ((view as any).size || 1);
-        return this.addBuffer(values.subarray(offset, scaledLength));
+        return this.addBuffer(values.subarray(0, scaledLength));
     }
     protected visitFlatListVector<T extends FlatListType>(vector: Vector<T>) {
         const { data, length } = vector;
@@ -269,17 +269,17 @@ export class RecordBatchSerializer extends VectorVisitor {
         const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset);
         // Push in the order FlatList types read their buffers
         // valueOffsets buffer first
-        this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets));
+        this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets));
         // sliced values buffer second
         this.addBuffer(values.subarray(firstOffset + offset, firstOffset + offset + byteLength));
         return this;
     }
     protected visitListVector<T extends SingleNestedType>(vector: Vector<T>) {
         const { data, length } = vector;
-        const { offset, valueOffsets } = <any> data;
+        const { valueOffsets } = <any> data;
         // If we have valueOffsets (ListVector), push that buffer first
         if (valueOffsets) {
-            this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets));
+            this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets));
         }
         // Then insert the List's values child
         return this.visit((vector as any as ListVector<T>).getChildAt(0)!);
diff --git a/js/src/table.ts b/js/src/table.ts
index e3be9bb..e09e206 100644
--- a/js/src/table.ts
+++ b/js/src/table.ts
@@ -93,16 +93,21 @@ export class Table implements DataFrame {
     constructor(schema: Schema, batches: RecordBatch[]);
     constructor(schema: Schema, ...batches: RecordBatch[]);
     constructor(...args: any[]) {
-        let schema: Schema;
-        let batches: RecordBatch[];
+
+        let schema: Schema = null!;
+
         if (args[0] instanceof Schema) {
-            schema = args[0];
-            batches = Array.isArray(args[1][0]) ? args[1][0] : args[1];
-        } else if (args[0] instanceof RecordBatch) {
-            schema = (batches = args)[0].schema;
-        } else {
-            schema = (batches = args[0])[0].schema;
+            schema = args.shift();
+        }
+
+        let batches = args.reduce(function flatten(xs: any[], x: any): any[] {
+            return Array.isArray(x) ? x.reduce(flatten, xs) : [...xs, x];
+        }, []).filter((x: any): x is RecordBatch => x instanceof RecordBatch);
+
+        if (!schema && !(schema = batches[0] && batches[0].schema)) {
+            throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch with a Schema');
         }
+
         this.schema = schema;
         this.batches = batches;
         this.batchesUnion = batches.length == 0 ?
diff --git a/js/src/vector/nested.ts b/js/src/vector/nested.ts
index 6d1bd4a..31bbee4 100644
--- a/js/src/vector/nested.ts
+++ b/js/src/vector/nested.ts
@@ -33,7 +33,7 @@ export abstract class NestedView<T extends NestedType> implements View<T> {
         this._children = children || new Array(this.numChildren);
     }
     public clone(data: Data<T>): this {
-        return new (<any> this.constructor)(data, this._children) as this;
+        return new (<any> this.constructor)(data, new Array(this.numChildren)) as this;
     }
     public isValid(): boolean {
         return true;
diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts
index 5d0d3ff..0f1ebcc 100644
--- a/js/test/integration/validate-tests.ts
+++ b/js/test/integration/validate-tests.ts
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+import '../jest-extensions';
+
 import * as fs from 'fs';
 import * as path from 'path';
 
@@ -59,62 +61,6 @@ const jsonAndArrowPaths = toArray(zip(
 ))
 .filter(([p1, p2]) => p1 !== undefined && p2 !== undefined) as [string, string][];
 
-expect.extend({
-    toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) {
-
-        const format = (x: any, y: any, msg= ' ') => `${
-            this.utils.printExpected(x)}${
-                msg}${
-            this.utils.printReceived(y)
-        }`;
-
-        let getFailures = new Array<string>();
-        let propsFailures = new Array<string>();
-        let iteratorFailures = new Array<string>();
-        let allFailures = [
-            { title: 'get', failures: getFailures },
-            { title: 'props', failures: propsFailures },
-            { title: 'iterator', failures: iteratorFailures }
-        ];
-
-        let props = [
-            // 'name', 'nullable', 'metadata',
-            'type', 'length', 'nullCount'
-        ];
-
-        for (let i = -1, n = props.length; ++i < n;) {
-            const prop = props[i];
-            if (`${v1[prop]}` !== `${v2[prop]}`) {
-                propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`);
-            }
-        }
-
-        for (let i = -1, n = v1.length; ++i < n;) {
-            let x1 = v1.get(i), x2 = v2.get(i);
-            if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
-                getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
-            }
-        }
-
-        let i = -1;
-        for (let [x1, x2] of zip(v1, v2)) {
-            ++i;
-            if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
-                iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
-            }
-        }
-
-        return {
-            pass: allFailures.every(({ failures }) => failures.length === 0),
-            message: () => [
-                `${columnName}: (${format(format1, format2, ' !== ')})\n`,
-                ...allFailures.map(({ failures, title }) =>
-                    !failures.length ? `` : [`${title}:`, ...failures].join(`\n`))
-            ].join('\n')
-        };
-    }
-});
-
 describe(`Integration`, () => {
     for (const [jsonFilePath, arrowFilePath] of jsonAndArrowPaths) {
         let { name, dir } = path.parse(arrowFilePath);
diff --git a/js/test/jest-extensions.ts b/js/test/jest-extensions.ts
new file mode 100644
index 0000000..f45b70c
--- /dev/null
+++ b/js/test/jest-extensions.ts
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import { zip } from 'ix/iterable/zip';
+
+expect.extend({
+    toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) {
+
+        const format = (x: any, y: any, msg= ' ') => `${
+            this.utils.printExpected(x)}${
+                msg}${
+            this.utils.printReceived(y)
+        }`;
+
+        let getFailures = new Array<string>();
+        let propsFailures = new Array<string>();
+        let iteratorFailures = new Array<string>();
+        let allFailures = [
+            { title: 'get', failures: getFailures },
+            { title: 'props', failures: propsFailures },
+            { title: 'iterator', failures: iteratorFailures }
+        ];
+
+        let props = [
+            // 'name', 'nullable', 'metadata',
+            'type', 'length', 'nullCount'
+        ];
+
+        for (let i = -1, n = props.length; ++i < n;) {
+            const prop = props[i];
+            if (`${v1[prop]}` !== `${v2[prop]}`) {
+                propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`);
+            }
+        }
+
+        for (let i = -1, n = v1.length; ++i < n;) {
+            let x1 = v1.get(i), x2 = v2.get(i);
+            if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
+                getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
+            }
+        }
+
+        let i = -1;
+        for (let [x1, x2] of zip(v1, v2)) {
+            ++i;
+            if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
+                iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
+            }
+        }
+
+        return {
+            pass: allFailures.every(({ failures }) => failures.length === 0),
+            message: () => [
+                `${columnName}: (${format(format1, format2, ' !== ')})\n`,
+                ...allFailures.map(({ failures, title }) =>
+                    !failures.length ? `` : [`${title}:`, ...failures].join(`\n`))
+            ].join('\n')
+        };
+    }
+});
diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts
index 98e3eb2..6ec5b74 100644
--- a/js/test/unit/table-tests.ts
+++ b/js/test/unit/table-tests.ts
@@ -321,7 +321,7 @@ function leftPad(str: string, fill: string, n: number) {
     return (new Array(n + 1).join(fill) + str).slice(-1 * n);
 }
 
-function getSingleRecordBatchTable() {
+export function getSingleRecordBatchTable() {
     return Table.from({
         'schema': {
             'fields': [
diff --git a/js/test/unit/writer-tests.ts b/js/test/unit/writer-tests.ts
new file mode 100644
index 0000000..7bd63fc
--- /dev/null
+++ b/js/test/unit/writer-tests.ts
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import '../jest-extensions';
+
+import Arrow from '../Arrow';
+import { getSingleRecordBatchTable } from './table-tests';
+const { Table, RecordBatch } = Arrow;
+
+describe('Table.serialize()', () => {
+    test(`Serializes sliced RecordBatches`, () => {
+
+        const table = getSingleRecordBatchTable();
+        const batch = table.batches[0], half = batch.length / 2 | 0;
+
+        // First compare what happens when slicing from the batch level
+        let [batch1, batch2] = [batch.slice(0, half), batch.slice(half)];
+
+        compareBatchAndTable(table,    0, batch1, Table.from(new Table(batch1).serialize()));
+        compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize()));
+
+        // Then compare what happens when creating a RecordBatch by slicing each child individually
+        batch1 = new RecordBatch(batch1.schema, batch1.length, batch1.schema.fields.map((_, i) => {
+            return batch.getChildAt(i)!.slice(0, half);
+        }));
+
+        batch2 = new RecordBatch(batch2.schema, batch2.length, batch2.schema.fields.map((_, i) => {
+            return batch.getChildAt(i)!.slice(half);
+        }));
+
+        compareBatchAndTable(table,    0, batch1, Table.from(new Table(batch1).serialize()));
+        compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize()));
+    });
+});
+
+function compareBatchAndTable(source: Table, offset: number, batch: RecordBatch, table: Table) {
+    expect(batch.length).toEqual(table.length);
+    expect(table.numCols).toEqual(source.numCols);
+    expect(batch.numCols).toEqual(source.numCols);
+    for (let i = -1, n = source.numCols; ++i < n;) {
+        const v0 = source.getColumnAt(i)!.slice(offset, offset + batch.length);
+        const v1 = batch.getChildAt(i);
+        const v2 = table.getColumnAt(i);
+        const name = source.schema.fields[i].name;
+        (expect([v1, `batch`, name]) as any).toEqualVector([v0, `source`]);
+        (expect([v2, `table`, name]) as any).toEqualVector([v0, `source`]);
+    }
+}