You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pt...@apache.org on 2018/09/27 18:09:00 UTC
[arrow] branch master updated: ARROW-3336: [JS] Fix IPC writer
serializing sliced arrays
This is an automated email from the ASF dual-hosted git repository.
ptaylor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b796b57 ARROW-3336: [JS] Fix IPC writer serializing sliced arrays
b796b57 is described below
commit b796b579b8aba472c6690a49f95e8e174d01a4bb
Author: ptaylor <pa...@me.com>
AuthorDate: Thu Sep 27 11:08:39 2018 -0700
ARROW-3336: [JS] Fix IPC writer serializing sliced arrays
Fixes https://issues.apache.org/jira/browse/ARROW-3336. Realized too late I branched from the https://github.com/apache/arrow/pull/2616, so that's why there's some extra commits at the front.
Check out the relevant tests here: https://github.com/trxcllnt/arrow/blob/860f61046fca0081dbe0ce986a97408ed5934e22/js/test/unit/writer-tests.ts#L26. This test covers the primitive, nested, and dictionary vectors, but it'd be worth adding more (for example, Unions).
Author: ptaylor <pa...@me.com>
Closes #2638 from trxcllnt/js-fix-writing-sliced-arrays and squashes the following commits:
bcd58caa <ptaylor> fix writer-tests.ts import
fa4d8f54 <ptaylor> only use vector offset to serialize bitmaps and data buffers of variable-width vectors, as all other buffers have already been adjusted as part of the original call to Vector#slice()
5ece0806 <ptaylor> reset internal children Array in case the childData for each has been sliced
e093f7c0 <ptaylor> add test validating writer serializes sliced arrays correctly
f9dd91b0 <ptaylor> move vector comparison extension into separate module
380d6555 <ptaylor> fix table constructor args
---
js/src/ipc/writer/binary.ts | 12 +++---
js/src/table.ts | 21 ++++++----
js/src/vector/nested.ts | 2 +-
js/test/integration/validate-tests.ts | 58 +--------------------------
js/test/jest-extensions.ts | 74 +++++++++++++++++++++++++++++++++++
js/test/unit/table-tests.ts | 2 +-
js/test/unit/writer-tests.ts | 62 +++++++++++++++++++++++++++++
7 files changed, 159 insertions(+), 72 deletions(-)
diff --git a/js/src/ipc/writer/binary.ts b/js/src/ipc/writer/binary.ts
index 4be3dc7..015e747 100644
--- a/js/src/ipc/writer/binary.ts
+++ b/js/src/ipc/writer/binary.ts
@@ -215,7 +215,7 @@ export class RecordBatchSerializer extends VectorVisitor {
// Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet
const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1);
const shiftedOffsets = new Int32Array(length);
- const unshiftedOffsets = this.getZeroBasedValueOffsets(sliceOffset, length, valueOffsets);
+ const unshiftedOffsets = this.getZeroBasedValueOffsets(0, length, valueOffsets);
for (let typeId, shift, index = -1; ++index < length;) {
typeId = typeIds[index];
// ~(-1) used to be faster than x === -1, so maybe worth benchmarking the difference of these two impls for large dense unions:
@@ -257,9 +257,9 @@ export class RecordBatchSerializer extends VectorVisitor {
}
protected visitFlatVector<T extends FlatType>(vector: Vector<T>) {
const { view, data } = vector;
- const { offset, length, values } = data;
+ const { length, values } = data;
const scaledLength = length * ((view as any).size || 1);
- return this.addBuffer(values.subarray(offset, scaledLength));
+ return this.addBuffer(values.subarray(0, scaledLength));
}
protected visitFlatListVector<T extends FlatListType>(vector: Vector<T>) {
const { data, length } = vector;
@@ -269,17 +269,17 @@ export class RecordBatchSerializer extends VectorVisitor {
const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset);
// Push in the order FlatList types read their buffers
// valueOffsets buffer first
- this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets));
+ this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets));
// sliced values buffer second
this.addBuffer(values.subarray(firstOffset + offset, firstOffset + offset + byteLength));
return this;
}
protected visitListVector<T extends SingleNestedType>(vector: Vector<T>) {
const { data, length } = vector;
- const { offset, valueOffsets } = <any> data;
+ const { valueOffsets } = <any> data;
// If we have valueOffsets (ListVector), push that buffer first
if (valueOffsets) {
- this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets));
+ this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets));
}
// Then insert the List's values child
return this.visit((vector as any as ListVector<T>).getChildAt(0)!);
diff --git a/js/src/table.ts b/js/src/table.ts
index e3be9bb..e09e206 100644
--- a/js/src/table.ts
+++ b/js/src/table.ts
@@ -93,16 +93,21 @@ export class Table implements DataFrame {
constructor(schema: Schema, batches: RecordBatch[]);
constructor(schema: Schema, ...batches: RecordBatch[]);
constructor(...args: any[]) {
- let schema: Schema;
- let batches: RecordBatch[];
+
+ let schema: Schema = null!;
+
if (args[0] instanceof Schema) {
- schema = args[0];
- batches = Array.isArray(args[1][0]) ? args[1][0] : args[1];
- } else if (args[0] instanceof RecordBatch) {
- schema = (batches = args)[0].schema;
- } else {
- schema = (batches = args[0])[0].schema;
+ schema = args.shift();
+ }
+
+ let batches = args.reduce(function flatten(xs: any[], x: any): any[] {
+ return Array.isArray(x) ? x.reduce(flatten, xs) : [...xs, x];
+ }, []).filter((x: any): x is RecordBatch => x instanceof RecordBatch);
+
+ if (!schema && !(schema = batches[0] && batches[0].schema)) {
+ throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch with a Schema');
}
+
this.schema = schema;
this.batches = batches;
this.batchesUnion = batches.length == 0 ?
diff --git a/js/src/vector/nested.ts b/js/src/vector/nested.ts
index 6d1bd4a..31bbee4 100644
--- a/js/src/vector/nested.ts
+++ b/js/src/vector/nested.ts
@@ -33,7 +33,7 @@ export abstract class NestedView<T extends NestedType> implements View<T> {
this._children = children || new Array(this.numChildren);
}
public clone(data: Data<T>): this {
- return new (<any> this.constructor)(data, this._children) as this;
+ return new (<any> this.constructor)(data, new Array(this.numChildren)) as this;
}
public isValid(): boolean {
return true;
diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts
index 5d0d3ff..0f1ebcc 100644
--- a/js/test/integration/validate-tests.ts
+++ b/js/test/integration/validate-tests.ts
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+import '../jest-extensions';
+
import * as fs from 'fs';
import * as path from 'path';
@@ -59,62 +61,6 @@ const jsonAndArrowPaths = toArray(zip(
))
.filter(([p1, p2]) => p1 !== undefined && p2 !== undefined) as [string, string][];
-expect.extend({
- toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) {
-
- const format = (x: any, y: any, msg= ' ') => `${
- this.utils.printExpected(x)}${
- msg}${
- this.utils.printReceived(y)
- }`;
-
- let getFailures = new Array<string>();
- let propsFailures = new Array<string>();
- let iteratorFailures = new Array<string>();
- let allFailures = [
- { title: 'get', failures: getFailures },
- { title: 'props', failures: propsFailures },
- { title: 'iterator', failures: iteratorFailures }
- ];
-
- let props = [
- // 'name', 'nullable', 'metadata',
- 'type', 'length', 'nullCount'
- ];
-
- for (let i = -1, n = props.length; ++i < n;) {
- const prop = props[i];
- if (`${v1[prop]}` !== `${v2[prop]}`) {
- propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`);
- }
- }
-
- for (let i = -1, n = v1.length; ++i < n;) {
- let x1 = v1.get(i), x2 = v2.get(i);
- if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
- getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
- }
- }
-
- let i = -1;
- for (let [x1, x2] of zip(v1, v2)) {
- ++i;
- if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
- iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
- }
- }
-
- return {
- pass: allFailures.every(({ failures }) => failures.length === 0),
- message: () => [
- `${columnName}: (${format(format1, format2, ' !== ')})\n`,
- ...allFailures.map(({ failures, title }) =>
- !failures.length ? `` : [`${title}:`, ...failures].join(`\n`))
- ].join('\n')
- };
- }
-});
-
describe(`Integration`, () => {
for (const [jsonFilePath, arrowFilePath] of jsonAndArrowPaths) {
let { name, dir } = path.parse(arrowFilePath);
diff --git a/js/test/jest-extensions.ts b/js/test/jest-extensions.ts
new file mode 100644
index 0000000..f45b70c
--- /dev/null
+++ b/js/test/jest-extensions.ts
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import { zip } from 'ix/iterable/zip';
+
+expect.extend({
+ toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) {
+
+ const format = (x: any, y: any, msg= ' ') => `${
+ this.utils.printExpected(x)}${
+ msg}${
+ this.utils.printReceived(y)
+ }`;
+
+ let getFailures = new Array<string>();
+ let propsFailures = new Array<string>();
+ let iteratorFailures = new Array<string>();
+ let allFailures = [
+ { title: 'get', failures: getFailures },
+ { title: 'props', failures: propsFailures },
+ { title: 'iterator', failures: iteratorFailures }
+ ];
+
+ let props = [
+ // 'name', 'nullable', 'metadata',
+ 'type', 'length', 'nullCount'
+ ];
+
+ for (let i = -1, n = props.length; ++i < n;) {
+ const prop = props[i];
+ if (`${v1[prop]}` !== `${v2[prop]}`) {
+ propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`);
+ }
+ }
+
+ for (let i = -1, n = v1.length; ++i < n;) {
+ let x1 = v1.get(i), x2 = v2.get(i);
+ if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
+ getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
+ }
+ }
+
+ let i = -1;
+ for (let [x1, x2] of zip(v1, v2)) {
+ ++i;
+ if (this.utils.stringify(x1) !== this.utils.stringify(x2)) {
+ iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`);
+ }
+ }
+
+ return {
+ pass: allFailures.every(({ failures }) => failures.length === 0),
+ message: () => [
+ `${columnName}: (${format(format1, format2, ' !== ')})\n`,
+ ...allFailures.map(({ failures, title }) =>
+ !failures.length ? `` : [`${title}:`, ...failures].join(`\n`))
+ ].join('\n')
+ };
+ }
+});
diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts
index 98e3eb2..6ec5b74 100644
--- a/js/test/unit/table-tests.ts
+++ b/js/test/unit/table-tests.ts
@@ -321,7 +321,7 @@ function leftPad(str: string, fill: string, n: number) {
return (new Array(n + 1).join(fill) + str).slice(-1 * n);
}
-function getSingleRecordBatchTable() {
+export function getSingleRecordBatchTable() {
return Table.from({
'schema': {
'fields': [
diff --git a/js/test/unit/writer-tests.ts b/js/test/unit/writer-tests.ts
new file mode 100644
index 0000000..7bd63fc
--- /dev/null
+++ b/js/test/unit/writer-tests.ts
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import '../jest-extensions';
+
+import Arrow from '../Arrow';
+import { getSingleRecordBatchTable } from './table-tests';
+const { Table, RecordBatch } = Arrow;
+
+describe('Table.serialize()', () => {
+ test(`Serializes sliced RecordBatches`, () => {
+
+ const table = getSingleRecordBatchTable();
+ const batch = table.batches[0], half = batch.length / 2 | 0;
+
+ // First compare what happens when slicing from the batch level
+ let [batch1, batch2] = [batch.slice(0, half), batch.slice(half)];
+
+ compareBatchAndTable(table, 0, batch1, Table.from(new Table(batch1).serialize()));
+ compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize()));
+
+ // Then compare what happens when creating a RecordBatch by slicing each child individually
+ batch1 = new RecordBatch(batch1.schema, batch1.length, batch1.schema.fields.map((_, i) => {
+ return batch.getChildAt(i)!.slice(0, half);
+ }));
+
+ batch2 = new RecordBatch(batch2.schema, batch2.length, batch2.schema.fields.map((_, i) => {
+ return batch.getChildAt(i)!.slice(half);
+ }));
+
+ compareBatchAndTable(table, 0, batch1, Table.from(new Table(batch1).serialize()));
+ compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize()));
+ });
+});
+
+function compareBatchAndTable(source: Table, offset: number, batch: RecordBatch, table: Table) {
+ expect(batch.length).toEqual(table.length);
+ expect(table.numCols).toEqual(source.numCols);
+ expect(batch.numCols).toEqual(source.numCols);
+ for (let i = -1, n = source.numCols; ++i < n;) {
+ const v0 = source.getColumnAt(i)!.slice(offset, offset + batch.length);
+ const v1 = batch.getChildAt(i);
+ const v2 = table.getColumnAt(i);
+ const name = source.schema.fields[i].name;
+ (expect([v1, `batch`, name]) as any).toEqualVector([v0, `source`]);
+ (expect([v2, `table`, name]) as any).toEqualVector([v0, `source`]);
+ }
+}