You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pt...@apache.org on 2019/04/12 18:20:43 UTC

[arrow] branch master updated: ARROW-5100: [JS] Remove swap while collapsing contiguous buffers

This is an automated email from the ASF dual-hosted git repository.

ptaylor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8537420  ARROW-5100: [JS] Remove swap while collapsing contiguous buffers
8537420 is described below

commit 853742021ce1dc3123c614c1f4b12e8050b11a3c
Author: ptaylor <pa...@me.com>
AuthorDate: Fri Apr 12 11:20:29 2019 -0700

    ARROW-5100: [JS] Remove swap while collapsing contiguous buffers
    
    Fixes https://issues.apache.org/jira/browse/ARROW-5100
    
    Seems like this is a fairly rare edge case, but really perplexing when it happens. I saw it when manually creating Arrow Columns from data sliced out of an ArrayBuffer pool, and just so happened to get an overlapping + out-of-order set of buffers. I added a test case that demonstrates the issue too.
    
    Description from the issue:
    > We collapse contiguous Uint8Arrays that share the same underlying ArrayBuffer and have overlapping byte ranges. This was done to maintain true zero-copy behavior when using certain node core streams that use a buffer pool internally, and could write chunks of the same logical Arrow Message at out-of-order byte offsets in the pool.
    
    > Unfortunately this can also lead to a bug where, in rare cases, buffers are swapped while writing Arrow Messages too. We could have a flag to indicate whether we think collapsing out-of-order same-buffer chunks is safe, but I'm not sure if we can always know that, so I'd prefer to take it out and incur the copy cost.
    
    Author: ptaylor <pa...@me.com>
    
    Closes #4102 from trxcllnt/js/no-buffer-swap and squashes the following commits:
    
    0aaf813d <ptaylor> ensure built asset names match the js version
    2a5c5d51 <ptaylor> remove swap behavior from contiguous buffer collapse for safety
---
 js/gulp/arrow-task.js                 | 12 ++++++------
 js/src/util/buffer.ts                 |  8 ++------
 js/test/unit/table/serialize-tests.ts | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js
index 48e717e..0b95440 100644
--- a/js/gulp/arrow-task.js
+++ b/js/gulp/arrow-task.js
@@ -40,14 +40,14 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target) {
     const esnextUmdSourceMapsGlob = `${targetDir(`esnext`, `umd`)}/*.map`;
     return Observable.forkJoin(
         observableFromStreams(gulp.src(dtsGlob),                 gulp.dest(out)), // copy d.ts files
-        observableFromStreams(gulp.src(cjsGlob),                 gulp.dest(out)), // copy es2015 cjs files
-        observableFromStreams(gulp.src(cjsSourceMapsGlob),       gulp.dest(out)), // copy es2015 cjs sourcemaps
-        observableFromStreams(gulp.src(esmSourceMapsGlob),       gulp.dest(out)), // copy es2015 esm sourcemaps
+        observableFromStreams(gulp.src(cjsGlob),                 gulp.dest(out)), // copy esnext cjs files
+        observableFromStreams(gulp.src(cjsSourceMapsGlob),       gulp.dest(out)), // copy esnext cjs sourcemaps
+        observableFromStreams(gulp.src(esmSourceMapsGlob),       gulp.dest(out)), // copy esnext esm sourcemaps
         observableFromStreams(gulp.src(es5UmdSourceMapsGlob),    gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename
-        observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename
-        observableFromStreams(gulp.src(esmGlob),       gulpRename((p) => { p.extname = '.mjs'; }),          gulp.dest(out)), // copy es2015 esm files and rename to `.mjs`
+        observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), gulp.dest(out)), // copy esnext umd sourcemap files, but don't rename
+        observableFromStreams(gulp.src(esmGlob),       gulpRename((p) => { p.extname = '.mjs'; }),          gulp.dest(out)), // copy esnext esm files and rename to `.mjs`
         observableFromStreams(gulp.src(es5UmdGlob),    gulpRename((p) => { p.basename += `.es5.min`; }),    gulp.dest(out)), // copy es5 umd files and add `.min`
-        observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { p.basename += `.es2015.min`; }), gulp.dest(out)), // copy es2015 umd files and add `.es2015.min`
+        observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { p.basename += `.esnext.min`; }), gulp.dest(out)), // copy esnext umd files and add `.esnext.min`
     ).publish(new ReplaySubject()).refCount();
 }))({});
 
diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts
index e0fb0fd..2c56b1c 100644
--- a/js/src/util/buffer.ts
+++ b/js/src/util/buffer.ts
@@ -32,15 +32,11 @@ function collapseContiguousByteRanges(chunks: Uint8Array[]) {
     for (let x, y, i = 0, j = 0, n = chunks.length; ++i < n;) {
         x = result[j];
         y = chunks[i];
-        // continue x and y don't share the same underlying ArrayBuffer
-        if (!x || !y || x.buffer !== y.buffer) {
+        // continue if x and y don't share the same underlying ArrayBuffer, or if x isn't before y
+        if (!x || !y || x.buffer !== y.buffer || y.byteOffset < x.byteOffset) {
             y && (result[++j] = y);
             continue;
         }
-        // swap if y starts before x
-        if (y.byteOffset < x.byteOffset) {
-            x = chunks[i]; y = result[j];
-        }
         ({ byteOffset: xOffset, byteLength: xLen } = x);
         ({ byteOffset: yOffset, byteLength: yLen } = y);
         // continue if the byte ranges of x and y aren't contiguous
diff --git a/js/test/unit/table/serialize-tests.ts b/js/test/unit/table/serialize-tests.ts
index f337913..9dce2f5 100644
--- a/js/test/unit/table/serialize-tests.ts
+++ b/js/test/unit/table/serialize-tests.ts
@@ -18,7 +18,7 @@
 import '../../jest-extensions';
 import * as generate from '../../generate-test-data';
 import {
-    Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null
+    Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null, Int32Vector
 } from '../../Arrow';
 
 const toSchema = (...xs: [string, DataType][]) => new Schema(xs.map((x) => new Field(...x)));
@@ -34,6 +34,17 @@ function createTable<T extends { [key: string]: DataType } = any>(schema: Schema
 
 describe('Table#serialize()', () => {
 
+    test(`doesn't swap the order of buffers that share the same underlying ArrayBuffer but are in a different order`, () => {
+        const values = new Int32Array([0, 1, 2, 3, 4, 5, 6, 7]);
+        const expected = values.slice();
+        const x = Int32Vector.from(values.subarray(4, 8)); // back
+        const y = Int32Vector.from(values.subarray(0, 4)); // front
+        const source = Table.new([x, y], ['x', 'y']);
+        const table = Table.from(source.serialize());
+        expect(table.getColumn('x').toArray()).toEqual(expected.subarray(4, 8));
+        expect(table.getColumn('y').toArray()).toEqual(expected.subarray(0, 4));
+    });
+
     test(`Table#empty round-trips through serialization`, () => {
         const source = Table.empty();
         source.schema.metadata.set('foo', 'bar');