You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pt...@apache.org on 2018/07/09 02:00:37 UTC

[arrow] branch master updated: ARROW-2789: [JS] Add iterator to DataFrame

This is an automated email from the ASF dual-hosted git repository.

ptaylor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ddcb8f5  ARROW-2789: [JS] Add iterator to DataFrame
ddcb8f5 is described below

commit ddcb8f58301ae7376bac91f17c2c1ee37fd8ea95
Author: Brian Hulette <hu...@gmail.com>
AuthorDate: Sun Jul 8 18:59:49 2018 -0700

    ARROW-2789: [JS] Add iterator to DataFrame
    
    Add an iterator to `FilteredDataFrame` and the `DataFrame` interface
    
    Author: Brian Hulette <hu...@gmail.com>
    
    Closes #2215 from TheNeuralBit/dataframe-improvements and squashes the following commits:
    
    dba0e618 <Brian Hulette> rename length getter back to count()
    a43269f5 <Brian Hulette> fix tests
    664a053c <Brian Hulette> Add iterator and readonly length member
---
 js/src/table.ts | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/js/src/table.ts b/js/src/table.ts
index de06dd7..8144c98 100644
--- a/js/src/table.ts
+++ b/js/src/table.ts
@@ -29,10 +29,11 @@ export type NextFunc = (idx: number, batch: RecordBatch) => void;
 export type BindFunc = (batch: RecordBatch) => void;
 
 export interface DataFrame {
+    count(): number;
     filter(predicate: Predicate): DataFrame;
     scan(next: NextFunc, bind?: BindFunc): void;
-    count(): number;
     countBy(col: (Col|string)): CountByResult;
+    [Symbol.iterator](): IterableIterator<Struct['TValue']>;
 }
 
 export class Table implements DataFrame {
@@ -143,7 +144,6 @@ export class Table implements DataFrame {
             }
         }
     }
-    public count(): number { return this.length; }
     public countBy(name: Col | string): CountByResult {
         const batches = this.batches, numBatches = batches.length;
         const count_by = typeof name === 'string' ? new Col(name) : name;
@@ -171,6 +171,9 @@ export class Table implements DataFrame {
         }
         return new CountByResult(vector.dictionary, IntVector.from(counts));
     }
+    public count(): number {
+        return this.length;
+    }
     public select(...columnNames: string[]) {
         return new Table(this.batches.map((batch) => batch.select(...columnNames)));
     }
@@ -239,6 +242,26 @@ class FilteredDataFrame implements DataFrame {
         }
         return sum;
     }
+    public *[Symbol.iterator](): IterableIterator<Struct['TValue']> {
+        // inlined version of this:
+        // this.parent.scan((idx, columns) => {
+        //     if (this.predicate(idx, columns)) next(idx, columns);
+        // });
+        const batches = this.batches;
+        const numBatches = batches.length;
+        for (let batchIndex = -1; ++batchIndex < numBatches;) {
+            // load batches
+            const batch = batches[batchIndex];
+            // TODO: bind batches lazily
+            // If predicate doesn't match anything in the batch we don't need
+            // to bind the callback
+            const predicate = this.predicate.bind(batch);
+            // yield all indices
+            for (let index = -1, numRows = batch.length; ++index < numRows;) {
+                if (predicate(index, batch)) { yield batch.get(index) as any; }
+            }
+        }
+    }
     public filter(predicate: Predicate): DataFrame {
         return new FilteredDataFrame(
             this.batches,