You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/12/05 18:00:31 UTC

[arrow] branch master updated: ARROW-3681: [Go] Add benchmarks for CSV reader

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 137a69e  ARROW-3681: [Go] Add benchmarks for CSV reader
137a69e is described below

commit 137a69e1fb6647dde5da9d18d9da47e7a9e694b0
Author: Sebastien Binet <bi...@cern.ch>
AuthorDate: Wed Dec 5 19:00:21 2018 +0100

    ARROW-3681: [Go] Add benchmarks for CSV reader
    
    This CL adds a set of benchmarks for the CSV reader type.
    E.g.:
    
    ```
    $> go test -run=NONE -bench=Read/rows=.*_cols=.*_chunks=-1 -benchmem
    goos: linux
    goarch: amd64
    pkg: github.com/apache/arrow/go/arrow/csv
    BenchmarkRead/rows=10_cols=1_chunks=-1-8         	  200000	     10219 ns/op	    9560 B/op	      73 allocs/op
    BenchmarkRead/rows=10_cols=10_chunks=-1-8        	   30000	     75434 ns/op	   47264 B/op	     368 allocs/op
    BenchmarkRead/rows=10_cols=100_chunks=-1-8       	    3000	    489027 ns/op	  426960 B/op	    3255 allocs/op
    BenchmarkRead/rows=10_cols=1000_chunks=-1-8      	     200	   5400913 ns/op	 4308912 B/op	   32072 allocs/op
    BenchmarkRead/rows=100_cols=1_chunks=-1-8        	   50000	     45297 ns/op	   30552 B/op	     268 allocs/op
    BenchmarkRead/rows=100_cols=10_chunks=-1-8       	    5000	    333999 ns/op	  195520 B/op	     661 allocs/op
    BenchmarkRead/rows=100_cols=100_chunks=-1-8      	     500	   2660322 ns/op	 1869777 B/op	    4538 allocs/op
    BenchmarkRead/rows=100_cols=1000_chunks=-1-8     	      50	  25683147 ns/op	18805425 B/op	   43256 allocs/op
    BenchmarkRead/rows=1000_cols=1_chunks=-1-8       	    5000	    423213 ns/op	  218968 B/op	    2086 allocs/op
    BenchmarkRead/rows=1000_cols=10_chunks=-1-8      	     500	   2420959 ns/op	 1591808 B/op	    2614 allocs/op
    BenchmarkRead/rows=1000_cols=100_chunks=-1-8     	      50	  21765485 ns/op	15474384 B/op	    7841 allocs/op
    BenchmarkRead/rows=1000_cols=1000_chunks=-1-8    	       5	 222083917 ns/op	154949808 B/op	   60060 allocs/op
    BenchmarkRead/rows=10000_cols=1_chunks=-1-8      	     500	   3938427 ns/op	 3083224 B/op	   20123 allocs/op
    BenchmarkRead/rows=10000_cols=10_chunks=-1-8     	      50	  22066971 ns/op	20298368 B/op	   20903 allocs/op
    BenchmarkRead/rows=10000_cols=100_chunks=-1-8    	       5	 209542066 ns/op	193038672 B/op	   28651 allocs/op
    BenchmarkRead/rows=10000_cols=1000_chunks=-1-8   	       1	2696959353 ns/op	1939814576 B/op	  106070 allocs/op
    BenchmarkRead/rows=100000_cols=1_chunks=-1-8     	      30	  35208837 ns/op	31869150 B/op	  200155 allocs/op
    BenchmarkRead/rows=100000_cols=10_chunks=-1-8    	       5	 219030269 ns/op	183553152 B/op	  201125 allocs/op
    BenchmarkRead/rows=100000_cols=100_chunks=-1-8   	       1	2421018029 ns/op	1692336464 B/op	  210762 allocs/op
    BenchmarkRead/rows=100000_cols=1000_chunks=-1-8  	       1	28196721844 ns/op	16891740336 B/op	  307082 allocs/op
    PASS
    ok  	github.com/apache/arrow/go/arrow/csv	107.802s
    ```
    
    Author: Sebastien Binet <bi...@cern.ch>
    
    Closes #3071 from sbinet/issue-3681 and squashes the following commits:
    
    8eb60c52 <Sebastien Binet> ARROW-3681:  Add benchmarks for CSV reader
---
 go/arrow/csv/csv_test.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go
index f53cf17..aaafb37 100644
--- a/go/arrow/csv/csv_test.go
+++ b/go/arrow/csv/csv_test.go
@@ -485,3 +485,69 @@ rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7"
 		})
 	}
 }
+
+func BenchmarkRead(b *testing.B) {
+	gen := func(rows, cols int) []byte {
+		buf := new(bytes.Buffer)
+		for i := 0; i < rows; i++ {
+			for j := 0; j < cols; j++ {
+				if j > 0 {
+					fmt.Fprintf(buf, ";")
+				}
+				fmt.Fprintf(buf, "%d;%f;str-%d", i, float64(i), i)
+			}
+			fmt.Fprintf(buf, "\n")
+		}
+		return buf.Bytes()
+	}
+
+	for _, rows := range []int{10, 1e2, 1e3, 1e4, 1e5} {
+		for _, cols := range []int{1, 10, 100, 1000} {
+			raw := gen(rows, cols)
+			for _, chunks := range []int{-1, 0, 10, 100, 1000} {
+				b.Run(fmt.Sprintf("rows=%d cols=%d chunks=%d", rows, cols, chunks), func(b *testing.B) {
+					benchRead(b, raw, rows, cols, chunks)
+				})
+			}
+		}
+	}
+}
+
+func benchRead(b *testing.B, raw []byte, rows, cols, chunks int) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(b, 0)
+
+	var fields []arrow.Field
+	for i := 0; i < cols; i++ {
+		fields = append(fields, []arrow.Field{
+			arrow.Field{Name: fmt.Sprintf("i64-%d", i), Type: arrow.PrimitiveTypes.Int64},
+			arrow.Field{Name: fmt.Sprintf("f64-%d", i), Type: arrow.PrimitiveTypes.Float64},
+			arrow.Field{Name: fmt.Sprintf("str-%d", i), Type: arrow.BinaryTypes.String},
+		}...)
+	}
+
+	schema := arrow.NewSchema(fields, nil)
+	chunk := 0
+	if chunks != 0 {
+		chunk = rows / chunks
+	}
+	opts := []csv.Option{
+		csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+		csv.WithChunk(chunk),
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		r := csv.NewReader(bytes.NewReader(raw), schema, opts...)
+
+		n := int64(0)
+		for r.Next() {
+			n += r.Record().NumRows()
+		}
+
+		r.Release()
+		if n != int64(rows) {
+			b.Fatalf("invalid number of rows. want=%d, got=%d", n, rows)
+		}
+	}
+}