You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/12/05 18:00:31 UTC
[arrow] branch master updated: ARROW-3681: [Go] Add benchmarks for
CSV reader
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 137a69e ARROW-3681: [Go] Add benchmarks for CSV reader
137a69e is described below
commit 137a69e1fb6647dde5da9d18d9da47e7a9e694b0
Author: Sebastien Binet <bi...@cern.ch>
AuthorDate: Wed Dec 5 19:00:21 2018 +0100
ARROW-3681: [Go] Add benchmarks for CSV reader
This CL adds a set of benchmarks for the CSV reader type.
E.g.:
```
$> go test -run=NONE -bench=Read/rows=.*_cols=.*_chunks=-1 -benchmem
goos: linux
goarch: amd64
pkg: github.com/apache/arrow/go/arrow/csv
BenchmarkRead/rows=10_cols=1_chunks=-1-8 200000 10219 ns/op 9560 B/op 73 allocs/op
BenchmarkRead/rows=10_cols=10_chunks=-1-8 30000 75434 ns/op 47264 B/op 368 allocs/op
BenchmarkRead/rows=10_cols=100_chunks=-1-8 3000 489027 ns/op 426960 B/op 3255 allocs/op
BenchmarkRead/rows=10_cols=1000_chunks=-1-8 200 5400913 ns/op 4308912 B/op 32072 allocs/op
BenchmarkRead/rows=100_cols=1_chunks=-1-8 50000 45297 ns/op 30552 B/op 268 allocs/op
BenchmarkRead/rows=100_cols=10_chunks=-1-8 5000 333999 ns/op 195520 B/op 661 allocs/op
BenchmarkRead/rows=100_cols=100_chunks=-1-8 500 2660322 ns/op 1869777 B/op 4538 allocs/op
BenchmarkRead/rows=100_cols=1000_chunks=-1-8 50 25683147 ns/op 18805425 B/op 43256 allocs/op
BenchmarkRead/rows=1000_cols=1_chunks=-1-8 5000 423213 ns/op 218968 B/op 2086 allocs/op
BenchmarkRead/rows=1000_cols=10_chunks=-1-8 500 2420959 ns/op 1591808 B/op 2614 allocs/op
BenchmarkRead/rows=1000_cols=100_chunks=-1-8 50 21765485 ns/op 15474384 B/op 7841 allocs/op
BenchmarkRead/rows=1000_cols=1000_chunks=-1-8 5 222083917 ns/op 154949808 B/op 60060 allocs/op
BenchmarkRead/rows=10000_cols=1_chunks=-1-8 500 3938427 ns/op 3083224 B/op 20123 allocs/op
BenchmarkRead/rows=10000_cols=10_chunks=-1-8 50 22066971 ns/op 20298368 B/op 20903 allocs/op
BenchmarkRead/rows=10000_cols=100_chunks=-1-8 5 209542066 ns/op 193038672 B/op 28651 allocs/op
BenchmarkRead/rows=10000_cols=1000_chunks=-1-8 1 2696959353 ns/op 1939814576 B/op 106070 allocs/op
BenchmarkRead/rows=100000_cols=1_chunks=-1-8 30 35208837 ns/op 31869150 B/op 200155 allocs/op
BenchmarkRead/rows=100000_cols=10_chunks=-1-8 5 219030269 ns/op 183553152 B/op 201125 allocs/op
BenchmarkRead/rows=100000_cols=100_chunks=-1-8 1 2421018029 ns/op 1692336464 B/op 210762 allocs/op
BenchmarkRead/rows=100000_cols=1000_chunks=-1-8 1 28196721844 ns/op 16891740336 B/op 307082 allocs/op
PASS
ok github.com/apache/arrow/go/arrow/csv 107.802s
```
Author: Sebastien Binet <bi...@cern.ch>
Closes #3071 from sbinet/issue-3681 and squashes the following commits:
8eb60c52 <Sebastien Binet> ARROW-3681: Add benchmarks for CSV reader
---
go/arrow/csv/csv_test.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go
index f53cf17..aaafb37 100644
--- a/go/arrow/csv/csv_test.go
+++ b/go/arrow/csv/csv_test.go
@@ -485,3 +485,69 @@ rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7"
})
}
}
+
+func BenchmarkRead(b *testing.B) {
+ gen := func(rows, cols int) []byte {
+ buf := new(bytes.Buffer)
+ for i := 0; i < rows; i++ {
+ for j := 0; j < cols; j++ {
+ if j > 0 {
+ fmt.Fprintf(buf, ";")
+ }
+ fmt.Fprintf(buf, "%d;%f;str-%d", i, float64(i), i)
+ }
+ fmt.Fprintf(buf, "\n")
+ }
+ return buf.Bytes()
+ }
+
+ for _, rows := range []int{10, 1e2, 1e3, 1e4, 1e5} {
+ for _, cols := range []int{1, 10, 100, 1000} {
+ raw := gen(rows, cols)
+ for _, chunks := range []int{-1, 0, 10, 100, 1000} {
+ b.Run(fmt.Sprintf("rows=%d cols=%d chunks=%d", rows, cols, chunks), func(b *testing.B) {
+ benchRead(b, raw, rows, cols, chunks)
+ })
+ }
+ }
+ }
+}
+
+func benchRead(b *testing.B, raw []byte, rows, cols, chunks int) {
+ mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(b, 0)
+
+ var fields []arrow.Field
+ for i := 0; i < cols; i++ {
+ fields = append(fields, []arrow.Field{
+ arrow.Field{Name: fmt.Sprintf("i64-%d", i), Type: arrow.PrimitiveTypes.Int64},
+ arrow.Field{Name: fmt.Sprintf("f64-%d", i), Type: arrow.PrimitiveTypes.Float64},
+ arrow.Field{Name: fmt.Sprintf("str-%d", i), Type: arrow.BinaryTypes.String},
+ }...)
+ }
+
+ schema := arrow.NewSchema(fields, nil)
+ chunk := 0
+ if chunks != 0 {
+ chunk = rows / chunks
+ }
+ opts := []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(chunk),
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ r := csv.NewReader(bytes.NewReader(raw), schema, opts...)
+
+ n := int64(0)
+ for r.Next() {
+ n += r.Record().NumRows()
+ }
+
+ r.Release()
+ if n != int64(rows) {
+ b.Fatalf("invalid number of rows. want=%d, got=%d", n, rows)
+ }
+ }
+}