You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/12 22:40:52 UTC

[GitHub] [arrow] mdepero opened a new pull request, #13141: [Go] Fix broken parquet plain boolean decoder

mdepero opened a new pull request, #13141:
URL: https://github.com/apache/arrow/pull/13141

   While reading parquet files using this library, we discovered that boolean fields with `PLAIN` encoding were not being read properly. It was discovered that this was due to how `bitOffset` was managed in the boolean decoder; this PR patches the decoder and also adds test coverage for reading plain boolean pages.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #13141:
URL: https://github.com/apache/arrow/pull/13141#issuecomment-1126994124

   Benchmark runs are scheduled for baseline = 390fb095b0cdd6b1116d083485fcc2debaf04aad and contender = 576c33c8705b7e03ef1ee0a7fc50c00305bb8fc1. 576c33c8705b7e03ef1ee0a7fc50c00305bb8fc1 is a master commit associated with this PR. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Finished :arrow_down:0.0% :arrow_up:0.0%] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/68a1b8c0ea324e02b1776437fe098c69...69c20577ac3d463a9ce43f2f79a6b391/)
   [Failed :arrow_down:0.12% :arrow_up:0.0%] [test-mac-arm](https://conbench.ursa.dev/compare/runs/6096d9784e8a41b5ad7091393f3cfb3a...8699e7e1601c431c935fbc84aa24b2c1/)
   [Finished :arrow_down:3.93% :arrow_up:0.0%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/658876f0408148958a50484b3365045e...e0f3e0015e794a32b0f5360a1add406e/)
   [Finished :arrow_down:0.12% :arrow_up:0.0%] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/d4a2eb29f0934963a9724a1cd0ef533e...b55c3b126ca54d20986ab43c6af2a42a/)
   Buildkite builds:
   [Finished] [`576c33c8` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/761)
   [Failed] [`576c33c8` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/758)
   [Finished] [`576c33c8` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/748)
   [Finished] [`576c33c8` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/764)
   [Finished] [`390fb095` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/760)
   [Failed] [`390fb095` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/757)
   [Finished] [`390fb095` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/747)
   [Finished] [`390fb095` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/763)
   Supported benchmarks:
   ec2-t3-xlarge-us-east-2: Supported benchmark langs: Python, R. Runs only benchmarks with cloud = True
   test-mac-arm: Supported benchmark langs: C++, Python, R
   ursa-i9-9960x: Supported benchmark langs: Python, R, JavaScript
   ursa-thinkcentre-m75q: Supported benchmark langs: C++, Java
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] mdepero commented on a diff in pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
mdepero commented on code in PR #13141:
URL: https://github.com/apache/arrow/pull/13141#discussion_r872644045


##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -37,48 +37,57 @@ func (PlainBooleanDecoder) Type() parquet.Type {
 	return parquet.Types.Boolean
 }
 
+func (dec *PlainBooleanDecoder) SetData(nvals int, data []byte) error {
+	if err := dec.decoder.SetData(nvals, data); err != nil {
+		return err
+	}
+	dec.bitOffset = 0
+	return nil
+}
+
 // Decode fills out with bools decoded from the data at the current point
 // or until we reach the end of the data.
 //
 // Returns the number of values decoded
 func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) {
 	max := shared_utils.MinInt(len(out), dec.nvals)
 
-	unalignedExtract := func(start, end, curBitOffset int) int {
-		i := start
-		for ; curBitOffset < end && i < max; i, curBitOffset = i+1, curBitOffset+1 {
-			out[i] = (dec.data[0] & byte(1<<curBitOffset)) != 0
+	// attempts to read all remaining bool values from the current data byte
+	unalignedExtract := func(i int) int {
+		for ; dec.bitOffset < 8 && i < max; i, dec.bitOffset = i+1, dec.bitOffset+1 {
+			out[i] = (dec.data[0] & byte(1<<dec.bitOffset)) != 0
 		}
-		return i // return the number of bits we extracted
+		if dec.bitOffset == 8 {
+			// we read every bit from this byte
+			dec.bitOffset = 0
+			dec.data = dec.data[1:] // move data forward
+		}
+		return i // return the next index for out[]
 	}
 
 	// if we aren't at a byte boundary, then get bools until we hit
 	// a byte boundary with the bit offset.
 	i := 0
 	if dec.bitOffset != 0 {
-		i = unalignedExtract(0, 8, dec.bitOffset)
-		dec.bitOffset = (dec.bitOffset + i) % 8
+		i = unalignedExtract(i)
 	}
 
 	// determine the number of full bytes worth of bits we can decode
 	// given the number of values we want to decode.
 	bitsRemain := max - i
-	batch := bitsRemain / 8 * 8
+	batch := (bitsRemain / 8) * 8
 	if batch > 0 { // only go in here if there's at least one full byte to decode
-		if i > 0 { // skip our data forward if we decoded anything above
-			dec.data = dec.data[1:]
-			out = out[i:]
-		}
 		// determine the number of aligned bytes we can grab using SIMD optimized
 		// functions to improve performance.
 		alignedBytes := bitutil.BytesForBits(int64(batch))
-		utils.BytesToBools(dec.data[:alignedBytes], out)
-		dec.data = dec.data[alignedBytes:]
-		out = out[alignedBytes*8:]
+		utils.BytesToBools(dec.data[:alignedBytes], out[i:])
+
+		dec.data = dec.data[alignedBytes:] // move data forward
+		i += int(alignedBytes) * 8
 	}
 
 	// grab any trailing bits now that we've got our aligned bytes.
-	dec.bitOffset += unalignedExtract(dec.bitOffset, bitsRemain-batch, dec.bitOffset)
+	i = unalignedExtract(i)

Review Comment:
   +1, should be fixed and happy to merge after tests pass. Thanks for picking up quickly!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #13141:
URL: https://github.com/apache/arrow/pull/13141#issuecomment-1125487852

   https://issues.apache.org/jira/browse/ARROW-16563


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #13141: [Go] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #13141:
URL: https://github.com/apache/arrow/pull/13141#issuecomment-1125482778

   <!--
     Licensed to the Apache Software Foundation (ASF) under one
     or more contributor license agreements.  See the NOTICE file
     distributed with this work for additional information
     regarding copyright ownership.  The ASF licenses this file
     to you under the Apache License, Version 2.0 (the
     "License"); you may not use this file except in compliance
     with the License.  You may obtain a copy of the License at
   
       http://www.apache.org/licenses/LICENSE-2.0
   
     Unless required by applicable law or agreed to in writing,
     software distributed under the License is distributed on an
     "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
     KIND, either express or implied.  See the License for the
     specific language governing permissions and limitations
     under the License.
   -->
   
   Thanks for opening a pull request!
   
   If this is not a [minor PR](https://github.com/apache/arrow/blob/master/CONTRIBUTING.md#Minor-Fixes). Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW
   
   Opening JIRAs ahead of time contributes to the [Openness](http://theapacheway.com/open/#:~:text=Openness%20allows%20new%20users%20the,must%20happen%20in%20the%20open.) of the Apache Arrow project.
   
   Then could you also rename pull request title in the following format?
   
       ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY}
   
   or
   
       MINOR: [${COMPONENT}] ${SUMMARY}
   
   See also:
   
     * [Other pull requests](https://github.com/apache/arrow/pulls/)
     * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #13141:
URL: https://github.com/apache/arrow/pull/13141#issuecomment-1125487862

   :warning: Ticket **has not been started in JIRA**, please click 'Start Progress'.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade closed pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
zeroshade closed pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder
URL: https://github.com/apache/arrow/pull/13141


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #13141:
URL: https://github.com/apache/arrow/pull/13141#issuecomment-1126994151

   ['Python', 'R'] benchmarks have high level of regressions.
   [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/658876f0408148958a50484b3365045e...e0f3e0015e794a32b0f5360a1add406e/)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #13141: ARROW-16563: [Go][Parquet] Fix broken parquet plain boolean decoder

Posted by GitBox <gi...@apache.org>.
zeroshade commented on code in PR #13141:
URL: https://github.com/apache/arrow/pull/13141#discussion_r872634643


##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -37,48 +37,57 @@ func (PlainBooleanDecoder) Type() parquet.Type {
 	return parquet.Types.Boolean
 }
 
+func (dec *PlainBooleanDecoder) SetData(nvals int, data []byte) error {
+	if err := dec.decoder.SetData(nvals, data); err != nil {
+		return err
+	}
+	dec.bitOffset = 0
+	return nil
+}
+
 // Decode fills out with bools decoded from the data at the current point
 // or until we reach the end of the data.
 //
 // Returns the number of values decoded
 func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) {
 	max := shared_utils.MinInt(len(out), dec.nvals)
 
-	unalignedExtract := func(start, end, curBitOffset int) int {
-		i := start
-		for ; curBitOffset < end && i < max; i, curBitOffset = i+1, curBitOffset+1 {
-			out[i] = (dec.data[0] & byte(1<<curBitOffset)) != 0
+	// attempts to read all remaining bool values from the current data byte
+	unalignedExtract := func(i int) int {
+		for ; dec.bitOffset < 8 && i < max; i, dec.bitOffset = i+1, dec.bitOffset+1 {
+			out[i] = (dec.data[0] & byte(1<<dec.bitOffset)) != 0
 		}
-		return i // return the number of bits we extracted
+		if dec.bitOffset == 8 {
+			// we read every bit from this byte
+			dec.bitOffset = 0
+			dec.data = dec.data[1:] // move data forward
+		}
+		return i // return the next index for out[]
 	}
 
 	// if we aren't at a byte boundary, then get bools until we hit
 	// a byte boundary with the bit offset.
 	i := 0
 	if dec.bitOffset != 0 {
-		i = unalignedExtract(0, 8, dec.bitOffset)
-		dec.bitOffset = (dec.bitOffset + i) % 8
+		i = unalignedExtract(i)
 	}
 
 	// determine the number of full bytes worth of bits we can decode
 	// given the number of values we want to decode.
 	bitsRemain := max - i
-	batch := bitsRemain / 8 * 8
+	batch := (bitsRemain / 8) * 8
 	if batch > 0 { // only go in here if there's at least one full byte to decode
-		if i > 0 { // skip our data forward if we decoded anything above
-			dec.data = dec.data[1:]
-			out = out[i:]
-		}
 		// determine the number of aligned bytes we can grab using SIMD optimized
 		// functions to improve performance.
 		alignedBytes := bitutil.BytesForBits(int64(batch))
-		utils.BytesToBools(dec.data[:alignedBytes], out)
-		dec.data = dec.data[alignedBytes:]
-		out = out[alignedBytes*8:]
+		utils.BytesToBools(dec.data[:alignedBytes], out[i:])
+
+		dec.data = dec.data[alignedBytes:] // move data forward
+		i += int(alignedBytes) * 8
 	}
 
 	// grab any trailing bits now that we've got our aligned bytes.
-	dec.bitOffset += unalignedExtract(dec.bitOffset, bitsRemain-batch, dec.bitOffset)
+	i = unalignedExtract(i)

Review Comment:
   the linter is throwing an error because this value is never used. the `i = ` should probably be removed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org