You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/10/20 15:57:34 UTC

[arrow] branch master updated: ARROW-18081: [Go] Add Scalar Boolean functions (#14442)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e06e98db35 ARROW-18081: [Go] Add Scalar Boolean functions (#14442)
e06e98db35 is described below

commit e06e98db356e602212019cfbae83fd3d5347292d
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Thu Oct 20 11:57:26 2022 -0400

    ARROW-18081: [Go] Add Scalar Boolean functions (#14442)
    
    Authored-by: Matt Topol <zo...@gmail.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 ci/scripts/go_test.sh                              |   2 +-
 go/arrow/bitutil/_lib/bitmap_ops.c                 |  14 +-
 go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s      | 198 ++++++++++++
 go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s      | 258 ++++++++++++++++
 go/arrow/bitutil/bitmap_ops.go                     |  46 +++
 go/arrow/bitutil/bitmap_ops_amd64.go               |   6 +
 go/arrow/bitutil/bitmap_ops_arm64.go               |   2 +
 go/arrow/bitutil/bitmap_ops_avx2_amd64.go          |  14 +
 go/arrow/bitutil/bitmap_ops_avx2_amd64.s           | 181 +++++++++++
 go/arrow/bitutil/bitmap_ops_noasm.go               |   2 +
 go/arrow/bitutil/bitmap_ops_ppc64le.go             |   2 +
 go/arrow/bitutil/bitmap_ops_s390x.go               |   2 +
 go/arrow/bitutil/bitmap_ops_sse4_amd64.go          |  14 +
 go/arrow/bitutil/bitmap_ops_sse4_amd64.s           | 245 +++++++++++++++
 go/arrow/bitutil/bitmaps.go                        | 175 ++++++++++-
 .../compute/internal/kernels/scalar_boolean.go     | 332 +++++++++++++++++++++
 go/arrow/compute/internal/kernels/types.go         |  43 +++
 go/arrow/compute/registry.go                       |   1 +
 go/arrow/compute/scalar_bool.go                    | 131 ++++++++
 go/arrow/compute/scalar_bool_test.go               | 152 ++++++++++
 go/go.sum                                          |   1 +
 21 files changed, 1812 insertions(+), 9 deletions(-)

diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh
index e31fa55564..54b05c3cc2 100755
--- a/ci/scripts/go_test.sh
+++ b/ci/scripts/go_test.sh
@@ -61,7 +61,7 @@ pushd ${source_dir}/arrow
 TAGS="assert,test"
 if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
     if [[ "${MSYSTEM}" = "MINGW64" ]]; then
-        export PATH=${MINGW_PREFIX}/bin:$PATH
+        export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH
     fi
     TAGS="${TAGS},ccalloc"
 fi
diff --git a/go/arrow/bitutil/_lib/bitmap_ops.c b/go/arrow/bitutil/_lib/bitmap_ops.c
index 96817b2f2b..f48b4d4d82 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops.c
+++ b/go/arrow/bitutil/_lib/bitmap_ops.c
@@ -31,4 +31,16 @@ void FULL_NAME(bitmap_aligned_or)(const uint8_t* left, const uint8_t* right, uin
     for (int64_t i = 0; i < nbytes; ++i) {
         out[i] = left[i] | right[i];
     }
-}
\ No newline at end of file
+}
+
+void FULL_NAME(bitmap_aligned_and_not)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) {
+    for (int64_t i = 0; i < nbytes; ++i) {
+        out[i] = left[i] & ~right[i];
+    }
+}
+
+void FULL_NAME(bitmap_aligned_xor)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) {
+    for (int64_t i = 0; i < nbytes; ++i) {
+        out[i] = left[i] ^ right[i];
+    }
+}
diff --git a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
index 69f69d2970..a4010dab55 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
+++ b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
@@ -207,6 +207,204 @@ bitmap_aligned_or_avx2:                 # @bitmap_aligned_or_avx2
 .Lfunc_end1:
 	.size	bitmap_aligned_or_avx2, .Lfunc_end1-bitmap_aligned_or_avx2
                                         # -- End function
+	.globl	bitmap_aligned_and_not_avx2     # -- Begin function bitmap_aligned_and_not_avx2
+	.p2align	4, 0x90
+	.type	bitmap_aligned_and_not_avx2,@function
+bitmap_aligned_and_not_avx2:            # @bitmap_aligned_and_not_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	rbx
+	and	rsp, -8
+	test	rcx, rcx
+	jle	.LBB2_12
+# %bb.1:
+	cmp	rcx, 127
+	ja	.LBB2_7
+# %bb.2:
+	xor	r8d, r8d
+	jmp	.LBB2_3
+.LBB2_7:
+	lea	r8, [rdx + rcx]
+	lea	rax, [rdi + rcx]
+	cmp	rax, rdx
+	seta	r11b
+	lea	rax, [rsi + rcx]
+	cmp	r8, rdi
+	seta	bl
+	cmp	rax, rdx
+	seta	r10b
+	cmp	r8, rsi
+	seta	r9b
+	xor	r8d, r8d
+	test	r11b, bl
+	jne	.LBB2_3
+# %bb.8:
+	and	r10b, r9b
+	jne	.LBB2_3
+# %bb.9:
+	mov	r8, rcx
+	and	r8, -128
+	xor	eax, eax
+	.p2align	4, 0x90
+.LBB2_10:                               # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rsi + rax]
+	vmovups	ymm1, ymmword ptr [rsi + rax + 32]
+	vmovups	ymm2, ymmword ptr [rsi + rax + 64]
+	vmovups	ymm3, ymmword ptr [rsi + rax + 96]
+	vandnps	ymm0, ymm0, ymmword ptr [rdi + rax]
+	vandnps	ymm1, ymm1, ymmword ptr [rdi + rax + 32]
+	vandnps	ymm2, ymm2, ymmword ptr [rdi + rax + 64]
+	vandnps	ymm3, ymm3, ymmword ptr [rdi + rax + 96]
+	vmovups	ymmword ptr [rdx + rax], ymm0
+	vmovups	ymmword ptr [rdx + rax + 32], ymm1
+	vmovups	ymmword ptr [rdx + rax + 64], ymm2
+	vmovups	ymmword ptr [rdx + rax + 96], ymm3
+	sub	rax, -128
+	cmp	r8, rax
+	jne	.LBB2_10
+# %bb.11:
+	cmp	r8, rcx
+	je	.LBB2_12
+.LBB2_3:
+	mov	r9, r8
+	not	r9
+	test	cl, 1
+	je	.LBB2_5
+# %bb.4:
+	mov	al, byte ptr [rsi + r8]
+	not	al
+	and	al, byte ptr [rdi + r8]
+	mov	byte ptr [rdx + r8], al
+	or	r8, 1
+.LBB2_5:
+	add	r9, rcx
+	je	.LBB2_12
+	.p2align	4, 0x90
+.LBB2_6:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r8]
+	not	al
+	and	al, byte ptr [rdi + r8]
+	mov	byte ptr [rdx + r8], al
+	movzx	eax, byte ptr [rsi + r8 + 1]
+	not	al
+	and	al, byte ptr [rdi + r8 + 1]
+	mov	byte ptr [rdx + r8 + 1], al
+	add	r8, 2
+	cmp	rcx, r8
+	jne	.LBB2_6
+.LBB2_12:
+	lea	rsp, [rbp - 8]
+	pop	rbx
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end2:
+	.size	bitmap_aligned_and_not_avx2, .Lfunc_end2-bitmap_aligned_and_not_avx2
+                                        # -- End function
+	.globl	bitmap_aligned_xor_avx2         # -- Begin function bitmap_aligned_xor_avx2
+	.p2align	4, 0x90
+	.type	bitmap_aligned_xor_avx2,@function
+bitmap_aligned_xor_avx2:                # @bitmap_aligned_xor_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	rbx
+	and	rsp, -8
+	test	rcx, rcx
+	jle	.LBB3_12
+# %bb.1:
+	cmp	rcx, 127
+	ja	.LBB3_7
+# %bb.2:
+	xor	r10d, r10d
+	jmp	.LBB3_3
+.LBB3_7:
+	lea	r9, [rdx + rcx]
+	lea	rax, [rdi + rcx]
+	cmp	rax, rdx
+	seta	r11b
+	lea	rax, [rsi + rcx]
+	cmp	r9, rdi
+	seta	bl
+	cmp	rax, rdx
+	seta	r8b
+	cmp	r9, rsi
+	seta	r9b
+	xor	r10d, r10d
+	test	r11b, bl
+	jne	.LBB3_3
+# %bb.8:
+	and	r8b, r9b
+	jne	.LBB3_3
+# %bb.9:
+	mov	r10, rcx
+	and	r10, -128
+	xor	r8d, r8d
+	.p2align	4, 0x90
+.LBB3_10:                               # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rsi + r8]
+	vmovups	ymm1, ymmword ptr [rsi + r8 + 32]
+	vmovups	ymm2, ymmword ptr [rsi + r8 + 64]
+	vmovups	ymm3, ymmword ptr [rsi + r8 + 96]
+	vxorps	ymm0, ymm0, ymmword ptr [rdi + r8]
+	vxorps	ymm1, ymm1, ymmword ptr [rdi + r8 + 32]
+	vxorps	ymm2, ymm2, ymmword ptr [rdi + r8 + 64]
+	vxorps	ymm3, ymm3, ymmword ptr [rdi + r8 + 96]
+	vmovups	ymmword ptr [rdx + r8], ymm0
+	vmovups	ymmword ptr [rdx + r8 + 32], ymm1
+	vmovups	ymmword ptr [rdx + r8 + 64], ymm2
+	vmovups	ymmword ptr [rdx + r8 + 96], ymm3
+	sub	r8, -128
+	cmp	r10, r8
+	jne	.LBB3_10
+# %bb.11:
+	cmp	r10, rcx
+	je	.LBB3_12
+.LBB3_3:
+	mov	r8, r10
+	not	r8
+	add	r8, rcx
+	mov	r9, rcx
+	and	r9, 3
+	je	.LBB3_5
+	.p2align	4, 0x90
+.LBB3_4:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r10]
+	xor	al, byte ptr [rdi + r10]
+	mov	byte ptr [rdx + r10], al
+	add	r10, 1
+	add	r9, -1
+	jne	.LBB3_4
+.LBB3_5:
+	cmp	r8, 3
+	jb	.LBB3_12
+	.p2align	4, 0x90
+.LBB3_6:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r10]
+	xor	al, byte ptr [rdi + r10]
+	mov	byte ptr [rdx + r10], al
+	movzx	eax, byte ptr [rsi + r10 + 1]
+	xor	al, byte ptr [rdi + r10 + 1]
+	mov	byte ptr [rdx + r10 + 1], al
+	movzx	eax, byte ptr [rsi + r10 + 2]
+	xor	al, byte ptr [rdi + r10 + 2]
+	mov	byte ptr [rdx + r10 + 2], al
+	movzx	eax, byte ptr [rsi + r10 + 3]
+	xor	al, byte ptr [rdi + r10 + 3]
+	mov	byte ptr [rdx + r10 + 3], al
+	add	r10, 4
+	cmp	rcx, r10
+	jne	.LBB3_6
+.LBB3_12:
+	lea	rsp, [rbp - 8]
+	pop	rbx
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end3:
+	.size	bitmap_aligned_xor_avx2, .Lfunc_end3-bitmap_aligned_xor_avx2
+                                        # -- End function
 	.ident	"Ubuntu clang version 11.1.0-6"
 	.section	".note.GNU-stack","",@progbits
 	.addrsig
diff --git a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
index 9d028155b7..840c1a623b 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
+++ b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
@@ -267,6 +267,264 @@ bitmap_aligned_or_sse4:                 # @bitmap_aligned_or_sse4
 .Lfunc_end1:
 	.size	bitmap_aligned_or_sse4, .Lfunc_end1-bitmap_aligned_or_sse4
                                         # -- End function
+	.globl	bitmap_aligned_and_not_sse4     # -- Begin function bitmap_aligned_and_not_sse4
+	.p2align	4, 0x90
+	.type	bitmap_aligned_and_not_sse4,@function
+bitmap_aligned_and_not_sse4:            # @bitmap_aligned_and_not_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	rbx
+	and	rsp, -8
+	test	rcx, rcx
+	jle	.LBB2_16
+# %bb.1:
+	cmp	rcx, 31
+	ja	.LBB2_7
+# %bb.2:
+	xor	r11d, r11d
+.LBB2_3:
+	mov	r8, r11
+	not	r8
+	test	cl, 1
+	je	.LBB2_5
+# %bb.4:
+	mov	al, byte ptr [rsi + r11]
+	not	al
+	and	al, byte ptr [rdi + r11]
+	mov	byte ptr [rdx + r11], al
+	or	r11, 1
+.LBB2_5:
+	add	r8, rcx
+	je	.LBB2_16
+	.p2align	4, 0x90
+.LBB2_6:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r11]
+	not	al
+	and	al, byte ptr [rdi + r11]
+	mov	byte ptr [rdx + r11], al
+	movzx	eax, byte ptr [rsi + r11 + 1]
+	not	al
+	and	al, byte ptr [rdi + r11 + 1]
+	mov	byte ptr [rdx + r11 + 1], al
+	add	r11, 2
+	cmp	rcx, r11
+	jne	.LBB2_6
+	jmp	.LBB2_16
+.LBB2_7:
+	lea	r9, [rdx + rcx]
+	lea	rax, [rdi + rcx]
+	cmp	rax, rdx
+	seta	r10b
+	lea	rax, [rsi + rcx]
+	cmp	r9, rdi
+	seta	bl
+	cmp	rax, rdx
+	seta	r8b
+	cmp	r9, rsi
+	seta	r9b
+	xor	r11d, r11d
+	test	r10b, bl
+	jne	.LBB2_3
+# %bb.8:
+	and	r8b, r9b
+	jne	.LBB2_3
+# %bb.9:
+	mov	r11, rcx
+	and	r11, -32
+	lea	rax, [r11 - 32]
+	mov	r9, rax
+	shr	r9, 5
+	add	r9, 1
+	test	rax, rax
+	je	.LBB2_10
+# %bb.11:
+	mov	r10, r9
+	and	r10, -2
+	neg	r10
+	xor	r8d, r8d
+	.p2align	4, 0x90
+.LBB2_12:                               # =>This Inner Loop Header: Depth=1
+	movups	xmm0, xmmword ptr [rdi + r8]
+	movups	xmm1, xmmword ptr [rdi + r8 + 16]
+	movups	xmm2, xmmword ptr [rsi + r8]
+	andnps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 16]
+	andnps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8], xmm2
+	movups	xmmword ptr [rdx + r8 + 16], xmm0
+	movups	xmm0, xmmword ptr [rdi + r8 + 32]
+	movups	xmm1, xmmword ptr [rdi + r8 + 48]
+	movups	xmm2, xmmword ptr [rsi + r8 + 32]
+	andnps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 48]
+	andnps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8 + 32], xmm2
+	movups	xmmword ptr [rdx + r8 + 48], xmm0
+	add	r8, 64
+	add	r10, 2
+	jne	.LBB2_12
+# %bb.13:
+	test	r9b, 1
+	je	.LBB2_15
+.LBB2_14:
+	movups	xmm0, xmmword ptr [rdi + r8]
+	movups	xmm1, xmmword ptr [rdi + r8 + 16]
+	movups	xmm2, xmmword ptr [rsi + r8]
+	andnps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 16]
+	andnps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8], xmm2
+	movups	xmmword ptr [rdx + r8 + 16], xmm0
+.LBB2_15:
+	cmp	r11, rcx
+	jne	.LBB2_3
+.LBB2_16:
+	lea	rsp, [rbp - 8]
+	pop	rbx
+	pop	rbp
+	ret
+.LBB2_10:
+	xor	r8d, r8d
+	test	r9b, 1
+	jne	.LBB2_14
+	jmp	.LBB2_15
+.Lfunc_end2:
+	.size	bitmap_aligned_and_not_sse4, .Lfunc_end2-bitmap_aligned_and_not_sse4
+                                        # -- End function
+	.globl	bitmap_aligned_xor_sse4         # -- Begin function bitmap_aligned_xor_sse4
+	.p2align	4, 0x90
+	.type	bitmap_aligned_xor_sse4,@function
+bitmap_aligned_xor_sse4:                # @bitmap_aligned_xor_sse4
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	rbx
+	and	rsp, -8
+	test	rcx, rcx
+	jle	.LBB3_16
+# %bb.1:
+	cmp	rcx, 31
+	ja	.LBB3_7
+# %bb.2:
+	xor	r11d, r11d
+.LBB3_3:
+	mov	r8, r11
+	not	r8
+	add	r8, rcx
+	mov	r9, rcx
+	and	r9, 3
+	je	.LBB3_5
+	.p2align	4, 0x90
+.LBB3_4:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r11]
+	xor	al, byte ptr [rdi + r11]
+	mov	byte ptr [rdx + r11], al
+	add	r11, 1
+	add	r9, -1
+	jne	.LBB3_4
+.LBB3_5:
+	cmp	r8, 3
+	jb	.LBB3_16
+	.p2align	4, 0x90
+.LBB3_6:                                # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rsi + r11]
+	xor	al, byte ptr [rdi + r11]
+	mov	byte ptr [rdx + r11], al
+	movzx	eax, byte ptr [rsi + r11 + 1]
+	xor	al, byte ptr [rdi + r11 + 1]
+	mov	byte ptr [rdx + r11 + 1], al
+	movzx	eax, byte ptr [rsi + r11 + 2]
+	xor	al, byte ptr [rdi + r11 + 2]
+	mov	byte ptr [rdx + r11 + 2], al
+	movzx	eax, byte ptr [rsi + r11 + 3]
+	xor	al, byte ptr [rdi + r11 + 3]
+	mov	byte ptr [rdx + r11 + 3], al
+	add	r11, 4
+	cmp	rcx, r11
+	jne	.LBB3_6
+	jmp	.LBB3_16
+.LBB3_7:
+	lea	r9, [rdx + rcx]
+	lea	rax, [rdi + rcx]
+	cmp	rax, rdx
+	seta	r10b
+	lea	rax, [rsi + rcx]
+	cmp	r9, rdi
+	seta	bl
+	cmp	rax, rdx
+	seta	r8b
+	cmp	r9, rsi
+	seta	r9b
+	xor	r11d, r11d
+	test	r10b, bl
+	jne	.LBB3_3
+# %bb.8:
+	and	r8b, r9b
+	jne	.LBB3_3
+# %bb.9:
+	mov	r11, rcx
+	and	r11, -32
+	lea	rax, [r11 - 32]
+	mov	r9, rax
+	shr	r9, 5
+	add	r9, 1
+	test	rax, rax
+	je	.LBB3_10
+# %bb.11:
+	mov	r10, r9
+	and	r10, -2
+	neg	r10
+	xor	r8d, r8d
+	.p2align	4, 0x90
+.LBB3_12:                               # =>This Inner Loop Header: Depth=1
+	movups	xmm0, xmmword ptr [rdi + r8]
+	movups	xmm1, xmmword ptr [rdi + r8 + 16]
+	movups	xmm2, xmmword ptr [rsi + r8]
+	xorps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 16]
+	xorps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8], xmm2
+	movups	xmmword ptr [rdx + r8 + 16], xmm0
+	movups	xmm0, xmmword ptr [rdi + r8 + 32]
+	movups	xmm1, xmmword ptr [rdi + r8 + 48]
+	movups	xmm2, xmmword ptr [rsi + r8 + 32]
+	xorps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 48]
+	xorps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8 + 32], xmm2
+	movups	xmmword ptr [rdx + r8 + 48], xmm0
+	add	r8, 64
+	add	r10, 2
+	jne	.LBB3_12
+# %bb.13:
+	test	r9b, 1
+	je	.LBB3_15
+.LBB3_14:
+	movups	xmm0, xmmword ptr [rdi + r8]
+	movups	xmm1, xmmword ptr [rdi + r8 + 16]
+	movups	xmm2, xmmword ptr [rsi + r8]
+	xorps	xmm2, xmm0
+	movups	xmm0, xmmword ptr [rsi + r8 + 16]
+	xorps	xmm0, xmm1
+	movups	xmmword ptr [rdx + r8], xmm2
+	movups	xmmword ptr [rdx + r8 + 16], xmm0
+.LBB3_15:
+	cmp	r11, rcx
+	jne	.LBB3_3
+.LBB3_16:
+	lea	rsp, [rbp - 8]
+	pop	rbx
+	pop	rbp
+	ret
+.LBB3_10:
+	xor	r8d, r8d
+	test	r9b, 1
+	jne	.LBB3_14
+	jmp	.LBB3_15
+.Lfunc_end3:
+	.size	bitmap_aligned_xor_sse4, .Lfunc_end3-bitmap_aligned_xor_sse4
+                                        # -- End function
 	.ident	"Ubuntu clang version 11.1.0-6"
 	.section	".note.GNU-stack","",@progbits
 	.addrsig
diff --git a/go/arrow/bitutil/bitmap_ops.go b/go/arrow/bitutil/bitmap_ops.go
index 62322b04b9..7db750a6dd 100644
--- a/go/arrow/bitutil/bitmap_ops.go
+++ b/go/arrow/bitutil/bitmap_ops.go
@@ -39,6 +39,29 @@ func alignedBitAndGo(left, right, out []byte) {
 	}
 }
 
+func alignedBitAndNotGo(left, right, out []byte) {
+	var (
+		nbytes = len(out)
+		i      = 0
+	)
+	if nbytes > uint64SizeBytes {
+		// case where we have enough bytes to operate on words
+		leftWords := bytesToUint64(left[i:])
+		rightWords := bytesToUint64(right[i:])
+		outWords := bytesToUint64(out[i:])
+
+		for w := range outWords {
+			outWords[w] = leftWords[w] &^ rightWords[w]
+		}
+
+		i += len(outWords) * uint64SizeBytes
+	}
+	// grab any remaining bytes that were fewer than a word
+	for ; i < nbytes; i++ {
+		out[i] = left[i] &^ right[i]
+	}
+}
+
 func alignedBitOrGo(left, right, out []byte) {
 	var (
 		nbytes = len(out)
@@ -61,3 +84,26 @@ func alignedBitOrGo(left, right, out []byte) {
 		out[i] = left[i] | right[i]
 	}
 }
+
+func alignedBitXorGo(left, right, out []byte) {
+	var (
+		nbytes = len(out)
+		i      = 0
+	)
+	if nbytes > uint64SizeBytes {
+		// case where we have enough bytes to operate on words
+		leftWords := bytesToUint64(left[i:])
+		rightWords := bytesToUint64(right[i:])
+		outWords := bytesToUint64(out[i:])
+
+		for w := range outWords {
+			outWords[w] = leftWords[w] ^ rightWords[w]
+		}
+
+		i += len(outWords) * uint64SizeBytes
+	}
+	// grab any remaining bytes that were fewer than a word
+	for ; i < nbytes; i++ {
+		out[i] = left[i] ^ right[i]
+	}
+}
diff --git a/go/arrow/bitutil/bitmap_ops_amd64.go b/go/arrow/bitutil/bitmap_ops_amd64.go
index 9aa5a6dd56..ad0fd674ab 100644
--- a/go/arrow/bitutil/bitmap_ops_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_amd64.go
@@ -25,11 +25,17 @@ func init() {
 	if cpu.X86.HasAVX2 {
 		bitAndOp.opAligned = bitmapAlignedAndAVX2
 		bitOrOp.opAligned = bitmapAlignedOrAVX2
+		bitAndNotOp.opAligned = bitmapAlignedAndNotAVX2
+		bitXorOp.opAligned = bitmapAlignedXorAVX2
 	} else if cpu.X86.HasSSE42 {
 		bitAndOp.opAligned = bitmapAlignedAndSSE4
 		bitOrOp.opAligned = bitmapAlignedOrSSE4
+		bitAndNotOp.opAligned = bitmapAlignedAndNotSSE4
+		bitXorOp.opAligned = bitmapAlignedXorSSE4
 	} else {
 		bitAndOp.opAligned = alignedBitAndGo
 		bitOrOp.opAligned = alignedBitOrGo
+		bitAndNotOp.opAligned = alignedBitAndNotGo
+		bitXorOp.opAligned = alignedBitXorGo
 	}
 }
diff --git a/go/arrow/bitutil/bitmap_ops_arm64.go b/go/arrow/bitutil/bitmap_ops_arm64.go
index 86c47639a9..28d95d84ad 100644
--- a/go/arrow/bitutil/bitmap_ops_arm64.go
+++ b/go/arrow/bitutil/bitmap_ops_arm64.go
@@ -22,4 +22,6 @@ package bitutil
 func init() {
 	bitAndOp.opAligned = alignedBitAndGo
 	bitOrOp.opAligned = alignedBitOrGo
+	bitAndNotOp.opAligned = alignedBitAndNotGo
+	bitXorOp.opAligned = alignedBitXorGo
 }
diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
index 731b9807b7..1c01bd0f38 100644
--- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
@@ -36,3 +36,17 @@ func _bitmap_aligned_or_avx2(left, right, out unsafe.Pointer, length int64)
 func bitmapAlignedOrAVX2(left, right, out []byte) {
 	_bitmap_aligned_or_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
 }
+
+//go:noescape
+func _bitmap_aligned_and_not_avx2(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedAndNotAVX2(left, right, out []byte) {
+	_bitmap_aligned_and_not_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
+
+//go:noescape
+func _bitmap_aligned_xor_avx2(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedXorAVX2(left, right, out []byte) {
+	_bitmap_aligned_xor_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
index 2e2ade8961..00172e8659 100644
--- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
+++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
@@ -190,3 +190,184 @@ LBB1_6:
 LBB1_12:
 	VZEROUPPER
 	RET
+
+TEXT ·_bitmap_aligned_and_not_avx2(SB), $0-32
+
+	MOVQ left+0(FP), DI
+	MOVQ right+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ length+24(FP), CX
+
+	WORD $0x8548; BYTE $0xc9 // test    rcx, rcx
+	JLE  LBB2_12
+	LONG $0x7ff98348         // cmp    rcx, 127
+	JA   LBB2_7
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+	JMP  LBB2_3
+
+LBB2_7:
+	LONG $0x0a048d4c         // lea    r8, [rdx + rcx]
+	LONG $0x0f048d48         // lea    rax, [rdi + rcx]
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd3970f41         // seta    r11b
+	LONG $0x0e048d48         // lea    rax, [rsi + rcx]
+	WORD $0x3949; BYTE $0xf8 // cmp    r8, rdi
+	WORD $0x970f; BYTE $0xd3 // seta    bl
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd2970f41         // seta    r10b
+	WORD $0x3949; BYTE $0xf0 // cmp    r8, rsi
+	LONG $0xd1970f41         // seta    r9b
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+	WORD $0x8441; BYTE $0xdb // test    r11b, bl
+	JNE  LBB2_3
+	WORD $0x2045; BYTE $0xca // and    r10b, r9b
+	JNE  LBB2_3
+	WORD $0x8949; BYTE $0xc8 // mov    r8, rcx
+	LONG $0x80e08349         // and    r8, -128
+	WORD $0xc031             // xor    eax, eax
+
+LBB2_10:
+	LONG $0x0410fcc5; BYTE $0x06   // vmovups    ymm0, yword [rsi + rax]
+	LONG $0x4c10fcc5; WORD $0x2006 // vmovups    ymm1, yword [rsi + rax + 32]
+	LONG $0x5410fcc5; WORD $0x4006 // vmovups    ymm2, yword [rsi + rax + 64]
+	LONG $0x5c10fcc5; WORD $0x6006 // vmovups    ymm3, yword [rsi + rax + 96]
+	LONG $0x0455fcc5; BYTE $0x07   // vandnps    ymm0, ymm0, yword [rdi + rax]
+	LONG $0x4c55f4c5; WORD $0x2007 // vandnps    ymm1, ymm1, yword [rdi + rax + 32]
+	LONG $0x5455ecc5; WORD $0x4007 // vandnps    ymm2, ymm2, yword [rdi + rax + 64]
+	LONG $0x5c55e4c5; WORD $0x6007 // vandnps    ymm3, ymm3, yword [rdi + rax + 96]
+	LONG $0x0411fcc5; BYTE $0x02   // vmovups    yword [rdx + rax], ymm0
+	LONG $0x4c11fcc5; WORD $0x2002 // vmovups    yword [rdx + rax + 32], ymm1
+	LONG $0x5411fcc5; WORD $0x4002 // vmovups    yword [rdx + rax + 64], ymm2
+	LONG $0x5c11fcc5; WORD $0x6002 // vmovups    yword [rdx + rax + 96], ymm3
+	LONG $0x80e88348               // sub    rax, -128
+	WORD $0x3949; BYTE $0xc0       // cmp    r8, rax
+	JNE  LBB2_10
+	WORD $0x3949; BYTE $0xc8       // cmp    r8, rcx
+	JE   LBB2_12
+
+LBB2_3:
+	WORD $0x894d; BYTE $0xc1 // mov    r9, r8
+	WORD $0xf749; BYTE $0xd1 // not    r9
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB2_5
+	LONG $0x06048a42         // mov    al, byte [rsi + r8]
+	WORD $0xd0f6             // not    al
+	LONG $0x07042242         // and    al, byte [rdi + r8]
+	LONG $0x02048842         // mov    byte [rdx + r8], al
+	LONG $0x01c88349         // or    r8, 1
+
+LBB2_5:
+	WORD $0x0149; BYTE $0xc9 // add    r9, rcx
+	JE   LBB2_12
+
+LBB2_6:
+	LONG $0x04b60f42; BYTE $0x06   // movzx    eax, byte [rsi + r8]
+	WORD $0xd0f6                   // not    al
+	LONG $0x07042242               // and    al, byte [rdi + r8]
+	LONG $0x02048842               // mov    byte [rdx + r8], al
+	LONG $0x44b60f42; WORD $0x0106 // movzx    eax, byte [rsi + r8 + 1]
+	WORD $0xd0f6                   // not    al
+	LONG $0x07442242; BYTE $0x01   // and    al, byte [rdi + r8 + 1]
+	LONG $0x02448842; BYTE $0x01   // mov    byte [rdx + r8 + 1], al
+	LONG $0x02c08349               // add    r8, 2
+	WORD $0x394c; BYTE $0xc1       // cmp    rcx, r8
+	JNE  LBB2_6
+
+LBB2_12:
+	VZEROUPPER
+	RET
+
+TEXT ·_bitmap_aligned_xor_avx2(SB), $0-32
+
+	MOVQ left+0(FP), DI
+	MOVQ right+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ length+24(FP), CX
+
+	WORD $0x8548; BYTE $0xc9 // test    rcx, rcx
+	JLE  LBB3_12
+	LONG $0x7ff98348         // cmp    rcx, 127
+	JA   LBB3_7
+	WORD $0x3145; BYTE $0xd2 // xor    r10d, r10d
+	JMP  LBB3_3
+
+LBB3_7:
+	LONG $0x0a0c8d4c         // lea    r9, [rdx + rcx]
+	LONG $0x0f048d48         // lea    rax, [rdi + rcx]
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd3970f41         // seta    r11b
+	LONG $0x0e048d48         // lea    rax, [rsi + rcx]
+	WORD $0x3949; BYTE $0xf9 // cmp    r9, rdi
+	WORD $0x970f; BYTE $0xd3 // seta    bl
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd0970f41         // seta    r8b
+	WORD $0x3949; BYTE $0xf1 // cmp    r9, rsi
+	LONG $0xd1970f41         // seta    r9b
+	WORD $0x3145; BYTE $0xd2 // xor    r10d, r10d
+	WORD $0x8441; BYTE $0xdb // test    r11b, bl
+	JNE  LBB3_3
+	WORD $0x2045; BYTE $0xc8 // and    r8b, r9b
+	JNE  LBB3_3
+	WORD $0x8949; BYTE $0xca // mov    r10, rcx
+	LONG $0x80e28349         // and    r10, -128
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+
+LBB3_10:
+	LONG $0x107ca1c4; WORD $0x0604             // vmovups    ymm0, yword [rsi + r8]
+	LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups    ymm1, yword [rsi + r8 + 32]
+	LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups    ymm2, yword [rsi + r8 + 64]
+	LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups    ymm3, yword [rsi + r8 + 96]
+	LONG $0x577ca1c4; WORD $0x0704             // vxorps    ymm0, ymm0, yword [rdi + r8]
+	LONG $0x5774a1c4; WORD $0x074c; BYTE $0x20 // vxorps    ymm1, ymm1, yword [rdi + r8 + 32]
+	LONG $0x576ca1c4; WORD $0x0754; BYTE $0x40 // vxorps    ymm2, ymm2, yword [rdi + r8 + 64]
+	LONG $0x5764a1c4; WORD $0x075c; BYTE $0x60 // vxorps    ymm3, ymm3, yword [rdi + r8 + 96]
+	LONG $0x117ca1c4; WORD $0x0204             // vmovups    yword [rdx + r8], ymm0
+	LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups    yword [rdx + r8 + 32], ymm1
+	LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups    yword [rdx + r8 + 64], ymm2
+	LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups    yword [rdx + r8 + 96], ymm3
+	LONG $0x80e88349                           // sub    r8, -128
+	WORD $0x394d; BYTE $0xc2                   // cmp    r10, r8
+	JNE  LBB3_10
+	WORD $0x3949; BYTE $0xca                   // cmp    r10, rcx
+	JE   LBB3_12
+
+LBB3_3:
+	WORD $0x894d; BYTE $0xd0 // mov    r8, r10
+	WORD $0xf749; BYTE $0xd0 // not    r8
+	WORD $0x0149; BYTE $0xc8 // add    r8, rcx
+	WORD $0x8949; BYTE $0xc9 // mov    r9, rcx
+	LONG $0x03e18349         // and    r9, 3
+	JE   LBB3_5
+
+LBB3_4:
+	LONG $0x04b60f42; BYTE $0x16 // movzx    eax, byte [rsi + r10]
+	LONG $0x17043242             // xor    al, byte [rdi + r10]
+	LONG $0x12048842             // mov    byte [rdx + r10], al
+	LONG $0x01c28349             // add    r10, 1
+	LONG $0xffc18349             // add    r9, -1
+	JNE  LBB3_4
+
+LBB3_5:
+	LONG $0x03f88349 // cmp    r8, 3
+	JB   LBB3_12
+
+LBB3_6:
+	LONG $0x04b60f42; BYTE $0x16   // movzx    eax, byte [rsi + r10]
+	LONG $0x17043242               // xor    al, byte [rdi + r10]
+	LONG $0x12048842               // mov    byte [rdx + r10], al
+	LONG $0x44b60f42; WORD $0x0116 // movzx    eax, byte [rsi + r10 + 1]
+	LONG $0x17443242; BYTE $0x01   // xor    al, byte [rdi + r10 + 1]
+	LONG $0x12448842; BYTE $0x01   // mov    byte [rdx + r10 + 1], al
+	LONG $0x44b60f42; WORD $0x0216 // movzx    eax, byte [rsi + r10 + 2]
+	LONG $0x17443242; BYTE $0x02   // xor    al, byte [rdi + r10 + 2]
+	LONG $0x12448842; BYTE $0x02   // mov    byte [rdx + r10 + 2], al
+	LONG $0x44b60f42; WORD $0x0316 // movzx    eax, byte [rsi + r10 + 3]
+	LONG $0x17443242; BYTE $0x03   // xor    al, byte [rdi + r10 + 3]
+	LONG $0x12448842; BYTE $0x03   // mov    byte [rdx + r10 + 3], al
+	LONG $0x04c28349               // add    r10, 4
+	WORD $0x394c; BYTE $0xd1       // cmp    rcx, r10
+	JNE  LBB3_6
+
+LBB3_12:
+	VZEROUPPER
+	RET
diff --git a/go/arrow/bitutil/bitmap_ops_noasm.go b/go/arrow/bitutil/bitmap_ops_noasm.go
index 785531c1c2..e25347791f 100644
--- a/go/arrow/bitutil/bitmap_ops_noasm.go
+++ b/go/arrow/bitutil/bitmap_ops_noasm.go
@@ -22,4 +22,6 @@ package bitutil
 func init() {
 	bitAndOp.opAligned = alignedBitAndGo
 	bitOrOp.opAligned = alignedBitOrGo
+	bitAndNotOp.opAligned = alignedBitAndNotGo
+	bitXorOp.opAligned = alignedBitXorGo
 }
diff --git a/go/arrow/bitutil/bitmap_ops_ppc64le.go b/go/arrow/bitutil/bitmap_ops_ppc64le.go
index 86c47639a9..28d95d84ad 100644
--- a/go/arrow/bitutil/bitmap_ops_ppc64le.go
+++ b/go/arrow/bitutil/bitmap_ops_ppc64le.go
@@ -22,4 +22,6 @@ package bitutil
 func init() {
 	bitAndOp.opAligned = alignedBitAndGo
 	bitOrOp.opAligned = alignedBitOrGo
+	bitAndNotOp.opAligned = alignedBitAndNotGo
+	bitXorOp.opAligned = alignedBitXorGo
 }
diff --git a/go/arrow/bitutil/bitmap_ops_s390x.go b/go/arrow/bitutil/bitmap_ops_s390x.go
index 86c47639a9..28d95d84ad 100644
--- a/go/arrow/bitutil/bitmap_ops_s390x.go
+++ b/go/arrow/bitutil/bitmap_ops_s390x.go
@@ -22,4 +22,6 @@ package bitutil
 func init() {
 	bitAndOp.opAligned = alignedBitAndGo
 	bitOrOp.opAligned = alignedBitOrGo
+	bitAndNotOp.opAligned = alignedBitAndNotGo
+	bitXorOp.opAligned = alignedBitXorGo
 }
diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
index 5d1fcf9682..f16bce12bb 100644
--- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
@@ -36,3 +36,17 @@ func _bitmap_aligned_or_sse4(left, right, out unsafe.Pointer, length int64)
 func bitmapAlignedOrSSE4(left, right, out []byte) {
 	_bitmap_aligned_or_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
 }
+
+//go:noescape
+func _bitmap_aligned_and_not_sse4(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedAndNotSSE4(left, right, out []byte) {
+	_bitmap_aligned_and_not_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
+
+//go:noescape
+func _bitmap_aligned_xor_sse4(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedXorSSE4(left, right, out []byte) {
+	_bitmap_aligned_xor_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
index ad81cf6372..c15e186253 100644
--- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
+++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
@@ -254,3 +254,248 @@ LBB1_10:
 	LONG $0x01c1f641         // test    r9b, 1
 	JNE  LBB1_14
 	JMP  LBB1_15
+
+TEXT ·_bitmap_aligned_and_not_sse4(SB), $0-32
+
+	MOVQ left+0(FP), DI
+	MOVQ right+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ length+24(FP), CX
+
+	WORD $0x8548; BYTE $0xc9 // test    rcx, rcx
+	JLE  LBB2_16
+	LONG $0x1ff98348         // cmp    rcx, 31
+	JA   LBB2_7
+	WORD $0x3145; BYTE $0xdb // xor    r11d, r11d
+
+LBB2_3:
+	WORD $0x894d; BYTE $0xd8 // mov    r8, r11
+	WORD $0xf749; BYTE $0xd0 // not    r8
+	WORD $0xc1f6; BYTE $0x01 // test    cl, 1
+	JE   LBB2_5
+	LONG $0x1e048a42         // mov    al, byte [rsi + r11]
+	WORD $0xd0f6             // not    al
+	LONG $0x1f042242         // and    al, byte [rdi + r11]
+	LONG $0x1a048842         // mov    byte [rdx + r11], al
+	LONG $0x01cb8349         // or    r11, 1
+
+LBB2_5:
+	WORD $0x0149; BYTE $0xc8 // add    r8, rcx
+	JE   LBB2_16
+
+LBB2_6:
+	LONG $0x04b60f42; BYTE $0x1e   // movzx    eax, byte [rsi + r11]
+	WORD $0xd0f6                   // not    al
+	LONG $0x1f042242               // and    al, byte [rdi + r11]
+	LONG $0x1a048842               // mov    byte [rdx + r11], al
+	LONG $0x44b60f42; WORD $0x011e // movzx    eax, byte [rsi + r11 + 1]
+	WORD $0xd0f6                   // not    al
+	LONG $0x1f442242; BYTE $0x01   // and    al, byte [rdi + r11 + 1]
+	LONG $0x1a448842; BYTE $0x01   // mov    byte [rdx + r11 + 1], al
+	LONG $0x02c38349               // add    r11, 2
+	WORD $0x394c; BYTE $0xd9       // cmp    rcx, r11
+	JNE  LBB2_6
+	JMP  LBB2_16
+
+LBB2_7:
+	LONG $0x0a0c8d4c         // lea    r9, [rdx + rcx]
+	LONG $0x0f048d48         // lea    rax, [rdi + rcx]
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd2970f41         // seta    r10b
+	LONG $0x0e048d48         // lea    rax, [rsi + rcx]
+	WORD $0x3949; BYTE $0xf9 // cmp    r9, rdi
+	WORD $0x970f; BYTE $0xd3 // seta    bl
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd0970f41         // seta    r8b
+	WORD $0x3949; BYTE $0xf1 // cmp    r9, rsi
+	LONG $0xd1970f41         // seta    r9b
+	WORD $0x3145; BYTE $0xdb // xor    r11d, r11d
+	WORD $0x8441; BYTE $0xda // test    r10b, bl
+	JNE  LBB2_3
+	WORD $0x2045; BYTE $0xc8 // and    r8b, r9b
+	JNE  LBB2_3
+	WORD $0x8949; BYTE $0xcb // mov    r11, rcx
+	LONG $0xe0e38349         // and    r11, -32
+	LONG $0xe0438d49         // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc1 // mov    r9, rax
+	LONG $0x05e9c149         // shr    r9, 5
+	LONG $0x01c18349         // add    r9, 1
+	WORD $0x8548; BYTE $0xc0 // test    rax, rax
+	JE   LBB2_10
+	WORD $0x894d; BYTE $0xca // mov    r10, r9
+	LONG $0xfee28349         // and    r10, -2
+	WORD $0xf749; BYTE $0xda // neg    r10
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+
+LBB2_12:
+	LONG $0x04100f42; BYTE $0x07   // movups    xmm0, oword [rdi + r8]
+	LONG $0x4c100f42; WORD $0x1007 // movups    xmm1, oword [rdi + r8 + 16]
+	LONG $0x14100f42; BYTE $0x06   // movups    xmm2, oword [rsi + r8]
+	WORD $0x550f; BYTE $0xd0       // andnps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x1006 // movups    xmm0, oword [rsi + r8 + 16]
+	WORD $0x550f; BYTE $0xc1       // andnps    xmm0, xmm1
+	LONG $0x14110f42; BYTE $0x02   // movups    oword [rdx + r8], xmm2
+	LONG $0x44110f42; WORD $0x1002 // movups    oword [rdx + r8 + 16], xmm0
+	LONG $0x44100f42; WORD $0x2007 // movups    xmm0, oword [rdi + r8 + 32]
+	LONG $0x4c100f42; WORD $0x3007 // movups    xmm1, oword [rdi + r8 + 48]
+	LONG $0x54100f42; WORD $0x2006 // movups    xmm2, oword [rsi + r8 + 32]
+	WORD $0x550f; BYTE $0xd0       // andnps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x3006 // movups    xmm0, oword [rsi + r8 + 48]
+	WORD $0x550f; BYTE $0xc1       // andnps    xmm0, xmm1
+	LONG $0x54110f42; WORD $0x2002 // movups    oword [rdx + r8 + 32], xmm2
+	LONG $0x44110f42; WORD $0x3002 // movups    oword [rdx + r8 + 48], xmm0
+	LONG $0x40c08349               // add    r8, 64
+	LONG $0x02c28349               // add    r10, 2
+	JNE  LBB2_12
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB2_15
+
+LBB2_14:
+	LONG $0x04100f42; BYTE $0x07   // movups    xmm0, oword [rdi + r8]
+	LONG $0x4c100f42; WORD $0x1007 // movups    xmm1, oword [rdi + r8 + 16]
+	LONG $0x14100f42; BYTE $0x06   // movups    xmm2, oword [rsi + r8]
+	WORD $0x550f; BYTE $0xd0       // andnps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x1006 // movups    xmm0, oword [rsi + r8 + 16]
+	WORD $0x550f; BYTE $0xc1       // andnps    xmm0, xmm1
+	LONG $0x14110f42; BYTE $0x02   // movups    oword [rdx + r8], xmm2
+	LONG $0x44110f42; WORD $0x1002 // movups    oword [rdx + r8 + 16], xmm0
+
+LBB2_15:
+	WORD $0x3949; BYTE $0xcb // cmp    r11, rcx
+	JNE  LBB2_3
+
+LBB2_16:
+	RET
+
+LBB2_10:
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+	LONG $0x01c1f641         // test    r9b, 1
+	JNE  LBB2_14
+	JMP  LBB2_15
+
+TEXT ·_bitmap_aligned_xor_sse4(SB), $0-32
+
+	MOVQ left+0(FP), DI
+	MOVQ right+8(FP), SI
+	MOVQ out+16(FP), DX
+	MOVQ length+24(FP), CX
+
+	WORD $0x8548; BYTE $0xc9 // test    rcx, rcx
+	JLE  LBB3_16
+	LONG $0x1ff98348         // cmp    rcx, 31
+	JA   LBB3_7
+	WORD $0x3145; BYTE $0xdb // xor    r11d, r11d
+
+LBB3_3:
+	WORD $0x894d; BYTE $0xd8 // mov    r8, r11
+	WORD $0xf749; BYTE $0xd0 // not    r8
+	WORD $0x0149; BYTE $0xc8 // add    r8, rcx
+	WORD $0x8949; BYTE $0xc9 // mov    r9, rcx
+	LONG $0x03e18349         // and    r9, 3
+	JE   LBB3_5
+
+LBB3_4:
+	LONG $0x04b60f42; BYTE $0x1e // movzx    eax, byte [rsi + r11]
+	LONG $0x1f043242             // xor    al, byte [rdi + r11]
+	LONG $0x1a048842             // mov    byte [rdx + r11], al
+	LONG $0x01c38349             // add    r11, 1
+	LONG $0xffc18349             // add    r9, -1
+	JNE  LBB3_4
+
+LBB3_5:
+	LONG $0x03f88349 // cmp    r8, 3
+	JB   LBB3_16
+
+LBB3_6:
+	LONG $0x04b60f42; BYTE $0x1e   // movzx    eax, byte [rsi + r11]
+	LONG $0x1f043242               // xor    al, byte [rdi + r11]
+	LONG $0x1a048842               // mov    byte [rdx + r11], al
+	LONG $0x44b60f42; WORD $0x011e // movzx    eax, byte [rsi + r11 + 1]
+	LONG $0x1f443242; BYTE $0x01   // xor    al, byte [rdi + r11 + 1]
+	LONG $0x1a448842; BYTE $0x01   // mov    byte [rdx + r11 + 1], al
+	LONG $0x44b60f42; WORD $0x021e // movzx    eax, byte [rsi + r11 + 2]
+	LONG $0x1f443242; BYTE $0x02   // xor    al, byte [rdi + r11 + 2]
+	LONG $0x1a448842; BYTE $0x02   // mov    byte [rdx + r11 + 2], al
+	LONG $0x44b60f42; WORD $0x031e // movzx    eax, byte [rsi + r11 + 3]
+	LONG $0x1f443242; BYTE $0x03   // xor    al, byte [rdi + r11 + 3]
+	LONG $0x1a448842; BYTE $0x03   // mov    byte [rdx + r11 + 3], al
+	LONG $0x04c38349               // add    r11, 4
+	WORD $0x394c; BYTE $0xd9       // cmp    rcx, r11
+	JNE  LBB3_6
+	JMP  LBB3_16
+
+LBB3_7:
+	LONG $0x0a0c8d4c         // lea    r9, [rdx + rcx]
+	LONG $0x0f048d48         // lea    rax, [rdi + rcx]
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd2970f41         // seta    r10b
+	LONG $0x0e048d48         // lea    rax, [rsi + rcx]
+	WORD $0x3949; BYTE $0xf9 // cmp    r9, rdi
+	WORD $0x970f; BYTE $0xd3 // seta    bl
+	WORD $0x3948; BYTE $0xd0 // cmp    rax, rdx
+	LONG $0xd0970f41         // seta    r8b
+	WORD $0x3949; BYTE $0xf1 // cmp    r9, rsi
+	LONG $0xd1970f41         // seta    r9b
+	WORD $0x3145; BYTE $0xdb // xor    r11d, r11d
+	WORD $0x8441; BYTE $0xda // test    r10b, bl
+	JNE  LBB3_3
+	WORD $0x2045; BYTE $0xc8 // and    r8b, r9b
+	JNE  LBB3_3
+	WORD $0x8949; BYTE $0xcb // mov    r11, rcx
+	LONG $0xe0e38349         // and    r11, -32
+	LONG $0xe0438d49         // lea    rax, [r11 - 32]
+	WORD $0x8949; BYTE $0xc1 // mov    r9, rax
+	LONG $0x05e9c149         // shr    r9, 5
+	LONG $0x01c18349         // add    r9, 1
+	WORD $0x8548; BYTE $0xc0 // test    rax, rax
+	JE   LBB3_10
+	WORD $0x894d; BYTE $0xca // mov    r10, r9
+	LONG $0xfee28349         // and    r10, -2
+	WORD $0xf749; BYTE $0xda // neg    r10
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+
+LBB3_12:
+	LONG $0x04100f42; BYTE $0x07   // movups    xmm0, oword [rdi + r8]
+	LONG $0x4c100f42; WORD $0x1007 // movups    xmm1, oword [rdi + r8 + 16]
+	LONG $0x14100f42; BYTE $0x06   // movups    xmm2, oword [rsi + r8]
+	WORD $0x570f; BYTE $0xd0       // xorps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x1006 // movups    xmm0, oword [rsi + r8 + 16]
+	WORD $0x570f; BYTE $0xc1       // xorps    xmm0, xmm1
+	LONG $0x14110f42; BYTE $0x02   // movups    oword [rdx + r8], xmm2
+	LONG $0x44110f42; WORD $0x1002 // movups    oword [rdx + r8 + 16], xmm0
+	LONG $0x44100f42; WORD $0x2007 // movups    xmm0, oword [rdi + r8 + 32]
+	LONG $0x4c100f42; WORD $0x3007 // movups    xmm1, oword [rdi + r8 + 48]
+	LONG $0x54100f42; WORD $0x2006 // movups    xmm2, oword [rsi + r8 + 32]
+	WORD $0x570f; BYTE $0xd0       // xorps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x3006 // movups    xmm0, oword [rsi + r8 + 48]
+	WORD $0x570f; BYTE $0xc1       // xorps    xmm0, xmm1
+	LONG $0x54110f42; WORD $0x2002 // movups    oword [rdx + r8 + 32], xmm2
+	LONG $0x44110f42; WORD $0x3002 // movups    oword [rdx + r8 + 48], xmm0
+	LONG $0x40c08349               // add    r8, 64
+	LONG $0x02c28349               // add    r10, 2
+	JNE  LBB3_12
+	LONG $0x01c1f641               // test    r9b, 1
+	JE   LBB3_15
+
+LBB3_14:
+	LONG $0x04100f42; BYTE $0x07   // movups    xmm0, oword [rdi + r8]
+	LONG $0x4c100f42; WORD $0x1007 // movups    xmm1, oword [rdi + r8 + 16]
+	LONG $0x14100f42; BYTE $0x06   // movups    xmm2, oword [rsi + r8]
+	WORD $0x570f; BYTE $0xd0       // xorps    xmm2, xmm0
+	LONG $0x44100f42; WORD $0x1006 // movups    xmm0, oword [rsi + r8 + 16]
+	WORD $0x570f; BYTE $0xc1       // xorps    xmm0, xmm1
+	LONG $0x14110f42; BYTE $0x02   // movups    oword [rdx + r8], xmm2
+	LONG $0x44110f42; WORD $0x1002 // movups    oword [rdx + r8 + 16], xmm0
+
+LBB3_15:
+	WORD $0x3949; BYTE $0xcb // cmp    r11, rcx
+	JNE  LBB3_3
+
+LBB3_16:
+	RET
+
+LBB3_10:
+	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d
+	LONG $0x01c1f641         // test    r9b, 1
+	JNE  LBB3_14
+	JMP  LBB3_15
diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go
index abd1b188a7..c23a123292 100644
--- a/go/arrow/bitutil/bitmaps.go
+++ b/go/arrow/bitutil/bitmaps.go
@@ -18,6 +18,7 @@ package bitutil
 
 import (
 	"bytes"
+	"errors"
 	"math/bits"
 	"unsafe"
 
@@ -374,9 +375,14 @@ func (bm *BitmapWordWriter) PutNextTrailingByte(b byte, validBits int) {
 	}
 }
 
-// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset,
-// and copying length bits into dst, starting at bit offset dstOffset.
-func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+type transferMode int8
+
+const (
+	transferCopy transferMode = iota
+	transferInvert
+)
+
+func transferBitmap(mode transferMode, src []byte, srcOffset, length int, dst []byte, dstOffset int) {
 	if length == 0 {
 		// if there's nothing to write, end early.
 		return
@@ -393,12 +399,19 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
 		nwords := rdr.Words()
 		for nwords > 0 {
 			nwords--
-			wr.PutNextWord(rdr.NextWord())
+			if mode == transferInvert {
+				wr.PutNextWord(^rdr.NextWord())
+			} else {
+				wr.PutNextWord(rdr.NextWord())
+			}
 		}
 		nbytes := rdr.TrailingBytes()
 		for nbytes > 0 {
 			nbytes--
 			bt, validBits := rdr.NextTrailingByte()
+			if mode == transferInvert {
+				bt = ^bt
+			}
 			wr.PutNextTrailingByte(bt, validBits)
 		}
 		return
@@ -417,14 +430,33 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
 	// - high 5 bits: old bits from last byte of dest buffer
 	trailingBits := nbytes*8 - length
 	trailMask := byte(uint(1)<<(8-trailingBits)) - 1
-
-	copy(dst, src[:nbytes-1])
-	lastData := src[nbytes-1]
+	var lastData byte
+	if mode == transferInvert {
+		for i, b := range src[:nbytes-1] {
+			dst[i] = ^b
+		}
+		lastData = ^src[nbytes-1]
+	} else {
+		copy(dst, src[:nbytes-1])
+		lastData = src[nbytes-1]
+	}
 
 	dst[nbytes-1] &= ^trailMask
 	dst[nbytes-1] |= lastData & trailMask
 }
 
+// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset,
+// and copying length bits into dst, starting at bit offset dstOffset.
+func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+	transferBitmap(transferCopy, src, srcOffset, length, dst, dstOffset)
+}
+
+// InvertBitmap copies a bit range of a bitmap, inverting it as it copies
+// over into the destination.
+func InvertBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+	transferBitmap(transferInvert, src, srcOffset, length, dst, dstOffset)
+}
+
 type bitOp struct {
 	opWord    func(uint64, uint64) uint64
 	opByte    func(byte, byte) byte
@@ -440,6 +472,14 @@ var (
 		opWord: func(l, r uint64) uint64 { return l | r },
 		opByte: func(l, r byte) byte { return l | r },
 	}
+	bitAndNotOp = bitOp{
+		opWord: func(l, r uint64) uint64 { return l &^ r },
+		opByte: func(l, r byte) byte { return l &^ r },
+	}
+	bitXorOp = bitOp{
+		opWord: func(l, r uint64) uint64 { return l ^ r },
+		opByte: func(l, r byte) byte { return l ^ r },
+	}
 )
 
 func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
@@ -532,6 +572,22 @@ func BitmapOrAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset in
 	return BitmapOpAlloc(mem, bitOrOp, left, right, lOffset, rOffset, length, outOffset)
 }
 
+func BitmapAndNot(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
+	BitmapOp(bitAndNotOp, left, right, lOffset, rOffset, out, outOffset, length)
+}
+
+func BitmapAndNotAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer {
+	return BitmapOpAlloc(mem, bitAndNotOp, left, right, lOffset, rOffset, length, outOffset)
+}
+
+func BitmapXor(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
+	BitmapOp(bitXorOp, left, right, lOffset, rOffset, out, outOffset, length)
+}
+
+func BitmapXorAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer {
+	return BitmapOpAlloc(mem, bitXorOp, left, right, lOffset, rOffset, length, outOffset)
+}
+
 func BitmapEquals(left, right []byte, lOffset, rOffset int64, length int64) bool {
 	if lOffset%8 == 0 && rOffset%8 == 0 {
 		// byte aligned, fast path, can use bytes.Equal (memcmp)
@@ -584,3 +640,108 @@ type OptionalBitIndexer struct {
 func (b *OptionalBitIndexer) GetBit(i int) bool {
 	return b.Bitmap == nil || BitIsSet(b.Bitmap, b.Offset+i)
 }
+
+type Bitmap struct {
+	Data        []byte
+	Offset, Len int64
+}
+
+func bitLength(bitmaps []Bitmap) (int64, error) {
+	for _, b := range bitmaps[1:] {
+		if b.Len != bitmaps[0].Len {
+			return -1, errors.New("bitmaps must be same length")
+		}
+	}
+	return bitmaps[0].Len, nil
+}
+
+func runVisitWordsAndWriteLoop(bitLen int64, rdrs []*BitmapWordReader, wrs []*BitmapWordWriter, visitor func(in, out []uint64)) {
+	const bitWidth int64 = int64(uint64SizeBits)
+
+	visited := make([]uint64, len(rdrs))
+	output := make([]uint64, len(wrs))
+
+	// every reader will have same number of words, since they are same
+	// length'ed. This will be inefficient in some cases. When there's
+	// offsets beyond the Word boundary, every word would have to be
+	// created from 2 adjoining words
+	nwords := int64(rdrs[0].Words())
+	bitLen -= nwords * bitWidth
+	for nwords > 0 {
+		nwords--
+		for i := range visited {
+			visited[i] = rdrs[i].NextWord()
+		}
+		visitor(visited, output)
+		for i := range output {
+			wrs[i].PutNextWord(output[i])
+		}
+	}
+
+	// every reader will have the same number of trailing bytes, because
+	// we already confirmed they have the same length. Because
+	// offsets beyond the Word boundary can cause adjoining words, the
+	// tailing portion could be more than one word remaining full/partial
+	// words to write.
+	if bitLen == 0 {
+		return
+	}
+
+	// convert the word visitor to a bytevisitor
+	byteVisitor := func(in, out []byte) {
+		for i, w := range in {
+			visited[i] = uint64(w)
+		}
+		visitor(visited, output)
+		for i, w := range output {
+			out[i] = byte(w)
+		}
+	}
+
+	visitedBytes := make([]byte, len(rdrs))
+	outputBytes := make([]byte, len(wrs))
+	nbytes := rdrs[0].trailingBytes
+	for nbytes > 0 {
+		nbytes--
+		memory.Set(visitedBytes, 0)
+		memory.Set(outputBytes, 0)
+
+		var validBits int
+		for i := range rdrs {
+			visitedBytes[i], validBits = rdrs[i].NextTrailingByte()
+		}
+		byteVisitor(visitedBytes, outputBytes)
+		for i, w := range outputBytes {
+			wrs[i].PutNextTrailingByte(w, validBits)
+		}
+	}
+}
+
+// VisitWordsAndWrite visits words of bits from each input bitmap and
+// collects outputs to a slice of output Bitmaps.
+//
+// All bitmaps must have identical lengths. The first bit in a visited
+// bitmap may be offset within the first visited word, but words will
+// otherwise contain densely packed bits loaded from the bitmap. That
+// offset within the first word is returned.
+//
+// NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+// It also has a large prolog/epilog overhead and should be used
+// carefully in other cases. For 2 or fewer bitmaps, and/or smaller
+// bitmaps, try BitmapReader and or other utilities.
+func VisitWordsAndWrite(args []Bitmap, out []Bitmap, visitor func(in, out []uint64)) error {
+	bitLen, err := bitLength(args)
+	if err != nil {
+		return err
+	}
+
+	rdrs, wrs := make([]*BitmapWordReader, len(args)), make([]*BitmapWordWriter, len(out))
+	for i, in := range args {
+		rdrs[i] = NewBitmapWordReader(in.Data, int(in.Offset), int(in.Len))
+	}
+	for i, o := range out {
+		wrs[i] = NewBitmapWordWriter(o.Data, int(o.Offset), int(o.Len))
+	}
+	runVisitWordsAndWriteLoop(bitLen, rdrs, wrs, visitor)
+	return nil
+}
diff --git a/go/arrow/compute/internal/kernels/scalar_boolean.go b/go/arrow/compute/internal/kernels/scalar_boolean.go
new file mode 100644
index 0000000000..a458306451
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/scalar_boolean.go
@@ -0,0 +1,332 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernels
+
+import (
+	"github.com/apache/arrow/go/v10/arrow/bitutil"
+	"github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+	"github.com/apache/arrow/go/v10/arrow/scalar"
+)
+
+type computeWordFN func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64)
+
+func computeKleene(computeWord computeWordFN, ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	var (
+		inBMs = [4]bitutil.Bitmap{
+			{Data: left.Buffers[0].Buf, Offset: left.Offset, Len: left.Len},
+			{Data: left.Buffers[1].Buf, Offset: left.Offset, Len: left.Len},
+			{Data: right.Buffers[1].Buf, Offset: right.Offset, Len: right.Len},
+			{Data: right.Buffers[0].Buf, Offset: right.Offset, Len: right.Len},
+		}
+		outBMs = [2]bitutil.Bitmap{
+			{Data: out.Buffers[0].Buf, Offset: out.Offset, Len: out.Len},
+			{Data: out.Buffers[1].Buf, Offset: out.Offset, Len: out.Len},
+		}
+		apply = func(leftValid, leftData uint64, rightValid, rightData uint64) (outValidity, outData uint64) {
+			leftTrue, leftFalse := leftValid&leftData, leftValid&^leftData
+			rightTrue, rightFalse := rightValid&rightData, rightValid&^rightData
+			return computeWord(leftTrue, leftFalse, rightTrue, rightFalse)
+		}
+	)
+
+	switch {
+	case right.UpdateNullCount() == 0:
+		return bitutil.VisitWordsAndWrite(inBMs[:3], outBMs[:],
+			func(in, out []uint64) {
+				out[0], out[1] = apply(in[0], in[1], ^uint64(0), in[2])
+			})
+	case left.UpdateNullCount() == 0:
+		return bitutil.VisitWordsAndWrite(inBMs[1:], outBMs[:],
+			func(in, out []uint64) {
+				out[0], out[1] = apply(^uint64(0), in[0], in[2], in[1])
+			})
+	default:
+		return bitutil.VisitWordsAndWrite(inBMs[:], outBMs[:],
+			func(in, out []uint64) {
+				out[0], out[1] = apply(in[0], in[1], in[3], in[2])
+			})
+	}
+}
+
+type AndOpKernel struct {
+	commutativeBinaryKernel[AndOpKernel]
+}
+
+func (AndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	bitutil.BitmapAnd(left.Buffers[1].Buf, right.Buffers[1].Buf,
+		left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len)
+	return nil
+}
+
+func (AndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if !left.IsValid() {
+		return nil
+	}
+
+	outBM := out.Buffers[1].Buf
+	if left.(*scalar.Boolean).Value {
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset),
+			int(right.Len), outBM, int(out.Offset))
+	} else {
+		bitutil.SetBitsTo(outBM, out.Offset, out.Len, false)
+	}
+	return nil
+}
+
+type KleeneAndOpKernel struct {
+	commutativeBinaryKernel[KleeneAndOpKernel]
+}
+
+func (KleeneAndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		return (AndOpKernel{}).Call(ctx, left, right, out)
+	}
+
+	computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+		return leftFalse | rightFalse | (leftTrue & rightTrue), leftTrue & rightTrue
+	}
+	return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneAndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	var (
+		leftTrue  = left.IsValid() && left.(*scalar.Boolean).Value
+		leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+	)
+
+	switch {
+	case leftFalse:
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false)
+	case leftTrue:
+		if right.UpdateNullCount() == 0 {
+			bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+			out.Nulls = 0
+		} else {
+			bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		}
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	default: // scalar was null: out[i] is valid iff right[i] was false
+		if right.UpdateNullCount() == 0 {
+			bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		} else {
+			bitutil.BitmapAndNot(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset,
+				right.Offset, out.Buffers[0].Buf, out.Offset, right.Len)
+		}
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	}
+	return nil
+}
+
+type OrOpKernel struct {
+	commutativeBinaryKernel[OrOpKernel]
+}
+
+func (OrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	bitutil.BitmapOr(left.Buffers[1].Buf, right.Buffers[1].Buf,
+		left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len)
+	return nil
+}
+
+func (OrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if !left.IsValid() {
+		return nil
+	}
+
+	outBM := out.Buffers[1].Buf
+	if left.(*scalar.Boolean).Value {
+		bitutil.SetBitsTo(outBM, out.Offset, out.Len, true)
+	} else {
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset),
+			int(right.Len), outBM, int(out.Offset))
+	}
+	return nil
+}
+
+type KleeneOrOpKernel struct {
+	commutativeBinaryKernel[KleeneOrOpKernel]
+}
+
+func (KleeneOrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		return (OrOpKernel{}).Call(ctx, left, right, out)
+	}
+
+	computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+		return leftTrue | rightTrue | (leftFalse & rightFalse), leftTrue | rightTrue
+	}
+	return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneOrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	var (
+		leftTrue  = left.IsValid() && left.(*scalar.Boolean).Value
+		leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+	)
+
+	switch {
+	case leftTrue:
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, true) // all true case
+	case leftFalse:
+		if right.UpdateNullCount() == 0 {
+			bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+			out.Nulls = 0
+		} else {
+			bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		}
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	default: // scalar was null: out[i] is valid iff right[i] was true
+		if right.UpdateNullCount() == 0 {
+			bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		} else {
+			bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset,
+				right.Offset, out.Buffers[0].Buf, out.Offset, right.Len)
+		}
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	}
+	return nil
+}
+
+type XorOpKernel struct {
+	commutativeBinaryKernel[XorOpKernel]
+}
+
+func (XorOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	bitutil.BitmapXor(left.Buffers[1].Buf, right.Buffers[1].Buf,
+		left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, out.Len)
+	return nil
+}
+
+func (XorOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if !left.IsValid() {
+		return nil
+	}
+
+	outBM := out.Buffers[1].Buf
+	if left.(*scalar.Boolean).Value {
+		bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			outBM, int(out.Offset))
+	} else {
+		bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			outBM, int(out.Offset))
+	}
+	return nil
+}
+
+func invertScalar(in scalar.Scalar) *scalar.Boolean {
+	if in.IsValid() {
+		return scalar.NewBooleanScalar(!in.(*scalar.Boolean).Value)
+	}
+	return in.(*scalar.Boolean)
+}
+
+type AndNotOpKernel struct{}
+
+func (AndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	bitutil.BitmapAndNot(left.Buffers[1].Buf, right.Buffers[1].Buf, left.Offset, right.Offset,
+		out.Buffers[1].Buf, out.Offset, right.Len)
+	return nil
+}
+
+func (AndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if !left.IsValid() {
+		return nil
+	}
+
+	outBM := out.Buffers[1].Buf
+	if left.(*scalar.Boolean).Value {
+		bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			outBM, int(out.Offset))
+	} else {
+		bitutil.SetBitsTo(outBM, out.Offset, out.Len, false)
+	}
+	return nil
+}
+
+func (AndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+	return (AndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out)
+}
+
+type KleeneAndNotOpKernel struct{}
+
+func (KleeneAndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+	if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		return (AndNotOpKernel{}).Call(ctx, left, right, out)
+	}
+
+	computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+		return leftFalse | rightTrue | (leftTrue & rightFalse), leftTrue & rightFalse
+	}
+
+	return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneAndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+	var (
+		leftTrue  = left.IsValid() && left.(*scalar.Boolean).Value
+		leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+	)
+
+	switch {
+	case leftFalse:
+		bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+		out.Nulls = 0
+		bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false)
+	case leftTrue:
+		if right.UpdateNullCount() == 0 {
+			bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+			out.Nulls = 0
+		} else {
+			bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		}
+		bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	default: // scalar was null: out[i] is valid iff right[i] was true
+		if right.UpdateNullCount() == 0 {
+			bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+				out.Buffers[0].Buf, int(out.Offset))
+		} else {
+			bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset, right.Offset,
+				out.Buffers[0].Buf, out.Offset, right.Len)
+		}
+		bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+			out.Buffers[1].Buf, int(out.Offset))
+	}
+	return nil
+}
+
+func (KleeneAndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+	return (KleeneAndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out)
+}
diff --git a/go/arrow/compute/internal/kernels/types.go b/go/arrow/compute/internal/kernels/types.go
index eeae4b6c4e..073e1c608c 100644
--- a/go/arrow/compute/internal/kernels/types.go
+++ b/go/arrow/compute/internal/kernels/types.go
@@ -17,7 +17,12 @@
 package kernels
 
 import (
+	"fmt"
+
 	"github.com/apache/arrow/go/v10/arrow"
+	"github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+	"github.com/apache/arrow/go/v10/arrow/internal/debug"
+	"github.com/apache/arrow/go/v10/arrow/scalar"
 )
 
 var (
@@ -62,3 +67,41 @@ const (
 	CmpLT
 	CmpLE
 )
+
+type simpleBinaryKernel interface {
+	Call(*exec.KernelCtx, *exec.ArraySpan, *exec.ArraySpan, *exec.ExecResult) error
+	CallScalarLeft(*exec.KernelCtx, scalar.Scalar, *exec.ArraySpan, *exec.ExecResult) error
+}
+
+type commutativeBinaryKernel[T simpleBinaryKernel] struct{}
+
+func (commutativeBinaryKernel[T]) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+	var t T
+	return t.CallScalarLeft(ctx, right, left, out)
+}
+
+type SimpleBinaryKernel interface {
+	simpleBinaryKernel
+	CallScalarRight(*exec.KernelCtx, *exec.ArraySpan, scalar.Scalar, *exec.ExecResult) error
+}
+
+func SimpleBinary[K SimpleBinaryKernel](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
+	if batch.Len == 0 {
+		return nil
+	}
+
+	var k K
+	if batch.Values[0].IsArray() {
+		if batch.Values[1].IsArray() {
+			return k.Call(ctx, &batch.Values[0].Array, &batch.Values[1].Array, out)
+		}
+		return k.CallScalarRight(ctx, &batch.Values[0].Array, batch.Values[1].Scalar, out)
+	}
+
+	if batch.Values[1].IsArray() {
+		return k.CallScalarLeft(ctx, batch.Values[0].Scalar, &batch.Values[1].Array, out)
+	}
+
+	debug.Assert(false, "should be unreachable")
+	return fmt.Errorf("%w: should be unreachable", arrow.ErrInvalid)
+}
diff --git a/go/arrow/compute/registry.go b/go/arrow/compute/registry.go
index d56605f407..c28eea619a 100644
--- a/go/arrow/compute/registry.go
+++ b/go/arrow/compute/registry.go
@@ -46,6 +46,7 @@ func GetFunctionRegistry() FunctionRegistry {
 		registry = NewRegistry()
 		RegisterScalarCast(registry)
 		RegisterVectorSelection(registry)
+		RegisterScalarBoolean(registry)
 		RegisterScalarArithmetic(registry)
 	})
 	return registry
diff --git a/go/arrow/compute/scalar_bool.go b/go/arrow/compute/scalar_bool.go
new file mode 100644
index 0000000000..0a0f6afd19
--- /dev/null
+++ b/go/arrow/compute/scalar_bool.go
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute
+
+import (
+	"fmt"
+
+	"github.com/apache/arrow/go/v10/arrow"
+	"github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+	"github.com/apache/arrow/go/v10/arrow/compute/internal/kernels"
+)
+
+var (
+	andDoc = FunctionDoc{
+		Summary:     "Logical 'and' boolean values",
+		Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_kleene'",
+		ArgNames:    []string{"x", "y"},
+	}
+	andNotDoc = FunctionDoc{
+		Summary:     "Logical 'and not' boolean values",
+		Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_not_kleene'",
+		ArgNames:    []string{"x", "y"},
+	}
+	orDoc = FunctionDoc{
+		Summary:     "Logical 'or' boolean values",
+		Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'or_kleene'",
+		ArgNames:    []string{"x", "y"},
+	}
+	xorDoc = FunctionDoc{
+		Summary:     "Logical 'xor' boolean values",
+		Description: "When a null is encountered in either input, a null is output.",
+		ArgNames:    []string{"x", "y"},
+	}
+	andKleeneDoc = FunctionDoc{
+		Summary: "Logical 'and' boolean values (Kleene logic)",
+		Description: `This function behaves as follows with nulls:
+		
+		- true and null = null
+		- null and true = null
+		- false and null = false
+		- null and false = false
+		- null and null = null
+		
+		In other words, in this context, a null value really means "unknown"
+		and an unknown value "and" false is always false.
+		For a different null behavior, see function "and".`,
+		ArgNames: []string{"x", "y"},
+	}
+	andNotKleeneDoc = FunctionDoc{
+		Summary: "Logical 'and_not' boolean values (Kleene logic)",
+		Description: `This function behaves as follows with nulls:
+		
+		- true and not null = null
+		- null and not false = null
+		- false and not null = false
+		- null and not true = false
+		- null and not null = null
+		
+		In other words, in this context, a null value really means "unknown"
+		and an unknown value "and not" true is always false, as is false
+		"and not" an unknown value.
+		For a different null behavior, see function "and_not".`,
+		ArgNames: []string{"x", "y"},
+	}
+	orKleeneDoc = FunctionDoc{
+		Summary: "Logical 'or' boolean values (Kleene logic)",
+		Description: `This function behaves as follows with nulls:
+		
+		- true or null = true
+		- null or true = true
+		- false or null = null
+		- null or false = null
+		- null or null = null
+		
+		In other words, in this context, a null value really means "unknown"
+		and an unknown value "or" true is always true.
+		For a different null behavior, see function "and".`,
+		ArgNames: []string{"x", "y"},
+	}
+)
+
+func makeFunction(reg FunctionRegistry, name string, arity int, ex exec.ArrayKernelExec, doc FunctionDoc, nulls exec.NullHandling) {
+	fn := NewScalarFunction(name, Arity{NArgs: arity}, doc)
+
+	inTypes := make([]exec.InputType, arity)
+	for i := range inTypes {
+		inTypes[i] = exec.NewExactInput(arrow.FixedWidthTypes.Boolean)
+	}
+
+	k := exec.NewScalarKernel(inTypes, exec.NewOutputType(arrow.FixedWidthTypes.Boolean), ex, nil)
+	k.NullHandling = nulls
+
+	if err := fn.AddKernel(k); err != nil {
+		panic(err)
+	}
+
+	if !reg.AddFunction(fn, false) {
+		panic(fmt.Errorf("function '%s' already exists", name))
+	}
+}
+
+func RegisterScalarBoolean(reg FunctionRegistry) {
+	makeFunction(reg, "and", 2, kernels.SimpleBinary[kernels.AndOpKernel],
+		andDoc, exec.NullIntersection)
+	makeFunction(reg, "and_not", 2, kernels.SimpleBinary[kernels.AndNotOpKernel],
+		andNotDoc, exec.NullIntersection)
+	makeFunction(reg, "or", 2, kernels.SimpleBinary[kernels.OrOpKernel],
+		orDoc, exec.NullIntersection)
+	makeFunction(reg, "xor", 2, kernels.SimpleBinary[kernels.XorOpKernel],
+		xorDoc, exec.NullIntersection)
+	makeFunction(reg, "and_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndOpKernel],
+		andKleeneDoc, exec.NullComputedPrealloc)
+	makeFunction(reg, "and_not_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndNotOpKernel],
+		andNotKleeneDoc, exec.NullComputedPrealloc)
+	makeFunction(reg, "or_kleene", 2, kernels.SimpleBinary[kernels.KleeneOrOpKernel],
+		orKleeneDoc, exec.NullComputedPrealloc)
+}
diff --git a/go/arrow/compute/scalar_bool_test.go b/go/arrow/compute/scalar_bool_test.go
new file mode 100644
index 0000000000..956118d265
--- /dev/null
+++ b/go/arrow/compute/scalar_bool_test.go
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute_test
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v10/arrow"
+	"github.com/apache/arrow/go/v10/arrow/array"
+	"github.com/apache/arrow/go/v10/arrow/compute"
+	"github.com/apache/arrow/go/v10/arrow/memory"
+	"github.com/apache/arrow/go/v10/arrow/scalar"
+	"github.com/stretchr/testify/require"
+)
+
+func checkScalarBinary(t *testing.T, fn string, left, right, expected compute.Datum, opts compute.FunctionOptions) {
+	checkScalar(t, fn, []compute.Datum{left, right}, expected, opts)
+}
+
+func checkBooleanScalarArrayBinary(t *testing.T, ctx context.Context, funcName string, array compute.Datum) {
+	mem := compute.GetAllocator(ctx)
+	for _, sc := range []scalar.Scalar{scalar.MakeNullScalar(arrow.FixedWidthTypes.Boolean), scalar.NewBooleanScalar(true), scalar.NewBooleanScalar(false)} {
+		constantArr, err := scalar.MakeArrayFromScalar(sc, int(array.Len()), mem)
+		defer constantArr.Release()
+
+		require.NoError(t, err)
+		expected, err := compute.CallFunction(ctx, funcName, nil, &compute.ArrayDatum{Value: constantArr.Data()}, array)
+		require.NoError(t, err)
+		defer expected.Release()
+
+		checkScalar(t, funcName, []compute.Datum{compute.NewDatum(sc), array}, expected, nil)
+
+		expected, err = compute.CallFunction(ctx, funcName, nil, array, &compute.ArrayDatum{Value: constantArr.Data()})
+		require.NoError(t, err)
+		defer expected.Release()
+		checkScalar(t, funcName, []compute.Datum{array, compute.NewDatum(sc)}, expected, nil)
+	}
+}
+
+func TestBooleanKernels(t *testing.T) {
+	tests := []struct {
+		fn           string
+		expectedJSON string
+		commutative  bool
+	}{
+		{"and", `[true, false, null, false, null, null]`, true},
+		{"or", `[true, true, null, false, null, null]`, true},
+		{"xor", `[false, true, null, false, null, null]`, true},
+		{"and_not", `[false, true, null, false, false, null, null, null, null]`, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.fn, func(t *testing.T) {
+			mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+			defer mem.AssertSize(t, 0)
+
+			var (
+				leftJSON  = `[true, true, true, false, false, null]`
+				rightJSON = `[true, false, null, false, null, null]`
+			)
+
+			if !tt.commutative {
+				leftJSON = `[true, true, true, false, false, false, null, null, null]`
+				rightJSON = `[true, false, null, true, false, null, true, false, null]`
+			}
+
+			left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+				strings.NewReader(leftJSON))
+			defer left.Release()
+			right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+				strings.NewReader(rightJSON))
+			defer right.Release()
+			exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON))
+			defer exp.Release()
+
+			checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil)
+			ctx := compute.WithAllocator(context.Background(), mem)
+			checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()})
+		})
+	}
+}
+
+func TestBooleanKleeneKernels(t *testing.T) {
+	tests := []struct {
+		fn           string
+		expectedJSON []string
+		commutative  bool
+	}{
+		{"and_kleene", []string{`[true, false, null, false, false, null]`, `[true, false, false, null, false]`, `[true, false, false, false]`}, true},
+		{"or_kleene", []string{`[true, true, true, false, null, null]`, `[true, true, false, true, null]`, `[true, true, false, true]`}, true},
+		{"and_not_kleene", []string{`[false, true, null, false, false, false, false, null, null]`, `[false, true, false, false]`}, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.fn, func(t *testing.T) {
+			var (
+				leftJSON  = make([]string, len(tt.expectedJSON))
+				rightJSON = make([]string, len(tt.expectedJSON))
+			)
+
+			if tt.commutative {
+				leftJSON[0] = `[true, true, true, false, false, null]`
+				rightJSON[0] = `[true, false, null, false, null, null]`
+				leftJSON[1] = `[true, true, false, null, null]`
+				rightJSON[1] = `[true, false, false, true, false]`
+				leftJSON[2] = `[true, true, false, true]`
+				rightJSON[2] = `[true, false, false, false]`
+			} else {
+				leftJSON[0] = `[true, true, true, false, false, false, null, null, null]`
+				rightJSON[0] = `[true, false, null, true, false, null, true, false, null]`
+				leftJSON[1] = `[true, true, false, false]`
+				rightJSON[1] = `[true, false, true, false]`
+			}
+
+			for i := range tt.expectedJSON {
+				func() {
+					mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+					defer mem.AssertSize(t, 0)
+
+					left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+						strings.NewReader(leftJSON[i]))
+					defer left.Release()
+					right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+						strings.NewReader(rightJSON[i]))
+					defer right.Release()
+					exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON[i]))
+					defer exp.Release()
+
+					checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil)
+					ctx := compute.WithAllocator(context.Background(), mem)
+					checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()})
+				}()
+			}
+		})
+	}
+}
diff --git a/go/go.sum b/go/go.sum
index 04695d5559..b247b659cc 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -137,6 +137,7 @@ github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qq
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
 github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
 github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=