You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:44 UTC

[arrow] branch feature/format-string-view updated (0eb21efb37 -> 7801b48486)

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a change to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git


 discard 0eb21efb37 add cast to/from string_view
 discard 10a07d0e66 Extract visitation of views owning buffers
 discard 8f9792d68a wrote <=, needed >=
 discard 7435aa0821 Adding comparison and concatenation
 discard 79b5820d87 Added validation for StringView arrays
 discard 51b5fd7355 delete potentially internal viewing members for rvalues
 discard 95cdfdb0cb fixes in substrait, rename in LICENSE, owning scalars
 discard 522ccd74ce run binary data visitor tests on StringView/BinaryView
 discard df5220c4ce fix formatting
 discard 3a6f159ffc fix formatting
 discard 49457d909a implement inline visitor for StringView/BinaryView
 discard 75e4a3c8c8 add StringView/BinaryView to AllTypeIds
 discard 3f26147b6b enable JSON converter for StringView/BinaryView
 discard a38dfff766 BinaryViewBuilder: fix duplicate values in null bitmap
 discard 9261dd836b Draft basic scaffolding for Binary/StringView types and get compiling
     add 8f43a9fce9 ARROW-18322: [Python] Add PYARROW_WITH_FLIGHT to PyArrow C++ cmake (#14642)
     add c2f4671e6b ARROW-18326: [Go] Add option to support dictionary deltas with IPC (#14639)
     add d82e4c1a7c ARROW-18336: [Release][Docs] Don't update versions not in major release (#14653)
     add 610c7279b9 ARROW-18278: [Java] Adjust path in Maven generate-libs-jni-macos-linux (#14623)
     add 877776eb3c ARROW-18289: [Release][vcpkg] Add a script to update vcpkg's arrow port (#14610)
     add f269d5049d ARROW-18287: [C++][CMake] Add support for Brotli/utf8proc provided by vcpkg (#14609)
     add 50aa770f2a ARROW-18259: [C++][CMake] Add support for system Thrift CMake package (#14597)
     add ae554331e7 ARROW-18233: [Release][JS] don't install yarn to system (#14577)
     add b4a8320890 ARROW-18335: [CI][Release][JS] Use Node.js 16 as workaround (#14652)
     add e38141421d ARROW-18309: [Go] Fix delta bit packing decode panic (#14649)
     add 5f8cc745b1 ARROW-18321: [R] Add tests for binary_slice kernel (#14647)
     add e0e7ba824f ARROW-18235: [C++][Gandiva] Fix the like function implementation for escape chars (#14579)
     add 470e5b9f7c ARROW-18223: [Release][Homebrew] Detect reverse dependencies automatically (#14566)
     add 74459443fe ARROW-18222: [Release][MSYS2] Detect reverse dependencies automatically (#14565)
     add 0f87e6bf89 ARROW-18121: [Release][CI] Use Ubuntu 22.04 for verifying binaries (#14470)
     add b5b0282516 ARROW-15538: [C++] Expanding coverage of math functions from Substrait to Acero (#14434)
     add 917f70b8b5 ARROW-16817: [C++] Test ORC writer errors with invalid types (#14638)
     add 6697826746 ARROW-18120: [Release][Dev] Automate running binaries/wheels verifications (#14469)
     add 84c9ac73be ARROW-18342: [C++] AsofJoinNode support for Boolean data field (#14658)
     add c7a4ee78a3 ARROW-18348: [CI][Release][Yum] redhat-rpm-config is needed on AlmaLinux 9 (#14661)
     add 501b7997d4 ARROW-18256: [C++][Windows] Use IMPORTED_IMPLIB for external shared Thrift (#14595)
     add e9222ae00b ARROW-18332: [Go] Cast Dictionary types to value type (#14650)
     add 31dca2b262 MINOR: [R] Simplify compare_dplyr_binding test helper (#14676)
     add 5b368d2265 ARROW-18323: Enabling issue templates in GitHub issues (#14675)
     add b9dd41607c ARROW-18110: [Go] Scalar Comparisons (#14669)
     add 3718d8a3f9 ARROW-18343: [C++] Remove AllocateBitmap() with out parameter (#14657)
     add 42caa23b00 ARROW-18374: [Go][CI][Benchmarking] Fix Go Bench Script after Conbench change (#14689)
     add 7198676ac4 ARROW-18374: [Go][CI][Benchmarking] Fix Go benchmark github info (#14691)
     add c9293039b5 ARROW-17610: [C++] Support additional source types in SourceNode (#14207)
     add fd2595c3ce ARROW-18303: [Go] Allow easy compute module importing (#14690)
     add cf66f4882d ARROW-18366: [Packaging][RPM][Gandiva] Fix link error on AlmaLinux 9 (#14680)
     add 57b81cac8a ARROW-18225: [Python] Fully support filesystem in parquet.write_metadata (#14574)
     add 3cc982e519 ARROW-18341: [Doc][Python] Update note about bundling Arrow C++ on Windows (#14660)
     add f769f6b323 ARROW-18173: [Python] Drop older versions of Pandas (<1.0) (#14631)
     add b1110ae377 ARROW-17989: [C++][Python] Enable struct_field kernel to accept string field names (#14495)
     add 16ef5a8394 ARROW-18379: [Python] Change warnings to _warnings in _plasma_store_entry_point (#14695)
     add 7ae4705c62 ARROW-17985: [C++][Python] Improve s3fs error message when wrong region (#14601)
     add 1a9b1e8591 MINOR: [CI][C++] Bump aws-sdk-version on conda jobs (#14668)
     add 21309eaaeb ARROW-4709: [C++] Optimize for ordered JSON fields (#14100)
     add 59f99d2ca7 ARROW-18382: [C++] Set ADDRESS_SANITIZER in fuzzing builds (#14702)
     add 1e9eb61587 ARROW-18340: [Python] PyArrow C++ header files no longer always included in installed pyarrow (#14656)
     add 767f203dd1 MINOR: [C++] Fix CMake deps for minimal Flight benchmark build (#14700)
     add 945bcf6429 MINOR: [Go][CI] Shift test data to arrow-testing (#14706)
     add 62829c5ca8 ARROW-18360: [Python] Don't crash when schema=None in FlightClient.do_put (#14698)
     add ad54d6ca3a ARROW-18350: [C++] Use std::to_chars instead of std::to_string (#14666)
     add 1121bbcf2e ARROW-18111: [Go] Remaining scalar binary arithmetic (shifts, power, bitwise) (#14703)
     add b1f65ea44b ARROW-18383: [C++] Avoid global variables for thread pools and at-fork handlers (#14704)
     add 7a47e8dced ARROW-16673: [Java] Integrate C Data into allocator hierarchy (#14506)
     add ade42669e4 ARROW-18392: [Python] Fix test_s3fs_wrong_region; set anonymous=True (#14716)
     add 9a2aef7d9e ARROW-18397: [C++] Clear S3 region resolver client at S3 shutdown (#14718)
     add b4817115e2 MINOR: [Developer] Add triage users (#14719)
     add fb29effbb6 ARROW-18389: [CI][Python] Update nightly test-conda-python-3.7-pandas-0.24 to pandas >= 1.0 (#14714)
     add ada7e23959 MINOR: [Release] Add RC number and version to verify release candidate success message (#14664)
     add 94e45faf6a ARROW-17859: [C++] Use self-pipe in signal-receiving StopSource (#14250)
     add 25ca62ae0d ARROW-18373: Fix component drop-down, add license text (#14688)
     add c33bdabb63 MINOR: [Archery] Add collaborators to list of roles with access to trigger bot tasks (#14727)
     add 405b54ee35 ARROW-18292: [Release][Python] Upload .wheel/.tar.gz for release not RC (#14708)
     add 7276c359e8 ARROW-18384: [Release][MSYS2] Show pull request title (#14709)
     add 63f013cdb3 ARROW-17966: [C++] Adjust to new format for Substrait optional arguments (#14415)
     add 2078af7c71 ARROW-17836: [C++] Allow specifying alignment of buffers (#14225)
     add 8a93741349 ARROW-18390: [CI][Python] Update spark test modules to match spark master (#14715)
     add c0b311ee83 ARROW-18361: [CI][Conan] Merge upstream changes (#14671)
     add 409a95ddc2 ARROW-18406: [C++] Can't build Arrow with Substrait on Ubuntu 20.04 (#14735)
     add 4afe71030c ARROW-17887: [R][Doc] Improve readability of the Get Started and README pages (#14514)
     add 15a3b054f9 ARROW-18113: [C++] Add RandomAccessFile::ReadManyAsync (#14723)
     add 479941a9de ARROW-18334: [C++] Handle potential non-commutativity by rebinding (#14659)
     add 21649ae845 ARROW-18410: [Packaging][Ubuntu] Add support for Ubuntu 22.10 (#14740)
     add 1e418c33cb ARROW-18409: [GLib][Plasma] Suppress deprecated warning in building plasma-glib (#14739)
     add be023c1576 ARROW-18405: [Ruby] Avoid rebuilding chunked arrays in Arrow::Table.new (#14738)
     add d77ced27a0 ARROW-18407: [Release][Website] Use UTC for release date (#14737)
     add a594e38fad GH-14761: [Dev] Update labels on PR labeler to use new Component ones (#14762)
     add fde7b937c8 GH-14745: [R] {rlang} dependency must be at least version 1.0.0 because of check_dots_empty (#14744)
     add b1bcd6f3f1 ARROW-18380: [Dev] Update dev_pr GitHub workflows to accept both GitHub issues and JIRA (#14731)
     add ccb68afedf MINOR: Quick fix to the labeler for CPP files. (#14768)
     add 0f66b71486 MINOR: [R] Fix URLs in vignettes (#14770)
     add 3b0e1357d3 ARROW-18412: [C++][R] Windows build fails because of missing ChunkResolver symbols (#14774)
     add 2e9611a8e6 ARROW-18237: [Java] Extend Table code (#14573)
     new 8e1c1442fe Draft basic scaffolding for Binary/StringView types and get compiling
     new 20d89459f1 BinaryViewBuilder: fix duplicate values in null bitmap
     new 84666ed537 enable JSON converter for StringView/BinaryView
     new 5c8a6ecb8c add StringView/BinaryView to AllTypeIds
     new 2aaccd1b8c implement inline visitor for StringView/BinaryView
     new 864a74c81f fix formatting
     new 2f619ba07d fix formatting
     new 3d6a30a09d run binary data visitor tests on StringView/BinaryView
     new 94fcb95927 fixes in substrait, rename in LICENSE, owning scalars
     new 34efa83a8b delete potentially internal viewing members for rvalues
     new 7474342cf2 Added validation for StringView arrays
     new 04893f65e9 Adding comparison and concatenation
     new 4072a6b2ab wrote <=, needed >=
     new 21be7a9a25 Extract visitation of views owning buffers
     new 7801b48486 add cast to/from string_view

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (0eb21efb37)
            \
             N -- N -- N   refs/heads/feature/format-string-view (7801b48486)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 15 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .asf.yaml                                          |     4 +
 .github/ISSUE_TEMPLATE/bug_report.yaml             |    64 +
 .github/ISSUE_TEMPLATE/config.yml                  |     5 +-
 .github/ISSUE_TEMPLATE/feature_request.yaml        |    67 +
 .github/ISSUE_TEMPLATE/question.md                 |    26 -
 .github/ISSUE_TEMPLATE/usage_question.yaml         |    87 +
 .github/workflows/dev_pr.yml                       |     5 +-
 .github/workflows/dev_pr/helpers.js                |    64 +-
 .../dev_pr/{jira_check.js => issue_check.js}       |    91 +-
 .github/workflows/dev_pr/labeler.yml               |    28 +-
 .github/workflows/dev_pr/link.js                   |    71 +-
 .github/workflows/dev_pr/title_check.js            |     7 +-
 .github/workflows/dev_pr/title_check.md            |    13 +-
 .github/workflows/python.yml                       |     8 +-
 c_glib/plasma-glib/meson.build                     |     1 +
 ci/conan/all/conandata.yml                         |   113 +-
 ci/conan/all/conanfile.py                          |   406 +-
 ci/conan/all/patches/1.0.0-0001-cmake.patch        |   114 -
 ci/conan/all/patches/1.0.0-0002-jemalloc.patch     |    65 -
 .../patches/1.0.0-0005-fix-make12-namespace.patch  |    44 +
 ci/conan/all/patches/1.0.0-0006-fix-cmake.patch    |   355 +
 ...atch => 10.0.0-0001-mallctl-takes-size_t.patch} |    22 +-
 ci/conan/all/patches/10.0.0-0002-fix-cmake.patch   |   333 +
 ci/conan/all/patches/2.0.0-0001-cmake.patch        |    41 -
 ci/conan/all/patches/2.0.0-0002-jemalloc.patch     |    65 -
 .../all/patches/2.0.0-0006-gandiva-llvm-re2.patch  |   100 -
 .../all/patches/2.0.0-0007-fix-protoc-cmake.patch  |    33 -
 ci/conan/all/patches/2.0.0-0008-fix-cmake.patch    |   295 +
 ci/conan/all/patches/7.0.0-0001-cmake.patch        |    35 -
 ci/conan/all/patches/7.0.0-0002-jemalloc.patch     |    48 -
 .../patches/7.0.0-0004-remove-find-modules.patch   |    22 -
 .../all/patches/7.0.0-0005-use-find-package.patch  |   440 -
 ci/conan/all/patches/7.0.0-0007-fix-cmake.patch    |   369 +
 ci/conan/all/patches/8.0.0-0002-jemalloc.patch     |    48 -
 .../all/patches/8.0.0-0004-use-find-package.patch  |   401 -
 ci/conan/all/patches/8.0.0-0006-fix-cmake.patch    |   447 +
 ci/conan/all/test_package/CMakeLists.txt           |    16 +-
 ci/conan/all/test_package/conanfile.py             |    22 +-
 ci/conan/all/{ => test_v1_package}/CMakeLists.txt  |    10 +-
 .../{test_package => test_v1_package}/conanfile.py |    11 +-
 ci/conan/config.yml                                |     2 +
 ci/conan/merge_status.sh                           |     2 +-
 ci/conda_env_cpp.txt                               |     2 +-
 ci/scripts/go_bench_adapt.py                       |     7 +-
 ci/scripts/go_test.sh                              |     2 +-
 ci/scripts/integration_spark.sh                    |    39 +-
 cpp/CMakeLists.txt                                 |     9 +-
 cpp/cmake_modules/BuildUtils.cmake                 |     2 +-
 .../{FindBrotli.cmake => FindBrotliAlt.cmake}      |    43 +-
 .../{FindThrift.cmake => FindThriftAlt.cmake}      |   118 +-
 .../{Findjemalloc.cmake => FindjemallocAlt.cmake}  |    70 +-
 cpp/cmake_modules/Findutf8proc.cmake               |    18 +
 cpp/cmake_modules/ThirdpartyToolchain.cmake        |    21 +-
 cpp/src/arrow/adapters/orc/adapter_test.cc         |    16 +
 cpp/src/arrow/adapters/orc/util.cc                 |     4 +-
 cpp/src/arrow/array/builder_adaptive.cc            |    12 +-
 cpp/src/arrow/array/builder_adaptive.h             |    16 +-
 cpp/src/arrow/array/builder_base.h                 |     4 +-
 cpp/src/arrow/array/builder_binary.cc              |     6 +-
 cpp/src/arrow/array/builder_binary.h               |    22 +-
 cpp/src/arrow/array/builder_decimal.cc             |     8 +-
 cpp/src/arrow/array/builder_decimal.h              |     6 +-
 cpp/src/arrow/array/builder_dict.h                 |    49 +-
 cpp/src/arrow/array/builder_nested.h               |    12 +-
 cpp/src/arrow/array/builder_primitive.cc           |     9 +-
 cpp/src/arrow/array/builder_primitive.h            |    27 +-
 cpp/src/arrow/array/builder_time.h                 |    20 +-
 cpp/src/arrow/array/builder_union.cc               |     7 +-
 cpp/src/arrow/array/builder_union.h                |    24 +-
 cpp/src/arrow/buffer.cc                            |     8 +-
 cpp/src/arrow/buffer.h                             |    13 +-
 cpp/src/arrow/buffer_builder.h                     |    28 +-
 cpp/src/arrow/buffer_test.cc                       |    40 +
 cpp/src/arrow/builder.cc                           |     8 +-
 cpp/src/arrow/c/bridge.cc                          |    17 +-
 cpp/src/arrow/chunk_resolver.h                     |     2 +-
 cpp/src/arrow/compute/api_scalar.cc                |    12 +-
 cpp/src/arrow/compute/api_scalar.h                 |    16 +-
 cpp/src/arrow/compute/exec/aggregate.cc            |     6 +-
 cpp/src/arrow/compute/exec/asof_join_node.cc       |    19 +-
 cpp/src/arrow/compute/exec/asof_join_node_test.cc  |    22 +-
 cpp/src/arrow/compute/exec/exec_plan.cc            |     4 +-
 cpp/src/arrow/compute/exec/expression.cc           |    93 +-
 cpp/src/arrow/compute/exec/expression_test.cc      |   172 +-
 cpp/src/arrow/compute/exec/options.h               |    51 +
 cpp/src/arrow/compute/exec/plan_test.cc            |    79 +
 cpp/src/arrow/compute/exec/source_node.cc          |   136 +
 cpp/src/arrow/compute/exec/test_util.cc            |    40 +-
 cpp/src/arrow/compute/exec/test_util.h             |    28 +-
 cpp/src/arrow/compute/exec/union_node.cc           |     3 +-
 .../compute/kernels/base_arithmetic_internal.h     |     8 +
 cpp/src/arrow/compute/kernels/scalar_arithmetic.cc |     9 +
 .../compute/kernels/scalar_arithmetic_test.cc      |    94 +-
 cpp/src/arrow/compute/kernels/scalar_nested.cc     |    35 +-
 .../arrow/compute/kernels/scalar_nested_test.cc    |    46 +-
 cpp/src/arrow/dataset/dataset_writer.cc            |     7 +-
 cpp/src/arrow/datum.h                              |    21 +-
 cpp/src/arrow/datum_test.cc                        |    25 +
 .../arrow/engine/substrait/expression_internal.cc  |    73 +-
 cpp/src/arrow/engine/substrait/extension_set.cc    |   179 +-
 cpp/src/arrow/engine/substrait/extension_set.h     |    31 +-
 cpp/src/arrow/engine/substrait/function_test.cc    |   204 +-
 cpp/src/arrow/engine/substrait/plan_internal.cc    |    20 +
 .../arrow/engine/substrait/relation_internal.cc    |     4 +-
 cpp/src/arrow/engine/substrait/serde.cc            |    10 +
 cpp/src/arrow/engine/substrait/serde_test.cc       |    93 +-
 .../arrow/engine/substrait/test_plan_builder.cc    |    30 +-
 cpp/src/arrow/engine/substrait/test_plan_builder.h |     2 +
 cpp/src/arrow/filesystem/gcsfs_internal.cc         |    10 +-
 cpp/src/arrow/filesystem/s3_internal.h             |    36 +-
 cpp/src/arrow/filesystem/s3fs.cc                   |   193 +-
 cpp/src/arrow/filesystem/s3fs.h                    |     8 +
 cpp/src/arrow/flight/CMakeLists.txt                |     4 +-
 cpp/src/arrow/flight/cookie_internal.cc            |     5 +-
 cpp/src/arrow/flight/sql/column_metadata.cc        |    10 +-
 .../arrow/flight/transport/grpc/util_internal.cc   |     6 +-
 cpp/src/arrow/flight/transport/ucx/ucx_internal.cc |    10 +-
 cpp/src/arrow/flight/transport/ucx/ucx_server.cc   |    10 +-
 .../arrow/flight/transport/ucx/util_internal.cc    |     6 +-
 cpp/src/arrow/io/file_test.cc                      |    25 +-
 cpp/src/arrow/io/interfaces.cc                     |    24 +-
 cpp/src/arrow/io/interfaces.h                      |    21 +
 cpp/src/arrow/ipc/metadata_internal.cc             |     5 +-
 cpp/src/arrow/ipc/writer.cc                        |     4 +
 cpp/src/arrow/json/parser.cc                       |   138 +-
 cpp/src/arrow/json/parser_test.cc                  |    33 +
 cpp/src/arrow/memory_pool.cc                       |   158 +-
 cpp/src/arrow/memory_pool.h                        |    39 +-
 cpp/src/arrow/memory_pool_internal.h               |     9 +-
 cpp/src/arrow/memory_pool_jemalloc.cc              |    21 +-
 cpp/src/arrow/memory_pool_test.cc                  |     4 +-
 cpp/src/arrow/memory_pool_test.h                   |    17 +
 cpp/src/arrow/pretty_print.cc                      |     3 +-
 cpp/src/arrow/scalar.cc                            |     9 +
 cpp/src/arrow/scalar.h                             |    27 +-
 cpp/src/arrow/scalar_test.cc                       |    25 +
 cpp/src/arrow/stl_allocator.h                      |    11 +-
 cpp/src/arrow/testing/random.cc                    |   324 +-
 cpp/src/arrow/testing/random.h                     |   226 +-
 cpp/src/arrow/testing/random_test.cc               |    56 +
 cpp/src/arrow/type.cc                              |    85 +-
 cpp/src/arrow/type.h                               |     3 +
 cpp/src/arrow/type_fwd.h                           |     2 +
 cpp/src/arrow/type_test.cc                         |    52 +-
 cpp/src/arrow/util/atfork_internal.cc              |   164 +-
 cpp/src/arrow/util/cancel.cc                       |   190 +-
 cpp/src/arrow/util/cancel.h                        |     5 +
 cpp/src/arrow/util/cancel_test.cc                  |    68 +
 cpp/src/arrow/util/formatting.h                    |     3 +-
 cpp/src/arrow/util/int_util.cc                     |    14 +-
 cpp/src/arrow/util/io_util.cc                      |    76 +-
 cpp/src/arrow/util/io_util_test.cc                 |    31 +
 cpp/src/arrow/util/string.h                        |    70 +
 cpp/src/arrow/util/string_test.cc                  |    63 +
 cpp/src/arrow/util/thread_pool.cc                  |     1 +
 cpp/src/gandiva/regex_functions_holder.cc          |     9 +-
 cpp/src/gandiva/regex_functions_holder_test.cc     |    13 +-
 cpp/src/gandiva/tests/filter_test.cc               |    33 +
 cpp/src/parquet/arrow/schema.cc                    |     3 +-
 cpp/thirdparty/versions.txt                        |     4 +-
 dev/archery/archery/bot.py                         |     2 +-
 dev/archery/archery/crossbow/cli.py                |     5 -
 dev/archery/archery/crossbow/core.py               |     5 +-
 .../release/07-binary-verify.sh                    |    33 +-
 dev/release/binary-task.rb                         |     1 +
 dev/release/post-03-website.sh                     |     1 +
 dev/release/post-09-python.sh                      |    27 +-
 dev/release/post-11-bump-versions-test.rb          |    21 -
 dev/release/post-12-msys2.sh                       |    13 +-
 dev/release/post-13-homebrew.sh                    |    15 +-
 dev/release/{post-12-msys2.sh => post-14-vcpkg.sh} |    56 +-
 dev/release/rat_exclude_files.txt                  |     3 +-
 dev/release/utils-prepare.sh                       |    41 +-
 dev/release/verify-release-candidate.sh            |    28 +-
 dev/release/verify-yum.sh                          |     1 +
 .../apt/ubuntu-kinetic/Dockerfile                  |    32 +-
 .../apache-arrow/apt/ubuntu-kinetic-arm64/from     |     2 +-
 .../apache-arrow/apt/ubuntu-kinetic/Dockerfile     |    87 +
 .../apache-arrow/yum/almalinux-9/Dockerfile        |     3 +
 dev/tasks/linux-packages/package-task.rb           |     2 +
 dev/tasks/linux-packages/yum/build.sh              |     3 +
 dev/tasks/tasks.yml                                |     5 +-
 dev/tasks/verify-rc/github.linux.amd64.yml         |     2 +-
 docs/source/cpp/compute.rst                        |    76 +-
 docs/source/developers/java/building.rst           |    11 +-
 docs/source/developers/python.rst                  |    22 +-
 docs/source/developers/release.rst                 |     7 +-
 docs/source/python/install.rst                     |    15 +
 go/arrow/compute/arithmetic.go                     |   135 +
 go/arrow/compute/arithmetic_test.go                |   279 +-
 go/arrow/compute/cast.go                           |    44 +
 go/arrow/compute/cast_test.go                      |    51 +-
 go/arrow/compute/datum.go                          |    38 +-
 go/arrow/compute/datumkind_string.go               |     2 +
 go/arrow/compute/doc.go                            |    18 +-
 go/arrow/compute/exec.go                           |     2 +
 go/arrow/compute/exec_internals_test.go            |     2 +
 go/arrow/compute/exec_test.go                      |     2 +
 go/arrow/compute/executor.go                       |     3 +
 go/arrow/compute/expression.go                     |    76 +-
 go/arrow/compute/expression_test.go                |     2 +
 go/arrow/compute/funckind_string.go                |     2 +
 go/arrow/compute/functions.go                      |     2 +
 go/arrow/compute/functions_test.go                 |     2 +
 go/arrow/compute/go.mod                            |    51 -
 go/arrow/compute/go.sum                            |    76 -
 go/arrow/compute/internal/exec/hash_util.go        |     4 -
 go/arrow/compute/internal/exec/kernel.go           |     4 +
 go/arrow/compute/internal/exec/kernel_test.go      |     2 +
 go/arrow/compute/internal/exec/span.go             |     2 +
 go/arrow/compute/internal/exec/span_test.go        |     2 +
 go/arrow/compute/internal/exec/utils.go            |    93 +-
 go/arrow/compute/internal/exec/utils_test.go       |     2 +
 go/arrow/compute/internal/kernels/Makefile         |    17 +-
 .../internal/kernels/_lib/base_arithmetic.cc       |     9 +-
 .../kernels/_lib/base_arithmetic_avx2_amd64.s      |    34 +-
 .../kernels/_lib/base_arithmetic_sse4_amd64.s      |    34 +-
 .../internal/kernels/_lib/scalar_comparison.cc     |   241 +
 .../kernels/_lib/scalar_comparison_avx2_amd64.s    | 67763 +++++++++++++++++++
 .../kernels/_lib/scalar_comparison_sse4_amd64.s    | 59819 ++++++++++++++++
 .../compute/internal/kernels/base_arithmetic.go    |   126 +-
 .../internal/kernels/base_arithmetic_amd64.go      |     8 +-
 .../internal/kernels/base_arithmetic_avx2_amd64.go |     2 +-
 .../internal/kernels/base_arithmetic_avx2_amd64.s  |    34 +-
 .../internal/kernels/base_arithmetic_sse4_amd64.go |     2 +-
 .../internal/kernels/base_arithmetic_sse4_amd64.s  |    34 +-
 .../internal/kernels/basic_arithmetic_noasm.go     |     2 +-
 go/arrow/compute/internal/kernels/boolean_cast.go  |     2 +
 go/arrow/compute/internal/kernels/cast.go          |     4 +-
 go/arrow/compute/internal/kernels/cast_numeric.go  |     2 +
 .../compute/internal/kernels/cast_numeric_amd64.go |     2 +-
 .../internal/kernels/cast_numeric_avx2_amd64.go    |     2 +-
 .../internal/kernels/cast_numeric_avx2_amd64.s     |     2 +-
 .../internal/kernels/cast_numeric_neon_arm64.go    |     2 +-
 .../internal/kernels/cast_numeric_neon_arm64.s     |     2 +-
 .../internal/kernels/cast_numeric_sse4_amd64.go    |     2 +-
 .../internal/kernels/cast_numeric_sse4_amd64.s     |     2 +-
 go/arrow/compute/internal/kernels/cast_temporal.go |     2 +
 .../internal/kernels/compareoperator_string.go     |    30 +
 .../compute/internal/kernels/constant_factor.go    |     2 +
 .../internal/kernels/constant_factor_amd64.go      |     2 +-
 .../internal/kernels/constant_factor_avx2_amd64.go |     2 +-
 .../internal/kernels/constant_factor_avx2_amd64.s  |     2 +-
 .../internal/kernels/constant_factor_sse4_amd64.go |     2 +-
 .../internal/kernels/constant_factor_sse4_amd64.s  |     2 +-
 .../arrow/compute/internal/kernels/doc.go          |    11 +-
 go/arrow/compute/internal/kernels/helpers.go       |   187 +-
 go/arrow/compute/internal/kernels/numeric_cast.go  |     2 +
 .../compute/internal/kernels/scalar_arithmetic.go  |   208 +
 .../compute/internal/kernels/scalar_boolean.go     |     2 +
 .../internal/kernels/scalar_comparison_amd64.go    |   110 +
 .../kernels/scalar_comparison_avx2_amd64.go        |   109 +
 .../kernels/scalar_comparison_avx2_amd64.s         | 67310 ++++++++++++++++++
 .../scalar_comparison_noasm.go}                    |    13 +-
 .../kernels/scalar_comparison_sse4_amd64.go        |   109 +
 .../kernels/scalar_comparison_sse4_amd64.s         | 58288 ++++++++++++++++
 .../compute/internal/kernels/scalar_comparisons.go |   701 +
 go/arrow/compute/internal/kernels/string_casts.go  |     2 +
 go/arrow/compute/internal/kernels/types.go         |    21 +-
 .../compute/internal/kernels/vector_selection.go   |     2 +
 go/arrow/compute/no_exec.go                        |    45 -
 go/arrow/compute/registry.go                       |     3 +
 go/arrow/compute/registry_test.go                  |     2 +
 go/arrow/compute/scalar_bool.go                    |     2 +
 go/arrow/compute/scalar_bool_test.go               |     2 +
 go/arrow/compute/scalar_compare.go                 |   137 +
 go/arrow/compute/scalar_compare_test.go            |  1489 +
 go/arrow/compute/selection.go                      |     2 +
 go/arrow/compute/utils.go                          |    84 +
 go/arrow/compute/vector_selection_test.go          |     2 +
 go/arrow/decimal128/decimal128.go                  |     6 +-
 go/arrow/decimal256/decimal256.go                  |     9 +-
 go/arrow/doc.go                                    |     4 +
 go/arrow/internal/testing/gen/random_array_gen.go  |    42 +-
 go/arrow/ipc/ipc.go                                |     8 +
 go/arrow/ipc/ipc_test.go                           |    64 +-
 go/arrow/ipc/writer.go                             |    22 +-
 go/arrow/scalar/parse.go                           |    83 +
 go/arrow/scalar/scalar.go                          |    18 +-
 go/go.mod                                          |     2 +-
 go/go.sum                                          |   148 -
 go/internal/bitutils/bitmap_generate.go            |     2 +
 go/parquet/doc.go                                  |     3 +
 go/parquet/internal/encoding/delta_bit_packing.go  |    10 +-
 go/parquet/internal/encoding/encoding_test.go      |    61 +
 java/.gitignore                                    |     2 -
 .../java/org/apache/arrow/c/ArrayImporter.java     |    57 +-
 .../main/java/org/apache/arrow/c/ArrowArray.java   |     6 +
 .../apache/arrow/c/BufferImportTypeVisitor.java    |   322 +
 .../org/apache/arrow/c/CDataReferenceManager.java  |   124 -
 .../apache/arrow/c/ReferenceCountedArrowArray.java |    74 +
 .../org/apache/arrow/c/ArrowArrayUtilityTest.java  |   147 +
 .../java/org/apache/arrow/c/RoundtripTest.java     |    14 +-
 java/dataset/src/main/cpp/jni_util.cc              |    26 +-
 java/dataset/src/main/cpp/jni_util.h               |    11 +-
 .../org/apache/arrow/memory/AllocationManager.java |    55 +-
 .../org/apache/arrow/memory/BufferAllocator.java   |    31 +
 .../org/apache/arrow/memory/ForeignAllocation.java |    58 +
 .../arrow/memory/ForeignAllocationManager.java     |    45 +
 .../org/apache/arrow/memory/ReferenceManager.java  |    15 +-
 .../apache/arrow/memory/TestForeignAllocation.java |    72 +
 java/pom.xml                                       |     4 +-
 .../java/org/apache/arrow/vector/table/Row.java    |    99 +-
 .../java/org/apache/arrow/vector/table/Table.java  |     3 +-
 .../apache/arrow/vector/table/BaseTableTest.java   |     1 -
 .../org/apache/arrow/vector/table/TestUtils.java   |    58 +-
 python/pyarrow/__init__.py                         |     2 +-
 python/pyarrow/_compute.pyx                        |    39 +-
 python/pyarrow/_flight.pyx                         |     2 +-
 python/pyarrow/feather.py                          |     9 -
 python/pyarrow/includes/libarrow.pxd               |     6 +
 python/pyarrow/pandas-shim.pxi                     |    27 +-
 python/pyarrow/pandas_compat.py                    |     3 +-
 python/pyarrow/parquet/core.py                     |    24 +-
 python/pyarrow/src/CMakeLists.txt                  |     2 +-
 python/pyarrow/tests/parquet/test_dataset.py       |    10 +-
 python/pyarrow/tests/parquet/test_metadata.py      |    33 +
 python/pyarrow/tests/parquet/test_pandas.py        |     5 -
 python/pyarrow/tests/test_compute.py               |   114 +-
 python/pyarrow/tests/test_cpp_internals.py         |    17 +
 python/pyarrow/tests/test_flight.py                |    10 +
 python/pyarrow/tests/test_fs.py                    |    24 +
 python/pyarrow/tests/test_orc.py                   |     1 -
 python/pyarrow/tests/test_pandas.py                |   106 +-
 python/pyarrow/tests/test_schema.py                |     4 +-
 python/pyarrow/tests/test_substrait.py             |     5 +
 python/setup.py                                    |    18 +-
 r/.Rbuildignore                                    |     1 +
 r/DESCRIPTION                                      |     3 +-
 r/R/arrow-info.R                                   |     6 +-
 r/R/dataset.R                                      |     3 +-
 r/R/dplyr-funcs-string.R                           |    13 +-
 r/R/install-arrow.R                                |    10 +-
 r/R/json.R                                         |     2 +-
 r/R/parquet.R                                      |     3 +-
 r/README.md                                        |   355 +-
 r/STYLE.md                                         |     2 +-
 r/_pkgdown.yml                                     |   137 +-
 r/man/arrow-package.Rd                             |     2 +
 r/man/arrow_info.Rd                                |     6 +-
 r/man/install_arrow.Rd                             |    10 +-
 r/man/open_dataset.Rd                              |     3 +-
 r/man/read_json_arrow.Rd                           |     2 +-
 r/man/write_parquet.Rd                             |     3 +-
 r/pkgdown/favicon/apple-touch-icon-120x120.png     |   Bin 0 -> 4317 bytes
 r/pkgdown/favicon/apple-touch-icon-152x152.png     |   Bin 0 -> 5620 bytes
 r/pkgdown/favicon/apple-touch-icon-180x180.png     |   Bin 0 -> 6910 bytes
 r/pkgdown/favicon/apple-touch-icon-60x60.png       |   Bin 0 -> 2101 bytes
 r/pkgdown/favicon/apple-touch-icon-76x76.png       |   Bin 0 -> 2609 bytes
 r/pkgdown/favicon/apple-touch-icon.png             |   Bin 0 -> 6910 bytes
 r/pkgdown/favicon/favicon-16x16.png                |   Bin 0 -> 733 bytes
 r/pkgdown/favicon/favicon-32x32.png                |   Bin 0 -> 1236 bytes
 r/pkgdown/favicon/favicon.ico                      |   Bin 0 -> 15086 bytes
 r/src/memorypool.cpp                               |    18 +-
 r/tests/testthat/_snaps/dataset-write.md           |    30 +-
 r/tests/testthat/_snaps/dplyr-glimpse.md           |     6 +-
 r/tests/testthat/_snaps/dplyr-join.md              |    25 +-
 r/tests/testthat/helper-expectation.R              |    65 +-
 r/tests/testthat/test-compute-vector.R             |    21 +
 r/tests/testthat/test-dplyr-funcs-string.R         |    27 +-
 r/tools/nixlibs.R                                  |     8 +-
 r/vignettes/.gitignore                             |     1 +
 r/vignettes/array_indexing.png                     |   Bin 0 -> 31330 bytes
 r/vignettes/arrow.Rmd                              |   368 +-
 r/vignettes/data_objects.Rmd                       |   380 +
 r/vignettes/data_types.Rmd                         |   342 +
 r/vignettes/data_types.png                         |   Bin 0 -> 79799 bytes
 r/vignettes/data_wrangling.Rmd                     |   172 +
 r/vignettes/dataset.Rmd                            |   256 +-
 r/vignettes/developers/array_layout_integer.png    |   Bin 0 -> 26688 bytes
 r/vignettes/developers/array_layout_string.png     |   Bin 0 -> 19264 bytes
 r/vignettes/developers/chunked_array_layout.png    |   Bin 0 -> 26458 bytes
 r/vignettes/developers/data_object_layout.Rmd      |   179 +
 r/vignettes/developers/debugging.Rmd               |     9 +-
 r/vignettes/developers/docker.Rmd                  |     7 +-
 r/vignettes/developers/install_details.Rmd         |     7 +-
 r/vignettes/developers/record_batch_layout.png     |   Bin 0 -> 42465 bytes
 r/vignettes/developers/setup.Rmd                   |    15 +-
 r/vignettes/developers/table_layout.png            |   Bin 0 -> 47905 bytes
 r/vignettes/developers/workflow.Rmd                |     8 +-
 .../{bindings.Rmd => writing_bindings.Rmd}         |    10 +-
 r/vignettes/developing.Rmd                         |    75 +-
 r/vignettes/flight.Rmd                             |    54 +-
 r/vignettes/fs.Rmd                                 |   185 +-
 r/vignettes/install.Rmd                            |   267 +-
 r/vignettes/install_nightly.Rmd                    |    55 +
 r/vignettes/metadata.Rmd                           |    82 +
 r/vignettes/python.Rmd                             |   199 +-
 r/vignettes/read_write.Rmd                         |   163 +
 r/vignettes/record_batch.png                       |   Bin 0 -> 37226 bytes
 r/vignettes/table.png                              |   Bin 0 -> 50878 bytes
 ruby/red-arrow/lib/arrow/array.rb                  |     4 +
 ruby/red-arrow/lib/arrow/chunked-array.rb          |    12 +
 ruby/red-arrow/lib/arrow/raw-table-converter.rb    |    12 +-
 ruby/red-arrow/lib/arrow/tensor.rb                 |     4 +
 ruby/red-arrow/test/test-table.rb                  |    22 +-
 testing                                            |     2 +-
 397 files changed, 267923 insertions(+), 5371 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yaml
 delete mode 100644 .github/ISSUE_TEMPLATE/question.md
 create mode 100644 .github/ISSUE_TEMPLATE/usage_question.yaml
 rename .github/workflows/dev_pr/{jira_check.js => issue_check.js} (51%)
 delete mode 100644 ci/conan/all/patches/1.0.0-0001-cmake.patch
 delete mode 100644 ci/conan/all/patches/1.0.0-0002-jemalloc.patch
 create mode 100644 ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch
 create mode 100644 ci/conan/all/patches/1.0.0-0006-fix-cmake.patch
 rename ci/conan/all/patches/{8.0.0-0001-cmake.patch => 10.0.0-0001-mallctl-takes-size_t.patch} (65%)
 create mode 100644 ci/conan/all/patches/10.0.0-0002-fix-cmake.patch
 delete mode 100644 ci/conan/all/patches/2.0.0-0001-cmake.patch
 delete mode 100644 ci/conan/all/patches/2.0.0-0002-jemalloc.patch
 delete mode 100644 ci/conan/all/patches/2.0.0-0006-gandiva-llvm-re2.patch
 delete mode 100644 ci/conan/all/patches/2.0.0-0007-fix-protoc-cmake.patch
 create mode 100644 ci/conan/all/patches/2.0.0-0008-fix-cmake.patch
 delete mode 100644 ci/conan/all/patches/7.0.0-0001-cmake.patch
 delete mode 100644 ci/conan/all/patches/7.0.0-0002-jemalloc.patch
 delete mode 100644 ci/conan/all/patches/7.0.0-0004-remove-find-modules.patch
 delete mode 100644 ci/conan/all/patches/7.0.0-0005-use-find-package.patch
 create mode 100644 ci/conan/all/patches/7.0.0-0007-fix-cmake.patch
 delete mode 100644 ci/conan/all/patches/8.0.0-0002-jemalloc.patch
 delete mode 100644 ci/conan/all/patches/8.0.0-0004-use-find-package.patch
 create mode 100644 ci/conan/all/patches/8.0.0-0006-fix-cmake.patch
 rename ci/conan/all/{ => test_v1_package}/CMakeLists.txt (83%)
 copy ci/conan/all/{test_package => test_v1_package}/conanfile.py (83%)
 rename cpp/cmake_modules/{FindBrotli.cmake => FindBrotliAlt.cmake} (79%)
 rename cpp/cmake_modules/{FindThrift.cmake => FindThriftAlt.cmake} (56%)
 rename cpp/cmake_modules/{Findjemalloc.cmake => FindjemallocAlt.cmake} (56%)
 copy python/pyarrow/tests/test_cpp_internals.py => dev/release/07-binary-verify.sh (64%)
 mode change 100644 => 100755
 copy dev/release/{post-12-msys2.sh => post-14-vcpkg.sh} (59%)
 copy python/pyarrow/tests/test_cpp_internals.py => dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-kinetic/Dockerfile (61%)
 copy ci/conan/merge_status.sh => dev/tasks/linux-packages/apache-arrow/apt/ubuntu-kinetic-arm64/from (93%)
 create mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-kinetic/Dockerfile
 delete mode 100644 go/arrow/compute/go.mod
 delete mode 100644 go/arrow/compute/go.sum
 create mode 100644 go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc
 create mode 100644 go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s
 create mode 100644 go/arrow/compute/internal/kernels/_lib/scalar_comparison_sse4_amd64.s
 create mode 100644 go/arrow/compute/internal/kernels/compareoperator_string.go
 rename go.work => go/arrow/compute/internal/kernels/doc.go (83%)
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparison_amd64.go
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparison_avx2_amd64.go
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparison_avx2_amd64.s
 copy go/arrow/compute/internal/{exec/hash_util.go => kernels/scalar_comparison_noasm.go} (77%)
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparison_sse4_amd64.go
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparison_sse4_amd64.s
 create mode 100644 go/arrow/compute/internal/kernels/scalar_comparisons.go
 delete mode 100644 go/arrow/compute/no_exec.go
 create mode 100644 go/arrow/compute/scalar_compare.go
 create mode 100644 go/arrow/compute/scalar_compare_test.go
 create mode 100644 java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
 delete mode 100644 java/c/src/main/java/org/apache/arrow/c/CDataReferenceManager.java
 create mode 100644 java/c/src/main/java/org/apache/arrow/c/ReferenceCountedArrowArray.java
 create mode 100644 java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
 create mode 100644 java/memory/memory-core/src/main/java/org/apache/arrow/memory/ForeignAllocation.java
 create mode 100644 java/memory/memory-core/src/main/java/org/apache/arrow/memory/ForeignAllocationManager.java
 create mode 100644 java/memory/memory-netty/src/test/java/org/apache/arrow/memory/TestForeignAllocation.java
 create mode 100644 r/pkgdown/favicon/apple-touch-icon-120x120.png
 create mode 100644 r/pkgdown/favicon/apple-touch-icon-152x152.png
 create mode 100644 r/pkgdown/favicon/apple-touch-icon-180x180.png
 create mode 100644 r/pkgdown/favicon/apple-touch-icon-60x60.png
 create mode 100644 r/pkgdown/favicon/apple-touch-icon-76x76.png
 create mode 100644 r/pkgdown/favicon/apple-touch-icon.png
 create mode 100644 r/pkgdown/favicon/favicon-16x16.png
 create mode 100644 r/pkgdown/favicon/favicon-32x32.png
 create mode 100644 r/pkgdown/favicon/favicon.ico
 create mode 100644 r/vignettes/.gitignore
 create mode 100644 r/vignettes/array_indexing.png
 create mode 100644 r/vignettes/data_objects.Rmd
 create mode 100644 r/vignettes/data_types.Rmd
 create mode 100644 r/vignettes/data_types.png
 create mode 100644 r/vignettes/data_wrangling.Rmd
 create mode 100644 r/vignettes/developers/array_layout_integer.png
 create mode 100644 r/vignettes/developers/array_layout_string.png
 create mode 100644 r/vignettes/developers/chunked_array_layout.png
 create mode 100644 r/vignettes/developers/data_object_layout.Rmd
 create mode 100644 r/vignettes/developers/record_batch_layout.png
 create mode 100644 r/vignettes/developers/table_layout.png
 rename r/vignettes/developers/{bindings.Rmd => writing_bindings.Rmd} (97%)
 create mode 100644 r/vignettes/install_nightly.Rmd
 create mode 100644 r/vignettes/metadata.Rmd
 create mode 100644 r/vignettes/read_write.Rmd
 create mode 100644 r/vignettes/record_batch.png
 create mode 100644 r/vignettes/table.png


[arrow] 14/15: Extract visitation of views owning buffers

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 21be7a9a25c59ba89709ccada71d771d1a2cd36b
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Mon Nov 28 14:09:59 2022 -0500

    Extract visitation of views owning buffers
---
 cpp/src/arrow/array/array_binary_test.cc |   8 +
 cpp/src/arrow/array/array_test.cc        |  52 +++----
 cpp/src/arrow/array/validate.cc          | 242 ++++++++++++++++++-------------
 cpp/src/arrow/buffer.h                   |   8 +
 cpp/src/arrow/testing/random.cc          |  10 ++
 5 files changed, 190 insertions(+), 130 deletions(-)

diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 92fc16f775..f21abf681f 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -389,6 +389,14 @@ TEST(StringViewArray, Validate) {
                   .ValidateFull(),
               Ok());
 
+  // overlapping views and buffers are allowed
+  EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+                         StringHeader(std::string_view{*buffer_s}.substr(5, 5)),
+                         StringHeader(std::string_view{*buffer_s}.substr(9, 4))},
+                        {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)})
+                  .ValidateFull(),
+              Ok());
+
   EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
                          // if a view points outside the buffers, that is invalid
                          StringHeader("from a galaxy far, far away"),
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index c14d4f21ac..36b274a99a 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -663,32 +663,32 @@ TEST_F(TestArray, TestMakeEmptyArray) {
   FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
   std::vector<int8_t> union_type_codes{7, 42};
 
-  std::shared_ptr<DataType> types[] = {null(),
-                                       boolean(),
-                                       int8(),
-                                       uint16(),
-                                       int32(),
-                                       uint64(),
-                                       float64(),
-                                       binary(),
-                                       large_binary(),
-                                       fixed_size_binary(3),
-                                       decimal(16, 4),
-                                       utf8(),
-                                       large_utf8(),
-                                       list(utf8()),
-                                       list(int64()),
-                                       large_list(large_utf8()),
-                                       fixed_size_list(utf8(), 3),
-                                       fixed_size_list(int64(), 4),
-                                       dictionary(int32(), utf8()),
-                                       struct_({field("a", utf8()), field("b", int32())}),
-                                       sparse_union(union_fields1, union_type_codes),
-                                       sparse_union(union_fields2, union_type_codes),
-                                       dense_union(union_fields1, union_type_codes),
-                                       dense_union(union_fields2, union_type_codes)};
-
-  for (auto type : types) {
+  for (auto type : {null(),
+                    boolean(),
+                    int8(),
+                    uint16(),
+                    int32(),
+                    uint64(),
+                    float64(),
+                    binary(),
+                    binary_view(),
+                    large_binary(),
+                    fixed_size_binary(3),
+                    decimal(16, 4),
+                    utf8(),
+                    utf8_view(),
+                    large_utf8(),
+                    list(utf8()),
+                    list(int64()),
+                    large_list(large_utf8()),
+                    fixed_size_list(utf8(), 3),
+                    fixed_size_list(int64(), 4),
+                    dictionary(int32(), utf8()),
+                    struct_({field("a", utf8()), field("b", int32())}),
+                    sparse_union(union_fields1, union_type_codes),
+                    sparse_union(union_fields2, union_type_codes),
+                    dense_union(union_fields1, union_type_codes),
+                    dense_union(union_fields2, union_type_codes)}) {
     ARROW_SCOPED_TRACE("type = ", type->ToString());
     ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
     ASSERT_OK(array->ValidateFull());
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 53836efd97..2b83c84b28 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -30,13 +30,141 @@
 #include "arrow/util/decimal.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/sort.h"
+#include "arrow/util/string.h"
 #include "arrow/util/unreachable.h"
 #include "arrow/util/utf8.h"
 #include "arrow/visit_data_inline.h"
 #include "arrow/visit_type_inline.h"
 
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
+
+/// visitor will be called once for each non-inlined StringHeader.
+/// It will be passed the index of each non-inlined StringHeader,
+/// as well as a `const shared_ptr<Buffer>&` of the buffer
+/// wherein the viewed memory resides, or nullptr if the viewed memory
+/// is not in a buffer managed by this array.
+template <typename Visitor>
+Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data,
+                                            const Visitor& visitor) {
+  auto* headers = data.buffers[1]->data_as<StringHeader>();
+
+  static const std::shared_ptr<Buffer> kNullBuffer = nullptr;
+
+  if (data.buffers.size() == 2 ||
+      (data.buffers.size() == 3 && data.buffers.back() == nullptr)) {
+    // there are no character buffers, just visit a null buffer
+    for (int64_t i = 0; i < data.length; ++i) {
+      if (headers[i].IsInline()) continue;
+      RETURN_NOT_OK(visitor(i, kNullBuffer));
+    }
+    return Status::OK();
+  }
+
+  auto IsSubrangeOf = [](std::string_view super, StringHeader sub) {
+    return super.data() <= sub.data() &&
+           super.data() + super.size() >= sub.data() + sub.size();
+  };
+
+  std::vector<std::string_view> buffers;
+  std::vector<const std::shared_ptr<Buffer>*> owning_buffers;
+  for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
+    if (*it != nullptr) {
+      buffers.emplace_back(**it);
+      owning_buffers.push_back(&*it);
+    }
+  }
+
+  const int not_found = static_cast<int>(buffers.size());
+
+  auto DoVisit = [&](auto get_buffer) {
+    DCHECK(!buffers.empty());
+
+    // owning_buffers[not_found] points to the null placeholder
+    owning_buffers.push_back(&kNullBuffer);
+
+    std::string_view buffer_containing_previous_view = buffers.front();
+    int buffer_i = 0;
+
+    for (int64_t i = 0; i < data.length; ++i) {
+      if (headers[i].IsInline()) continue;
+
+      if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) {
+        // Fast path: for most string view arrays, we'll have runs
+        // of views into the same buffer.
+      } else {
+        buffer_i = get_buffer(headers[i]);
+        if (buffer_i != not_found) {
+          // if we didn't find a buffer which owns headers[i], we can hope
+          // that there was just one out of line string and check
+          // buffer_containing_previous_view next iteration
+          buffer_containing_previous_view = buffers[buffer_i];
+        }
+      }
+
+      RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i]));
+    }
+    return Status::OK();
+  };
+
+  // Simplest check for view-in-buffer: loop through buffers and check each one.
+  auto Linear = [&](StringHeader view) {
+    int i = 0;
+    for (std::string_view buffer : buffers) {
+      if (IsSubrangeOf(buffer, view)) return i;
+      ++i;
+    }
+    return not_found;
+  };
+
+  if (buffers.size() <= 32) {
+    // If there are few buffers to search through, sorting/binary search is not
+    // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
+    return DoVisit(Linear);
+  }
+
+  auto DataPtrLess = [](std::string_view l, std::string_view r) {
+    return l.data() < r.data();
+  };
+
+  {
+    auto sort_indices = ArgSort(buffers, DataPtrLess);
+    Permute(sort_indices, &buffers);
+    Permute(sort_indices, &owning_buffers);
+  }
+
+  bool non_overlapping =
+      buffers.end() !=
+      std::adjacent_find(buffers.begin(), buffers.end(),
+                         [](std::string_view before, std::string_view after) {
+                           return before.data() + before.size() <= after.data();
+                         });
+  if (ARROW_PREDICT_FALSE(!non_overlapping)) {
+    // Using a binary search with overlapping buffers would not *uniquely* identify
+    // a potentially-containing buffer. Moreover this should be a fairly rare case
+    // so optimizing for it seems premature.
+    return DoVisit(Linear);
+  }
+
+  // More sophisticated check for view-in-buffer: binary search through the buffers.
+  return DoVisit([&](StringHeader view) {
+    // Find the first buffer whose data starts after the data in view-
+    // only buffers *before* this could contain view. Since we've additionally
+    // checked that the buffers do not overlap, only the buffer *immediately before*
+    // this could contain view.
+    int one_past_potential_super =
+        static_cast<int>(std::upper_bound(buffers.begin(), buffers.end(),
+                                          std::string_view{view}, DataPtrLess) -
+                         buffers.begin());
+
+    if (one_past_potential_super == 0) return not_found;
+
+    int i = one_past_potential_super - 1;
+    if (IsSubrangeOf(buffers[i], view)) return i;
+
+    return not_found;
+  });
+}
 
 namespace {
 
@@ -610,109 +738,16 @@ struct ValidateArrayImpl {
       return Status::OK();
     }
 
-    auto* headers = data.GetValues<StringHeader>(1);
-    std::string_view buffer_containing_previous_view;
-
-    auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
-      return super.data() <= sub.data() &&
-             super.data() + super.size() >= sub.data() + sub.size();
-    };
-
-    std::vector<std::string_view> buffers;
-    for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
-      buffers.emplace_back(**it);
-    }
-
-    auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) {
-      if constexpr (check_previous_buffer) {
-        buffer_containing_previous_view = buffers.front();
-      }
-
-      for (int64_t i = 0; i < data.length; ++i) {
-        if (headers[i].IsInline()) continue;
-
-        std::string_view view{headers[i]};
-
-        if constexpr (check_previous_buffer) {
-          if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) {
-            // Fast path: for most string view arrays, we'll have runs
-            // of views into the same buffer.
-            continue;
-          }
-        }
+    return VisitNonInlinedViewsAndOwningBuffers(
+        data, [&](int64_t i, const std::shared_ptr<Buffer>& owner) {
+          if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK();
 
-        if (!in_a_buffer(view)) {
+          auto* ptr = data.buffers[1]->data_as<StringHeader>()[i].data();
           return Status::Invalid(
-              "String view at slot ", i, " @", (std::uintptr_t)view.data(),
+              "String view at slot ", i, " @",
+              arrow::HexEncode(reinterpret_cast<uint8_t*>(&ptr), sizeof(ptr)),
               " views memory not resident in any buffer managed by the array");
-        }
-      }
-      return Status::OK();
-    };
-
-    if (buffers.empty()) {
-      // there are no character buffers; the only way this array
-      // can be valid is if all views are inline
-      return CheckViews([](std::string_view) { return std::false_type{}; },
-                        /*check_previous_buffer=*/std::false_type{});
-    }
-
-    // Simplest check for view-in-buffer: loop through buffers and check each one.
-    auto Linear = [&](std::string_view view) {
-      for (std::string_view buffer : buffers) {
-        if (IsSubrangeOf(buffer, view)) {
-          buffer_containing_previous_view = buffer;
-          return true;
-        }
-      }
-      return false;
-    };
-
-    if (buffers.size() <= 32) {
-      // If there are few buffers to search through, sorting/binary search is not
-      // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
-      return CheckViews(Linear,
-                        /*check_previous_buffer=*/std::true_type{});
-    }
-
-    auto DataPtrLess = [](std::string_view l, std::string_view r) {
-      return l.data() < r.data();
-    };
-
-    std::sort(buffers.begin(), buffers.end(), DataPtrLess);
-    bool non_overlapping =
-        buffers.end() !=
-        std::adjacent_find(buffers.begin(), buffers.end(),
-                           [](std::string_view before, std::string_view after) {
-                             return before.data() + before.size() <= after.data();
-                           });
-    if (ARROW_PREDICT_FALSE(!non_overlapping)) {
-      // Using a binary search with overlapping buffers would not *uniquely* identify
-      // a potentially-containing buffer. Moreover this should be a fairly rare case
-      // so optimizing for it seems premature.
-      return CheckViews(Linear,
-                        /*check_previous_buffer=*/std::true_type{});
-    }
-
-    // More sophisticated check for view-in-buffer: binary search through the buffers.
-    return CheckViews(
-        [&](std::string_view view) {
-          // Find the first buffer whose data starts after the data in view-
-          // only buffers *before* this could contain view. Since we've additionally
-          // checked that the buffers do not overlap, only the buffer *immediately before*
-          // this could contain view.
-          auto one_past_potential_super =
-              std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess);
-
-          if (one_past_potential_super == buffers.begin()) return false;
-
-          auto potential_super = *(one_past_potential_super - 1);
-          if (!IsSubrangeOf(potential_super, view)) return false;
-
-          buffer_containing_previous_view = potential_super;
-          return true;
-        },
-        /*check_previous_buffer=*/std::true_type{});
+        });
   }
 
   template <typename ListType>
@@ -863,5 +898,4 @@ Status ValidateUTF8(const ArrayData& data) {
 ARROW_EXPORT
 Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
 
-}  // namespace internal
-}  // namespace arrow
+}  // namespace arrow::internal
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 27b1a1edac..85135df1a3 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -182,6 +182,10 @@ class ARROW_EXPORT Buffer {
 #endif
     return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
   }
+  template <typename T>
+  const T* data_as() const {
+    return reinterpret_cast<const T*>(data());
+  }
 
   /// \brief Return a writable pointer to the buffer's data
   ///
@@ -199,6 +203,10 @@ class ARROW_EXPORT Buffer {
     return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
                                                       : NULLPTR;
   }
+  template <typename T>
+  T* mutable_data_as() {
+    return reinterpret_cast<T*>(mutable_data());
+  }
 
   /// \brief Return the device address of the buffer's data
   uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index e45e296ff6..137a5c031a 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -828,6 +828,16 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
                   ->View(field.type());
     }
 
+    case Type::type::STRING_VIEW:
+    case Type::type::BINARY_VIEW: {
+      const auto min_length =
+          GetMetadata<int32_t>(field.metadata().get(), "min_length", 0);
+      const auto max_length =
+          GetMetadata<int32_t>(field.metadata().get(), "max_length", 20);
+      return *StringView(length, min_length, max_length, null_probability)
+                  ->View(field.type());
+    }
+
     case Type::type::DECIMAL128:
       return Decimal128(field.type(), length, null_probability, alignment, memory_pool);
 


[arrow] 15/15: add cast to/from string_view

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 7801b484864619a1ef0c7a4ebd8b0d27bfe231d3
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Nov 29 16:02:29 2022 -0500

    add cast to/from string_view
---
 cpp/src/arrow/array/data.cc                        |  13 +-
 cpp/src/arrow/array/data.h                         |  21 +-
 cpp/src/arrow/array/validate.cc                    |  62 +++--
 cpp/src/arrow/compute/exec.cc                      |   3 +
 .../arrow/compute/kernels/scalar_cast_internal.cc  |  12 +-
 .../arrow/compute/kernels/scalar_cast_internal.h   |   3 -
 .../arrow/compute/kernels/scalar_cast_numeric.cc   |   4 +-
 .../arrow/compute/kernels/scalar_cast_string.cc    | 277 ++++++++++++++-------
 cpp/src/arrow/compute/kernels/scalar_cast_test.cc  |  66 +++--
 9 files changed, 300 insertions(+), 161 deletions(-)

diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 0cfa9fcd2e..eb8249077d 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -145,7 +145,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
   }
   this->offset = data.offset;
 
-  for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
+  for (int i = 0; i < std::min(static_cast<int>(data.buffers.size()), 3); ++i) {
     const std::shared_ptr<Buffer>& buffer = data.buffers[i];
     // It is the invoker-of-kernels's responsibility to ensure that
     // const buffers are not written to accidentally.
@@ -291,6 +291,17 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
     }
     this->buffers[2].data = const_cast<uint8_t*>(data_buffer);
     this->buffers[2].size = data_size;
+  } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) {
+    const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
+    this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);
+    if (scalar.is_valid) {
+      *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {scalar.value->data(),
+                                                                 scalar.value->size()};
+      this->buffers[2].data = const_cast<uint8_t*>(scalar.value->data());
+      this->buffers[2].size = scalar.value->size();
+    } else {
+      *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {};
+    }
   } else if (type_id == Type::FIXED_SIZE_BINARY) {
     const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
     this->buffers[1].data = const_cast<uint8_t*>(scalar.value->data());
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index e024483f66..b9991bd959 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -356,10 +356,10 @@ struct ARROW_EXPORT ArraySpan {
   void SetSlice(int64_t offset, int64_t length) {
     this->offset = offset;
     this->length = length;
-    if (this->type->id() != Type::NA) {
-      this->null_count = kUnknownNullCount;
-    } else {
+    if (this->type->id() == Type::NA) {
       this->null_count = this->length;
+    } else if (this->MayHaveNulls()) {
+      this->null_count = kUnknownNullCount;
     }
   }
 
@@ -375,6 +375,21 @@ struct ARROW_EXPORT ArraySpan {
 
 namespace internal {
 
+template <typename F>
+Status VisitSlices(ArraySpan input, int64_t slice_size, const F& f) {
+  int64_t num_slices = input.length / slice_size;
+  int64_t trailing_slice_size = input.length % slice_size;
+  int64_t offset = input.offset;
+
+  for (int64_t i = 0; i < num_slices; ++i) {
+    input.SetSlice(offset, slice_size);
+    ARROW_RETURN_NOT_OK(f(input));
+    offset += slice_size;
+  }
+  input.SetSlice(offset, trailing_slice_size);
+  return f(input);
+}
+
 void FillZeroLengthArray(const DataType* type, ArraySpan* span);
 
 /// Construct a zero-copy view of this ArrayData with the given type.
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 2b83c84b28..4b10951724 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -171,39 +171,29 @@ namespace {
 struct UTF8DataValidator {
   const ArrayData& data;
 
-  Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); }
-
-  Status Visit(const StringViewType&) {
-    util::InitializeUTF8();
-
-    const auto* values = data.GetValues<StringHeader>(1);
-    for (int64_t i = 0; i < data.length; ++i) {
-      if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(
-              reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) {
-        return Status::Invalid("Invalid UTF8 sequence at string index ", i);
-      }
+  template <typename T>
+  Status Visit(const T&) {
+    if constexpr (std::is_same_v<T, StringType> || std::is_same_v<T, LargeStringType> ||
+                  std::is_same_v<T, StringViewType>) {
+      util::InitializeUTF8();
+
+      int64_t i = 0;
+      return VisitArraySpanInline<T>(
+          data,
+          [&](std::string_view v) {
+            if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+              return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+            }
+            ++i;
+            return Status::OK();
+          },
+          [&]() {
+            ++i;
+            return Status::OK();
+          });
+    } else {
+      Unreachable("utf-8 validation of non string type");
     }
-    return Status::OK();
-  }
-
-  template <typename StringType>
-  enable_if_string<StringType, Status> Visit(const StringType&) {
-    util::InitializeUTF8();
-
-    int64_t i = 0;
-    return VisitArraySpanInline<StringType>(
-        data,
-        [&](std::string_view v) {
-          if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
-            return Status::Invalid("Invalid UTF8 sequence at string index ", i);
-          }
-          ++i;
-          return Status::OK();
-        },
-        [&]() {
-          ++i;
-          return Status::OK();
-        });
   }
 };
 
@@ -304,6 +294,14 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status Visit(const StringViewType& type) {
+    RETURN_NOT_OK(ValidateBinaryView(type));
+    if (full_validation) {
+      RETURN_NOT_OK(ValidateUTF8(data));
+    }
+    return Status::OK();
+  }
+
   Status Visit(const Date64Type& type) {
     RETURN_NOT_OK(ValidateFixedWidthBuffers());
 
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 8d3dcf0f2c..3b868cc716 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -232,6 +232,9 @@ void ComputeDataPreallocate(const DataType& type,
     case Type::LARGE_LIST:
       widths->emplace_back(64, /*added_length=*/1);
       return;
+    case Type::BINARY_VIEW:
+    case Type::STRING_VIEW:
+      widths->emplace_back(8 * sizeof(StringHeader));
     default:
       break;
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 27a86135a6..8f5467ee84 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -170,12 +170,6 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
   return Status::OK();
 }
 
-Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
-                                            const std::vector<TypeHolder>&) {
-  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  return options.to_type;
-}
-
 /// You will see some of kernels with
 ///
 /// kOutputTargetType
@@ -184,8 +178,10 @@ Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
 /// easiest initial way to get the requested cast type including the TimeUnit
 /// to the kernel (which is needed to compute the output) was through
 /// CastOptions
-
-OutputType kOutputTargetType(ResolveOutputFromOptions);
+OutputType kOutputTargetType([](KernelContext* ctx,
+                                const std::vector<TypeHolder>&) -> Result<TypeHolder> {
+  return CastState::Get(ctx).to_type;
+});
 
 Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   // TODO(wesm): alternative strategy for zero copy casts after ARROW-16576
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 4d9afab199..bd8f41ea9f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -71,9 +71,6 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
                      CastFunction* func);
 
 // OutputType::Resolver that returns a type the type from CastOptions
-Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
-                                            const std::vector<TypeHolder>& args);
-
 ARROW_EXPORT extern OutputType kOutputTargetType;
 
 // Add generic casts to out_ty from:
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 00c7cacf9c..e68b08c804 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -726,7 +726,7 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
 }
 
 std::shared_ptr<CastFunction> GetCastToDecimal128() {
-  OutputType sig_out_ty(ResolveOutputFromOptions);
+  OutputType sig_out_ty = kOutputTargetType;
 
   auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
   AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
@@ -761,7 +761,7 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
 }
 
 std::shared_ptr<CastFunction> GetCastToDecimal256() {
-  OutputType sig_out_ty(ResolveOutputFromOptions);
+  OutputType sig_out_ty = kOutputTargetType;
 
   auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
   AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 44e233f98c..68ba6268ae 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -25,8 +25,10 @@
 #include "arrow/compute/kernels/scalar_cast_internal.h"
 #include "arrow/compute/kernels/temporal_internal.h"
 #include "arrow/result.h"
+#include "arrow/util/cpu_info.h"
 #include "arrow/util/formatting.h"
 #include "arrow/util/int_util.h"
+#include "arrow/util/unreachable.h"
 #include "arrow/util/utf8_internal.h"
 #include "arrow/visit_data_inline.h"
 
@@ -284,107 +286,192 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
 }
 
 template <typename O, typename I>
-enable_if_base_binary<I, Status> BinaryToBinaryCastExec(KernelContext* ctx,
-                                                        const ExecSpan& batch,
-                                                        ExecResult* out) {
-  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch,
+                              ExecResult* out) {
   const ArraySpan& input = batch[0].array;
 
-  if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
+  // This presupposes that one was created in the invocation layer
+  ArrayData* output = out->array_data().get();
+  output->SetNullCount(input.null_count);
+
+  const auto& options = CastState::Get(ctx);
+  bool check_utf8 = !I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8;
+  if (check_utf8) {
     InitializeUTF8();
-    ArraySpanVisitor<I> visitor;
-    Utf8Validator validator;
-    RETURN_NOT_OK(visitor.Visit(input, &validator));
   }
 
-  // Start with a zero-copy cast, but change indices to expected size
-  RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
-  return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
-      ctx, input, out->array_data().get());
-}
+  auto SimpleUtf8Validation = [&] {
+    if (check_utf8) {
+      Utf8Validator validator;
+      return ArraySpanVisitor<I>::Visit(input, &validator);
+    }
+    return Status::OK();
+  };
 
-template <typename O, typename I>
-enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
-                !std::is_same<O, FixedSizeBinaryType>::value,
-            Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  const ArraySpan& input = batch[0].array;
+  constexpr bool kInputOffsets =
+      std::is_base_of_v<BinaryType, I> || std::is_base_of_v<LargeBinaryType, I>;
 
-  if (O::is_utf8 && !options.allow_invalid_utf8) {
-    InitializeUTF8();
-    ArraySpanVisitor<I> visitor;
-    Utf8Validator validator;
-    RETURN_NOT_OK(visitor.Visit(input, &validator));
+  constexpr bool kInputViews = std::is_base_of_v<BinaryViewType, I>;
+
+  constexpr bool kInputFixed = std::is_same_v<FixedSizeBinaryType, I>;
+
+  constexpr bool kOutputOffsets =
+      std::is_base_of_v<BinaryType, O> || std::is_base_of_v<LargeBinaryType, O>;
+
+  constexpr bool kOutputViews = std::is_base_of_v<BinaryViewType, O>;
+
+  constexpr bool kOutputFixed = std::is_same_v<FixedSizeBinaryType, O>;
+
+  if constexpr (kInputOffsets && kOutputOffsets) {
+    // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to
+    // allocate a new null bitmap if necessary than to always allocate new offsets.
+    // Start with a zero-copy cast, but change indices to expected size
+    RETURN_NOT_OK(SimpleUtf8Validation());
+    RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+    return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
+        ctx, input, out->array_data().get());
   }
 
-  // Check for overflow
-  using output_offset_type = typename O::offset_type;
-  constexpr output_offset_type kMaxOffset =
-      std::numeric_limits<output_offset_type>::max();
-  const int32_t width = input.type->byte_width();
-  const int64_t max_offset = width * input.length;
-  if (max_offset > kMaxOffset) {
-    return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
-                           out->type()->ToString(), ": input array too large");
+  if constexpr (kInputViews && kOutputViews) {
+    return SimpleUtf8Validation() & ZeroCopyCastExec(ctx, batch, out);
   }
 
-  // This presupposes that one was created in the invocation layer
-  ArrayData* output = out->array_data().get();
+  if constexpr (kInputViews && kOutputOffsets) {
+    // FIXME(bkietz) this discards preallocated offset storage
+    typename TypeTraits<O>::BuilderType builder{ctx->memory_pool()};
 
-  // Copy buffers over, then generate indices
-  output->length = input.length;
-  output->SetNullCount(input.null_count);
-  if (input.offset == output->offset) {
-    output->buffers[0] = input.GetBuffer(0);
-  } else {
-    ARROW_ASSIGN_OR_RAISE(
-        output->buffers[0],
-        arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
-                                    input.offset, input.length));
-  }
+    RETURN_NOT_OK(builder.Reserve(input.length));
+    // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all
+    // the character data buffers here and reserve character data accordingly.
+
+    // sweep through L1-sized chunks to reduce the frequency of allocation
+    int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize(
+                             ::arrow::internal::CpuInfo::CacheLevel::L1) /
+                         sizeof(StringHeader) / 4;
+
+    RETURN_NOT_OK(::arrow::internal::VisitSlices(
+        input, chunk_size, [&](const ArraySpan& input_slice) {
+          int64_t num_chars = builder.value_data_length(), num_appended_chars = 0;
+          VisitArraySpanInline<I>(
+              input_slice,
+              [&](std::string_view v) {
+                num_appended_chars += static_cast<int64_t>(v.size());
+              },
+              [] {});
+
+          RETURN_NOT_OK(builder.ReserveData(num_appended_chars));
+
+          VisitArraySpanInline<I>(
+              input_slice, [&](std::string_view v) { builder.UnsafeAppend(v); },
+              [&] { builder.UnsafeAppendNull(); });
+
+          if (check_utf8) {
+            if (ARROW_PREDICT_FALSE(!ValidateUTF8Inline(builder.value_data() + num_chars,
+                                                        num_appended_chars))) {
+              return Status::Invalid("Invalid UTF8 sequence");
+            }
+          }
+          return Status::OK();
+        }));
 
-  // This buffer is preallocated
-  output_offset_type* offsets = output->GetMutableValues<output_offset_type>(1);
-  offsets[0] = static_cast<output_offset_type>(input.offset * width);
-  for (int64_t i = 0; i < input.length; i++) {
-    offsets[i + 1] = offsets[i] + width;
+    return builder.FinishInternal(std::get_if<std::shared_ptr<ArrayData>>(&out->value));
   }
 
-  // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary
-  // (index 2). After ARROW-16757, we need to copy this memory instead of
-  // zero-copy it because a Scalar value promoted to an ArraySpan may be
-  // referencing a temporary buffer whose scope does not extend beyond the
-  // kernel execution. In that scenario, the validity bitmap above can be
-  // zero-copied because it points to static memory (either a byte with a 1 or
-  // a 0 depending on whether the value is null or not).
-  std::shared_ptr<Buffer> input_data = input.GetBuffer(1);
-  if (input_data != nullptr) {
-    ARROW_ASSIGN_OR_RAISE(output->buffers[2], input_data->CopySlice(0, input_data->size(),
-                                                                    ctx->memory_pool()));
-  } else {
-    // TODO(wesm): it should already be nullptr, so we may be able to remove
-    // this
-    output->buffers[2] = nullptr;
+  if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) {
+    // we can reuse the data buffer here and just add views which reference it
+    if (input.MayHaveNulls()) {
+      ARROW_ASSIGN_OR_RAISE(
+          output->buffers[0],
+          arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
+                                      input.offset, input.length));
+    }
+    // FIXME(bkietz) segfault due to null buffer owner
+    // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2);
+
+    auto* headers = output->buffers[1]->mutable_data_as<StringHeader>();
+    if (check_utf8) {
+      Utf8Validator validator;
+      return VisitArraySpanInline<I>(
+          input,
+          [&](std::string_view v) {
+            *headers++ = StringHeader{v};
+            return validator.VisitValue(v);
+          },
+          [&] {
+            *headers++ = StringHeader{};
+            return Status::OK();
+          });
+    } else {
+      VisitArraySpanInline<I>(
+          input, [&](std::string_view v) { *headers++ = StringHeader{v}; },
+          [&] { *headers++ = StringHeader{}; });
+      return Status::OK();
+    }
   }
 
-  return Status::OK();
-}
+  if constexpr (kInputFixed && kOutputOffsets) {
+    RETURN_NOT_OK(SimpleUtf8Validation());
 
-template <typename O, typename I>
-enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
-                std::is_same<O, FixedSizeBinaryType>::value,
-            Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  const int32_t in_width = batch[0].type()->byte_width();
-  const int32_t out_width =
-      checked_cast<const FixedSizeBinaryType&>(*options.to_type).byte_width();
-  if (in_width != out_width) {
-    return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ",
-                           options.to_type.ToString(), ": widths must match");
+    using output_offset_type = typename O::offset_type;
+
+    int32_t width = input.type->byte_width();
+
+    if constexpr (std::is_same_v<output_offset_type, int32_t>) {
+      // Check for overflow
+      if (width * input.length > std::numeric_limits<int32_t>::max()) {
+        return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+                               out->type()->ToString(), ": input array too large");
+      }
+    }
+
+    // Copy buffers over, then generate indices
+    output->length = input.length;
+    output->SetNullCount(input.null_count);
+    if (input.offset == output->offset) {
+      output->buffers[0] = input.GetBuffer(0);
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          output->buffers[0],
+          arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
+                                      input.offset, input.length));
+    }
+
+    // This buffer is preallocated
+    auto* offsets = output->buffers[1]->mutable_data_as<output_offset_type>();
+    offsets[0] = static_cast<output_offset_type>(input.offset * width);
+    for (int64_t i = 0; i < input.length; i++) {
+      offsets[i + 1] = offsets[i] + width;
+    }
+
+    // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary
+    // (index 2). After ARROW-16757, we need to copy this memory instead of
+    // zero-copy it because a Scalar value promoted to an ArraySpan may be
+    // referencing a temporary buffer whose scope does not extend beyond the
+    // kernel execution. In that scenario, the validity bitmap above can be
+    // zero-copied because it points to static memory (either a byte with a 1 or
+    // a 0 depending on whether the value is null or not).
+    if (std::shared_ptr<Buffer> input_data = input.GetBuffer(1)) {
+      ARROW_ASSIGN_OR_RAISE(
+          output->buffers[2],
+          input_data->CopySlice(0, input_data->size(), ctx->memory_pool()));
+    } else {
+      // TODO(wesm): it should already be nullptr, so we may be able to remove
+      // this
+      output->buffers[2] = nullptr;
+    }
+
+    return Status::OK();
   }
-  return ZeroCopyCastExec(ctx, batch, out);
+
+  if constexpr (kInputFixed && kOutputFixed) {
+    if (input.type->byte_width() != output->type->byte_width()) {
+      return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+                             output->type->ToString(), ": widths must match");
+    }
+    return ZeroCopyCastExec(ctx, batch, out);
+  }
+
+  Unreachable();
 }
 
 #if defined(_MSC_VER)
@@ -447,6 +534,8 @@ template <typename OutType>
 void AddBinaryToBinaryCast(CastFunction* func) {
   AddBinaryToBinaryCast<OutType, StringType>(func);
   AddBinaryToBinaryCast<OutType, BinaryType>(func);
+  AddBinaryToBinaryCast<OutType, StringViewType>(func);
+  AddBinaryToBinaryCast<OutType, BinaryViewType>(func);
   AddBinaryToBinaryCast<OutType, LargeStringType>(func);
   AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
   AddBinaryToBinaryCast<OutType, FixedSizeBinaryType>(func);
@@ -459,6 +548,11 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
   AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
   AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
 
+  auto cast_binary_view =
+      std::make_shared<CastFunction>("cast_binary_view", Type::BINARY_VIEW);
+  AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get());
+  AddBinaryToBinaryCast<BinaryViewType>(cast_binary_view.get());
+
   auto cast_large_binary =
       std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
   AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
@@ -471,6 +565,14 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
   AddTemporalToStringCasts<StringType>(cast_string.get());
   AddBinaryToBinaryCast<StringType>(cast_string.get());
 
+  auto cast_string_view =
+      std::make_shared<CastFunction>("cast_string_view", Type::STRING_VIEW);
+  AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get());
+  AddNumberToStringCasts<StringViewType>(cast_string_view.get());
+  AddDecimalToStringCasts<StringViewType>(cast_string_view.get());
+  AddTemporalToStringCasts<StringViewType>(cast_string_view.get());
+  AddBinaryToBinaryCast<StringViewType>(cast_string_view.get());
+
   auto cast_large_string =
       std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
   AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
@@ -481,15 +583,16 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
 
   auto cast_fsb =
       std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
-  AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
-                 cast_fsb.get());
+  AddCommonCasts(Type::FIXED_SIZE_BINARY, kOutputTargetType, cast_fsb.get());
   DCHECK_OK(cast_fsb->AddKernel(
-      Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)},
-      OutputType(FirstType),
+      Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, kOutputTargetType,
       BinaryToBinaryCastExec<FixedSizeBinaryType, FixedSizeBinaryType>,
       NullHandling::COMPUTED_NO_PREALLOCATE));
 
-  return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+  return {
+      cast_binary,      cast_binary_view,  cast_large_binary, cast_string,
+      cast_string_view, cast_large_string, cast_fsb,
+  };
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 85da81357b..9c10b85b3c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -145,7 +145,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input,
 
   using arrow::internal::Bitmap;
   Bitmap is_valid(masked->buffers[0], 0, input->length());
-  if (auto original = input->null_bitmap()) {
+  if (const auto& original = input->null_bitmap()) {
     is_valid.CopyFrom(Bitmap(original, input->offset(), input->length()));
   } else {
     is_valid.SetBitsTo(true);
@@ -154,7 +154,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input,
   for (int i : indices_to_mask) {
     is_valid.SetBitTo(i, false);
   }
-  return MakeArray(masked);
+  return MakeArray(std::move(masked));
 }
 
 TEST(Cast, CanCast) {
@@ -167,6 +167,9 @@ TEST(Cast, CanCast) {
     }
   };
 
+  ExpectCanCast(boolean(), {utf8()});
+  return;
+
   auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr<DataType> from,
                                           std::vector<std::shared_ptr<DataType>> to_set) {
     ExpectCanCast(from, to_set, /*expected=*/false);
@@ -198,17 +201,21 @@ TEST(Cast, CanCast) {
     ExpectCannotCast(from_numeric, {null()});
   }
 
-  for (auto from_base_binary : kBaseBinaryTypes) {
-    ExpectCanCast(from_base_binary, {boolean()});
-    ExpectCanCast(from_base_binary, kNumericTypes);
-    ExpectCanCast(from_base_binary, kBaseBinaryTypes);
-    ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary});
+  auto base_binary_and_view_types = kBaseBinaryTypes;
+  base_binary_and_view_types.push_back(binary_view());
+  base_binary_and_view_types.push_back(utf8_view());
+
+  for (auto from : base_binary_and_view_types) {
+    ExpectCanCast(from, {boolean()});
+    ExpectCanCast(from, kNumericTypes);
+    ExpectCanCast(from, base_binary_and_view_types);
+    ExpectCanCast(dictionary(int64(), from), {from});
 
     // any cast which is valid for the dictionary is valid for the DictionaryArray
-    ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes);
-    ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes);
+    ExpectCanCast(dictionary(uint32(), from), kBaseBinaryTypes);
+    ExpectCanCast(dictionary(int16(), from), kNumericTypes);
 
-    ExpectCannotCast(from_base_binary, {null()});
+    ExpectCannotCast(from, {null()});
   }
 
   ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)});
@@ -1029,7 +1036,7 @@ TEST(Cast, DecimalToFloating) {
 }
 
 TEST(Cast, DecimalToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) {
       CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"),
                 ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])"));
@@ -1540,7 +1547,7 @@ TEST(Cast, TimeZeroCopy) {
 }
 
 TEST(Cast, DateToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(ArrayFromJSON(date32(), "[0, null]"),
               ArrayFromJSON(string_type, R"(["1970-01-01", null])"));
     CheckCast(ArrayFromJSON(date64(), "[86400000, null]"),
@@ -1549,7 +1556,7 @@ TEST(Cast, DateToString) {
 }
 
 TEST(Cast, TimeToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"),
               ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])"));
     CheckCast(
@@ -1559,7 +1566,7 @@ TEST(Cast, TimeToString) {
 }
 
 TEST(Cast, TimestampToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(
         ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"),
         ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])"));
@@ -1585,7 +1592,7 @@ TEST(Cast, TimestampToString) {
 }
 
 TEST_F(CastTimezone, TimestampWithZoneToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(
         ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
         ArrayFromJSON(string_type,
@@ -1771,7 +1778,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) {
 }
 
 TEST(Cast, DurationToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     for (auto unit : TimeUnit::values()) {
       CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"),
                 ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])"));
@@ -2008,6 +2015,10 @@ TEST(Cast, StringToTimestamp) {
 }
 
 static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Array> rhs) {
+  for (auto id : {lhs->type_id(), rhs->type_id()}) {
+    // views cannot be zero copied
+    if (id == Type::BINARY_VIEW || id == Type::STRING_VIEW) return;
+  }
   // null bitmap and data buffers are always zero-copied
   AssertBufferSame(*lhs, *rhs, 0);
   AssertBufferSame(*lhs, *rhs, 2);
@@ -2031,8 +2042,9 @@ static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Arr
 }
 
 TEST(Cast, BinaryToString) {
-  for (auto bin_type : {binary(), large_binary()}) {
-    for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto bin_type : {binary(), large_binary(), binary_view()}) {
+    for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
+      ARROW_SCOPED_TRACE(*bin_type, " to ", *string_type);
       // empty -> empty always works
       CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]"));
 
@@ -2050,13 +2062,14 @@ TEST(Cast, BinaryToString) {
       options.allow_invalid_utf8 = true;
       ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options));
       ASSERT_RAISES(Invalid, strings->ValidateFull());
+
       AssertBinaryZeroCopy(invalid_utf8, strings);
     }
   }
 
   auto from_type = fixed_size_binary(3);
   auto invalid_utf8 = FixedSizeInvalidUtf8(from_type);
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]"));
 
     // invalid utf-8 masked by a null bit is not an error
@@ -2075,9 +2088,12 @@ TEST(Cast, BinaryToString) {
     // N.B. null buffer is not always the same if input sliced
     AssertBufferSame(*invalid_utf8, *strings, 0);
 
-    // ARROW-16757: we no longer zero copy, but the contents are equal
-    ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
-    ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    if (string_type->id() != Type::STRING_VIEW) {
+      // ARROW-16757: we no longer zero copy, but the contents are equal
+      ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
+                strings->data()->buffers[2].get());
+      ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    }
   }
 }
 
@@ -2146,7 +2162,7 @@ TEST(Cast, StringToString) {
 }
 
 TEST(Cast, IntToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"),
               ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])"));
 
@@ -2178,7 +2194,7 @@ TEST(Cast, IntToString) {
 }
 
 TEST(Cast, FloatingToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(
         ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"),
         ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])"));
@@ -2190,7 +2206,7 @@ TEST(Cast, FloatingToString) {
 }
 
 TEST(Cast, BooleanToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
     CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"),
               ArrayFromJSON(string_type, R"(["true", "true", "false", null])"));
   }


[arrow] 03/15: enable JSON converter for StringView/BinaryView

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 84666ed5378ad5312fcb887a38cda66110ea2070
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:16:57 2022 +0200

    enable JSON converter for StringView/BinaryView
---
 cpp/src/arrow/ipc/json_simple.cc | 4 ++++
 cpp/src/arrow/json/converter.cc  | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc
index eea0c97302..4d2d803f3f 100644
--- a/cpp/src/arrow/ipc/json_simple.cc
+++ b/cpp/src/arrow/ipc/json_simple.cc
@@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr<DataType>& type,
     PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType)
     PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType)
     PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType)
+    PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType)
+    PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType)
     SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter,
                           FixedSizeBinaryType)
     SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type)
@@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
     SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter<BinaryType>)
     SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter<LargeStringType>)
     SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter<LargeBinaryType>)
+    SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter<StringViewType>)
+    SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter<BinaryViewType>)
     SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>)
     SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>)
     SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>)
diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc
index ec9713e41f..62d8efaf6e 100644
--- a/cpp/src/arrow/json/converter.cc
+++ b/cpp/src/arrow/json/converter.cc
@@ -305,6 +305,8 @@ Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool
     CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>);
     CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>);
     CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>);
+    CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter<BinaryViewType>);
+    CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter<StringViewType>);
     CONVERTER_CASE(Type::DECIMAL128, DecimalConverter<Decimal128Type>);
     CONVERTER_CASE(Type::DECIMAL256, DecimalConverter<Decimal256Type>);
     default:


[arrow] 06/15: fix formatting

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 864a74c81f187c597ad9517030b6187769b397f3
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:24:45 2022 +0200

    fix formatting
---
 cpp/src/arrow/visit_data_inline.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h
index b6996d188c..3058b8501b 100644
--- a/cpp/src/arrow/visit_data_inline.h
+++ b/cpp/src/arrow/visit_data_inline.h
@@ -176,12 +176,10 @@ struct ArraySpanInlineVisitor<T, enable_if_binary_view_like<T>> {
     }
     return VisitBitBlocks(
         arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t (index)) {
+        [&](int64_t(index)) {
           return valid_func(static_cast<std::string_view>(headers[index]));
         },
-        [&]() {
-          return null_func();
-        });
+        [&]() { return null_func(); });
   }
 
   template <typename ValidFunc, typename NullFunc>
@@ -201,7 +199,7 @@ struct ArraySpanInlineVisitor<T, enable_if_binary_view_like<T>> {
 
     VisitBitBlocksVoid(
         arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t (index)) {
+        [&](int64_t(index)) {
           valid_func(static_cast<std::string_view>(headers[index]));
         },
         std::forward<NullFunc>(null_func));


[arrow] 12/15: Adding comparison and concatenation

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 04893f65e92f57bba7b8ff0bbc201dfd17ff3aa0
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 17:04:47 2022 -0500

    Adding comparison and concatenation
---
 cpp/src/arrow/array/builder_binary.h    | 12 ++++++++----
 cpp/src/arrow/array/concatenate.cc      | 24 +++++++++++++++++++++++-
 cpp/src/arrow/array/concatenate_test.cc |  8 ++++++++
 cpp/src/arrow/compare.cc                |  8 +++++++-
 cpp/src/arrow/testing/random.cc         | 14 +++++++++++---
 cpp/src/arrow/testing/random.h          | 16 ++++++++++++++++
 6 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 30ab4b9d4a..ccfcb8b2b2 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -542,9 +542,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
  public:
   using TypeClass = BinaryViewType;
 
-  BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+  // this constructor provided for MakeBuilder compatibility
+  BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool)
       : BinaryViewBuilder(pool) {}
 
+  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        data_builder_(pool, alignment),
+        data_heap_builder_(pool) {}
+
   int64_t current_block_bytes_remaining() const {
     return data_heap_builder_.current_remaining_bytes();
   }
@@ -683,9 +690,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
   std::shared_ptr<DataType> type() const override { return binary_view(); }
 
  protected:
-  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {}
-
   static constexpr int64_t ValueSizeLimit() {
     return std::numeric_limits<uint32_t>::max();
   }
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index 3dd0ccea93..6f7d61283e 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -228,7 +228,29 @@ class ConcatenateImpl {
   }
 
   Status Visit(const BinaryViewType&) {
-    return Status::NotImplemented("binary / string view");
+    bool any_opted_out_of_view_validation = false;
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      auto begin = in_data->buffers.begin() + 2;
+      auto end = in_data->buffers.end();
+
+      if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) {
+        any_opted_out_of_view_validation = true;
+        --end;
+      }
+
+      for (auto it = begin; it != end; ++it) {
+        out_->buffers.push_back(*it);
+      }
+    }
+
+    if (any_opted_out_of_view_validation) {
+      out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers));
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader)));
+    return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]);
   }
 
   Status Visit(const ListType&) {
diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index bff5d7eec1..1bc0c65bec 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -91,6 +91,7 @@ class ConcatenateTest : public ::testing::Test {
       for (auto null_probability : this->null_probabilities_) {
         std::shared_ptr<Array> array;
         factory(size, null_probability, &array);
+          ASSERT_OK(array->ValidateFull());
         auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front());
         auto slices = this->Slices(array, offsets);
         ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
@@ -154,6 +155,13 @@ TEST_F(ConcatenateTest, StringType) {
   });
 }
 
+TEST_F(ConcatenateTest, StringViewType) {
+  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+    *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+    ASSERT_OK((**out).ValidateFull());
+  });
+}
+
 TEST_F(ConcatenateTest, LargeStringType) {
   Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
     *out =
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 68250f0288..5d1c3294c0 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -261,7 +261,13 @@ class RangeDataEqualsImpl {
 
   // Also matches StringViewType
   Status Visit(const BinaryViewType& type) {
-    return Status::NotImplemented("Binary / string view");
+    auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_;
+    auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_;
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return std::equal(left_values + i, left_values + i + length,
+                        right_values + i, right_values + i + length);
+    });
+    return Status::OK();
   }
 
   // Also matches LargeStringType
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 3213273474..e45e296ff6 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -362,13 +362,12 @@ std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType
   return gen.MakeRandomArray(size, null_probability, alignment, memory_pool);
 }
 
-template <typename TypeClass>
+template <typename TypeClass, typename offset_type = typename TypeClass::offset_type>
 static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size,
                                                   int32_t min_length, int32_t max_length,
                                                   double null_probability,
                                                   int64_t alignment,
                                                   MemoryPool* memory_pool) {
-  using offset_type = typename TypeClass::offset_type;
   using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
   using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
   using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
@@ -386,7 +385,7 @@ static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int
                  /*null_probability=*/0);
 
   std::vector<uint8_t> str_buffer(max_length);
-  BuilderType builder(memory_pool, alignment);
+  BuilderType builder{memory_pool, alignment};
 
   for (int64_t i = 0; i < size; ++i) {
     if (lengths->IsValid(i)) {
@@ -429,6 +428,15 @@ std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats(
   return *strings->View(binary());
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t min_length,
+                                                        int32_t max_length,
+                                                        double null_probability, 
+                                                        int64_t alignment,
+                                                        MemoryPool* memory_pool) {
+  return GenerateBinaryArray<StringViewType, uint32_t>(this, size, min_length, max_length,
+                                                       null_probability, alignment, memory_pool);
+}
+
 std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(
     int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
     double null_probability, int64_t alignment, MemoryPool* memory_pool) {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index b2e3a609a2..5b905896f2 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
                                 int64_t alignment = kDefaultBufferAlignment,
                                 MemoryPool* memory_pool = default_memory_pool());
 
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] null_probability the probability of a value being null
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length,
+                                    double null_probability = 0,
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
   /// \brief Generate a random LargeStringArray
   ///
   /// \param[in] size the size of the array to generate


[arrow] 08/15: run binary data visitor tests on StringView/BinaryView

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 3d6a30a09da0169369a67ed7a0d4afa478348292
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:29:16 2022 +0200

    run binary data visitor tests on StringView/BinaryView
---
 cpp/src/arrow/array/array_binary_test.cc | 13 ++++++++++---
 cpp/src/arrow/testing/gtest_util.h       |  3 +++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 3bc9bb91a0..c9f1b1cfab 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -883,11 +883,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
   void SetUp() override { type_ = TypeTraits<TypeClass>::type_singleton(); }
 
   void TestBasics() {
-    auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])");
+    auto array = ArrayFromJSON(
+        type_,
+        R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])");
     BinaryAppender appender;
     ArraySpanVisitor<TypeClass> visitor;
     ASSERT_OK(visitor.Visit(*array->data(), &appender));
-    ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"}));
+    ASSERT_THAT(appender.data,
+                ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me",
+                                             "allocate_me_aaaaa", "allocate_me_bbbb"}));
     ARROW_UNUSED(visitor);  // Workaround weird MSVC warning
   }
 
@@ -904,7 +908,10 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
   std::shared_ptr<DataType> type_;
 };
 
-TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes);
+using BinaryAndBin = ::testing::Types<BinaryType, LargeBinaryType, StringType,
+                                      LargeStringType, BinaryViewType, StringViewType>;
+
+TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes);
 
 TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }
 
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 2708056295..fc319a6d10 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -176,6 +176,9 @@ using DecimalArrowTypes = ::testing::Types<Decimal128Type, Decimal256Type>;
 using BaseBinaryArrowTypes =
     ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
 
+using BaseBinaryOrBinaryViewLikeArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
+
 using BinaryArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
 
 using StringArrowTypes = ::testing::Types<StringType, LargeStringType>;


[arrow] 11/15: Added validation for StringView arrays

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 7474342cf2214d88778dc33526013ec82537636a
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 13:07:57 2022 -0500

    Added validation for StringView arrays
---
 cpp/src/arrow/array/array_base.cc                  |   4 +-
 cpp/src/arrow/array/array_binary.h                 |  38 +++++-
 cpp/src/arrow/array/array_binary_test.cc           |  67 +++++++---
 cpp/src/arrow/array/array_test.cc                  |   4 +-
 cpp/src/arrow/array/builder_base.cc                |  17 ++-
 cpp/src/arrow/array/builder_binary.h               |   4 +-
 cpp/src/arrow/array/util.cc                        |  28 +++-
 cpp/src/arrow/array/validate.cc                    | 147 +++++++++++++++++++--
 cpp/src/arrow/compare.cc                           |   8 +-
 .../arrow/compute/kernels/scalar_nested_test.cc    |   3 +
 .../arrow/compute/kernels/scalar_string_test.cc    |  10 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc       |  94 ++++---------
 cpp/src/arrow/scalar.cc                            |  20 +--
 cpp/src/arrow/scalar.h                             |  18 ++-
 cpp/src/arrow/testing/gtest_util.h                 |   6 +-
 cpp/src/arrow/type.h                               |  11 +-
 16 files changed, 331 insertions(+), 148 deletions(-)

diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
index de9ab2e985..f4f860ca95 100644
--- a/cpp/src/arrow/array/array_base.cc
+++ b/cpp/src/arrow/array/array_base.cc
@@ -83,7 +83,9 @@ struct ScalarFromArraySlotImpl {
   }
 
   Status Visit(const BinaryViewArray& a) {
-    return Status::NotImplemented("ScalarFromArraySlot -> BinaryView");
+    StringHeader header = a.Value(index_);
+    std::string_view view{header};
+    return Finish(std::string{view});
   }
 
   Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
index 03ee77fab8..1c8947dde3 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -230,16 +230,37 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
 
   explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
 
-  BinaryViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
-                  const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+  /// By default, ValidateFull() will check each view in a BinaryViewArray or
+  /// StringViewArray to ensure it references a memory range owned by one of the array's
+  /// buffers.
+  ///
+  /// If the last character buffer is null, ValidateFull will skip this step. Use this
+  /// for arrays which view memory elsewhere.
+  static BufferVector DoNotValidateViews(BufferVector char_buffers) {
+    char_buffers.push_back(NULLPTR);
+    return char_buffers;
+  }
+
+  static bool OptedOutOfViewValidation(const ArrayData& data) {
+    return data.buffers.back() == NULLPTR;
+  }
+  bool OptedOutOfViewValidation() const { return OptedOutOfViewValidation(*data_); }
+
+  BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector char_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
                   int64_t null_count = kUnknownNullCount, int64_t offset = 0)
-      : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {}
+      : PrimitiveArray(binary_view(), length, std::move(data), std::move(null_bitmap),
+                       null_count, offset) {
+    for (auto& char_buffer : char_buffers) {
+      data_->buffers.push_back(std::move(char_buffer));
+    }
+  }
 
   const StringHeader* raw_values() const {
     return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
   }
 
-  StringHeader Value(int64_t i) const { return raw_values()[i]; }
+  const StringHeader& Value(int64_t i) const { return raw_values()[i]; }
 
   // For API compatibility with BinaryArray etc.
   std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); }
@@ -264,10 +285,13 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray {
 
   explicit StringViewArray(const std::shared_ptr<ArrayData>& data);
 
-  StringViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
-                  const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+  StringViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector char_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
                   int64_t null_count = kUnknownNullCount, int64_t offset = 0)
-      : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {}
+      : BinaryViewArray(length, std::move(data), std::move(char_buffers),
+                        std::move(null_bitmap), null_count, offset) {
+    data_->type = utf8_view();
+  }
 
   /// \brief Validate that this array contains only valid UTF8 entries
   ///
diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index c9f1b1cfab..92fc16f775 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -32,6 +32,7 @@
 #include "arrow/status.h"
 #include "arrow/testing/builder.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -365,38 +366,73 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets();
 
 TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
 
+TEST(StringViewArray, Validate) {
+  auto MakeArray = [](std::vector<StringHeader> headers, BufferVector char_buffers) {
+    auto length = static_cast<int64_t>(headers.size());
+    return StringViewArray(length, Buffer::Wrap(std::move(headers)),
+                           std::move(char_buffers));
+  };
+
+  // empty array is valid
+  EXPECT_THAT(MakeArray({}, {}).ValidateFull(), Ok());
+
+  // inline views need not have a corresponding buffer
+  EXPECT_THAT(MakeArray({"hello", "world", "inline me"}, {}).ValidateFull(), Ok());
+
+  auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)");
+  auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy");
+
+  // non-inline views are expected to reside in a buffer managed by the array
+  EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+                         StringHeader(std::string_view{*buffer_y})},
+                        {buffer_s, buffer_y})
+                  .ValidateFull(),
+              Ok());
+
+  EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+                         // if a view points outside the buffers, that is invalid
+                         StringHeader("from a galaxy far, far away"),
+                         StringHeader(std::string_view{*buffer_y})},
+                        {buffer_s, buffer_y})
+                  .ValidateFull(),
+              Raises(StatusCode::Invalid));
+
+  // ... unless specifically overridden
+  EXPECT_THAT(
+      MakeArray({"from a galaxy far, far away"}, StringViewArray::DoNotValidateViews({}))
+          .ValidateFull(),
+      Ok());
+}
+
 template <typename T>
 class TestUTF8Array : public ::testing::Test {
  public:
   using TypeClass = T;
-  using offset_type = typename TypeClass::offset_type;
   using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
 
-  Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
-                      std::string_view data, int64_t offset = 0) {
-    ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared<Buffer>(data),
-                  /*null_bitmap=*/nullptr, /*null_count=*/0, offset);
-    return arr.ValidateUTF8();
+  Status ValidateUTF8(const Array& arr) {
+    return checked_cast<const ArrayType&>(arr).ValidateUTF8();
   }
 
-  Status ValidateUTF8(const std::string& json) {
-    auto ty = TypeTraits<T>::type_singleton();
-    auto arr = ArrayFromJSON(ty, json);
-    return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
+  Status ValidateUTF8(std::vector<std::string> values) {
+    std::shared_ptr<Array> arr;
+    ArrayFromVector<T, std::string>(values, &arr);
+    return ValidateUTF8(*arr);
   }
 
   void TestValidateUTF8() {
-    ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
-    ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf"));  // \U0010ffff
+    ASSERT_OK(ValidateUTF8(*ArrayFromJSON(TypeTraits<T>::type_singleton(),
+                                          R"(["Voix", "ambiguë", "d’un", "cœur"])")));
+    ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"}));  // \U0010ffff
 
-    ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
+    ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"}));
 
     // More tests in TestValidateData() above
     // (ValidateFull() calls ValidateUTF8() internally)
   }
 };
 
-TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes);
+TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes);
 
 TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }
 
@@ -908,9 +944,6 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
   std::shared_ptr<DataType> type_;
 };
 
-using BinaryAndBin = ::testing::Types<BinaryType, LargeBinaryType, StringType,
-                                      LargeStringType, BinaryViewType, StringViewType>;
-
 TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes);
 
 TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index d4ad1578b7..c14d4f21ac 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -544,12 +544,14 @@ static ScalarVector GetScalars() {
       std::make_shared<DurationScalar>(60, duration(TimeUnit::SECOND)),
       std::make_shared<BinaryScalar>(hello),
       std::make_shared<LargeBinaryScalar>(hello),
+      std::make_shared<BinaryViewScalar>(hello),
       std::make_shared<FixedSizeBinaryScalar>(
           hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
       std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
       std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
       std::make_shared<StringScalar>(hello),
       std::make_shared<LargeStringScalar>(hello),
+      std::make_shared<StringViewScalar>(hello),
       std::make_shared<ListScalar>(ArrayFromJSON(int8(), "[1, 2, 3]")),
       ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"),
       std::make_shared<LargeListScalar>(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")),
@@ -594,7 +596,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
       ASSERT_EQ(array->null_count(), 0);
 
       // test case for ARROW-13321
-      for (int64_t i : std::vector<int64_t>{0, length / 2, length - 1}) {
+      for (int64_t i : {int64_t{0}, length / 2, length - 1}) {
         ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i));
         AssertScalarsEqual(*s, *scalar, /*verbose=*/true);
       }
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index e9d5fb44ac..3b2ee570f9 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -103,10 +103,7 @@ namespace {
 
 struct AppendScalarImpl {
   template <typename T>
-  enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
-                  is_fixed_size_binary_type<T>::value,
-              Status>
-  Visit(const T&) {
+  Status HandleFixedWidth(const T&) {
     auto builder = checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
     RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
 
@@ -125,7 +122,17 @@ struct AppendScalarImpl {
   }
 
   template <typename T>
-  enable_if_base_binary<T, Status> Visit(const T&) {
+  enable_if_t<has_c_type<T>::value, Status> Visit(const T& t) {
+    return HandleFixedWidth(t);
+  }
+
+  Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); }
+  Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); }
+  Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); }
+
+  template <typename T>
+  enable_if_t<is_binary_like_type<T>::value || is_string_like_type<T>::value, Status>
+  Visit(const T&) {
     int64_t data_size = 0;
     for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
          raw++) {
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index b9d926cb16..30ab4b9d4a 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -576,7 +576,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
   Status Append(StringHeader value) {
     ARROW_RETURN_NOT_OK(Reserve(1));
     UnsafeAppend(value);
-    UnsafeAppendToBitmap(true);
     return Status::OK();
   }
 
@@ -591,7 +590,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
       value = data_heap_builder_.UnsafeAppend(value, length);
     }
     UnsafeAppend(StringHeader(value, length));
-    UnsafeAppendToBitmap(true);
   }
 
   void UnsafeAppend(const char* value, int64_t length) {
@@ -653,7 +651,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
   }
 
   void UnsafeAppendEmptyValue() {
-    data_builder_.UnsafeAppend(StringHeader(""));
+    data_builder_.UnsafeAppend(StringHeader());
     UnsafeAppendToBitmap(true);
   }
 
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index ac9d76d469..fe5a0dd575 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -355,6 +355,10 @@ class NullArrayFactory {
       return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
     }
 
+    Status Visit(const BinaryViewType& type) {
+      return MaxOf(sizeof(StringHeader) * length_);
+    }
+
     Status Visit(const FixedSizeListType& type) {
       return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_));
     }
@@ -463,6 +467,11 @@ class NullArrayFactory {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewType&) {
+    out_->buffers.resize(2, buffer_);
+    return Status::OK();
+  }
+
   template <typename T>
   enable_if_var_size_list<T, Status> Visit(const T& type) {
     out_->buffers.resize(2, buffer_);
@@ -599,14 +608,27 @@ class RepeatedArrayFactory {
     RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer));
     auto size = static_cast<typename T::offset_type>(value->size());
     RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
-    out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(length_, offsets_buffer,
-                                                               values_buffer);
+    out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(
+        length_, std::move(offsets_buffer), std::move(values_buffer));
     return Status::OK();
   }
 
   template <typename T>
   enable_if_binary_view_like<T, Status> Visit(const T&) {
-    return Status::NotImplemented("binary / string view");
+    const std::shared_ptr<Buffer>& value =
+        checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
+
+    StringHeader header{std::string_view{*value}};
+    std::shared_ptr<Buffer> header_buffer;
+    RETURN_NOT_OK(CreateBufferOf(&header, sizeof(header), &header_buffer));
+
+    BufferVector char_buffers;
+    if (!header.IsInline()) {
+      char_buffers.push_back(value);
+    }
+    out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(
+        length_, std::move(header_buffer), std::move(char_buffers));
+    return Status::OK();
   }
 
   template <typename T>
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index cddb086005..53d74ba148 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -30,6 +30,7 @@
 #include "arrow/util/decimal.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/unreachable.h"
 #include "arrow/util/utf8.h"
 #include "arrow/visit_data_inline.h"
 #include "arrow/visit_type_inline.h"
@@ -42,10 +43,7 @@ namespace {
 struct UTF8DataValidator {
   const ArrayData& data;
 
-  Status Visit(const DataType&) {
-    // Default, should be unreachable
-    return Status::NotImplemented("");
-  }
+  Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); }
 
   Status Visit(const StringViewType&) {
     util::InitializeUTF8();
@@ -86,10 +84,7 @@ struct BoundsChecker {
   int64_t min_value;
   int64_t max_value;
 
-  Status Visit(const DataType&) {
-    // Default, should be unreachable
-    return Status::NotImplemented("");
-  }
+  Status Visit(const DataType&) { Unreachable("bounds checking of non integer type"); }
 
   template <typename IntegerType>
   enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
@@ -260,9 +255,7 @@ struct ValidateArrayImpl {
 
   Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
 
-  Status Visit(const BinaryViewType& type) {
-    return Status::NotImplemented("binary / string view");
-  }
+  Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); }
 
   Status Visit(const ListType& type) { return ValidateListLike(type); }
 
@@ -455,7 +448,14 @@ struct ValidateArrayImpl {
       return Status::Invalid("Array length is negative");
     }
 
-    if (data.buffers.size() != layout.buffers.size()) {
+    if (layout.variadic_spec) {
+      if (data.buffers.size() < layout.buffers.size()) {
+        return Status::Invalid("Expected at least ", layout.buffers.size(),
+                               " buffers in array "
+                               "of type ",
+                               type.ToString(), ", got ", data.buffers.size());
+      }
+    } else if (data.buffers.size() != layout.buffers.size()) {
       return Status::Invalid("Expected ", layout.buffers.size(),
                              " buffers in array "
                              "of type ",
@@ -471,7 +471,9 @@ struct ValidateArrayImpl {
 
     for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
       const auto& buffer = data.buffers[i];
-      const auto& spec = layout.buffers[i];
+      const auto& spec = i < static_cast<int>(layout.buffers.size())
+                             ? layout.buffers[i]
+                             : *layout.variadic_spec;
 
       if (buffer == nullptr) {
         continue;
@@ -594,6 +596,125 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t headers_byte_size = data.buffers[1]->size();
+    int64_t required_headers = data.length + data.offset;
+    if (static_cast<int64_t>(headers_byte_size / sizeof(StringHeader)) <
+        required_headers) {
+      return Status::Invalid("Header buffer size (bytes): ", headers_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (!full_validation || BinaryViewArray::OptedOutOfViewValidation(data)) {
+      return Status::OK();
+    }
+
+    auto* headers = data.GetValues<StringHeader>(1);
+    std::string_view buffer_containing_previous_view;
+
+    auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
+      return super.data() <= sub.data() &&
+             super.data() + super.size() <= sub.data() + sub.size();
+    };
+
+    std::vector<std::string_view> buffers;
+    for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
+      buffers.emplace_back(**it);
+    }
+
+    auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) {
+      if constexpr (check_previous_buffer) {
+        buffer_containing_previous_view = buffers.front();
+      }
+
+      for (int64_t i = 0; i < data.length; ++i) {
+        if (headers[i].IsInline()) continue;
+
+        std::string_view view{headers[i]};
+
+        if constexpr (check_previous_buffer) {
+          if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) {
+            // Fast path: for most string view arrays, we'll have runs
+            // of views into the same buffer.
+            continue;
+          }
+        }
+
+        if (!in_a_buffer(view)) {
+          return Status::Invalid(
+              "String view at slot ", i,
+              " views memory not resident in any buffer managed by the array");
+        }
+      }
+      return Status::OK();
+    };
+
+    if (buffers.empty()) {
+      // there are no character buffers; the only way this array
+      // can be valid is if all views are inline
+      return CheckViews([](std::string_view) { return std::false_type{}; },
+                        /*check_previous_buffer=*/std::false_type{});
+    }
+
+    // Simplest check for view-in-buffer: loop through buffers and check each one.
+    auto Linear = [&](std::string_view view) {
+      for (std::string_view buffer : buffers) {
+        if (IsSubrangeOf(buffer, view)) {
+          buffer_containing_previous_view = buffer;
+          return true;
+        }
+      }
+      return false;
+    };
+
+    if (buffers.size() <= 32) {
+      // If there are few buffers to search through, sorting/binary search is not
+      // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
+      return CheckViews(Linear,
+                        /*check_previous_buffer=*/std::true_type{});
+    }
+
+    auto DataPtrLess = [](std::string_view l, std::string_view r) {
+      return l.data() < r.data();
+    };
+
+    std::sort(buffers.begin(), buffers.end(), DataPtrLess);
+    bool non_overlapping =
+        buffers.end() !=
+        std::adjacent_find(buffers.begin(), buffers.end(),
+                           [](std::string_view before, std::string_view after) {
+                             return before.data() + before.size() <= after.data();
+                           });
+    if (ARROW_PREDICT_FALSE(!non_overlapping)) {
+      // Using a binary search with overlapping buffers would not *uniquely* identify
+      // a potentially-containing buffer. Moreover this should be a fairly rare case
+      // so optimizing for it seems premature.
+      return CheckViews(Linear,
+                        /*check_previous_buffer=*/std::true_type{});
+    }
+
+    // More sophisticated check for view-in-buffer: binary search through the buffers.
+    return CheckViews(
+        [&](std::string_view view) {
+          // Find the first buffer whose data starts after the data in view-
+          // only buffers *before* this could contain view. Since we've additionally
+          // checked that the buffers do not overlap, only the buffer *immediately before*
+          // this could contain view.
+          auto one_past_potential_super =
+              std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess);
+
+          if (one_past_potential_super == buffers.begin()) return false;
+
+          auto potential_super = *(one_past_potential_super - 1);
+          if (!IsSubrangeOf(potential_super, view)) return false;
+
+          buffer_containing_previous_view = potential_super;
+          return true;
+        },
+        /*check_previous_buffer=*/std::true_type{});
+  }
+
   template <typename ListType>
   Status ValidateListLike(const ListType& type) {
     const ArrayData& values = *data.child_data[0];
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 8ccc645046..68250f0288 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -727,19 +727,13 @@ class ScalarEqualsVisitor {
   Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
+  enable_if_t<std::is_base_of<BaseBinaryScalar, T>::value, Status>
   Visit(const T& left) {
     const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
     result_ = internal::SharedPtrEquals(left.value, right.value);
     return Status::OK();
   }
 
-  Status Visit(const BinaryViewScalar& left) {
-    const auto& right = checked_cast<const BinaryViewScalar&>(right_);
-    result_ = left.value == right.value;
-    return Status::OK();
-  }
-
   Status Visit(const Decimal128Scalar& left) {
     const auto& right = checked_cast<const Decimal128Scalar&>(right_);
     result_ = left.value == right.value;
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index 744f188908..523e20c4a7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -796,6 +796,9 @@ TEST(MakeStruct, Array) {
   EXPECT_THAT(MakeStructor({i32, str}, {"i", "s"}),
               ResultWith(Datum(*StructArray::Make({i32, str}, field_names))));
 
+  EXPECT_THAT(*MakeScalar("aa"), testing::Eq(StringScalar("aa")));
+  EXPECT_EQ(*MakeStructor({i32, MakeScalar("aa")}, {"i", "s"})->type(),
+            StructType({field("i", i32->type()), field("s", str->type())}));
   // Scalars are broadcast to the length of the arrays
   EXPECT_THAT(MakeStructor({i32, MakeScalar("aa")}, {"i", "s"}),
               ResultWith(Datum(*StructArray::Make({i32, str}, field_names))));
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 2498e7f562..b390a36b4c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -47,7 +47,6 @@ namespace compute {
 template <typename TestType>
 class BaseTestStringKernels : public ::testing::Test {
  protected:
-  using OffsetType = typename TypeTraits<TestType>::OffsetType;
   using ScalarType = typename TypeTraits<TestType>::ScalarType;
 
   void CheckUnary(std::string func_name, std::string json_input,
@@ -97,7 +96,14 @@ class BaseTestStringKernels : public ::testing::Test {
   }
 
   std::shared_ptr<DataType> offset_type() {
-    return TypeTraits<OffsetType>::type_singleton();
+    if constexpr (is_binary_view_like_type<TestType>::value) {
+      // Views do not have offsets, but Functions like binary_length
+      // will return the length as uint32
+      return uint32();
+    } else {
+      using OffsetType = typename TypeTraits<TestType>::OffsetType;
+      return TypeTraits<OffsetType>::type_singleton();
+    }
   }
 
   template <typename CType = const char*>
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index f2d4c29f0e..f9637a2f71 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -30,6 +30,7 @@
 #include "arrow/compute/kernels/common.h"
 #include "arrow/result.h"
 #include "arrow/util/hashing.h"
+#include "arrow/util/unreachable.h"
 
 namespace arrow {
 
@@ -261,7 +262,7 @@ class HashKernel : public KernelState {
 // Base class for all "regular" hash kernel implementations
 // (NullType has a separate implementation)
 
-template <typename Type, typename Scalar, typename Action,
+template <typename Type, typename Action, typename Scalar = typename Type::c_type,
           bool with_error_status = Action::with_error_status>
 class RegularHashKernel : public HashKernel {
  public:
@@ -501,39 +502,13 @@ class DictionaryHashKernel : public HashKernel {
 };
 
 // ----------------------------------------------------------------------
-
-template <typename Type, typename Action, typename Enable = void>
-struct HashKernelTraits {};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_null<Type>> {
-  using HashKernel = NullHashKernel<Action>;
-};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_has_c_type<Type>> {
-  using HashKernel = RegularHashKernel<Type, typename Type::c_type, Action>;
-};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
-  using HashKernel = RegularHashKernel<Type, std::string_view, Action>;
-};
-
-template <typename Type, typename Action>
-Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
-                                                 const KernelInitArgs& args) {
-  using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
-  auto result = std::make_unique<HashKernelType>(args.inputs[0].GetSharedPtr(),
-                                                 args.options, ctx->memory_pool());
-  RETURN_NOT_OK(result->Reset());
-  return std::move(result);
-}
-
-template <typename Type, typename Action>
+template <typename HashKernel>
 Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
                                               const KernelInitArgs& args) {
-  return HashInitImpl<Type, Action>(ctx, args);
+  auto result = std::make_unique<HashKernel>(args.inputs[0].GetSharedPtr(), args.options,
+                                             ctx->memory_pool());
+  RETURN_NOT_OK(result->Reset());
+  return std::move(result);
 }
 
 template <typename Action>
@@ -542,22 +517,22 @@ KernelInit GetHashInit(Type::type type_id) {
   // representation
   switch (type_id) {
     case Type::NA:
-      return HashInit<NullType, Action>;
+      return HashInit<NullHashKernel<Action>>;
     case Type::BOOL:
-      return HashInit<BooleanType, Action>;
+      return HashInit<RegularHashKernel<BooleanType, Action>>;
     case Type::INT8:
     case Type::UINT8:
-      return HashInit<UInt8Type, Action>;
+      return HashInit<RegularHashKernel<UInt8Type, Action>>;
     case Type::INT16:
     case Type::UINT16:
-      return HashInit<UInt16Type, Action>;
+      return HashInit<RegularHashKernel<UInt16Type, Action>>;
     case Type::INT32:
     case Type::UINT32:
     case Type::FLOAT:
     case Type::DATE32:
     case Type::TIME32:
     case Type::INTERVAL_MONTHS:
-      return HashInit<UInt32Type, Action>;
+      return HashInit<RegularHashKernel<UInt32Type, Action>>;
     case Type::INT64:
     case Type::UINT64:
     case Type::DOUBLE:
@@ -566,22 +541,23 @@ KernelInit GetHashInit(Type::type type_id) {
     case Type::TIMESTAMP:
     case Type::DURATION:
     case Type::INTERVAL_DAY_TIME:
-      return HashInit<UInt64Type, Action>;
+      return HashInit<RegularHashKernel<UInt64Type, Action>>;
     case Type::BINARY:
     case Type::STRING:
-      return HashInit<BinaryType, Action>;
+    case Type::BINARY_VIEW:
+    case Type::STRING_VIEW:
+      return HashInit<RegularHashKernel<BinaryType, Action, std::string_view>>;
     case Type::LARGE_BINARY:
     case Type::LARGE_STRING:
-      return HashInit<LargeBinaryType, Action>;
+      return HashInit<RegularHashKernel<LargeBinaryType, Action, std::string_view>>;
     case Type::FIXED_SIZE_BINARY:
     case Type::DECIMAL128:
     case Type::DECIMAL256:
-      return HashInit<FixedSizeBinaryType, Action>;
+      return HashInit<RegularHashKernel<FixedSizeBinaryType, Action, std::string_view>>;
     case Type::INTERVAL_MONTH_DAY_NANO:
-      return HashInit<MonthDayNanoIntervalType, Action>;
+      return HashInit<RegularHashKernel<MonthDayNanoIntervalType, Action>>;
     default:
-      DCHECK(false);
-      return nullptr;
+      Unreachable("non hashable type");
   }
 }
 
@@ -591,31 +567,11 @@ template <typename Action>
 Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
                                                         const KernelInitArgs& args) {
   const auto& dict_type = checked_cast<const DictionaryType&>(*args.inputs[0].type);
-  Result<std::unique_ptr<HashKernel>> indices_hasher;
-  switch (dict_type.index_type()->id()) {
-    case Type::INT8:
-    case Type::UINT8:
-      indices_hasher = HashInitImpl<UInt8Type, Action>(ctx, args);
-      break;
-    case Type::INT16:
-    case Type::UINT16:
-      indices_hasher = HashInitImpl<UInt16Type, Action>(ctx, args);
-      break;
-    case Type::INT32:
-    case Type::UINT32:
-      indices_hasher = HashInitImpl<UInt32Type, Action>(ctx, args);
-      break;
-    case Type::INT64:
-    case Type::UINT64:
-      indices_hasher = HashInitImpl<UInt64Type, Action>(ctx, args);
-      break;
-    default:
-      DCHECK(false) << "Unsupported dictionary index type";
-      break;
-  }
-  RETURN_NOT_OK(indices_hasher);
-  return std::make_unique<DictionaryHashKernel>(std::move(indices_hasher.ValueOrDie()),
-                                                dict_type.value_type());
+  ARROW_ASSIGN_OR_RAISE(auto indices_hasher,
+                        GetHashInit<Action>(dict_type.index_type()->id())(ctx, args));
+  return std::make_unique<DictionaryHashKernel>(
+      checked_pointer_cast<HashKernel>(std::move(indices_hasher)),
+      dict_type.value_type());
 }
 
 Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index bfe8a49a9e..aca767907c 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -226,13 +226,11 @@ struct ScalarValidateImpl {
 
   Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
 
-  Status Visit(const BinaryViewScalar& s) {
-    return Status::NotImplemented("Binary view");
-  }
+  Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); }
 
-  Status Visit(const StringViewScalar& s) {
-    return Status::NotImplemented("String view");
-  }
+  Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); }
 
   Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
 
@@ -499,14 +497,8 @@ Status Scalar::ValidateFull() const {
   return ScalarValidateImpl(/*full_validation=*/true).Validate(*this);
 }
 
-BinaryScalar::BinaryScalar(std::string s)
-    : BinaryScalar(Buffer::FromString(std::move(s))) {}
-
-LargeBinaryScalar::LargeBinaryScalar(std::string s)
-    : LargeBinaryScalar(Buffer::FromString(std::move(s))) {}
-
-LargeStringScalar::LargeStringScalar(std::string s)
-    : LargeStringScalar(Buffer::FromString(std::move(s))) {}
+BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr<DataType> type)
+    : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {}
 
 FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr<Buffer> value,
                                              std::shared_ptr<DataType> type,
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 9f41ad0975..6042f0b434 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -253,6 +253,8 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
 
   BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
+
+  BaseBinaryScalar(std::string s, std::shared_ptr<DataType> type);
 };
 
 struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
@@ -262,7 +264,7 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
   explicit BinaryScalar(std::shared_ptr<Buffer> value)
       : BinaryScalar(std::move(value), binary()) {}
 
-  explicit BinaryScalar(std::string s);
+  explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {}
 
   BinaryScalar() : BinaryScalar(binary()) {}
 };
@@ -274,6 +276,8 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
   explicit StringScalar(std::shared_ptr<Buffer> value)
       : StringScalar(std::move(value), utf8()) {}
 
+  explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {}
+
   StringScalar() : StringScalar(utf8()) {}
 };
 
@@ -284,6 +288,9 @@ struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar {
   explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
       : BinaryViewScalar(std::move(value), binary_view()) {}
 
+  explicit BinaryViewScalar(std::string s)
+      : BaseBinaryScalar(std::move(s), binary_view()) {}
+
   BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
 
   std::string_view view() const override { return std::string_view(*this->value); }
@@ -296,6 +303,9 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
   explicit StringViewScalar(std::shared_ptr<Buffer> value)
       : StringViewScalar(std::move(value), utf8_view()) {}
 
+  explicit StringViewScalar(std::string s)
+      : BinaryViewScalar(std::move(s), utf8_view()) {}
+
   StringViewScalar() : StringViewScalar(utf8_view()) {}
 };
 
@@ -309,7 +319,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
   explicit LargeBinaryScalar(std::shared_ptr<Buffer> value)
       : LargeBinaryScalar(std::move(value), large_binary()) {}
 
-  explicit LargeBinaryScalar(std::string s);
+  explicit LargeBinaryScalar(std::string s)
+      : BaseBinaryScalar(std::move(s), large_binary()) {}
 
   LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {}
 };
@@ -321,7 +332,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar {
   explicit LargeStringScalar(std::shared_ptr<Buffer> value)
       : LargeStringScalar(std::move(value), large_utf8()) {}
 
-  explicit LargeStringScalar(std::string s);
+  explicit LargeStringScalar(std::string s)
+      : LargeBinaryScalar(std::move(s), large_utf8()) {}
 
   LargeStringScalar() : LargeStringScalar(large_utf8()) {}
 };
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index fc319a6d10..4d29706829 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -177,12 +177,16 @@ using BaseBinaryArrowTypes =
     ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
 
 using BaseBinaryOrBinaryViewLikeArrowTypes =
-    ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
+    ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, StringType,
+                     LargeStringType, StringViewType>;
 
 using BinaryArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
 
 using StringArrowTypes = ::testing::Types<StringType, LargeStringType>;
 
+using StringOrStringViewArrowTypes =
+    ::testing::Types<StringType, LargeStringType, StringViewType>;
+
 using ListArrowTypes = ::testing::Types<ListType, LargeListType>;
 
 using UnionArrowTypes = ::testing::Types<SparseUnionType, DenseUnionType>;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index f4e082b3f6..faa2eb2af0 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -114,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout {
   std::vector<BufferSpec> buffers;
   /// Whether this type expects an associated dictionary array.
   bool has_dictionary = false;
+  /// If this is provided, the number of buffers expected is only lower-bounded by
+  /// buffers.size(). Buffers beyond this lower bound are expected to conform to
+  /// variadic_spec.
+  std::optional<BufferSpec> variadic_spec;
 
-  explicit DataTypeLayout(std::vector<BufferSpec> v) : buffers(std::move(v)) {}
+  explicit DataTypeLayout(std::vector<BufferSpec> buffers,
+                          std::optional<BufferSpec> variadic_spec = {})
+      : buffers(std::move(buffers)), variadic_spec(variadic_spec) {}
 };
 
 /// \brief Base class for all data types
@@ -701,7 +707,8 @@ class ARROW_EXPORT BinaryViewType : public DataType {
 
   DataTypeLayout layout() const override {
     return DataTypeLayout(
-        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))});
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))},
+        DataTypeLayout::VariableWidth());
   }
 
   std::string ToString() const override;


[arrow] 09/15: fixes in substrait, rename in LICENSE, owning scalars

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 94fcb95927b1edc0c9805b674e4df6c5a31f9277
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Nov 15 14:36:15 2022 -0500

    fixes in substrait, rename in LICENSE, owning scalars
---
 LICENSE.txt                                        |  2 +-
 cpp/src/arrow/array/builder_binary.h               |  3 +-
 .../arrow/engine/substrait/expression_internal.cc  |  9 ++++
 cpp/src/arrow/engine/substrait/type_internal.cc    |  7 +++
 cpp/src/arrow/scalar.cc                            |  9 ----
 cpp/src/arrow/scalar.h                             | 32 ++++--------
 cpp/src/arrow/util/string_header.h                 | 60 ++++++++++------------
 7 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index d282bfe7b3..02ac840980 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -2049,7 +2049,7 @@ License: http://www.apache.org/licenses/LICENSE-2.0
 
 This project includes code from Velox.
 
- * cpp/src/arrow/util/bytes_header.h
+ * cpp/src/arrow/util/string_header.h
 
 is based on Velox's
 
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index ca5209b81e..b9d926cb16 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -504,8 +504,7 @@ class ARROW_EXPORT StringHeapBuilder {
   /// UnsafeAppend operations without the need to allocate more memory
   Status Reserve(int64_t num_bytes) {
     if (num_bytes > current_remaining_bytes_) {
-      current_remaining_bytes_ =
-          num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize;
+      current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
                             AllocateBuffer(current_remaining_bytes_, pool_));
       current_out_buffer_ = new_block->mutable_data();
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc
index b8b545febc..19ec354cf6 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -606,6 +606,15 @@ struct ScalarToProtoImpl {
                       s);
   }
 
+  Status Visit(const StringViewScalar& s) {
+    return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); },
+                      s);
+  }
+  Status Visit(const BinaryViewScalar& s) {
+    return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); },
+                      s);
+  }
+
   Status Visit(const FixedSizeBinaryScalar& s) {
     return FromBuffer(
         [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s);
diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc
index 16032df67d..e6b0b41a14 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.cc
+++ b/cpp/src/arrow/engine/substrait/type_internal.cc
@@ -256,6 +256,13 @@ struct DataTypeToProtoImpl {
     return SetWith(&::substrait::Type::set_allocated_binary);
   }
 
+  Status Visit(const StringViewType& t) {
+    return SetWith(&::substrait::Type::set_allocated_string);
+  }
+  Status Visit(const BinaryViewType& t) {
+    return SetWith(&::substrait::Type::set_allocated_binary);
+  }
+
   Status Visit(const FixedSizeBinaryType& t) {
     SetWithThen(&::substrait::Type::set_allocated_fixed_binary)
         ->set_length(t.byte_width());
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index d139845bd7..bfe8a49a9e 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -70,12 +70,6 @@ struct ScalarHashImpl {
 
   Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
 
-  Status Visit(const BinaryViewScalar& s) {
-    const StringHeader& v = s.value;
-    hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size());
-    return Status::OK();
-  }
-
   template <typename T>
   Status Visit(const TemporalScalar<T>& s) {
     return ValueHash(s);
@@ -508,9 +502,6 @@ Status Scalar::ValidateFull() const {
 BinaryScalar::BinaryScalar(std::string s)
     : BinaryScalar(Buffer::FromString(std::move(s))) {}
 
-StringScalar::StringScalar(std::string s)
-    : StringScalar(Buffer::FromString(std::move(s))) {}
-
 LargeBinaryScalar::LargeBinaryScalar(std::string s)
     : LargeBinaryScalar(Buffer::FromString(std::move(s))) {}
 
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 9b7f604132..9f41ad0975 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -251,7 +251,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
     return value ? std::string_view(*value) : std::string_view();
   }
 
- protected:
   BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
 };
@@ -260,9 +259,6 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
   using BaseBinaryScalar::BaseBinaryScalar;
   using TypeClass = BinaryType;
 
-  BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
-      : BaseBinaryScalar(std::move(value), std::move(type)) {}
-
   explicit BinaryScalar(std::shared_ptr<Buffer> value)
       : BinaryScalar(std::move(value), binary()) {}
 
@@ -278,37 +274,29 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
   explicit StringScalar(std::shared_ptr<Buffer> value)
       : StringScalar(std::move(value), utf8()) {}
 
-  explicit StringScalar(std::string s);
-
   StringScalar() : StringScalar(utf8()) {}
 };
 
-struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase {
-  using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar {
+  using BaseBinaryScalar::BaseBinaryScalar;
   using TypeClass = BinaryViewType;
 
-  explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type)
-      : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
-
-  explicit BinaryViewScalar(StringHeader value)
-      : BinaryViewScalar(value, binary_view()) {}
-
-  BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {}
-
-  void* mutable_data() override { return reinterpret_cast<void*>(&this->value); }
+  explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
+      : BinaryViewScalar(std::move(value), binary_view()) {}
 
-  std::string_view view() const override { return std::string_view(this->value); }
+  BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
 
-  StringHeader value;
+  std::string_view view() const override { return std::string_view(*this->value); }
 };
 
 struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+  using BinaryViewScalar::BinaryViewScalar;
   using TypeClass = StringViewType;
 
-  explicit StringViewScalar(StringHeader value)
-      : BinaryViewScalar(std::move(value), utf8_view()) {}
+  explicit StringViewScalar(std::shared_ptr<Buffer> value)
+      : StringViewScalar(std::move(value), utf8_view()) {}
 
-  StringViewScalar() : BinaryViewScalar(utf8_view()) {}
+  StringViewScalar() : StringViewScalar(utf8_view()) {}
 };
 
 struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
index 29f378a580..8ba18a8366 100644
--- a/cpp/src/arrow/util/string_header.h
+++ b/cpp/src/arrow/util/string_header.h
@@ -33,6 +33,7 @@
 
 #pragma once
 
+#include <array>
 #include <cassert>
 #include <cstdint>
 #include <cstring>
@@ -69,35 +70,27 @@ struct StringHeader {
   static constexpr size_t kPrefixSize = 4;
   static constexpr size_t kInlineSize = 12;
 
-  StringHeader() {
-    static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
-    ;
-    memset(this, 0, sizeof(StringHeader));
-  }
+  StringHeader() = default;
 
-  explicit StringHeader(uint32_t size) : size_(size) {
-    memset(prefix_, 0, kPrefixSize);
-    value_.data = nullptr;
+  static StringHeader makeInline(uint32_t size, char** data) {
+    assert(size <= kInlineSize);
+    StringHeader s;
+    s.size_ = size;
+    *data = const_cast<char*>(s.data());
+    return s;
   }
 
-  StringHeader(const char* data, size_t len) : size_(len) {
+  StringHeader(const char* data, size_t len) : size_(static_cast<uint32_t>(len)) {
+    if (size_ == 0) return;
+
     // TODO: better option than assert?
-    assert(data || size_ == 0);
+    assert(data);
     if (IsInline()) {
-      // Zero the inline part.
-      // this makes sure that inline strings can be compared for equality with 2
-      // int64 compares.
-      memset(prefix_, 0, kPrefixSize);
-      if (size_ == 0) {
-        return;
-      }
-      // small string: inlined. Zero the last 8 bytes first to allow for whole
-      // word comparison.
-      value_.data = nullptr;
-      memcpy(prefix_, data, size_);
+      // small string: inlined. Bytes beyond size_ are already 0
+      memcpy(prefix_.data(), data, size_);
     } else {
       // large string: store pointer
-      memcpy(prefix_, data, kPrefixSize);
+      memcpy(prefix_.data(), data, kPrefixSize);
       value_.data = data;
     }
   }
@@ -112,19 +105,20 @@ struct StringHeader {
   //   StringHeader bh = "literal";
   //   std::optional<BytesView> obh = "literal";
   //
-  /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
+  // NOLINTNEXTLINE runtime/explicit
+  StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
 
   explicit StringHeader(const std::string& value)
       : StringHeader(value.data(), value.size()) {}
 
-  explicit StringHeader(const std::string_view& value)
+  explicit StringHeader(std::string_view value)
       : StringHeader(value.data(), value.size()) {}
 
   bool IsInline() const { return IsInline(size_); }
 
   static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
 
-  const char* data() const { return IsInline() ? prefix_ : value_.data; }
+  const char* data() const { return IsInline() ? prefix_.data() : value_.data; }
 
   size_t size() const { return size_; }
 
@@ -160,7 +154,7 @@ struct StringHeader {
     if (PrefixAsInt() != other.PrefixAsInt()) {
       // The result is decided on prefix. The shorter will be less
       // because the prefix is padded with zeros.
-      return memcmp(prefix_, other.prefix_, kPrefixSize);
+      return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize);
     }
     int32_t size = std::min(size_, other.size_) - kPrefixSize;
     if (size <= 0) {
@@ -168,7 +162,7 @@ struct StringHeader {
       return size_ - other.size_;
     }
     if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) {
-      int32_t result = memcmp(value_.inlined, other.value_.inlined, size);
+      int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size);
       return (result != 0) ? result : size_ - other.size_;
     }
     int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size);
@@ -183,9 +177,7 @@ struct StringHeader {
 
   bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; }
 
-  operator std::string() const { return std::string(data(), size()); }
-
-  std::string GetString() const { return *this; }
+  std::string GetString() const { return std::string(data(), size()); }
 
   explicit operator std::string_view() const { return std::string_view(data(), size()); }
 
@@ -208,12 +200,14 @@ struct StringHeader {
 
   // We rely on all members being laid out top to bottom . C++
   // guarantees this.
-  uint32_t size_;
-  char prefix_[4];
+  uint32_t size_ = 0;
+  std::array<char, 4> prefix_ = {0};
   union {
-    char inlined[8];
+    std::array<char, 8> inlined = {0};
     const char* data;
   } value_;
 };
 
+static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
+
 }  // namespace arrow


[arrow] 04/15: add StringView/BinaryView to AllTypeIds

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 5c8a6ecb8c8747ec425a019cfa5c6084be04da0e
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:18:02 2022 +0200

    add StringView/BinaryView to AllTypeIds
---
 cpp/src/arrow/type.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index b976260ccd..becd0c0c62 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -129,6 +129,8 @@ std::vector<Type::type> AllTypeIds() {
           Type::BINARY,
           Type::LARGE_STRING,
           Type::LARGE_BINARY,
+          Type::STRING_VIEW,
+          Type::BINARY_VIEW,
           Type::FIXED_SIZE_BINARY,
           Type::STRUCT,
           Type::LIST,


[arrow] 13/15: wrote <=, needed >=

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 4072a6b2ab76e36ce724e16eef8c6a2aa491dda3
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Sat Nov 19 21:28:48 2022 -0500

    wrote <=, needed >=
---
 cpp/src/arrow/array/concatenate_test.cc | 2 +-
 cpp/src/arrow/array/validate.cc         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index 1bc0c65bec..59d7885784 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -91,7 +91,7 @@ class ConcatenateTest : public ::testing::Test {
       for (auto null_probability : this->null_probabilities_) {
         std::shared_ptr<Array> array;
         factory(size, null_probability, &array);
-          ASSERT_OK(array->ValidateFull());
+        ASSERT_OK(array->ValidateFull());
         auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front());
         auto slices = this->Slices(array, offsets);
         ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 53d74ba148..53836efd97 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -615,7 +615,7 @@ struct ValidateArrayImpl {
 
     auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
       return super.data() <= sub.data() &&
-             super.data() + super.size() <= sub.data() + sub.size();
+             super.data() + super.size() >= sub.data() + sub.size();
     };
 
     std::vector<std::string_view> buffers;
@@ -643,7 +643,7 @@ struct ValidateArrayImpl {
 
         if (!in_a_buffer(view)) {
           return Status::Invalid(
-              "String view at slot ", i,
+              "String view at slot ", i, " @", (std::uintptr_t)view.data(),
               " views memory not resident in any buffer managed by the array");
         }
       }


[arrow] 10/15: delete potentially internal viewing members for rvalues

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 34efa83a8bccefb34085eb1daaee6ceead81b575
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 09:16:11 2022 -0500

    delete potentially internal viewing members for rvalues
---
 cpp/src/arrow/util/string_header.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
index 8ba18a8366..e3e9d9d69c 100644
--- a/cpp/src/arrow/util/string_header.h
+++ b/cpp/src/arrow/util/string_header.h
@@ -83,7 +83,7 @@ struct StringHeader {
   StringHeader(const char* data, size_t len) : size_(static_cast<uint32_t>(len)) {
     if (size_ == 0) return;
 
-    // TODO: better option than assert?
+    // TODO(bkietz) better option than assert?
     assert(data);
     if (IsInline()) {
       // small string: inlined. Bytes beyond size_ are already 0
@@ -118,7 +118,8 @@ struct StringHeader {
 
   static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
 
-  const char* data() const { return IsInline() ? prefix_.data() : value_.data; }
+  const char* data() const& { return IsInline() ? prefix_.data() : value_.data; }
+  const char* data() && = delete;
 
   size_t size() const { return size_; }
 
@@ -179,11 +180,14 @@ struct StringHeader {
 
   std::string GetString() const { return std::string(data(), size()); }
 
-  explicit operator std::string_view() const { return std::string_view(data(), size()); }
+  explicit operator std::string_view() const& { return std::string_view(data(), size()); }
+  operator std::string_view() && = delete;
 
-  const char* begin() const { return data(); }
+  const char* begin() const& { return data(); }
+  const char* end() const& { return data() + size(); }
 
-  const char* end() const { return data() + size(); }
+  const char* begin() && = delete;
+  const char* end() && = delete;
 
   bool empty() const { return size() == 0; }
 


[arrow] 07/15: fix formatting

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 2f619ba07daf5ab8ab18e20bf38617f51d1291bb
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:25:06 2022 +0200

    fix formatting
---
 cpp/src/arrow/type_traits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index dcd7c36ba2..7ba04d35e2 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -829,8 +829,8 @@ using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>;
 template <typename T>
 using has_string_view =
     std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
-                           std::is_same<BinaryViewType, T>::value ||
-                           std::is_same<LargeBinaryType, T>::value ||
+                                     std::is_same<BinaryViewType, T>::value ||
+                                     std::is_same<LargeBinaryType, T>::value ||
                                      std::is_same<StringType, T>::value ||
                                      std::is_same<StringViewType, T>::value ||
                                      std::is_same<LargeStringType, T>::value ||


[arrow] 05/15: implement inline visitor for StringView/BinaryView

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 2aaccd1b8c4ca9b7e52864bd50a8c0370672827a
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:21:14 2022 +0200

    implement inline visitor for StringView/BinaryView
---
 cpp/src/arrow/visit_data_inline.h | 53 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h
index 7d37698f14..b6996d188c 100644
--- a/cpp/src/arrow/visit_data_inline.h
+++ b/cpp/src/arrow/visit_data_inline.h
@@ -155,6 +155,59 @@ struct ArraySpanInlineVisitor<T, enable_if_base_binary<T>> {
   }
 };
 
+// BinaryView, StringView...
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_binary_view_like<T>> {
+  using c_type = std::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    if (arr.length == 0) {
+      return Status::OK();
+    }
+    const StringHeader* headers;
+    if (arr.buffers[1].data == NULLPTR) {
+      headers = NULLPTR;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      headers = arr.GetValues<StringHeader>(1);
+    }
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t (index)) {
+          return valid_func(static_cast<std::string_view>(headers[index]));
+        },
+        [&]() {
+          return null_func();
+        });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    if (arr.length == 0) {
+      return;
+    }
+    const StringHeader* headers;
+    if (arr.buffers[1].data == NULLPTR) {
+      headers = NULLPTR;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      headers = arr.GetValues<StringHeader>(1);
+    }
+
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t (index)) {
+          valid_func(static_cast<std::string_view>(headers[index]));
+        },
+        std::forward<NullFunc>(null_func));
+  }
+};
+
 // FixedSizeBinary, Decimal128
 template <typename T>
 struct ArraySpanInlineVisitor<T, enable_if_fixed_size_binary<T>> {


[arrow] 02/15: BinaryViewBuilder: fix duplicate values in null bitmap

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 20d89459f1c3f328f564ad484cc7868be1aff0b0
Author: Tobias Zagorni <to...@zagorni.eu>
AuthorDate: Tue Oct 18 17:14:16 2022 +0200

    BinaryViewBuilder: fix duplicate values in null bitmap
---
 cpp/src/arrow/array/builder_binary.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index c716e6d225..ca5209b81e 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -563,7 +563,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
       ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length));
     }
     UnsafeAppend(StringHeader(value, length));
-    UnsafeAppendToBitmap(true);
     return Status::OK();
   }
 


[arrow] 01/15: Draft basic scaffolding for Binary/StringView types and get compiling

Posted by bk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 8e1c1442feebe9af2db607e50abd4b9bb900e3fb
Author: Wes McKinney <we...@apache.org>
AuthorDate: Fri Sep 9 16:35:27 2022 -0500

    Draft basic scaffolding for Binary/StringView types and get compiling
---
 LICENSE.txt                                        |  16 +-
 cpp/src/arrow/array/array_base.cc                  |   4 +
 cpp/src/arrow/array/array_binary.cc                |  12 +
 cpp/src/arrow/array/array_binary.h                 |  58 +++++
 cpp/src/arrow/array/builder_binary.cc              |  86 +++++++
 cpp/src/arrow/array/builder_binary.h               | 248 +++++++++++++++++++++
 cpp/src/arrow/array/builder_dict.cc                |   6 +
 cpp/src/arrow/array/builder_dict.h                 |  10 +
 cpp/src/arrow/array/concatenate.cc                 |   4 +
 cpp/src/arrow/array/util.cc                        |  13 ++
 cpp/src/arrow/array/validate.cc                    |  20 +-
 cpp/src/arrow/compare.cc                           |  13 +-
 cpp/src/arrow/ipc/feather.cc                       |   4 +-
 cpp/src/arrow/ipc/metadata_internal.cc             |  10 +
 cpp/src/arrow/ipc/reader.cc                        |   5 +
 cpp/src/arrow/ipc/writer.cc                        |   4 +
 cpp/src/arrow/json/test_common.h                   |  10 +-
 cpp/src/arrow/scalar.cc                            |  14 ++
 cpp/src/arrow/scalar.h                             |  29 +++
 cpp/src/arrow/testing/json_internal.cc             |  10 +-
 cpp/src/arrow/type.cc                              |  16 +-
 cpp/src/arrow/type.h                               |  46 ++++
 cpp/src/arrow/type_fwd.h                           |  21 ++
 cpp/src/arrow/type_test.cc                         |  12 +
 cpp/src/arrow/type_traits.h                        |  57 ++++-
 cpp/src/arrow/util/string_header.h                 | 219 ++++++++++++++++++
 cpp/src/arrow/visitor.cc                           |   8 +-
 cpp/src/arrow/visitor.h                            |   6 +
 cpp/src/arrow/visitor_generate.h                   |   2 +
 cpp/src/parquet/column_writer.cc                   |   1 +
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc |  38 +---
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  23 +-
 32 files changed, 974 insertions(+), 51 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 86cfaf546c..d282bfe7b3 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1965,7 +1965,7 @@ This project includes code from the autobrew project.
 The following files are based on code from the autobrew project:
 * r/tools/autobrew
 * dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
-* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb 
+* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb
 
 Copyright (c) 2019, Jeroen Ooms
 License: MIT
@@ -2047,6 +2047,20 @@ License: http://www.apache.org/licenses/LICENSE-2.0
 
 --------------------------------------------------------------------------------
 
+This project includes code from Velox.
+
+ * cpp/src/arrow/util/bytes_header.h
+
+is based on Velox's
+
+ * velox/type/StringView.h
+
+Copyright: Copyright (c) Facebook, Inc. and its affiliates.
+Home page: https://github.com/facebookincubator/velox
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
 The file cpp/src/arrow/vendored/musl/strptime.c has the following license
 
 Copyright © 2005-2020 Rich Felker, et al.
diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
index 5d27b2aedf..de9ab2e985 100644
--- a/cpp/src/arrow/array/array_base.cc
+++ b/cpp/src/arrow/array/array_base.cc
@@ -82,6 +82,10 @@ struct ScalarFromArraySlotImpl {
     return Finish(a.GetString(index_));
   }
 
+  Status Visit(const BinaryViewArray& a) {
+    return Status::NotImplemented("ScalarFromArraySlot -> BinaryView");
+  }
+
   Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
 
   Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc
index 9466b5a48f..cfc467160a 100644
--- a/cpp/src/arrow/array/array_binary.cc
+++ b/cpp/src/arrow/array/array_binary.cc
@@ -89,6 +89,18 @@ LargeStringArray::LargeStringArray(int64_t length,
 
 Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
 
+BinaryViewArray::BinaryViewArray(const std::shared_ptr<ArrayData>& data) {
+  ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
+  SetData(data);
+}
+
+StringViewArray::StringViewArray(const std::shared_ptr<ArrayData>& data) {
+  ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
+  SetData(data);
+}
+
+Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+
 FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
   SetData(data);
 }
diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
index 7e58a96ff8..03ee77fab8 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -22,6 +22,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -217,6 +218,63 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
   Status ValidateUTF8() const;
 };
 
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+  explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
+                  const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {}
+
+  const StringHeader* raw_values() const {
+    return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
+  }
+
+  StringHeader Value(int64_t i) const { return raw_values()[i]; }
+
+  // For API compatibility with BinaryArray etc.
+  std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); }
+
+  // EXPERIMENTAL
+  std::optional<std::string_view> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  IteratorType begin() const { return IteratorType(*this); }
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  using PrimitiveArray::PrimitiveArray;
+};
+
+/// Concrete Array class for variable-size string view (utf-8) data using
+/// StringHeader to reference in-line or out-of-line string values
+class ARROW_EXPORT StringViewArray : public BinaryViewArray {
+ public:
+  using TypeClass = StringViewType;
+
+  explicit StringViewArray(const std::shared_ptr<ArrayData>& data);
+
+  StringViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
+                  const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {}
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
+};
+
 // ----------------------------------------------------------------------
 // Fixed width binary
 
diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc
index 571f450aab..e0a7bc1193 100644
--- a/cpp/src/arrow/array/builder_binary.cc
+++ b/cpp/src/arrow/array/builder_binary.cc
@@ -40,6 +40,92 @@ namespace arrow {
 
 using internal::checked_cast;
 
+// ----------------------------------------------------------------------
+// Binary/StringView
+
+Status BinaryViewBuilder::AppendValues(const std::vector<std::string>& values,
+                                       const uint8_t* valid_bytes) {
+  // We only need to allocate memory for the out-of-line strings
+  std::size_t out_of_line_total = std::accumulate(
+      values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) {
+        size_t length = str.size();
+        return sum + (length > StringHeader::kInlineSize ? length : 0);
+      });
+  RETURN_NOT_OK(Reserve(values.size()));
+  RETURN_NOT_OK(ReserveData(out_of_line_total));
+
+  if (valid_bytes != nullptr) {
+    for (std::size_t i = 0; i < values.size(); ++i) {
+      if (valid_bytes[i]) {
+        UnsafeAppend(values[i]);
+      } else {
+        UnsafeAppendNull();
+      }
+    }
+  } else {
+    for (std::size_t i = 0; i < values.size(); ++i) {
+      UnsafeAppend(values[i]);
+    }
+  }
+  UnsafeAppendToBitmap(valid_bytes, values.size());
+  return Status::OK();
+}
+
+Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset,
+                                           int64_t length) {
+  auto bitmap = array.GetValues<uint8_t>(0, 0);
+  auto values = array.GetValues<StringHeader>(1) + offset;
+
+  int64_t out_of_line_total = 0;
+  for (int64_t i = 0; i < length; i++) {
+    if (!values[i].IsInline()) {
+      out_of_line_total += static_cast<int64_t>(values[i].size());
+    }
+  }
+  RETURN_NOT_OK(Reserve(length));
+  RETURN_NOT_OK(ReserveData(out_of_line_total));
+  for (int64_t i = 0; i < length; i++) {
+    if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
+      if (values[i].IsInline()) {
+        UnsafeAppend(values[i]);
+      } else {
+        UnsafeAppend(values[i].data(), values[i].size());
+      }
+    } else {
+      UnsafeAppendNull();
+    }
+  }
+  return Status::OK();
+}
+
+Status BinaryViewBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+  ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+  BufferVector buffers = {null_bitmap, data};
+  for (auto&& buffer : data_heap_builder_.Finish()) {
+    buffers.push_back(std::move(buffer));
+  }
+  *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_);
+  capacity_ = length_ = null_count_ = 0;
+  Reset();
+  return Status::OK();
+}
+
+Status BinaryViewBuilder::ReserveData(int64_t length) {
+  if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) {
+    return Status::CapacityError(
+        "BinaryView or StringView elements cannot reference "
+        "strings larger than 4GB");
+  }
+  return data_heap_builder_.Reserve(length);
+}
+
+void BinaryViewBuilder::Reset() {
+  ArrayBuilder::Reset();
+  data_builder_.Reset();
+  data_heap_builder_.Reset();
+}
+
 // ----------------------------------------------------------------------
 // Fixed width binary
 
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 25183ca169..c716e6d225 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -459,6 +459,254 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
   std::shared_ptr<DataType> type() const override { return large_utf8(); }
 };
 
+// ----------------------------------------------------------------------
+// BinaryViewBuilder, StringViewBuilder
+//
+// The builders permit two styles of use: one where appended data is
+// accumulated in a third buffer that is appended to the resulting ArrayData,
+// and one where only the StringHeaders are appended. If you only want to
+// append StringHeaders, then use the Append(const StringHeader&) methods
+
+namespace internal {
+
+// Because we construct StringHeader objects incrementally, resizing buffers is
+// not an option as memory addresses for out-of-line strings will change. Thus,
+// we allocate medium-sized memory chunks and accumulate data in those, which
+// may result in some waste if there are many large-ish strings. If a string
+// comes along that does not fit into a block, we allocate a new block and
+// write into that.
+//
+// Later we can implement optimizations to continuing filling underfull blocks
+// after encountering a large string that required allocating a new block.
+class ARROW_EXPORT StringHeapBuilder {
+ public:
+  static constexpr int64_t kDefaultBlocksize = 1 << 20;  // 1MB
+
+  StringHeapBuilder(MemoryPool* pool, int64_t blocksize = kDefaultBlocksize)
+      : pool_(pool), blocksize_(blocksize) {}
+
+  const uint8_t* UnsafeAppend(const uint8_t* data, int64_t num_bytes) {
+    memcpy(current_out_buffer_, data, static_cast<size_t>(num_bytes));
+    const uint8_t* result = current_out_buffer_;
+    current_out_buffer_ += num_bytes;
+    current_remaining_bytes_ -= num_bytes;
+    return result;
+  }
+
+  Result<const uint8_t*> Append(const uint8_t* data, int64_t num_bytes) {
+    if (num_bytes > current_remaining_bytes_) {
+      ARROW_RETURN_NOT_OK(Reserve(num_bytes));
+    }
+    return UnsafeAppend(data, num_bytes);
+  }
+
+  /// \brief Ensure that the indicated number of bytes can be appended via
+  /// UnsafeAppend operations without the need to allocate more memory
+  Status Reserve(int64_t num_bytes) {
+    if (num_bytes > current_remaining_bytes_) {
+      current_remaining_bytes_ =
+          num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize;
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
+                            AllocateBuffer(current_remaining_bytes_, pool_));
+      current_out_buffer_ = new_block->mutable_data();
+      blocks_.emplace_back(std::move(new_block));
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    current_out_buffer_ = nullptr;
+    current_remaining_bytes_ = 0;
+    blocks_.clear();
+  }
+
+  int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
+
+  std::vector<std::shared_ptr<Buffer>> Finish() {
+    current_out_buffer_ = nullptr;
+    current_remaining_bytes_ = 0;
+    return std::move(blocks_);
+  }
+
+ private:
+  MemoryPool* pool_;
+  const int64_t blocksize_;
+  std::vector<std::shared_ptr<Buffer>> blocks_;
+
+  uint8_t* current_out_buffer_ = nullptr;
+  int64_t current_remaining_bytes_ = 0;
+};
+
+}  // namespace internal
+
+class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = BinaryViewType;
+
+  BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : BinaryViewBuilder(pool) {}
+
+  int64_t current_block_bytes_remaining() const {
+    return data_heap_builder_.current_remaining_bytes();
+  }
+
+  Status Append(const uint8_t* value, int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    if (length > static_cast<int64_t>(StringHeader::kInlineSize)) {
+      // String is stored out-of-line
+      if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) {
+        return Status::CapacityError(
+            "BinaryView or StringView elements cannot reference "
+            "strings larger than 4GB");
+      }
+      // Overwrite 'value' since we will use that for the StringHeader value below
+      ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length));
+    }
+    UnsafeAppend(StringHeader(value, length));
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  Status Append(const char* value, int64_t length) {
+    return Append(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  Status Append(std::string_view value) {
+    return Append(value.data(), static_cast<int64_t>(value.size()));
+  }
+
+  Status Append(StringHeader value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(value);
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  /// \brief Append without checking capacity
+  ///
+  /// Builder should have been presized using Reserve() and ReserveData(),
+  /// respectively, and the value must not be larger than 4GB
+  void UnsafeAppend(const uint8_t* value, int64_t length) {
+    if (length > static_cast<int64_t>(StringHeader::kInlineSize)) {
+      // String is stored out-of-line
+      // Overwrite 'value' since we will use that for the StringHeader value below
+      value = data_heap_builder_.UnsafeAppend(value, length);
+    }
+    UnsafeAppend(StringHeader(value, length));
+    UnsafeAppendToBitmap(true);
+  }
+
+  void UnsafeAppend(const char* value, int64_t length) {
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  void UnsafeAppend(const std::string& value) {
+    UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
+  }
+
+  void UnsafeAppend(std::string_view value) {
+    UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
+  }
+
+  void UnsafeAppend(StringHeader value) {
+    data_builder_.UnsafeAppend(value);
+    UnsafeAppendToBitmap(true);
+  }
+
+  /// \brief Ensures there is enough allocated available capacity in the
+  /// out-of-line data heap to append the indicated number of bytes without
+  /// additional allocations
+  Status ReserveData(int64_t length);
+
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, StringHeader());  // zero
+    UnsafeSetNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Append a single null element
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(StringHeader());  // zero
+    UnsafeAppendToBitmap(false);
+    return Status::OK();
+  }
+
+  /// \brief Append a empty element (length-0 inline string)
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(StringHeader(""));  // zero
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  /// \brief Append several empty elements
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, StringHeader(""));
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  void UnsafeAppendNull() {
+    data_builder_.UnsafeAppend(StringHeader());
+    UnsafeAppendToBitmap(false);
+  }
+
+  void UnsafeAppendEmptyValue() {
+    data_builder_.UnsafeAppend(StringHeader(""));
+    UnsafeAppendToBitmap(true);
+  }
+
+  /// \brief Append a sequence of strings in one shot.
+  ///
+  /// \param[in] values a vector of strings
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const std::vector<std::string>& values,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
+  /// the underlying out-of-line string memory to avoid memory lifetime issues
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override;
+
+  void Reset() override;
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+    ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  std::shared_ptr<DataType> type() const override { return binary_view(); }
+
+ protected:
+  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {}
+
+  static constexpr int64_t ValueSizeLimit() {
+    return std::numeric_limits<uint32_t>::max();
+  }
+
+  TypedBufferBuilder<StringHeader> data_builder_;
+
+  // Accumulates out-of-line data in fixed-size chunks which are then attached
+  // to the resulting ArrayData
+  internal::StringHeapBuilder data_heap_builder_;
+};
+
+class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
+ public:
+  using BinaryViewBuilder::BinaryViewBuilder;
+  std::shared_ptr<DataType> type() const override { return utf8_view(); }
+};
+
 // ----------------------------------------------------------------------
 // FixedSizeBinaryBuilder
 
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 061fb60041..c99a6facee 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu
   return impl_->GetOrInsert<BinaryType>(value, out);
 }
 
+Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value,
+                                        int32_t* out) {
+  // Create BinaryArray dictionary for now
+  return impl_->GetOrInsert<BinaryType>(value, out);
+}
+
 Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value,
                                         int32_t* out) {
   return impl_->GetOrInsert<LargeBinaryType>(value, out);
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index cb0aaf3099..0cc82930a1 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -60,6 +60,12 @@ struct DictionaryValue<T, enable_if_base_binary<T>> {
                                 BinaryType, LargeBinaryType>::type;
 };
 
+template <typename T>
+struct DictionaryValue<T, enable_if_binary_view_like<T>> {
+  using type = std::string_view;
+  using PhysicalType = BinaryViewType;
+};
+
 template <typename T>
 struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
   using type = std::string_view;
@@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable {
   Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
   Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
 
+  // TODO: Consider working StringHeader throughout the hashing machinery to
+  // benefit from faster comparisons, reduced need to allocate memory
+  Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
+
   class DictionaryMemoTableImpl;
   std::unique_ptr<DictionaryMemoTableImpl> impl_;
 };
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index aab734284f..3dd0ccea93 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -227,6 +227,10 @@ class ConcatenateImpl {
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
 
+  Status Visit(const BinaryViewType&) {
+    return Status::NotImplemented("binary / string view");
+  }
+
   Status Visit(const ListType&) {
     std::vector<Range> value_ranges;
     ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index c0cdcab730..ac9d76d469 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -264,6 +264,14 @@ class ArrayDataEndianSwapper {
     return Status::OK();
   }
 
+  template <typename T>
+  enable_if_t<std::is_same<BinaryViewType, T>::value ||
+                  std::is_same<StringViewType, T>::value,
+              Status>
+  Visit(const T& type) {
+    return Status::NotImplemented("Binary / string view");
+  }
+
   Status Visit(const ListType& type) {
     RETURN_NOT_OK(SwapOffsets<int32_t>(1));
     return Status::OK();
@@ -596,6 +604,11 @@ class RepeatedArrayFactory {
     return Status::OK();
   }
 
+  template <typename T>
+  enable_if_binary_view_like<T, Status> Visit(const T&) {
+    return Status::NotImplemented("binary / string view");
+  }
+
   template <typename T>
   enable_if_var_size_list<T, Status> Visit(const T& type) {
     using ScalarType = typename TypeTraits<T>::ScalarType;
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 56470ac74b..cddb086005 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -47,6 +47,19 @@ struct UTF8DataValidator {
     return Status::NotImplemented("");
   }
 
+  Status Visit(const StringViewType&) {
+    util::InitializeUTF8();
+
+    const auto* values = data.GetValues<StringHeader>(1);
+    for (int64_t i = 0; i < data.length; ++i) {
+      if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(
+              reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) {
+        return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+      }
+    }
+    return Status::OK();
+  }
+
   template <typename StringType>
   enable_if_string<StringType, Status> Visit(const StringType&) {
     util::InitializeUTF8();
@@ -247,6 +260,10 @@ struct ValidateArrayImpl {
 
   Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
 
+  Status Visit(const BinaryViewType& type) {
+    return Status::NotImplemented("binary / string view");
+  }
+
   Status Visit(const ListType& type) { return ValidateListLike(type); }
 
   Status Visit(const LargeListType& type) { return ValidateListLike(type); }
@@ -716,7 +733,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d
 
 ARROW_EXPORT
 Status ValidateUTF8(const ArrayData& data) {
-  DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
+  DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW ||
+         data.type->id() == Type::LARGE_STRING);
   UTF8DataValidator validator{data};
   return VisitTypeInline(*data.type, &validator);
 }
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index baadd10cca..8ccc645046 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -259,6 +259,11 @@ class RangeDataEqualsImpl {
   // Also matches StringType
   Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
+  // Also matches StringViewType
+  Status Visit(const BinaryViewType& type) {
+    return Status::NotImplemented("Binary / string view");
+  }
+
   // Also matches LargeStringType
   Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
@@ -577,7 +582,7 @@ class TypeEqualsVisitor {
 
   template <typename T>
   enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value ||
-                  is_base_binary_type<T>::value,
+                  is_base_binary_type<T>::value || is_binary_view_like_type<T>::value,
               Status>
   Visit(const T&) {
     result_ = true;
@@ -729,6 +734,12 @@ class ScalarEqualsVisitor {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewScalar& left) {
+    const auto& right = checked_cast<const BinaryViewScalar&>(right_);
+    result_ = left.value == right.value;
+    return Status::OK();
+  }
+
   Status Visit(const Decimal128Scalar& left) {
     const auto& right = checked_cast<const Decimal128Scalar&>(right_);
     result_ = left.value == right.value;
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index b6d3a3d7d8..1ef076fac4 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -536,8 +536,8 @@ struct ArrayWriterV1 {
       is_nested_type<T>::value || is_null_type<T>::value || is_decimal_type<T>::value ||
           std::is_same<DictionaryType, T>::value || is_duration_type<T>::value ||
           is_interval_type<T>::value || is_fixed_size_binary_type<T>::value ||
-          std::is_same<Date64Type, T>::value || std::is_same<Time64Type, T>::value ||
-          std::is_same<ExtensionType, T>::value,
+          is_binary_view_like_type<T>::value || std::is_same<Date64Type, T>::value ||
+          std::is_same<Time64Type, T>::value || std::is_same<ExtensionType, T>::value,
       Status>::type
   Visit(const T& type) {
     return Status::NotImplemented(type.ToString());
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index 2e450b9d46..367b31d5dd 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -523,6 +523,16 @@ class FieldToFlatbufferVisitor {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewType& type) {
+    // BinaryView will be written to IPC as a normal binary array
+    return Visit(BinaryType());
+  }
+
+  Status Visit(const StringViewType& type) {
+    // StringView will be written to IPC as a normal UTF8 string array
+    return Visit(StringType());
+  }
+
   Status Visit(const LargeBinaryType& type) {
     fb_type_ = flatbuf::Type::LargeBinary;
     type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union();
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index a1b17afaaf..843d5917b3 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -348,6 +348,11 @@ class ArrayLoader {
     return LoadBinary<T>(type.id());
   }
 
+  Status Visit(const BinaryViewType& type) {
+    DCHECK(false);
+    return Status::NotImplemented("Reading IPC format to binary view is not supported");
+  }
+
   Status Visit(const FixedSizeBinaryType& type) {
     out_->buffers.resize(2);
     RETURN_NOT_OK(LoadCommon(type.id()));
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index b89604e6fe..d68da651f3 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -388,6 +388,10 @@ class RecordBatchSerializer {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewArray& array) {
+    return Status::NotImplemented("Binary / string view type");
+  }
+
   Status Visit(const FixedSizeListArray& array) {
     --max_recursion_depth_;
     auto size = array.list_type()->list_size();
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index c01036047c..86a03c82ab 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -110,8 +110,7 @@ struct GenerateImpl {
     return OK(writer.Double(val));
   }
 
-  template <typename T>
-  enable_if_base_binary<T, Status> Visit(const T&) {
+  Status GenerateAscii(const DataType&) {
     auto size = std::poisson_distribution<>{4}(e);
     std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
     std::string s(size, '\0');
@@ -119,6 +118,13 @@ struct GenerateImpl {
     return OK(writer.String(s.c_str()));
   }
 
+  template <typename T>
+  enable_if_base_binary<T, Status> Visit(const T& t) {
+    return GenerateAscii(t);
+  }
+
+  Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+
   template <typename T>
   enable_if_list_like<T, Status> Visit(const T& t) {
     auto size = std::poisson_distribution<>{4}(e);
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index 0ca08d7a82..d139845bd7 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -70,6 +70,12 @@ struct ScalarHashImpl {
 
   Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
 
+  Status Visit(const BinaryViewScalar& s) {
+    const StringHeader& v = s.value;
+    hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size());
+    return Status::OK();
+  }
+
   template <typename T>
   Status Visit(const TemporalScalar<T>& s) {
     return ValueHash(s);
@@ -226,6 +232,14 @@ struct ScalarValidateImpl {
 
   Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
 
+  Status Visit(const BinaryViewScalar& s) {
+    return Status::NotImplemented("Binary view");
+  }
+
+  Status Visit(const StringViewScalar& s) {
+    return Status::NotImplemented("String view");
+  }
+
   Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
 
   template <typename ScalarType>
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index cf852dff36..9b7f604132 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -37,6 +37,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/compare.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/string_header.h"
 #include "arrow/util/visibility.h"
 #include "arrow/visit_type_inline.h"
 
@@ -282,6 +283,34 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
   StringScalar() : StringScalar(utf8()) {}
 };
 
+struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase {
+  using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+  using TypeClass = BinaryViewType;
+
+  explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type)
+      : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
+
+  explicit BinaryViewScalar(StringHeader value)
+      : BinaryViewScalar(value, binary_view()) {}
+
+  BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {}
+
+  void* mutable_data() override { return reinterpret_cast<void*>(&this->value); }
+
+  std::string_view view() const override { return std::string_view(this->value); }
+
+  StringHeader value;
+};
+
+struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+  using TypeClass = StringViewType;
+
+  explicit StringViewScalar(StringHeader value)
+      : BinaryViewScalar(std::move(value), utf8_view()) {}
+
+  StringViewScalar() : BinaryViewScalar(utf8_view()) {}
+};
+
 struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
   using BaseBinaryScalar::BaseBinaryScalar;
   using TypeClass = LargeBinaryType;
diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc
index c1d45aa2e0..a296e0fba7 100644
--- a/cpp/src/arrow/testing/json_internal.cc
+++ b/cpp/src/arrow/testing/json_internal.cc
@@ -227,8 +227,8 @@ class SchemaWriter {
 
   template <typename T>
   enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value ||
-              is_base_binary_type<T>::value || is_base_list_type<T>::value ||
-              is_struct_type<T>::value>
+              is_base_binary_type<T>::value || is_binary_view_like_type<T>::value ||
+              is_base_list_type<T>::value || is_struct_type<T>::value>
   WriteTypeMetadata(const T& type) {}
 
   void WriteTypeMetadata(const MapType& type) {
@@ -386,6 +386,8 @@ class SchemaWriter {
   Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
   Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
   Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); }
+  Status Visit(const StringViewType& type) { return WritePrimitive("utf8_view", type); }
+  Status Visit(const BinaryViewType& type) { return WritePrimitive("binary_view", type); }
   Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); }
   Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); }
   Status Visit(const FixedSizeBinaryType& type) {
@@ -1320,6 +1322,10 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  Status Visit(const BinaryViewType& type) {
+    return Status::NotImplemented("Binary / string view");
+  }
+
   Status Visit(const DayTimeIntervalType& type) {
     DayTimeIntervalBuilder builder(pool_);
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index ea9525404c..b976260ccd 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -59,10 +59,14 @@ constexpr Type::type FixedSizeListType::type_id;
 
 constexpr Type::type BinaryType::type_id;
 
+constexpr Type::type BinaryViewType::type_id;
+
 constexpr Type::type LargeBinaryType::type_id;
 
 constexpr Type::type StringType::type_id;
 
+constexpr Type::type StringViewType::type_id;
+
 constexpr Type::type LargeStringType::type_id;
 
 constexpr Type::type FixedSizeBinaryType::type_id;
@@ -188,7 +192,9 @@ std::string ToString(Type::type id) {
     TO_STRING_CASE(INTERVAL_MONTHS)
     TO_STRING_CASE(DURATION)
     TO_STRING_CASE(STRING)
+    TO_STRING_CASE(STRING_VIEW)
     TO_STRING_CASE(BINARY)
+    TO_STRING_CASE(BINARY_VIEW)
     TO_STRING_CASE(LARGE_STRING)
     TO_STRING_CASE(LARGE_BINARY)
     TO_STRING_CASE(FIXED_SIZE_BINARY)
@@ -564,10 +570,14 @@ std::string FixedSizeListType::ToString() const {
 
 std::string BinaryType::ToString() const { return "binary"; }
 
+std::string BinaryViewType::ToString() const { return "binary_view"; }
+
 std::string LargeBinaryType::ToString() const { return "large_binary"; }
 
 std::string StringType::ToString() const { return "string"; }
 
+std::string StringViewType::ToString() const { return "string_view"; }
+
 std::string LargeStringType::ToString() const { return "large_string"; }
 
 int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
@@ -2114,8 +2124,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat)
 PARAMETER_LESS_FINGERPRINT(Float)
 PARAMETER_LESS_FINGERPRINT(Double)
 PARAMETER_LESS_FINGERPRINT(Binary)
+PARAMETER_LESS_FINGERPRINT(BinaryView)
 PARAMETER_LESS_FINGERPRINT(LargeBinary)
 PARAMETER_LESS_FINGERPRINT(String)
+PARAMETER_LESS_FINGERPRINT(StringView)
 PARAMETER_LESS_FINGERPRINT(LargeString)
 PARAMETER_LESS_FINGERPRINT(Date32)
 PARAMETER_LESS_FINGERPRINT(Date64)
@@ -2283,8 +2295,10 @@ TYPE_FACTORY(float16, HalfFloatType)
 TYPE_FACTORY(float32, FloatType)
 TYPE_FACTORY(float64, DoubleType)
 TYPE_FACTORY(utf8, StringType)
+TYPE_FACTORY(utf8_view, StringViewType)
 TYPE_FACTORY(large_utf8, LargeStringType)
 TYPE_FACTORY(binary, BinaryType)
+TYPE_FACTORY(binary_view, BinaryViewType)
 TYPE_FACTORY(large_binary, LargeBinaryType)
 TYPE_FACTORY(date64, Date64Type)
 TYPE_FACTORY(date32, Date32Type)
@@ -2532,7 +2546,7 @@ void InitStaticData() {
   // * Time32
   // * Time64
   // * Timestamp
-  g_primitive_types = {null(), boolean(), date32(), date64()};
+  g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()};
   Extend(g_numeric_types, &g_primitive_types);
   Extend(g_base_binary_types, &g_primitive_types);
 }
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 415aaacf1c..f4e082b3f6 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -33,6 +33,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/string_header.h"
 #include "arrow/util/visibility.h"
 #include "arrow/visitor.h"  // IWYU pragma: keep
 
@@ -686,6 +687,33 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
 };
 
+/// \brief Concrete type class for variable-size binary view data using
+/// StringHeader structs
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr const char* type_name() { return "binary_view"; }
+
+  BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {}
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))});
+  }
+
+  std::string ToString() const override;
+  std::string name() const override { return "binary_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  // Allow subclasses like StringType to change the logical type.
+  explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {}
+};
+
 /// \brief Concrete type class for large variable-size binary data
 class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
  public:
@@ -732,6 +760,24 @@ class ARROW_EXPORT StringType : public BinaryType {
   std::string ComputeFingerprint() const override;
 };
 
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringViewType : public BinaryViewType {
+ public:
+  static constexpr Type::type type_id = Type::STRING_VIEW;
+  static constexpr bool is_utf8 = true;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr const char* type_name() { return "utf8_view"; }
+
+  StringViewType() : BinaryViewType(Type::STRING_VIEW) {}
+
+  std::string ToString() const override;
+  std::string name() const override { return "utf8_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
 /// \brief Concrete type class for large variable-size string data, utf8-encoded
 class ARROW_EXPORT LargeStringType : public LargeBinaryType {
  public:
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index ba0e635f73..1066d50321 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -108,6 +108,11 @@ class BinaryArray;
 class BinaryBuilder;
 struct BinaryScalar;
 
+class BinaryViewType;
+class BinaryViewArray;
+class BinaryViewBuilder;
+struct BinaryViewScalar;
+
 class LargeBinaryType;
 class LargeBinaryArray;
 class LargeBinaryBuilder;
@@ -123,6 +128,11 @@ class StringArray;
 class StringBuilder;
 struct StringScalar;
 
+class StringViewType;
+class StringViewArray;
+class StringViewBuilder;
+struct StringViewScalar;
+
 class LargeStringType;
 class LargeStringArray;
 class LargeStringBuilder;
@@ -405,6 +415,13 @@ struct Type {
     /// Calendar interval type with three fields.
     INTERVAL_MONTH_DAY_NANO,
 
+    /// String (UTF8) view type with 4-byte prefix and inline small string
+    /// optimization
+    STRING_VIEW,
+
+    /// Bytes view type with 4-byte prefix and inline small string optimization
+    BINARY_VIEW,
+
     // Leave this at the end
     MAX_ID
   };
@@ -446,10 +463,14 @@ ARROW_EXPORT const std::shared_ptr<DataType>& float32();
 ARROW_EXPORT const std::shared_ptr<DataType>& float64();
 /// \brief Return a StringType instance
 ARROW_EXPORT const std::shared_ptr<DataType>& utf8();
+/// \brief Return a StringViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& utf8_view();
 /// \brief Return a LargeStringType instance
 ARROW_EXPORT const std::shared_ptr<DataType>& large_utf8();
 /// \brief Return a BinaryType instance
 ARROW_EXPORT const std::shared_ptr<DataType>& binary();
+/// \brief Return a BinaryViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& binary_view();
 /// \brief Return a LargeBinaryType instance
 ARROW_EXPORT const std::shared_ptr<DataType>& large_binary();
 /// \brief Return a Date32Type instance
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index 954ad63c8a..ad0804be8b 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -1189,9 +1189,21 @@ TEST(TestBinaryType, ToString) {
 TEST(TestStringType, ToString) {
   StringType str;
   ASSERT_EQ(str.id(), Type::STRING);
+  ASSERT_EQ(str.name(), std::string("utf8"));
+  ASSERT_EQ(str.type_name(), std::string("utf8"));
   ASSERT_EQ(str.ToString(), std::string("string"));
 }
 
+TEST(TestBinaryViewType, ToString) {
+  BinaryViewType t1;
+  BinaryViewType e1;
+  StringViewType t2;
+  AssertTypeEqual(t1, e1);
+  AssertTypeNotEqual(t1, t2);
+  ASSERT_EQ(t1.id(), Type::BINARY_VIEW);
+  ASSERT_EQ(t1.ToString(), std::string("binary_view"));
+}
+
 TEST(TestLargeBinaryTypes, ToString) {
   BinaryType bt1;
   LargeBinaryType t1;
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 5873969066..dcd7c36ba2 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -341,6 +341,16 @@ struct TypeTraits<BinaryType> {
   static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
 };
 
+template <>
+struct TypeTraits<BinaryViewType> {
+  using ArrayType = BinaryViewArray;
+  using BuilderType = BinaryViewBuilder;
+  using ScalarType = BinaryViewScalar;
+  using CType = StringHeader;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return binary_view(); }
+};
+
 template <>
 struct TypeTraits<LargeBinaryType> {
   using ArrayType = LargeBinaryArray;
@@ -371,6 +381,16 @@ struct TypeTraits<StringType> {
   static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
 };
 
+template <>
+struct TypeTraits<StringViewType> {
+  using ArrayType = StringViewArray;
+  using BuilderType = StringViewBuilder;
+  using ScalarType = StringViewScalar;
+  using CType = StringHeader;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return utf8_view(); }
+};
+
 template <>
 struct TypeTraits<LargeStringType> {
   using ArrayType = LargeStringArray;
@@ -390,6 +410,11 @@ struct CTypeTraits<std::string> : public TypeTraits<StringType> {
   using ArrowType = StringType;
 };
 
+template <>
+struct CTypeTraits<StringHeader> : public TypeTraits<BinaryViewType> {
+  using ArrowType = BinaryViewType;
+};
+
 template <>
 struct CTypeTraits<const char*> : public CTypeTraits<std::string> {};
 
@@ -605,9 +630,28 @@ using is_string_type =
 template <typename T, typename R = void>
 using enable_if_string = enable_if_t<is_string_type<T>::value, R>;
 
+template <typename T>
+using is_binary_view_like_type = std::is_base_of<BinaryViewType, T>;
+
+template <typename T>
+using is_binary_view_type = std::is_same<BinaryViewType, T>;
+
+template <typename T>
+using is_string_view_type = std::is_same<StringViewType, T>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view_like = enable_if_t<is_binary_view_like_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view = enable_if_t<is_binary_view_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_string_view = enable_if_t<is_string_view_type<T>::value, R>;
+
 template <typename T>
 using is_string_like_type =
-    std::integral_constant<bool, is_base_binary_type<T>::value && T::is_utf8>;
+    std::integral_constant<bool, (is_base_binary_type<T>::value && T::is_utf8) ||
+                                     is_string_view_type<T>::value>;
 
 template <typename T, typename R = void>
 using enable_if_string_like = enable_if_t<is_string_like_type<T>::value, R>;
@@ -630,10 +674,9 @@ template <typename T, typename R = void>
 using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, R>;
 
 template <typename T>
-using is_binary_like_type =
-    std::integral_constant<bool, (is_base_binary_type<T>::value &&
-                                  !is_string_like_type<T>::value) ||
-                                     is_fixed_size_binary_type<T>::value>;
+using is_binary_like_type = std::integral_constant<
+    bool, (is_base_binary_type<T>::value && !is_string_like_type<T>::value) ||
+              is_binary_view_type<T>::value || is_fixed_size_binary_type<T>::value>;
 
 template <typename T, typename R = void>
 using enable_if_binary_like = enable_if_t<is_binary_like_type<T>::value, R>;
@@ -786,8 +829,10 @@ using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>;
 template <typename T>
 using has_string_view =
     std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
-                                     std::is_same<LargeBinaryType, T>::value ||
+                           std::is_same<BinaryViewType, T>::value ||
+                           std::is_same<LargeBinaryType, T>::value ||
                                      std::is_same<StringType, T>::value ||
+                                     std::is_same<StringViewType, T>::value ||
                                      std::is_same<LargeStringType, T>::value ||
                                      std::is_same<FixedSizeBinaryType, T>::value>;
 
diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
new file mode 100644
index 0000000000..29f378a580
--- /dev/null
+++ b/cpp/src/arrow/util/string_header.h
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <string_view>
+
+namespace arrow {
+
+// Variable length string or binary with 4 byte prefix and inline optimization
+// for small values (12 bytes or fewer). This is similar to std::string_view
+// except that the referenced is limited in size to UINT32_MAX and up to the
+// first four bytes of the string are copied into the struct. The prefix allows
+// failing comparisons early and can reduce the CPU cache working set when
+// dealing with short strings.
+//
+// Short string   |----|----|--------|
+//                 ^    ^      ^
+//                 |    |      |
+//                 size prefix remaining in-line portion
+//
+// Long string    |----|----|--------|
+//                 ^    ^      ^
+//                 |    |      |
+//                 size prefix pointer to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+struct StringHeader {
+ public:
+  using value_type = char;
+
+  static constexpr size_t kPrefixSize = 4;
+  static constexpr size_t kInlineSize = 12;
+
+  StringHeader() {
+    static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
+    ;
+    memset(this, 0, sizeof(StringHeader));
+  }
+
+  explicit StringHeader(uint32_t size) : size_(size) {
+    memset(prefix_, 0, kPrefixSize);
+    value_.data = nullptr;
+  }
+
+  StringHeader(const char* data, size_t len) : size_(len) {
+    // TODO: better option than assert?
+    assert(data || size_ == 0);
+    if (IsInline()) {
+      // Zero the inline part.
+      // this makes sure that inline strings can be compared for equality with 2
+      // int64 compares.
+      memset(prefix_, 0, kPrefixSize);
+      if (size_ == 0) {
+        return;
+      }
+      // small string: inlined. Zero the last 8 bytes first to allow for whole
+      // word comparison.
+      value_.data = nullptr;
+      memcpy(prefix_, data, size_);
+    } else {
+      // large string: store pointer
+      memcpy(prefix_, data, kPrefixSize);
+      value_.data = data;
+    }
+  }
+
+  StringHeader(const uint8_t* data, int64_t len)
+      : StringHeader(reinterpret_cast<const char*>(data), static_cast<size_t>(len)) {}
+
+  // Making StringHeader implicitly constructible/convertible from char* and
+  // string literals, in order to allow for a more flexible API and optional
+  // interoperability. E.g:
+  //
+  //   StringHeader bh = "literal";
+  //   std::optional<BytesView> obh = "literal";
+  //
+  /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
+
+  explicit StringHeader(const std::string& value)
+      : StringHeader(value.data(), value.size()) {}
+
+  explicit StringHeader(const std::string_view& value)
+      : StringHeader(value.data(), value.size()) {}
+
+  bool IsInline() const { return IsInline(size_); }
+
+  static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
+
+  const char* data() const { return IsInline() ? prefix_ : value_.data; }
+
+  size_t size() const { return size_; }
+
+  size_t capacity() const { return size_; }
+
+  friend std::ostream& operator<<(std::ostream& os, const StringHeader& header) {
+    os.write(header.data(), header.size());
+    return os;
+  }
+
+  bool operator==(const StringHeader& other) const {
+    // Compare lengths and first 4 characters.
+    if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) {
+      return false;
+    }
+    if (IsInline()) {
+      // The inline part is zeroed at construction, so we can compare
+      // a word at a time if data extends past 'prefix_'.
+      return size_ <= kPrefixSize || InlinedAsInt64() == other.InlinedAsInt64();
+    }
+    // Sizes are equal and this is not inline, therefore both are out
+    // of line and have kPrefixSize first in common.
+    return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize,
+                  size_ - kPrefixSize) == 0;
+  }
+
+  bool operator!=(const StringHeader& other) const { return !(*this == other); }
+
+  // Returns 0, if this == other
+  //       < 0, if this < other
+  //       > 0, if this > other
+  int32_t Compare(const StringHeader& other) const {
+    if (PrefixAsInt() != other.PrefixAsInt()) {
+      // The result is decided on prefix. The shorter will be less
+      // because the prefix is padded with zeros.
+      return memcmp(prefix_, other.prefix_, kPrefixSize);
+    }
+    int32_t size = std::min(size_, other.size_) - kPrefixSize;
+    if (size <= 0) {
+      // One ends within the prefix.
+      return size_ - other.size_;
+    }
+    if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) {
+      int32_t result = memcmp(value_.inlined, other.value_.inlined, size);
+      return (result != 0) ? result : size_ - other.size_;
+    }
+    int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size);
+    return (result != 0) ? result : size_ - other.size_;
+  }
+
+  bool operator<(const StringHeader& other) const { return Compare(other) < 0; }
+
+  bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; }
+
+  bool operator>(const StringHeader& other) const { return Compare(other) > 0; }
+
+  bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; }
+
+  operator std::string() const { return std::string(data(), size()); }
+
+  std::string GetString() const { return *this; }
+
+  explicit operator std::string_view() const { return std::string_view(data(), size()); }
+
+  const char* begin() const { return data(); }
+
+  const char* end() const { return data() + size(); }
+
+  bool empty() const { return size() == 0; }
+
+ private:
+  inline int64_t SizeAndPrefixAsInt64() const {
+    return reinterpret_cast<const int64_t*>(this)[0];
+  }
+
+  inline int64_t InlinedAsInt64() const {
+    return reinterpret_cast<const int64_t*>(this)[1];
+  }
+
+  int32_t PrefixAsInt() const { return *reinterpret_cast<const int32_t*>(&prefix_); }
+
+  // We rely on all members being laid out top to bottom . C++
+  // guarantees this.
+  uint32_t size_;
+  char prefix_[4];
+  union {
+    char inlined[8];
+    const char* data;
+  } value_;
+};
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index d22efc942e..03381a08a7 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array)
 ARRAY_VISITOR_DEFAULT(HalfFloatArray)
 ARRAY_VISITOR_DEFAULT(FloatArray)
 ARRAY_VISITOR_DEFAULT(DoubleArray)
-ARRAY_VISITOR_DEFAULT(BinaryArray)
 ARRAY_VISITOR_DEFAULT(StringArray)
+ARRAY_VISITOR_DEFAULT(StringViewArray)
+ARRAY_VISITOR_DEFAULT(BinaryArray)
+ARRAY_VISITOR_DEFAULT(BinaryViewArray)
 ARRAY_VISITOR_DEFAULT(LargeBinaryArray)
 ARRAY_VISITOR_DEFAULT(LargeStringArray)
 ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray)
@@ -95,7 +97,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType)
 TYPE_VISITOR_DEFAULT(FloatType)
 TYPE_VISITOR_DEFAULT(DoubleType)
 TYPE_VISITOR_DEFAULT(StringType)
+TYPE_VISITOR_DEFAULT(StringViewType)
 TYPE_VISITOR_DEFAULT(BinaryType)
+TYPE_VISITOR_DEFAULT(BinaryViewType)
 TYPE_VISITOR_DEFAULT(LargeStringType)
 TYPE_VISITOR_DEFAULT(LargeBinaryType)
 TYPE_VISITOR_DEFAULT(FixedSizeBinaryType)
@@ -145,7 +149,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar)
 SCALAR_VISITOR_DEFAULT(FloatScalar)
 SCALAR_VISITOR_DEFAULT(DoubleScalar)
 SCALAR_VISITOR_DEFAULT(StringScalar)
+SCALAR_VISITOR_DEFAULT(StringViewScalar)
 SCALAR_VISITOR_DEFAULT(BinaryScalar)
+SCALAR_VISITOR_DEFAULT(BinaryViewScalar)
 SCALAR_VISITOR_DEFAULT(LargeStringScalar)
 SCALAR_VISITOR_DEFAULT(LargeBinaryScalar)
 SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar)
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index 7f83c9ebab..58330de9d0 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor {
   virtual Status Visit(const FloatArray& array);
   virtual Status Visit(const DoubleArray& array);
   virtual Status Visit(const StringArray& array);
+  virtual Status Visit(const StringViewArray& array);
   virtual Status Visit(const BinaryArray& array);
+  virtual Status Visit(const BinaryViewArray& array);
   virtual Status Visit(const LargeStringArray& array);
   virtual Status Visit(const LargeBinaryArray& array);
   virtual Status Visit(const FixedSizeBinaryArray& array);
@@ -93,7 +95,9 @@ class ARROW_EXPORT TypeVisitor {
   virtual Status Visit(const FloatType& type);
   virtual Status Visit(const DoubleType& type);
   virtual Status Visit(const StringType& type);
+  virtual Status Visit(const StringViewType& type);
   virtual Status Visit(const BinaryType& type);
+  virtual Status Visit(const BinaryViewType& type);
   virtual Status Visit(const LargeStringType& type);
   virtual Status Visit(const LargeBinaryType& type);
   virtual Status Visit(const FixedSizeBinaryType& type);
@@ -141,7 +145,9 @@ class ARROW_EXPORT ScalarVisitor {
   virtual Status Visit(const FloatScalar& scalar);
   virtual Status Visit(const DoubleScalar& scalar);
   virtual Status Visit(const StringScalar& scalar);
+  virtual Status Visit(const StringViewScalar& scalar);
   virtual Status Visit(const BinaryScalar& scalar);
+  virtual Status Visit(const BinaryViewScalar& scalar);
   virtual Status Visit(const LargeStringScalar& scalar);
   virtual Status Visit(const LargeBinaryScalar& scalar);
   virtual Status Visit(const FixedSizeBinaryScalar& scalar);
diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h
index 265c76197a..2c267576ca 100644
--- a/cpp/src/arrow/visitor_generate.h
+++ b/cpp/src/arrow/visitor_generate.h
@@ -40,7 +40,9 @@ namespace arrow {
   ACTION(Boolean);                              \
   ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
   ACTION(String);                               \
+  ACTION(StringView);                           \
   ACTION(Binary);                               \
+  ACTION(BinaryView);                           \
   ACTION(LargeString);                          \
   ACTION(LargeBinary);                          \
   ACTION(FixedSizeBinary);                      \
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f7898c02d4..e62e34abb0 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -129,6 +129,7 @@ struct ValueBufferSlicer {
   NOT_IMPLEMENTED_VISIT(FixedSizeList);
   NOT_IMPLEMENTED_VISIT(Dictionary);
   NOT_IMPLEMENTED_VISIT(Extension);
+  NOT_IMPLEMENTED_VISIT(BinaryView);
 
 #undef NOT_IMPLEMENTED_VISIT
 
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f3cee6c65e..7e48f09889 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -116,39 +116,21 @@ void BufferCapsule_Destructor(PyObject* capsule) {
 using internal::arrow_traits;
 using internal::npy_traits;
 
-template <typename T>
+template <typename T, typename Enable = void>
 struct WrapBytes {};
 
-template <>
-struct WrapBytes<StringType> {
-  static inline PyObject* Wrap(const char* data, int64_t length) {
-    return PyUnicode_FromStringAndSize(data, length);
-  }
-};
-
-template <>
-struct WrapBytes<LargeStringType> {
+template <typename T>
+struct WrapBytes<T, enable_if_t<is_string_type<T>::value ||
+                                is_string_view_type<T>::value>> {
   static inline PyObject* Wrap(const char* data, int64_t length) {
     return PyUnicode_FromStringAndSize(data, length);
   }
 };
 
-template <>
-struct WrapBytes<BinaryType> {
-  static inline PyObject* Wrap(const char* data, int64_t length) {
-    return PyBytes_FromStringAndSize(data, length);
-  }
-};
-
-template <>
-struct WrapBytes<LargeBinaryType> {
-  static inline PyObject* Wrap(const char* data, int64_t length) {
-    return PyBytes_FromStringAndSize(data, length);
-  }
-};
-
-template <>
-struct WrapBytes<FixedSizeBinaryType> {
+template <typename T>
+struct WrapBytes<T, enable_if_t<is_binary_type<T>::value ||
+                                is_binary_view_type<T>::value ||
+                                is_fixed_size_binary_type<T>::value>> {
   static inline PyObject* Wrap(const char* data, int64_t length) {
     return PyBytes_FromStringAndSize(data, length);
   }
@@ -1026,7 +1008,9 @@ struct ObjectWriterVisitor {
   }
 
   template <typename Type>
-  enable_if_t<is_base_binary_type<Type>::value || is_fixed_size_binary_type<Type>::value,
+  enable_if_t<is_base_binary_type<Type>::value ||
+              is_binary_view_like_type<Type>::value ||
+              is_fixed_size_binary_type<Type>::value,
               Status>
   Visit(const Type& type) {
     auto WrapValue = [](const std::string_view& view, PyObject** out) {
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 9e7f07ef81..3ffff8cf19 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -479,13 +479,17 @@ class PyValue {
 
   // The binary-like intermediate representation is PyBytesView because it keeps temporary
   // python objects alive (non-contiguous memoryview) and stores whether the original
-  // object was unicode encoded or not, which is used for unicode -> bytes coersion if
+  // object was unicode encoded or not, which is used for unicode -> bytes coercion if
   // there is a non-unicode object observed.
 
   static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) {
     return view.ParseString(obj);
   }
 
+  static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) {
+    return view.ParseString(obj);
+  }
+
   static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
                         PyBytesView& view) {
     ARROW_RETURN_NOT_OK(view.ParseString(obj));
@@ -672,12 +676,9 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::
   PyBytesView view_;
 };
 
-template <typename T>
-class PyPrimitiveConverter<T, enable_if_base_binary<T>>
-    : public PrimitiveConverter<T, PyConverter> {
+template <typename T, typename OffsetType>
+class PyBinaryConverter : public PrimitiveConverter<T, PyConverter> {
  public:
-  using OffsetType = typename T::offset_type;
-
   Status Append(PyObject* value) override {
     if (PyValue::IsNull(this->options_, value)) {
       this->primitive_builder_->UnsafeAppendNull();
@@ -701,7 +702,7 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
   Result<std::shared_ptr<Array>> ToArray() override {
     ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
     if (observed_binary_) {
-      // if we saw any non-unicode, cast results to BinaryArray
+      // if we saw any non-unicode, cast results to BinaryArray/BinaryViewArray
       auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
       return array->View(binary_type);
     } else {
@@ -714,6 +715,14 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
   bool observed_binary_ = false;
 };
 
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_base_binary<T>>
+    : public PyBinaryConverter<T, typename T::offset_type> {};
+
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_binary_view_like<T>>
+    : public PyBinaryConverter<T, int64_t> {};
+
 template <typename U>
 class PyDictionaryConverter<U, enable_if_has_c_type<U>>
     : public DictionaryConverter<U, PyConverter> {