You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2024/01/29 16:27:43 UTC

(arrow) branch main updated: GH-39740: [C++] Fix filter and take kernel for month_day_nano intervals (#39795)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 800254fb16 GH-39740: [C++] Fix filter and take kernel for month_day_nano intervals (#39795)
800254fb16 is described below

commit 800254fb16f23af57916768124fb90e0050a8335
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Jan 29 17:27:36 2024 +0100

    GH-39740: [C++] Fix filter and take kernel for month_day_nano intervals (#39795)
    
    ### Rationale for this change
    
    The filter and take functions were not correctly supported on month_day_nano intervals.
    
    ### What changes are included in this PR?
    
    * Expand the primitive filter implementation to handle all possible fixed-width primitive types (including fixed-size binary)
    * Expand the take filter implementation to handle all well-known fixed-width primitive types (including month_day_nano, decimal128 and decimal256)
    * Add benchmarks for taking and filtering fixed-size binary
    
    These changes allow for very significant performance improvements filtering and taking fixed-size binary data:
    ```
    ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    Non-regressions: (90)
    ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                  benchmark           baseline          contender  change %                                                                                                                                                                                                                                                                   counters
              FilterFixedSizeBinaryFilterNoNulls/524288/0/8      1.716 GiB/sec     33.814 GiB/sec  1870.862      {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/0/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2462, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 99.9}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1/8 380.056M items/sec   7.098G items/sec  1767.491                                {'family_index': 3, 'per_family_instance_index': 6, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 505, 'byte_width': 8.0, 'null_percent': 100.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/0/9      1.916 GiB/sec     33.721 GiB/sec  1659.766      {'family_index': 0, 'per_family_instance_index': 1, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/0/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2750, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/9/8    917.713 MiB/sec      9.193 GiB/sec   925.719    {'family_index': 0, 'per_family_instance_index': 18, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/9/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1271, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 99.9}
             FilterFixedSizeBinaryFilterNoNulls/524288/12/8      1.004 GiB/sec      9.374 GiB/sec   833.673   {'family_index': 0, 'per_family_instance_index': 24, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/12/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1440, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/3/8      1.625 GiB/sec     15.009 GiB/sec   823.442      {'family_index': 0, 'per_family_instance_index': 6, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/3/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2328, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/9/9   1021.638 MiB/sec      9.126 GiB/sec   814.670    {'family_index': 0, 'per_family_instance_index': 19, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/9/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1428, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/6/8      1.235 GiB/sec     10.814 GiB/sec   775.869     {'family_index': 0, 'per_family_instance_index': 12, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/6/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1762, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 99.9}
             FilterFixedSizeBinaryFilterNoNulls/524288/12/9      1.123 GiB/sec      9.120 GiB/sec   712.196   {'family_index': 0, 'per_family_instance_index': 25, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/12/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1598, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/6/9      1.370 GiB/sec     10.499 GiB/sec   666.348     {'family_index': 0, 'per_family_instance_index': 13, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/6/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1958, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/3/9      1.814 GiB/sec     13.394 GiB/sec   638.343      {'family_index': 0, 'per_family_instance_index': 7, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/3/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2600, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/2/8     12.155 GiB/sec     77.799 GiB/sec   540.051      {'family_index': 0, 'per_family_instance_index': 4, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/2/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17222, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 1.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/2/9     13.507 GiB/sec     84.361 GiB/sec   524.592      {'family_index': 0, 'per_family_instance_index': 5, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/2/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 19469, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 1.0}
             TakeFixedSizeBinaryMonotonicIndices/524288/1/8 194.493M items/sec 732.378M items/sec   276.557                                      {'family_index': 4, 'per_family_instance_index': 6, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/1/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 259, 'byte_width': 8.0, 'null_percent': 100.0}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1/8 200.981M items/sec 747.628M items/sec   271.989                                  {'family_index': 2, 'per_family_instance_index': 6, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 268, 'byte_width': 8.0, 'null_percent': 100.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/0/8    947.631 MiB/sec      3.318 GiB/sec   258.565    {'family_index': 1, 'per_family_instance_index': 0, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/0/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1329, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 99.9}
            FilterFixedSizeBinaryFilterWithNulls/524288/3/8    911.406 MiB/sec      3.121 GiB/sec   250.677    {'family_index': 1, 'per_family_instance_index': 6, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/3/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1275, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 99.9}
              FilterFixedSizeBinaryFilterNoNulls/524288/1/8      1.045 GiB/sec      3.535 GiB/sec   238.406      {'family_index': 0, 'per_family_instance_index': 2, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/1/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1496, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/6/8    899.161 MiB/sec      2.915 GiB/sec   232.029   {'family_index': 1, 'per_family_instance_index': 12, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/6/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1260, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 99.9}
            FilterFixedSizeBinaryFilterWithNulls/524288/9/8    829.852 MiB/sec      2.617 GiB/sec   222.914  {'family_index': 1, 'per_family_instance_index': 18, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/9/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1157, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 99.9}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/0/8 234.268M items/sec 752.809M items/sec   221.345                                    {'family_index': 2, 'per_family_instance_index': 8, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/0/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 312, 'byte_width': 8.0, 'null_percent': 0.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/1/9      1.171 GiB/sec      3.711 GiB/sec   216.957      {'family_index': 0, 'per_family_instance_index': 3, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/1/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1674, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 0.0, 'select%': 50.0}
             TakeFixedSizeBinaryMonotonicIndices/524288/0/8 249.393M items/sec 787.274M items/sec   215.676                                        {'family_index': 4, 'per_family_instance_index': 8, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/0/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 333, 'byte_width': 8.0, 'null_percent': 0.0}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/0/8 234.268M items/sec 736.727M items/sec   214.481                                  {'family_index': 3, 'per_family_instance_index': 8, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/0/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 313, 'byte_width': 8.0, 'null_percent': 0.0}
          TakeFixedSizeBinaryMonotonicIndices/524288/1000/8 134.852M items/sec 423.748M items/sec   214.231                                     {'family_index': 4, 'per_family_instance_index': 0, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/1000/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 202, 'byte_width': 8.0, 'null_percent': 0.1}
           FilterFixedSizeBinaryFilterWithNulls/524288/12/8    913.734 MiB/sec      2.599 GiB/sec   191.245 {'family_index': 1, 'per_family_instance_index': 24, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/12/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1292, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 99.9}
      TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1000/8 138.218M items/sec 309.307M items/sec   123.783                                 {'family_index': 2, 'per_family_instance_index': 0, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1000/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 184, 'byte_width': 8.0, 'null_percent': 0.1}
    TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1000/8 132.755M items/sec 293.027M items/sec   120.727                               {'family_index': 3, 'per_family_instance_index': 0, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1000/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 179, 'byte_width': 8.0, 'null_percent': 0.1}
        TakeFixedSizeBinaryRandomIndicesNoNulls/524288/10/8 125.492M items/sec 272.996M items/sec   117.540                                  {'family_index': 2, 'per_family_instance_index': 2, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/10/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 174, 'byte_width': 8.0, 'null_percent': 10.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/9/9    926.938 MiB/sec      1.904 GiB/sec   110.379  {'family_index': 1, 'per_family_instance_index': 19, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/9/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1295, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 99.9}
            TakeFixedSizeBinaryMonotonicIndices/524288/10/8 158.754M items/sec 331.106M items/sec   108.565                                      {'family_index': 4, 'per_family_instance_index': 2, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/10/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 167, 'byte_width': 8.0, 'null_percent': 10.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/0/9      1.031 GiB/sec      2.129 GiB/sec   106.621    {'family_index': 1, 'per_family_instance_index': 1, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/0/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1477, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 99.9}
            FilterFixedSizeBinaryFilterWithNulls/524288/3/9   1020.776 MiB/sec      2.056 GiB/sec   106.293    {'family_index': 1, 'per_family_instance_index': 7, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/3/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1430, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 99.9}
            FilterFixedSizeBinaryFilterWithNulls/524288/4/8    890.785 MiB/sec      1.768 GiB/sec   103.293    {'family_index': 1, 'per_family_instance_index': 8, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/4/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1242, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/6/9   1005.839 MiB/sec      1.984 GiB/sec   102.023   {'family_index': 1, 'per_family_instance_index': 13, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/6/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1407, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 99.9}
            FilterFixedSizeBinaryFilterWithNulls/524288/1/8    916.810 MiB/sec      1.762 GiB/sec    96.757    {'family_index': 1, 'per_family_instance_index': 2, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/1/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1270, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/7/8    890.211 MiB/sec      1.694 GiB/sec    94.853   {'family_index': 1, 'per_family_instance_index': 14, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/7/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1235, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 50.0}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/2/8  95.788M items/sec 184.004M items/sec    92.095                                   {'family_index': 2, 'per_family_instance_index': 4, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/2/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 124, 'byte_width': 8.0, 'null_percent': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/10/8    862.497 MiB/sec      1.616 GiB/sec    91.823 {'family_index': 1, 'per_family_instance_index': 20, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/10/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1200, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/12/9      1.005 GiB/sec      1.904 GiB/sec    89.431 {'family_index': 1, 'per_family_instance_index': 25, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/12/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1442, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 99.9}
             TakeFixedSizeBinaryMonotonicIndices/524288/2/8 123.065M items/sec 228.755M items/sec    85.881                                       {'family_index': 4, 'per_family_instance_index': 4, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/2/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 164, 'byte_width': 8.0, 'null_percent': 50.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/10/8    930.637 MiB/sec      1.669 GiB/sec    83.659   {'family_index': 0, 'per_family_instance_index': 20, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/10/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1293, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 50.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/4/8      1.034 GiB/sec      1.871 GiB/sec    81.019      {'family_index': 0, 'per_family_instance_index': 8, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/4/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1482, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 50.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/7/8   1004.789 MiB/sec      1.772 GiB/sec    80.538     {'family_index': 0, 'per_family_instance_index': 14, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/7/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1404, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/13/8    920.819 MiB/sec      1.616 GiB/sec    79.686 {'family_index': 1, 'per_family_instance_index': 26, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/13/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1285, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 50.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/13/8    974.713 MiB/sec      1.669 GiB/sec    75.388   {'family_index': 0, 'per_family_instance_index': 26, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/13/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1363, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 50.0}
      TakeFixedSizeBinaryRandomIndicesWithNulls/524288/10/8 107.165M items/sec 187.372M items/sec    74.845                                {'family_index': 3, 'per_family_instance_index': 2, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/10/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 143, 'byte_width': 8.0, 'null_percent': 10.0}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/2/8  72.662M items/sec 114.781M items/sec    57.965                                  {'family_index': 3, 'per_family_instance_index': 4, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/2/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 96, 'byte_width': 8.0, 'null_percent': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/10/9    976.180 MiB/sec      1.480 GiB/sec    55.260 {'family_index': 1, 'per_family_instance_index': 21, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/10/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1358, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 50.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/10/9      1.023 GiB/sec      1.581 GiB/sec    54.502   {'family_index': 0, 'per_family_instance_index': 21, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/10/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1466, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/4/9    992.477 MiB/sec      1.453 GiB/sec    49.957    {'family_index': 1, 'per_family_instance_index': 9, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/4/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1400, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/7/9    997.679 MiB/sec      1.450 GiB/sec    48.846   {'family_index': 1, 'per_family_instance_index': 15, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/7/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1389, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 50.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/13/9      1.071 GiB/sec      1.581 GiB/sec    47.526   {'family_index': 0, 'per_family_instance_index': 27, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/13/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1538, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/13/9      1.008 GiB/sec      1.485 GiB/sec    47.328 {'family_index': 1, 'per_family_instance_index': 27, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/13/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1446, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 50.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/1/9      1.003 GiB/sec      1.452 GiB/sec    44.708    {'family_index': 1, 'per_family_instance_index': 3, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/1/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1437, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 50.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/7/9      1.105 GiB/sec      1.568 GiB/sec    41.954     {'family_index': 0, 'per_family_instance_index': 15, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/7/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1587, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 50.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/4/9      1.163 GiB/sec      1.613 GiB/sec    38.639      {'family_index': 0, 'per_family_instance_index': 9, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/4/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1662, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 50.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/14/9      8.884 GiB/sec     12.117 GiB/sec    36.381 {'family_index': 1, 'per_family_instance_index': 29, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/14/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 12508, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 1.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/11/9      8.886 GiB/sec     12.075 GiB/sec    35.892 {'family_index': 1, 'per_family_instance_index': 23, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/11/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 12716, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 1.0}
          TakeFixedSizeBinaryMonotonicIndices/524288/1000/9 134.765M items/sec 182.868M items/sec    35.694                                     {'family_index': 4, 'per_family_instance_index': 1, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/1000/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 206, 'byte_width': 9.0, 'null_percent': 0.1}
              FilterFixedSizeBinaryFilterNoNulls/524288/5/8     11.393 GiB/sec     15.091 GiB/sec    32.453     {'family_index': 0, 'per_family_instance_index': 10, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/5/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16510, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 1.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/8/8     11.573 GiB/sec     15.102 GiB/sec    30.496     {'family_index': 0, 'per_family_instance_index': 16, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/8/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16684, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 1.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/11/8      7.740 GiB/sec     10.059 GiB/sec    29.956 {'family_index': 1, 'per_family_instance_index': 22, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/11/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 10972, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 5.0, 'select%': 1.0}
           FilterFixedSizeBinaryFilterWithNulls/524288/14/8      7.733 GiB/sec      9.915 GiB/sec    28.213 {'family_index': 1, 'per_family_instance_index': 28, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/14/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 10991, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 5.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/5/8      7.682 GiB/sec      9.765 GiB/sec    27.109   {'family_index': 1, 'per_family_instance_index': 10, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/5/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 10991, 'byte_width': 8.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/8/9      8.856 GiB/sec     11.180 GiB/sec    26.241   {'family_index': 1, 'per_family_instance_index': 17, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/8/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 12571, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/8/8      7.735 GiB/sec      9.710 GiB/sec    25.530   {'family_index': 1, 'per_family_instance_index': 16, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/8/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 11069, 'byte_width': 8.0, 'data null%': 1.0, 'mask null%': 5.0, 'select%': 1.0}
            TakeFixedSizeBinaryMonotonicIndices/524288/10/9 128.606M items/sec 160.249M items/sec    24.604                                      {'family_index': 4, 'per_family_instance_index': 3, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/10/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 209, 'byte_width': 9.0, 'null_percent': 10.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/11/8     12.033 GiB/sec     14.737 GiB/sec    22.478   {'family_index': 0, 'per_family_instance_index': 22, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/11/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17220, 'byte_width': 8.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 1.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/14/8     12.141 GiB/sec     14.761 GiB/sec    21.579   {'family_index': 0, 'per_family_instance_index': 28, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/14/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17343, 'byte_width': 8.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/5/9      8.825 GiB/sec     10.633 GiB/sec    20.489   {'family_index': 1, 'per_family_instance_index': 11, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/5/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 12543, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 5.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/2/8      8.300 GiB/sec      9.969 GiB/sec    20.117    {'family_index': 1, 'per_family_instance_index': 4, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/2/8', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 11819, 'byte_width': 8.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 1.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/5/9     12.954 GiB/sec     15.192 GiB/sec    17.273     {'family_index': 0, 'per_family_instance_index': 11, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/5/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 18572, 'byte_width': 9.0, 'data null%': 0.1, 'mask null%': 0.0, 'select%': 1.0}
              FilterFixedSizeBinaryFilterNoNulls/524288/8/9     13.181 GiB/sec     15.222 GiB/sec    15.490     {'family_index': 0, 'per_family_instance_index': 17, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/8/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 18904, 'byte_width': 9.0, 'data null%': 1.0, 'mask null%': 0.0, 'select%': 1.0}
            FilterFixedSizeBinaryFilterWithNulls/524288/2/9      9.344 GiB/sec     10.632 GiB/sec    13.784    {'family_index': 1, 'per_family_instance_index': 5, 'run_name': 'FilterFixedSizeBinaryFilterWithNulls/524288/2/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 13291, 'byte_width': 9.0, 'data null%': 0.0, 'mask null%': 5.0, 'select%': 1.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/11/9     13.566 GiB/sec     14.894 GiB/sec     9.789   {'family_index': 0, 'per_family_instance_index': 23, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/11/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 19349, 'byte_width': 9.0, 'data null%': 10.0, 'mask null%': 0.0, 'select%': 1.0}
             FilterFixedSizeBinaryFilterNoNulls/524288/14/9     13.603 GiB/sec     14.863 GiB/sec     9.265   {'family_index': 0, 'per_family_instance_index': 29, 'run_name': 'FilterFixedSizeBinaryFilterNoNulls/524288/14/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 19490, 'byte_width': 9.0, 'data null%': 90.0, 'mask null%': 0.0, 'select%': 1.0}
        TakeFixedSizeBinaryRandomIndicesNoNulls/524288/10/9 124.390M items/sec 133.566M items/sec     7.377                                  {'family_index': 2, 'per_family_instance_index': 3, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/10/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 164, 'byte_width': 9.0, 'null_percent': 10.0}
             TakeFixedSizeBinaryMonotonicIndices/524288/2/9 116.792M items/sec 124.182M items/sec     6.328                                       {'family_index': 4, 'per_family_instance_index': 5, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/2/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 161, 'byte_width': 9.0, 'null_percent': 50.0}
      TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1000/9 135.860M items/sec 142.524M items/sec     4.905                                 {'family_index': 2, 'per_family_instance_index': 1, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1000/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 180, 'byte_width': 9.0, 'null_percent': 0.1}
    TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1000/9 131.123M items/sec 137.400M items/sec     4.788                               {'family_index': 3, 'per_family_instance_index': 1, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1000/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 176, 'byte_width': 9.0, 'null_percent': 0.1}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/0/9 220.634M items/sec 230.872M items/sec     4.640                                    {'family_index': 2, 'per_family_instance_index': 9, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/0/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 295, 'byte_width': 9.0, 'null_percent': 0.0}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/2/9  97.425M items/sec 101.477M items/sec     4.159                                   {'family_index': 2, 'per_family_instance_index': 5, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/2/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 130, 'byte_width': 9.0, 'null_percent': 50.0}
      TakeFixedSizeBinaryRandomIndicesWithNulls/524288/10/9 104.830M items/sec 108.346M items/sec     3.354                                {'family_index': 3, 'per_family_instance_index': 3, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/10/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 100, 'byte_width': 9.0, 'null_percent': 10.0}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1/9 378.858M items/sec 387.322M items/sec     2.234                                {'family_index': 3, 'per_family_instance_index': 7, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/1/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 506, 'byte_width': 9.0, 'null_percent': 100.0}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/0/9 221.900M items/sec 226.450M items/sec     2.050                                  {'family_index': 3, 'per_family_instance_index': 9, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/0/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 295, 'byte_width': 9.0, 'null_percent': 0.0}
             TakeFixedSizeBinaryMonotonicIndices/524288/0/9 248.664M items/sec 253.037M items/sec     1.758                                        {'family_index': 4, 'per_family_instance_index': 9, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/0/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 332, 'byte_width': 9.0, 'null_percent': 0.0}
         TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1/9 197.730M items/sec 201.173M items/sec     1.741                                  {'family_index': 2, 'per_family_instance_index': 7, 'run_name': 'TakeFixedSizeBinaryRandomIndicesNoNulls/524288/1/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 264, 'byte_width': 9.0, 'null_percent': 100.0}
       TakeFixedSizeBinaryRandomIndicesWithNulls/524288/2/9  73.196M items/sec  74.167M items/sec     1.327                                  {'family_index': 3, 'per_family_instance_index': 5, 'run_name': 'TakeFixedSizeBinaryRandomIndicesWithNulls/524288/2/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 96, 'byte_width': 9.0, 'null_percent': 50.0}
             TakeFixedSizeBinaryMonotonicIndices/524288/1/9 192.545M items/sec 188.138M items/sec    -2.289                                      {'family_index': 4, 'per_family_instance_index': 7, 'run_name': 'TakeFixedSizeBinaryMonotonicIndices/524288/1/9', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 257, 'byte_width': 9.0, 'null_percent': 100.0}
    
    ```
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    
    * Closes: #39740
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 .../compute/kernels/vector_selection_benchmark.cc  |  72 ++++++++-
 .../kernels/vector_selection_filter_internal.cc    | 167 ++++++++++++---------
 .../compute/kernels/vector_selection_internal.cc   |  22 ++-
 .../compute/kernels/vector_selection_internal.h    |   2 +-
 .../kernels/vector_selection_take_internal.cc      |  76 +++++++---
 .../arrow/compute/kernels/vector_selection_test.cc | 150 +++++++++++++-----
 6 files changed, 348 insertions(+), 141 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
index 25e30e65a3..e65d5dbcab 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
@@ -128,6 +128,13 @@ struct TakeBenchmark {
     Bench(values);
   }
 
+  void FixedSizeBinary() {
+    const int32_t byte_width = static_cast<int32_t>(state.range(2));
+    auto values = rand.FixedSizeBinary(args.size, byte_width, args.null_proportion);
+    Bench(values);
+    state.counters["byte_width"] = byte_width;
+  }
+
   void String() {
     int32_t string_min_length = 0, string_max_length = 32;
     auto values = std::static_pointer_cast<StringArray>(rand.String(
@@ -149,6 +156,7 @@ struct TakeBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Take(values, indices).status());
     }
+    state.SetItemsProcessed(state.iterations() * values->length());
   }
 };
 
@@ -166,8 +174,7 @@ struct FilterBenchmark {
 
   void Int64() {
     const int64_t array_size = args.size / sizeof(int64_t);
-    auto values = std::static_pointer_cast<NumericArray<Int64Type>>(
-        rand.Int64(array_size, -100, 100, args.values_null_proportion));
+    auto values = rand.Int64(array_size, -100, 100, args.values_null_proportion);
     Bench(values);
   }
 
@@ -181,6 +188,15 @@ struct FilterBenchmark {
     Bench(values);
   }
 
+  void FixedSizeBinary() {
+    const int32_t byte_width = static_cast<int32_t>(state.range(2));
+    const int64_t array_size = args.size / byte_width;
+    auto values =
+        rand.FixedSizeBinary(array_size, byte_width, args.values_null_proportion);
+    Bench(values);
+    state.counters["byte_width"] = byte_width;
+  }
+
   void String() {
     int32_t string_min_length = 0, string_max_length = 32;
     int32_t string_mean_length = (string_max_length + string_min_length) / 2;
@@ -202,6 +218,7 @@ struct FilterBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Filter(values, filter).status());
     }
+    state.SetItemsProcessed(state.iterations() * values->length());
   }
 
   void BenchRecordBatch() {
@@ -236,6 +253,7 @@ struct FilterBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Filter(batch, filter).status());
     }
+    state.SetItemsProcessed(state.iterations() * num_rows);
   }
 };
 
@@ -255,6 +273,14 @@ static void FilterFSLInt64FilterWithNulls(benchmark::State& state) {
   FilterBenchmark(state, true).FSLInt64();
 }
 
+static void FilterFixedSizeBinaryFilterNoNulls(benchmark::State& state) {
+  FilterBenchmark(state, false).FixedSizeBinary();
+}
+
+static void FilterFixedSizeBinaryFilterWithNulls(benchmark::State& state) {
+  FilterBenchmark(state, true).FixedSizeBinary();
+}
+
 static void FilterStringFilterNoNulls(benchmark::State& state) {
   FilterBenchmark(state, false).String();
 }
@@ -283,6 +309,19 @@ static void TakeInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).Int64();
 }
 
+static void TakeFixedSizeBinaryRandomIndicesNoNulls(benchmark::State& state) {
+  TakeBenchmark(state, false).FixedSizeBinary();
+}
+
+static void TakeFixedSizeBinaryRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, true).FixedSizeBinary();
+}
+
+static void TakeFixedSizeBinaryMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .FixedSizeBinary();
+}
+
 static void TakeFSLInt64RandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, false).FSLInt64();
 }
@@ -315,8 +354,22 @@ void FilterSetArgs(benchmark::internal::Benchmark* bench) {
   }
 }
 
+void FilterFSBSetArgs(benchmark::internal::Benchmark* bench) {
+  for (int64_t size : g_data_sizes) {
+    for (int i = 0; i < static_cast<int>(g_filter_params.size()); ++i) {
+      // FixedSizeBinary of primitive sizes (powers of two up to 32)
+      // have a faster path.
+      for (int32_t byte_width : {8, 9}) {
+        bench->Args({static_cast<ArgsType>(size), i, byte_width});
+      }
+    }
+  }
+}
+
 BENCHMARK(FilterInt64FilterNoNulls)->Apply(FilterSetArgs);
 BENCHMARK(FilterInt64FilterWithNulls)->Apply(FilterSetArgs);
+BENCHMARK(FilterFixedSizeBinaryFilterNoNulls)->Apply(FilterFSBSetArgs);
+BENCHMARK(FilterFixedSizeBinaryFilterWithNulls)->Apply(FilterFSBSetArgs);
 BENCHMARK(FilterFSLInt64FilterNoNulls)->Apply(FilterSetArgs);
 BENCHMARK(FilterFSLInt64FilterWithNulls)->Apply(FilterSetArgs);
 BENCHMARK(FilterStringFilterNoNulls)->Apply(FilterSetArgs);
@@ -340,9 +393,24 @@ void TakeSetArgs(benchmark::internal::Benchmark* bench) {
   }
 }
 
+void TakeFSBSetArgs(benchmark::internal::Benchmark* bench) {
+  for (int64_t size : g_data_sizes) {
+    for (auto nulls : std::vector<ArgsType>({1000, 10, 2, 1, 0})) {
+      // FixedSizeBinary of primitive sizes (powers of two up to 32)
+      // have a faster path.
+      for (int32_t byte_width : {8, 9}) {
+        bench->Args({static_cast<ArgsType>(size), nulls, byte_width});
+      }
+    }
+  }
+}
+
 BENCHMARK(TakeInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeFixedSizeBinaryRandomIndicesNoNulls)->Apply(TakeFSBSetArgs);
+BENCHMARK(TakeFixedSizeBinaryRandomIndicesWithNulls)->Apply(TakeFSBSetArgs);
+BENCHMARK(TakeFixedSizeBinaryMonotonicIndices)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeFSLInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeFSLInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs);
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
index a25b04ae4f..8825d697fd 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
@@ -146,36 +146,40 @@ class DropNullCounter {
 
 /// \brief The Filter implementation for primitive (fixed-width) types does not
 /// use the logical Arrow type but rather the physical C type. This way we only
-/// generate one take function for each byte width. We use the same
-/// implementation here for boolean and fixed-byte-size inputs with some
-/// template specialization.
-template <typename ArrowType>
+/// generate one take function for each byte width.
+///
+/// We use compile-time specialization for two variations:
+/// - operating on boolean data (using kIsBoolean = true)
+/// - operating on fixed-width data of arbitrary width (using kByteWidth = -1),
+///   with the actual width only known at runtime
+template <int32_t kByteWidth, bool kIsBoolean = false>
 class PrimitiveFilterImpl {
  public:
-  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
-                                      uint8_t, typename ArrowType::c_type>::type;
-
   PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
                       FilterOptions::NullSelectionBehavior null_selection,
                       ArrayData* out_arr)
-      : values_is_valid_(values.buffers[0].data),
-        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
+      : byte_width_(values.type->byte_width()),
+        values_is_valid_(values.buffers[0].data),
+        values_data_(values.buffers[1].data),
         values_null_count_(values.null_count),
         values_offset_(values.offset),
         values_length_(values.length),
         filter_(filter),
         null_selection_(null_selection) {
-    if (values.type->id() != Type::BOOL) {
+    if constexpr (kByteWidth >= 0 && !kIsBoolean) {
+      DCHECK_EQ(kByteWidth, byte_width_);
+    }
+    if constexpr (!kIsBoolean) {
       // No offset applied for boolean because it's a bitmap
-      values_data_ += values.offset;
+      values_data_ += values.offset * byte_width();
     }
 
     if (out_arr->buffers[0] != nullptr) {
       // May be unallocated if neither filter nor values contain nulls
       out_is_valid_ = out_arr->buffers[0]->mutable_data();
     }
-    out_data_ = reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data());
-    out_offset_ = out_arr->offset;
+    out_data_ = out_arr->buffers[1]->mutable_data();
+    DCHECK_EQ(out_arr->offset, 0);
     out_length_ = out_arr->length;
     out_position_ = 0;
   }
@@ -201,14 +205,11 @@ class PrimitiveFilterImpl {
           [&](int64_t position, int64_t segment_length, bool filter_valid) {
             if (filter_valid) {
               CopyBitmap(values_is_valid_, values_offset_ + position, segment_length,
-                         out_is_valid_, out_offset_ + out_position_);
+                         out_is_valid_, out_position_);
               WriteValueSegment(position, segment_length);
             } else {
-              bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
-                                  segment_length, false);
-              memset(out_data_ + out_offset_ + out_position_, 0,
-                     segment_length * sizeof(T));
-              out_position_ += segment_length;
+              bit_util::SetBitsTo(out_is_valid_, out_position_, segment_length, false);
+              WriteNullSegment(segment_length);
             }
             return true;
           });
@@ -218,7 +219,7 @@ class PrimitiveFilterImpl {
     if (out_is_valid_) {
       // Set all to valid, so only if nulls are produced by EMIT_NULL, we need
       // to set out_is_valid[i] to false.
-      bit_util::SetBitsTo(out_is_valid_, out_offset_, out_length_, true);
+      bit_util::SetBitsTo(out_is_valid_, 0, out_length_, true);
     }
     return VisitPlainxREEFilterOutputSegments(
         filter_, /*filter_may_have_nulls=*/true, null_selection_,
@@ -226,11 +227,8 @@ class PrimitiveFilterImpl {
           if (filter_valid) {
             WriteValueSegment(position, segment_length);
           } else {
-            bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
-                                segment_length, false);
-            memset(out_data_ + out_offset_ + out_position_, 0,
-                   segment_length * sizeof(T));
-            out_position_ += segment_length;
+            bit_util::SetBitsTo(out_is_valid_, out_position_, segment_length, false);
+            WriteNullSegment(segment_length);
           }
           return true;
         });
@@ -260,13 +258,13 @@ class PrimitiveFilterImpl {
                                                  values_length_);
 
     auto WriteNotNull = [&](int64_t index) {
-      bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
+      bit_util::SetBit(out_is_valid_, out_position_);
       // Increments out_position_
       WriteValue(index);
     };
 
     auto WriteMaybeNull = [&](int64_t index) {
-      bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
+      bit_util::SetBitTo(out_is_valid_, out_position_,
                          bit_util::GetBit(values_is_valid_, values_offset_ + index));
       // Increments out_position_
       WriteValue(index);
@@ -279,15 +277,14 @@ class PrimitiveFilterImpl {
       BitBlockCount data_block = data_counter.NextWord();
       if (filter_block.AllSet() && data_block.AllSet()) {
         // Fastest path: all values in block are included and not null
-        bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
-                            filter_block.length, true);
+        bit_util::SetBitsTo(out_is_valid_, out_position_, filter_block.length, true);
         WriteValueSegment(in_position, filter_block.length);
         in_position += filter_block.length;
       } else if (filter_block.AllSet()) {
         // Faster: all values are selected, but some values are null
         // Batch copy bits from values validity bitmap to output validity bitmap
         CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
-                   out_is_valid_, out_offset_ + out_position_);
+                   out_is_valid_, out_position_);
         WriteValueSegment(in_position, filter_block.length);
         in_position += filter_block.length;
       } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
@@ -326,7 +323,7 @@ class PrimitiveFilterImpl {
                 WriteNotNull(in_position);
               } else if (!is_valid) {
                 // Filter slot is null, so we have a null in the output
-                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
+                bit_util::ClearBit(out_is_valid_, out_position_);
                 WriteNull();
               }
               ++in_position;
@@ -362,7 +359,7 @@ class PrimitiveFilterImpl {
                 WriteMaybeNull(in_position);
               } else if (!is_valid) {
                 // Filter slot is null, so we have a null in the output
-                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
+                bit_util::ClearBit(out_is_valid_, out_position_);
                 WriteNull();
               }
               ++in_position;
@@ -376,54 +373,72 @@ class PrimitiveFilterImpl {
   // Write the next out_position given the selected in_position for the input
   // data and advance out_position
   void WriteValue(int64_t in_position) {
-    out_data_[out_offset_ + out_position_++] = values_data_[in_position];
+    if constexpr (kIsBoolean) {
+      bit_util::SetBitTo(out_data_, out_position_,
+                         bit_util::GetBit(values_data_, values_offset_ + in_position));
+    } else {
+      memcpy(out_data_ + out_position_ * byte_width(),
+             values_data_ + in_position * byte_width(), byte_width());
+    }
+    ++out_position_;
   }
 
   void WriteValueSegment(int64_t in_start, int64_t length) {
-    std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
+    if constexpr (kIsBoolean) {
+      CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
+                 out_position_);
+    } else {
+      memcpy(out_data_ + out_position_ * byte_width(),
+             values_data_ + in_start * byte_width(), length * byte_width());
+    }
     out_position_ += length;
   }
 
   void WriteNull() {
-    // Zero the memory
-    out_data_[out_offset_ + out_position_++] = T{};
+    if constexpr (kIsBoolean) {
+      // Zero the bit
+      bit_util::ClearBit(out_data_, out_position_);
+    } else {
+      // Zero the memory
+      memset(out_data_ + out_position_ * byte_width(), 0, byte_width());
+    }
+    ++out_position_;
+  }
+
+  void WriteNullSegment(int64_t length) {
+    if constexpr (kIsBoolean) {
+      // Zero the bits
+      bit_util::SetBitsTo(out_data_, out_position_, length, false);
+    } else {
+      // Zero the memory
+      memset(out_data_ + out_position_ * byte_width(), 0, length * byte_width());
+    }
+    out_position_ += length;
+  }
+
+  constexpr int32_t byte_width() const {
+    if constexpr (kByteWidth >= 0) {
+      return kByteWidth;
+    } else {
+      return byte_width_;
+    }
   }
 
  private:
+  int32_t byte_width_;
   const uint8_t* values_is_valid_;
-  const T* values_data_;
+  const uint8_t* values_data_;
   int64_t values_null_count_;
   int64_t values_offset_;
   int64_t values_length_;
   const ArraySpan& filter_;
   FilterOptions::NullSelectionBehavior null_selection_;
   uint8_t* out_is_valid_ = NULLPTR;
-  T* out_data_;
-  int64_t out_offset_;
+  uint8_t* out_data_;
   int64_t out_length_;
   int64_t out_position_;
 };
 
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
-  bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
-                     bit_util::GetBit(values_data_, values_offset_ + in_position));
-}
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
-                                                                int64_t length) {
-  CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
-             out_offset_ + out_position_);
-  out_position_ += length;
-}
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
-  // Zero the bit
-  bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
-}
-
 Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& values = batch[0].array;
   const ArraySpan& filter = batch[1].array;
@@ -459,22 +474,32 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
 
   switch (bit_width) {
     case 1:
-      PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterImpl<1, /*kIsBoolean=*/true>(values, filter, null_selection, out_arr)
+          .Exec();
       break;
     case 8:
-      PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterImpl<1>(values, filter, null_selection, out_arr).Exec();
       break;
     case 16:
-      PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterImpl<2>(values, filter, null_selection, out_arr).Exec();
       break;
     case 32:
-      PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterImpl<4>(values, filter, null_selection, out_arr).Exec();
       break;
     case 64:
-      PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterImpl<8>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 128:
+      // For INTERVAL_MONTH_DAY_NANO, DECIMAL128
+      PrimitiveFilterImpl<16>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 256:
+      // For DECIMAL256
+      PrimitiveFilterImpl<32>(values, filter, null_selection, out_arr).Exec();
       break;
     default:
-      DCHECK(false) << "Invalid values bit width";
+      // Non-specializing on byte width
+      PrimitiveFilterImpl<-1>(values, filter, null_selection, out_arr).Exec();
       break;
   }
   return Status::OK();
@@ -1050,10 +1075,10 @@ void PopulateFilterKernels(std::vector<SelectionKernelData>* out) {
       {InputType(match::Primitive()), plain_filter, PrimitiveFilterExec},
       {InputType(match::BinaryLike()), plain_filter, BinaryFilterExec},
       {InputType(match::LargeBinaryLike()), plain_filter, BinaryFilterExec},
-      {InputType(Type::FIXED_SIZE_BINARY), plain_filter, FSBFilterExec},
       {InputType(null()), plain_filter, NullFilterExec},
-      {InputType(Type::DECIMAL128), plain_filter, FSBFilterExec},
-      {InputType(Type::DECIMAL256), plain_filter, FSBFilterExec},
+      {InputType(Type::FIXED_SIZE_BINARY), plain_filter, PrimitiveFilterExec},
+      {InputType(Type::DECIMAL128), plain_filter, PrimitiveFilterExec},
+      {InputType(Type::DECIMAL256), plain_filter, PrimitiveFilterExec},
       {InputType(Type::DICTIONARY), plain_filter, DictionaryFilterExec},
       {InputType(Type::EXTENSION), plain_filter, ExtensionFilterExec},
       {InputType(Type::LIST), plain_filter, ListFilterExec},
@@ -1068,10 +1093,10 @@ void PopulateFilterKernels(std::vector<SelectionKernelData>* out) {
       {InputType(match::Primitive()), ree_filter, PrimitiveFilterExec},
       {InputType(match::BinaryLike()), ree_filter, BinaryFilterExec},
       {InputType(match::LargeBinaryLike()), ree_filter, BinaryFilterExec},
-      {InputType(Type::FIXED_SIZE_BINARY), ree_filter, FSBFilterExec},
       {InputType(null()), ree_filter, NullFilterExec},
-      {InputType(Type::DECIMAL128), ree_filter, FSBFilterExec},
-      {InputType(Type::DECIMAL256), ree_filter, FSBFilterExec},
+      {InputType(Type::FIXED_SIZE_BINARY), ree_filter, PrimitiveFilterExec},
+      {InputType(Type::DECIMAL128), ree_filter, PrimitiveFilterExec},
+      {InputType(Type::DECIMAL256), ree_filter, PrimitiveFilterExec},
       {InputType(Type::DICTIONARY), ree_filter, DictionaryFilterExec},
       {InputType(Type::EXTENSION), ree_filter, ExtensionFilterExec},
       {InputType(Type::LIST), ree_filter, ListFilterExec},
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index 98eb37e9c5..a0fe2808e3 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -77,7 +77,8 @@ Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit
   if (bit_width == 1) {
     ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
   } else {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(length * bit_width / 8));
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1],
+                          ctx->Allocate(bit_util::BytesForBits(length * bit_width)));
   }
   return Status::OK();
 }
@@ -899,10 +900,6 @@ Status FilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
 
 }  // namespace
 
-Status FSBFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return FilterExec<FSBSelectionImpl>(ctx, batch, out);
-}
-
 Status ListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return FilterExec<ListSelectionImpl<ListType>>(ctx, batch, out);
 }
@@ -946,7 +943,20 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
 }
 
 Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<FSBSelectionImpl>(ctx, batch, out);
+  const ArraySpan& values = batch[0].array;
+  const auto byte_width = values.type->byte_width();
+  // Use primitive Take implementation (presumably faster) for some byte widths
+  switch (byte_width) {
+    case 1:
+    case 2:
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      return PrimitiveTakeExec(ctx, batch, out);
+    default:
+      return TakeExec<FSBSelectionImpl>(ctx, batch, out);
+  }
 }
 
 Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index b9eba6ea66..95f3e51cd6 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -70,7 +70,6 @@ void VisitPlainxREEFilterOutputSegments(
     FilterOptions::NullSelectionBehavior null_selection,
     const EmitREEFilterSegment& emit_segment);
 
-Status FSBFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
@@ -79,6 +78,7 @@ Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 
 Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status PrimitiveTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 612de8505d..89b3f7d0d3 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -334,11 +334,15 @@ using TakeState = OptionsWrapper<TakeOptions>;
 /// only generate one take function for each byte width.
 ///
 /// This function assumes that the indices have been boundschecked.
-template <typename IndexCType, typename ValueCType>
+template <typename IndexCType, typename ValueWidthConstant>
 struct PrimitiveTakeImpl {
+  static constexpr int kValueWidth = ValueWidthConstant::value;
+
   static void Exec(const ArraySpan& values, const ArraySpan& indices,
                    ArrayData* out_arr) {
-    const auto* values_data = values.GetValues<ValueCType>(1);
+    DCHECK_EQ(values.type->byte_width(), kValueWidth);
+    const auto* values_data =
+        values.GetValues<uint8_t>(1, 0) + kValueWidth * values.offset;
     const uint8_t* values_is_valid = values.buffers[0].data;
     auto values_offset = values.offset;
 
@@ -346,9 +350,10 @@ struct PrimitiveTakeImpl {
     const uint8_t* indices_is_valid = indices.buffers[0].data;
     auto indices_offset = indices.offset;
 
-    auto out = out_arr->GetMutableValues<ValueCType>(1);
+    auto out = out_arr->GetMutableValues<uint8_t>(1, 0) + kValueWidth * out_arr->offset;
     auto out_is_valid = out_arr->buffers[0]->mutable_data();
     auto out_offset = out_arr->offset;
+    DCHECK_EQ(out_offset, 0);
 
     // If either the values or indices have nulls, we preemptively zero out the
     // out validity bitmap so that we don't have to use ClearBit in each
@@ -357,6 +362,19 @@ struct PrimitiveTakeImpl {
       bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
     }
 
+    auto WriteValue = [&](int64_t position) {
+      memcpy(out + position * kValueWidth,
+             values_data + indices_data[position] * kValueWidth, kValueWidth);
+    };
+
+    auto WriteZero = [&](int64_t position) {
+      memset(out + position * kValueWidth, 0, kValueWidth);
+    };
+
+    auto WriteZeroSegment = [&](int64_t position, int64_t length) {
+      memset(out + position * kValueWidth, 0, kValueWidth * length);
+    };
+
     OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
                                                 indices.length);
     int64_t position = 0;
@@ -370,7 +388,7 @@ struct PrimitiveTakeImpl {
           // Fastest path: neither values nor index nulls
           bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
           for (int64_t i = 0; i < block.length; ++i) {
-            out[position] = values_data[indices_data[position]];
+            WriteValue(position);
             ++position;
           }
         } else if (block.popcount > 0) {
@@ -379,14 +397,14 @@ struct PrimitiveTakeImpl {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
               // index is not null
               bit_util::SetBit(out_is_valid, out_offset + position);
-              out[position] = values_data[indices_data[position]];
+              WriteValue(position);
             } else {
-              out[position] = ValueCType{};
+              WriteZero(position);
             }
             ++position;
           }
         } else {
-          memset(out + position, 0, sizeof(ValueCType) * block.length);
+          WriteZeroSegment(position, block.length);
           position += block.length;
         }
       } else {
@@ -397,11 +415,11 @@ struct PrimitiveTakeImpl {
             if (bit_util::GetBit(values_is_valid,
                                  values_offset + indices_data[position])) {
               // value is not null
-              out[position] = values_data[indices_data[position]];
+              WriteValue(position);
               bit_util::SetBit(out_is_valid, out_offset + position);
               ++valid_count;
             } else {
-              out[position] = ValueCType{};
+              WriteZero(position);
             }
             ++position;
           }
@@ -414,16 +432,16 @@ struct PrimitiveTakeImpl {
                 bit_util::GetBit(values_is_valid,
                                  values_offset + indices_data[position])) {
               // index is not null && value is not null
-              out[position] = values_data[indices_data[position]];
+              WriteValue(position);
               bit_util::SetBit(out_is_valid, out_offset + position);
               ++valid_count;
             } else {
-              out[position] = ValueCType{};
+              WriteZero(position);
             }
             ++position;
           }
         } else {
-          memset(out + position, 0, sizeof(ValueCType) * block.length);
+          WriteZeroSegment(position, block.length);
           position += block.length;
         }
       }
@@ -554,6 +572,8 @@ void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
   }
 }
 
+}  // namespace
+
 Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& values = batch[0].array;
   const ArraySpan& indices = batch[1].array;
@@ -577,24 +597,40 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
       TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
       break;
     case 8:
-      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 1>>(
+          values, indices, out_arr);
       break;
     case 16:
-      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 2>>(
+          values, indices, out_arr);
       break;
     case 32:
-      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 4>>(
+          values, indices, out_arr);
       break;
     case 64:
-      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 8>>(
+          values, indices, out_arr);
       break;
-    default:
-      DCHECK(false) << "Invalid values byte width";
+    case 128:
+      // For INTERVAL_MONTH_DAY_NANO, DECIMAL128
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 16>>(
+          values, indices, out_arr);
+      break;
+    case 256:
+      // For DECIMAL256
+      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 32>>(
+          values, indices, out_arr);
       break;
+    default:
+      return Status::NotImplemented("Unsupported primitive type for take: ",
+                                    *values.type);
   }
   return Status::OK();
 }
 
+namespace {
+
 // ----------------------------------------------------------------------
 // Null take
 
@@ -836,8 +872,8 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
       {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec},
       {InputType(Type::FIXED_SIZE_BINARY), take_indices, FSBTakeExec},
       {InputType(null()), take_indices, NullTakeExec},
-      {InputType(Type::DECIMAL128), take_indices, FSBTakeExec},
-      {InputType(Type::DECIMAL256), take_indices, FSBTakeExec},
+      {InputType(Type::DECIMAL128), take_indices, PrimitiveTakeExec},
+      {InputType(Type::DECIMAL256), take_indices, PrimitiveTakeExec},
       {InputType(Type::DICTIONARY), take_indices, DictionaryTake},
       {InputType(Type::EXTENSION), take_indices, ExtensionTake},
       {InputType(Type::LIST), take_indices, ListTakeExec},
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index bdf9f5454f..ec94b328ea 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -309,6 +309,33 @@ class TestFilterKernel : public ::testing::Test {
     AssertFilter(values_array, ree_filter, expected_array);
   }
 
+  void TestNumericBasics(const std::shared_ptr<DataType>& type) {
+    ARROW_SCOPED_TRACE("type = ", *type);
+    AssertFilter(type, "[]", "[]", "[]");
+
+    AssertFilter(type, "[9]", "[0]", "[]");
+    AssertFilter(type, "[9]", "[1]", "[9]");
+    AssertFilter(type, "[9]", "[null]", "[null]");
+    AssertFilter(type, "[null]", "[0]", "[]");
+    AssertFilter(type, "[null]", "[1]", "[null]");
+    AssertFilter(type, "[null]", "[null]", "[null]");
+
+    AssertFilter(type, "[7, 8, 9]", "[0, 1, 0]", "[8]");
+    AssertFilter(type, "[7, 8, 9]", "[1, 0, 1]", "[7, 9]");
+    AssertFilter(type, "[null, 8, 9]", "[0, 1, 0]", "[8]");
+    AssertFilter(type, "[7, 8, 9]", "[null, 1, 0]", "[null, 8]");
+    AssertFilter(type, "[7, 8, 9]", "[1, null, 1]", "[7, null, 9]");
+
+    AssertFilter(ArrayFromJSON(type, "[7, 8, 9]"),
+                 ArrayFromJSON(boolean(), "[0, 1, 1, 1, 0, 1]")->Slice(3, 3),
+                 ArrayFromJSON(type, "[7, 9]"));
+
+    ASSERT_RAISES(Invalid, Filter(ArrayFromJSON(type, "[7, 8, 9]"),
+                                  ArrayFromJSON(boolean(), "[]"), emit_null_));
+    ASSERT_RAISES(Invalid, Filter(ArrayFromJSON(type, "[7, 8, 9]"),
+                                  ArrayFromJSON(boolean(), "[]"), drop_));
+  }
+
   const FilterOptions emit_null_, drop_;
 };
 
@@ -342,6 +369,33 @@ void ValidateFilter(const std::shared_ptr<Array>& values,
                     /*verbose=*/true);
 }
 
+TEST_F(TestFilterKernel, Temporal) {
+  this->TestNumericBasics(time32(TimeUnit::MILLI));
+  this->TestNumericBasics(time64(TimeUnit::MICRO));
+  this->TestNumericBasics(timestamp(TimeUnit::NANO, "Europe/Paris"));
+  this->TestNumericBasics(duration(TimeUnit::SECOND));
+  this->TestNumericBasics(date32());
+  this->AssertFilter(date64(), "[0, 86400000, null]", "[null, 1, 0]", "[null, 86400000]");
+}
+
+TEST_F(TestFilterKernel, Duration) {
+  for (auto type : DurationTypes()) {
+    this->TestNumericBasics(type);
+  }
+}
+
+TEST_F(TestFilterKernel, Interval) {
+  this->TestNumericBasics(month_interval());
+
+  auto type = day_time_interval();
+  this->AssertFilter(type, "[[1, -600], [2, 3000], null]", "[null, 1, 0]",
+                     "[null, [2, 3000]]");
+  type = month_day_nano_interval();
+  this->AssertFilter(type,
+                     "[[1, -2, 34567890123456789], [2, 3, -34567890123456789], null]",
+                     "[null, 1, 0]", "[null, [2, 3, -34567890123456789]]");
+}
+
 class TestFilterKernelWithNull : public TestFilterKernel {
  protected:
   void AssertFilter(const std::string& values, const std::string& filter,
@@ -401,30 +455,7 @@ class TestFilterKernelWithNumeric : public TestFilterKernel {
 
 TYPED_TEST_SUITE(TestFilterKernelWithNumeric, NumericArrowTypes);
 TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) {
-  auto type = this->type_singleton();
-  this->AssertFilter(type, "[]", "[]", "[]");
-
-  this->AssertFilter(type, "[9]", "[0]", "[]");
-  this->AssertFilter(type, "[9]", "[1]", "[9]");
-  this->AssertFilter(type, "[9]", "[null]", "[null]");
-  this->AssertFilter(type, "[null]", "[0]", "[]");
-  this->AssertFilter(type, "[null]", "[1]", "[null]");
-  this->AssertFilter(type, "[null]", "[null]", "[null]");
-
-  this->AssertFilter(type, "[7, 8, 9]", "[0, 1, 0]", "[8]");
-  this->AssertFilter(type, "[7, 8, 9]", "[1, 0, 1]", "[7, 9]");
-  this->AssertFilter(type, "[null, 8, 9]", "[0, 1, 0]", "[8]");
-  this->AssertFilter(type, "[7, 8, 9]", "[null, 1, 0]", "[null, 8]");
-  this->AssertFilter(type, "[7, 8, 9]", "[1, null, 1]", "[7, null, 9]");
-
-  this->AssertFilter(ArrayFromJSON(type, "[7, 8, 9]"),
-                     ArrayFromJSON(boolean(), "[0, 1, 1, 1, 0, 1]")->Slice(3, 3),
-                     ArrayFromJSON(type, "[7, 9]"));
-
-  ASSERT_RAISES(Invalid, Filter(ArrayFromJSON(type, "[7, 8, 9]"),
-                                ArrayFromJSON(boolean(), "[]"), this->emit_null_));
-  ASSERT_RAISES(Invalid, Filter(ArrayFromJSON(type, "[7, 8, 9]"),
-                                ArrayFromJSON(boolean(), "[]"), this->drop_));
+  this->TestNumericBasics(this->type_singleton());
 }
 
 template <typename CType>
@@ -588,7 +619,7 @@ TYPED_TEST(TestFilterKernelWithDecimal, FilterNumeric) {
                                 ArrayFromJSON(boolean(), "[]"), this->drop_));
 }
 
-TEST(TestFilterKernel, NoValidityBitmapButUnknownNullCount) {
+TEST_F(TestFilterKernel, NoValidityBitmapButUnknownNullCount) {
   auto values = ArrayFromJSON(int32(), "[1, 2, 3, 4]");
   auto filter = ArrayFromJSON(boolean(), "[true, true, false, true]");
 
@@ -1136,6 +1167,20 @@ class TestTakeKernel : public ::testing::Test {
     TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values),
                                             ArrayFromJSON(int16(), indices));
   }
+
+  void TestNumericBasics(const std::shared_ptr<DataType>& type) {
+    ARROW_SCOPED_TRACE("type = ", *type);
+    CheckTake(type, "[7, 8, 9]", "[]", "[]");
+    CheckTake(type, "[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]");
+    CheckTake(type, "[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]");
+    CheckTake(type, "[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]");
+    CheckTake(type, "[null, 8, 9]", "[]", "[]");
+    CheckTake(type, "[7, 8, 9]", "[0, 0, 0, 0, 0, 0, 2]", "[7, 7, 7, 7, 7, 7, 9]");
+
+    std::shared_ptr<Array> arr;
+    ASSERT_RAISES(IndexError, TakeJSON(type, "[7, 8, 9]", int8(), "[0, 9, 0]", &arr));
+    ASSERT_RAISES(IndexError, TakeJSON(type, "[7, 8, 9]", int8(), "[0, -1, 0]", &arr));
+  }
 };
 
 template <typename ArrowType>
@@ -1201,6 +1246,34 @@ TEST_F(TestTakeKernel, TakeBoolean) {
                 TakeJSON(boolean(), "[true, false, true]", int8(), "[0, -1, 0]", &arr));
 }
 
+TEST_F(TestTakeKernel, Temporal) {
+  this->TestNumericBasics(time32(TimeUnit::MILLI));
+  this->TestNumericBasics(time64(TimeUnit::MICRO));
+  this->TestNumericBasics(timestamp(TimeUnit::NANO, "Europe/Paris"));
+  this->TestNumericBasics(duration(TimeUnit::SECOND));
+  this->TestNumericBasics(date32());
+  CheckTake(date64(), "[0, 86400000, null]", "[null, 1, 1, 0]",
+            "[null, 86400000, 86400000, 0]");
+}
+
+TEST_F(TestTakeKernel, Duration) {
+  for (auto type : DurationTypes()) {
+    this->TestNumericBasics(type);
+  }
+}
+
+TEST_F(TestTakeKernel, Interval) {
+  this->TestNumericBasics(month_interval());
+
+  auto type = day_time_interval();
+  CheckTake(type, "[[1, -600], [2, 3000], null]", "[0, null, 2, 1]",
+            "[[1, -600], null, null, [2, 3000]]");
+  type = month_day_nano_interval();
+  CheckTake(type, "[[1, -2, 34567890123456789], [2, 3, -34567890123456789], null]",
+            "[0, null, 2, 1]",
+            "[[1, -2, 34567890123456789], null, null, [2, 3, -34567890123456789]]");
+}
+
 template <typename ArrowType>
 class TestTakeKernelWithNumeric : public TestTakeKernelTyped<ArrowType> {
  protected:
@@ -1216,18 +1289,7 @@ class TestTakeKernelWithNumeric : public TestTakeKernelTyped<ArrowType> {
 
 TYPED_TEST_SUITE(TestTakeKernelWithNumeric, NumericArrowTypes);
 TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) {
-  this->AssertTake("[7, 8, 9]", "[]", "[]");
-  this->AssertTake("[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]");
-  this->AssertTake("[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]");
-  this->AssertTake("[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]");
-  this->AssertTake("[null, 8, 9]", "[]", "[]");
-  this->AssertTake("[7, 8, 9]", "[0, 0, 0, 0, 0, 0, 2]", "[7, 7, 7, 7, 7, 7, 9]");
-
-  std::shared_ptr<Array> arr;
-  ASSERT_RAISES(IndexError,
-                TakeJSON(this->type_singleton(), "[7, 8, 9]", int8(), "[0, 9, 0]", &arr));
-  ASSERT_RAISES(IndexError, TakeJSON(this->type_singleton(), "[7, 8, 9]", int8(),
-                                     "[0, -1, 0]", &arr));
+  this->TestNumericBasics(this->type_singleton());
 }
 
 template <typename TypeClass>
@@ -1816,6 +1878,7 @@ TEST(TestTakeMetaFunction, ArityChecking) {
 template <typename Unused = void>
 struct FilterRandomTest {
   static void Test(const std::shared_ptr<DataType>& type) {
+    ARROW_SCOPED_TRACE("type = ", *type);
     auto rand = random::RandomArrayGenerator(kRandomSeed);
     const int64_t length = static_cast<int64_t>(1ULL << 10);
     for (auto null_probability : {0.0, 0.01, 0.1, 0.999, 1.0}) {
@@ -1856,6 +1919,7 @@ void CheckTakeRandom(const std::shared_ptr<Array>& values, int64_t indices_lengt
 template <typename ValuesType>
 struct TakeRandomTest {
   static void Test(const std::shared_ptr<DataType>& type) {
+    ARROW_SCOPED_TRACE("type = ", *type);
     auto rand = random::RandomArrayGenerator(kRandomSeed);
     const int64_t values_length = 64 * 16 + 1;
     const int64_t indices_length = 64 * 4 + 1;
@@ -1897,8 +1961,10 @@ TEST(TestFilter, RandomString) {
 }
 
 TEST(TestFilter, RandomFixedSizeBinary) {
-  FilterRandomTest<>::Test(fixed_size_binary(0));
-  FilterRandomTest<>::Test(fixed_size_binary(16));
+  // FixedSizeBinary filter is special-cased for some widths
+  for (int32_t width : {0, 1, 16, 32, 35}) {
+    FilterRandomTest<>::Test(fixed_size_binary(width));
+  }
 }
 
 TEST(TestTake, PrimitiveRandom) { TestRandomPrimitiveCTypes<TakeRandomTest>(); }
@@ -1911,8 +1977,10 @@ TEST(TestTake, RandomString) {
 }
 
 TEST(TestTake, RandomFixedSizeBinary) {
-  TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(0));
-  TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
+  // FixedSizeBinary take is special-cased for some widths
+  for (int32_t width : {0, 1, 16, 32, 35}) {
+    TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(width));
+  }
 }
 
 // ----------------------------------------------------------------------