You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/09 16:16:26 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7356: ARROW-9034: [C++] Implement "BinaryBitBlockCounter", add single-word functions to BitBlockCounter

pitrou commented on a change in pull request #7356:
URL: https://github.com/apache/arrow/pull/7356#discussion_r436753353



##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -19,37 +19,108 @@
 
 #include <cstdint>
 
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace internal {
 
-/// \brief A class that scans through a true/false bitmap to yield blocks of up
-/// to 256 bits at a time along with their popcount. This is used to accelerate
-/// processing of mostly-not-null array data.
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+  int16_t length;
+  int16_t popcount;
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
 class ARROW_EXPORT BitBlockCounter {
  public:
-  struct Block {
-    int16_t length;
-    int16_t popcount;
-  };
-
-  static constexpr int16_t kTargetBlockLength = 256;
-
   BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
       : bitmap_(bitmap + start_offset / 8),
         bits_remaining_(length),
         offset_(start_offset % 8) {}
 
-  /// \brief Return the next run of available bits, up to 256. The returned
-  /// pair contains the size of run and the number of true values
-  Block NextBlock();
+  /// \brief Return the next run of available bits, usually 256. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 256.

Review comment:
       "Longer than 256" (or "longer than 64") doesn't sound like a great idea, if the calling code can rely for the length limit to optimize a bit better.

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -19,37 +19,108 @@
 
 #include <cstdint>
 
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace internal {
 
-/// \brief A class that scans through a true/false bitmap to yield blocks of up
-/// to 256 bits at a time along with their popcount. This is used to accelerate
-/// processing of mostly-not-null array data.
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+  int16_t length;
+  int16_t popcount;
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
 class ARROW_EXPORT BitBlockCounter {
  public:
-  struct Block {
-    int16_t length;
-    int16_t popcount;
-  };
-
-  static constexpr int16_t kTargetBlockLength = 256;
-
   BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
       : bitmap_(bitmap + start_offset / 8),
         bits_remaining_(length),
         offset_(start_offset % 8) {}
 
-  /// \brief Return the next run of available bits, up to 256. The returned
-  /// pair contains the size of run and the number of true values
-  Block NextBlock();
+  /// \brief Return the next run of available bits, usually 256. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 256.
+  BitBlockCount NextFourWords();
+
+  /// \brief Return the next run of available bits, usually 64. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 64.
+  BitBlockCount NextWord();
+
+  /// \brief Inlineable implementation of NextWord

Review comment:
       Is it worth keeping both inlined and non-inlined versions or was is just for the sake of benchmarking this PR?

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -19,37 +19,108 @@
 
 #include <cstdint>
 
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace internal {
 
-/// \brief A class that scans through a true/false bitmap to yield blocks of up
-/// to 256 bits at a time along with their popcount. This is used to accelerate
-/// processing of mostly-not-null array data.
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+  int16_t length;
+  int16_t popcount;
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
 class ARROW_EXPORT BitBlockCounter {
  public:
-  struct Block {
-    int16_t length;
-    int16_t popcount;
-  };
-
-  static constexpr int16_t kTargetBlockLength = 256;
-
   BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
       : bitmap_(bitmap + start_offset / 8),
         bits_remaining_(length),
         offset_(start_offset % 8) {}
 
-  /// \brief Return the next run of available bits, up to 256. The returned
-  /// pair contains the size of run and the number of true values
-  Block NextBlock();
+  /// \brief Return the next run of available bits, usually 256. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 256.
+  BitBlockCount NextFourWords();
+
+  /// \brief Return the next run of available bits, usually 64. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 64.
+  BitBlockCount NextWord();
+
+  /// \brief Inlineable implementation of NextWord

Review comment:
       Agreed. I suppose most uses will be less trivial than a simple sum.

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -19,37 +19,108 @@
 
 #include <cstdint>
 
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace internal {
 
-/// \brief A class that scans through a true/false bitmap to yield blocks of up
-/// to 256 bits at a time along with their popcount. This is used to accelerate
-/// processing of mostly-not-null array data.
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+  int16_t length;
+  int16_t popcount;
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
 class ARROW_EXPORT BitBlockCounter {
  public:
-  struct Block {
-    int16_t length;
-    int16_t popcount;
-  };
-
-  static constexpr int16_t kTargetBlockLength = 256;
-
   BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
       : bitmap_(bitmap + start_offset / 8),
         bits_remaining_(length),
         offset_(start_offset % 8) {}
 
-  /// \brief Return the next run of available bits, up to 256. The returned
-  /// pair contains the size of run and the number of true values
-  Block NextBlock();
+  /// \brief Return the next run of available bits, usually 256. The returned
+  /// pair contains the size of run and the number of true values. When the
+  /// offset is greater than zero, the last block may be longer than 256.

Review comment:
       That would sound better to me, yes.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org