You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/02 00:50:27 UTC

[GitHub] [arrow] save-buffer commented on a change in pull request #12537: ARROW-3998: [C++] Add TPC-H Generator

save-buffer commented on a change in pull request #12537:
URL: https://github.com/apache/arrow/pull/12537#discussion_r817259916



##########
File path: cpp/src/arrow/compute/exec/tpch_node.cc
##########
@@ -0,0 +1,3738 @@
+#include "arrow/compute/exec/tpch_node.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/future.h"
+#include "arrow/util/unreachable.h"
+
+#include <algorithm>
+#include <bitset>
+#include <cstring>
+#include <random>
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_set>
+
+namespace arrow
+{
+    using internal::checked_cast;
+
+    namespace compute
+    {
+        class TpchText
+        {
+        public:
+            Status Init();
+            Result<Datum> GenerateComments(
+                size_t num_comments,
+                size_t min_length,
+                size_t max_length,
+                random::pcg32_fast &rng);
+
+        private:
+            void GenerateWord(size_t &offset, const char **words, size_t num_choices);
+            void GenerateNoun(size_t &offset);
+            void GenerateVerb(size_t &offset);
+            void GenerateAdjective(size_t &offset);
+            void GenerateAdverb(size_t &offset);
+            void GeneratePreposition(size_t &offset);
+            void GenerateAuxiliary(size_t &offset);
+            void GenerateTerminator(size_t &offset);
+
+            void GenerateNounPhrase(size_t &offset);
+            void GenerateVerbPhrase(size_t &offset);
+            void GeneratePrepositionalPhrase(size_t &offset);
+
+            void GenerateSentence(size_t &offset);
+
+            std::unique_ptr<Buffer> text_;
+            random::pcg32_fast rng_;
+            static constexpr size_t kTextBytes = 300 * 1024 * 1024; // 300 MB
+        };
+
+        class TpchTableGenerator
+        {
+        public:
+            using OutputBatchCallback = std::function<void(ExecBatch)>;
+            using FinishedCallback = std::function<void(int64_t)>;
+            using GenerateFn = std::function<Status(size_t)>;
+            using ScheduleCallback = std::function<Status(GenerateFn)>;
+            using AbortCallback = std::function<void()>;
+
+            virtual Status Init(
+                std::vector<std::string> columns,
+                int scale_factor,
+                int64_t batch_size) = 0;
+
+            virtual Status StartProducing(
+                size_t num_threads,
+                OutputBatchCallback output_callback,
+                FinishedCallback finished_callback,
+                ScheduleCallback schedule_callback) = 0;
+
+            void Abort(AbortCallback abort_callback)
+            {
+                bool expected = false;
+                if(done_.compare_exchange_strong(expected, true))
+                {
+                    abort_callback();
+                }
+            }
+
+            virtual std::shared_ptr<Schema> schema() const = 0;
+
+            virtual ~TpchTableGenerator() = default;
+
+        protected:
+            std::atomic<bool> done_ = { false };
+            std::atomic<int64_t> batches_generated_ = { 0 };
+        };
+
+        int GetNumDigits(int64_t x)
+        {
+            // This if statement chain is for MAXIMUM SPEED
+            /*
+              .,
+              .      _,'f----.._
+              |\ ,-'"/  |     ,'
+              |,_  ,--.      /
+              /,-. ,'`.     (_
+              f  o|  o|__     "`-.
+              ,-._.,--'_ `.   _.,-`
+              `"' ___.,'` j,-'
+              `-.__.,--'
+             */
+            // Source: https://stackoverflow.com/questions/1068849/how-do-i-determine-the-number-of-digits-of-an-integer-in-c

Review comment:
       It's actually not really a copy - I have the `DCHECK`, made it work with 64-bit ints, return -1 in the unreachable scenario, and the variable and function names are different. I was mainly using it as a source for justifying why this is the fastest way. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org