You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2021/09/13 04:01:16 UTC

[GitHub] [incubator-doris] zuochunwei opened a new pull request #6625: hll optimize

zuochunwei opened a new pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625


   1. 通过三目表达式替换std::max,std::max很重
   2. 通过数组替代std::set,std::set基于红黑树,遍历会沿着链域走,cache命中性不好
   3. 优化了serialize函数,通过减少分支,提升num_non_zero_registers计算速度,通过_registers的序列化更快
   4. encode_fixedXX_le/decode_fixedXX_le避免使用重的memcpy
   5. 在美团优选测试发现,性能提升较为明显


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708967291



##########
File path: be/src/olap/hll.cpp
##########
@@ -363,53 +427,53 @@ int64_t HyperLogLog::estimate_cardinality() const {
         // there are relatively large fluctuations, we fixed the problem refer to redis.
         double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) -
                       1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
-                      1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate +
-                      83.3216;
+                      1.2940 * 1.0e-7 * (estimate * estimate) -
+                      5.2921 * 1.0e-3 * estimate + 83.3216;
         estimate -= estimate * (bias / 100);
     }
     return (int64_t)(estimate + 0.5);
 }
 
 void HllSetResolver::parse() {
     // skip LengthValueType
-    char* pdata = _buf_ref;
+    char*  pdata = _buf_ref;
     _set_type = (HllDataType)pdata[0];
     char* sparse_data = NULL;
     switch (_set_type) {
-    case HLL_DATA_EXPLICIT:
-        // first byte : type
-        // second~five byte : hash values's number
-        // five byte later : hash value
-        _explicit_num = (ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]);
-        _explicit_value =
-                (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExplicitLengthValueType));
-        break;
-    case HLL_DATA_SPARSE:
-        // first byte : type
-        // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero
-        // 2^HLL_COLUMN_PRECISION)/8 + 1以后value
-        _sparse_count = (SparseLengthValueType*)(pdata + sizeof(SetTypeValueType));
-        sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType);
-        for (int i = 0; i < *_sparse_count; i++) {
-            SparseIndexType* index = (SparseIndexType*)sparse_data;
-            sparse_data += sizeof(SparseIndexType);
-            SparseValueType* value = (SparseValueType*)sparse_data;
-            _sparse_map[*index] = *value;
-            sparse_data += sizeof(SetTypeValueType);
-        }
-        break;
-    case HLL_DATA_FULL:
-        // first byte : type
-        // second byte later : hll register value
-        _full_value_position = pdata + sizeof(SetTypeValueType);
-        break;
-    default:
-        // HLL_DATA_EMPTY
-        break;
+        case HLL_DATA_EXPLICIT:

Review comment:
       we don't indent with case statement




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] imay commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
imay commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r707145902



##########
File path: be/src/util/coding.h
##########
@@ -27,37 +27,37 @@ inline void encode_fixed8(uint8_t* buf, uint8_t val) {
 
 inline void encode_fixed16_le(uint8_t* buf, uint16_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint16_t*)buf = val;
 #else
     uint16_t res = bswap_16(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint16_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed32_le(uint8_t* buf, uint32_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint32_t*)buf = val;
 #else
     uint32_t res = bswap_32(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint32_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed64_le(uint8_t* buf, uint64_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint64_t*)buf = val;
 #else
     uint64_t res = gbswap_64(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint64_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed128_le(uint8_t* buf, uint128_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint128_t*)buf = val;

Review comment:
       better to use memcpy. If buf address is not aligned, this will cause BE crash.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] github-actions[bot] commented on pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#issuecomment-927416153






-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708987284



##########
File path: be/src/olap/hll.cpp
##########
@@ -363,53 +427,53 @@ int64_t HyperLogLog::estimate_cardinality() const {
         // there are relatively large fluctuations, we fixed the problem refer to redis.
         double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) -
                       1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
-                      1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate +
-                      83.3216;
+                      1.2940 * 1.0e-7 * (estimate * estimate) -
+                      5.2921 * 1.0e-3 * estimate + 83.3216;
         estimate -= estimate * (bias / 100);
     }
     return (int64_t)(estimate + 0.5);
 }
 
 void HllSetResolver::parse() {
     // skip LengthValueType
-    char* pdata = _buf_ref;
+    char*  pdata = _buf_ref;

Review comment:
       ok




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708987987



##########
File path: be/src/olap/hll.h
##########
@@ -20,9 +20,8 @@
 
 #include <math.h>
 #include <stdio.h>
-
-#include <map>
 #include <set>
+#include <map>

Review comment:
       maybe forget to delete it

##########
File path: be/src/olap/hll.h
##########
@@ -124,33 +125,36 @@ class HyperLogLog {
 
     // Check if input slice is a valid serialized binary of HyperLogLog.
     // This function only check the encoded type in slice, whose complex
-    // function is O(1).
+    // function is O(1). 
     static bool is_valid(const Slice& slice);
 
     // only for debug
     std::string to_string() {
         switch (_type) {
-        case HLL_DATA_EMPTY:
-            return {};
-        case HLL_DATA_EXPLICIT:
-        case HLL_DATA_SPARSE:
-        case HLL_DATA_FULL: {
-            std::string str{"hash set size: "};
-            str.append(std::to_string(_hash_set.size()));
-            str.append("\ncardinality:\t");
-            str.append(std::to_string(estimate_cardinality()));
-            str.append("\ntype:\t");
-            str.append(std::to_string(_type));
-            return str;
-        }
-        default:
-            return {};
+            case HLL_DATA_EMPTY:

Review comment:
       ok




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] imay commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
imay commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r707273803



##########
File path: be/src/util/coding.h
##########
@@ -27,37 +27,37 @@ inline void encode_fixed8(uint8_t* buf, uint8_t val) {
 
 inline void encode_fixed16_le(uint8_t* buf, uint16_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint16_t*)buf = val;
 #else
     uint16_t res = bswap_16(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint16_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed32_le(uint8_t* buf, uint32_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint32_t*)buf = val;
 #else
     uint32_t res = bswap_32(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint32_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed64_le(uint8_t* buf, uint64_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint64_t*)buf = val;
 #else
     uint64_t res = gbswap_64(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint64_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed128_le(uint8_t* buf, uint128_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint128_t*)buf = val;

Review comment:
       GCC will generate SIMD instruments, which needs address aligment. If address is not alignment, progress will throw illegal instruction exception




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708970875



##########
File path: be/src/olap/hll.h
##########
@@ -162,35 +166,60 @@ class HyperLogLog {
     void _convert_explicit_to_register();
 
     // update one hash value into this registers
-    void _update_registers(uint64_t hash_value) {
+    inline void _update_registers(uint64_t hash_value) {
         // Use the lower bits to index into the number of streams and then
         // find the first 1 bit after the index bits.
         int idx = hash_value % HLL_REGISTERS_COUNT;
         hash_value >>= HLL_COLUMN_PRECISION;
         // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
         hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
         uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
-        _registers[idx] = std::max((uint8_t)_registers[idx], first_one_bit);
+        _registers[idx] = (_registers[idx] > first_one_bit ? _registers[idx] : first_one_bit);
     }
 
     // absorb other registers into this registers
-    void _merge_registers(const uint8_t* other_registers) {
+    void _merge_registers(const uint8_t* other) {
         for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
-            _registers[i] = std::max(_registers[i], other_registers[i]);
+             _registers[i] = (_registers[i] < other[i] ? other[i] : _registers[i]);

Review comment:
       ```suggestion
                _registers[i] = _registers[i] < other[i] ? other[i] : _registers[i];
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r709023209



##########
File path: be/src/util/coding.h
##########
@@ -27,37 +27,37 @@ inline void encode_fixed8(uint8_t* buf, uint8_t val) {
 
 inline void encode_fixed16_le(uint8_t* buf, uint16_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint16_t*)buf = val;
 #else
     uint16_t res = bswap_16(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint16_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed32_le(uint8_t* buf, uint32_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint32_t*)buf = val;
 #else
     uint32_t res = bswap_32(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint32_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed64_le(uint8_t* buf, uint64_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint64_t*)buf = val;
 #else
     uint64_t res = gbswap_64(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint64_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed128_le(uint8_t* buf, uint128_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint128_t*)buf = val;

Review comment:
       got,thank u




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] morningman merged pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
morningman merged pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#issuecomment-919035962


   > You should re-format files, and use whitespace instead of tab.
   > you can try `clang-format -i -style=file /path/to/file` in youre code root
   
   ok


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708967621



##########
File path: be/src/olap/hll.cpp
##########
@@ -363,53 +427,53 @@ int64_t HyperLogLog::estimate_cardinality() const {
         // there are relatively large fluctuations, we fixed the problem refer to redis.
         double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) -
                       1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
-                      1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate +
-                      83.3216;
+                      1.2940 * 1.0e-7 * (estimate * estimate) -
+                      5.2921 * 1.0e-3 * estimate + 83.3216;
         estimate -= estimate * (bias / 100);
     }
     return (int64_t)(estimate + 0.5);
 }
 
 void HllSetResolver::parse() {
     // skip LengthValueType
-    char* pdata = _buf_ref;
+    char*  pdata = _buf_ref;

Review comment:
       ```suggestion
       char* pdata = _buf_ref;
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r707238823



##########
File path: be/src/util/coding.h
##########
@@ -27,37 +27,37 @@ inline void encode_fixed8(uint8_t* buf, uint8_t val) {
 
 inline void encode_fixed16_le(uint8_t* buf, uint16_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint16_t*)buf = val;
 #else
     uint16_t res = bswap_16(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint16_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed32_le(uint8_t* buf, uint32_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint32_t*)buf = val;
 #else
     uint32_t res = bswap_32(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint32_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed64_le(uint8_t* buf, uint64_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint64_t*)buf = val;
 #else
     uint64_t res = gbswap_64(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint64_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed128_le(uint8_t* buf, uint128_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint128_t*)buf = val;

Review comment:
       architecture dependent, x86 will work well,other architecture may cause bus error




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708992058



##########
File path: be/src/olap/hll.h
##########
@@ -162,35 +166,60 @@ class HyperLogLog {
     void _convert_explicit_to_register();
 
     // update one hash value into this registers
-    void _update_registers(uint64_t hash_value) {
+    inline void _update_registers(uint64_t hash_value) {
         // Use the lower bits to index into the number of streams and then
         // find the first 1 bit after the index bits.
         int idx = hash_value % HLL_REGISTERS_COUNT;
         hash_value >>= HLL_COLUMN_PRECISION;
         // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
         hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
         uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
-        _registers[idx] = std::max((uint8_t)_registers[idx], first_one_bit);
+        _registers[idx] = (_registers[idx] > first_one_bit ? _registers[idx] : first_one_bit);

Review comment:
       i think use () is more clear:)
   

##########
File path: be/src/olap/hll.h
##########
@@ -162,35 +166,60 @@ class HyperLogLog {
     void _convert_explicit_to_register();
 
     // update one hash value into this registers
-    void _update_registers(uint64_t hash_value) {
+    inline void _update_registers(uint64_t hash_value) {
         // Use the lower bits to index into the number of streams and then
         // find the first 1 bit after the index bits.
         int idx = hash_value % HLL_REGISTERS_COUNT;
         hash_value >>= HLL_COLUMN_PRECISION;
         // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
         hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
         uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
-        _registers[idx] = std::max((uint8_t)_registers[idx], first_one_bit);
+        _registers[idx] = (_registers[idx] > first_one_bit ? _registers[idx] : first_one_bit);
     }
 
     // absorb other registers into this registers
-    void _merge_registers(const uint8_t* other_registers) {
+    void _merge_registers(const uint8_t* other) {
         for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
-            _registers[i] = std::max(_registers[i], other_registers[i]);
+             _registers[i] = (_registers[i] < other[i] ? other[i] : _registers[i]);

Review comment:
       i think use () is more clear:)
   
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r707238823



##########
File path: be/src/util/coding.h
##########
@@ -27,37 +27,37 @@ inline void encode_fixed8(uint8_t* buf, uint8_t val) {
 
 inline void encode_fixed16_le(uint8_t* buf, uint16_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint16_t*)buf = val;
 #else
     uint16_t res = bswap_16(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint16_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed32_le(uint8_t* buf, uint32_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint32_t*)buf = val;
 #else
     uint32_t res = bswap_32(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint32_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed64_le(uint8_t* buf, uint64_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint64_t*)buf = val;
 #else
     uint64_t res = gbswap_64(val);
-    memcpy(buf, &res, sizeof(res));
+    *(uint64_t*)buf = res;
 #endif
 }
 
 inline void encode_fixed128_le(uint8_t* buf, uint128_t val) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-    memcpy(buf, &val, sizeof(val));
+    *(uint128_t*)buf = val;

Review comment:
       ok, only x86 no align requirement




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708969975



##########
File path: be/src/olap/hll.h
##########
@@ -124,33 +125,36 @@ class HyperLogLog {
 
     // Check if input slice is a valid serialized binary of HyperLogLog.
     // This function only check the encoded type in slice, whose complex
-    // function is O(1).
+    // function is O(1). 
     static bool is_valid(const Slice& slice);
 
     // only for debug
     std::string to_string() {
         switch (_type) {
-        case HLL_DATA_EMPTY:
-            return {};
-        case HLL_DATA_EXPLICIT:
-        case HLL_DATA_SPARSE:
-        case HLL_DATA_FULL: {
-            std::string str{"hash set size: "};
-            str.append(std::to_string(_hash_set.size()));
-            str.append("\ncardinality:\t");
-            str.append(std::to_string(estimate_cardinality()));
-            str.append("\ntype:\t");
-            str.append(std::to_string(_type));
-            return str;
-        }
-        default:
-            return {};
+            case HLL_DATA_EMPTY:

Review comment:
       case statement does not need to be indented




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708969034



##########
File path: be/src/olap/hll.h
##########
@@ -20,9 +20,8 @@
 
 #include <math.h>
 #include <stdio.h>
-
-#include <map>
 #include <set>
+#include <map>

Review comment:
       Unless there are special reasons, the header files are in alphabetical order




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708986529



##########
File path: be/src/olap/hll.cpp
##########
@@ -100,36 +102,72 @@ void HyperLogLog::merge(const HyperLogLog& other) {
     }
     case HLL_DATA_EXPLICIT: {
         switch (other._type) {
-        case HLL_DATA_EXPLICIT:
+        case HLL_DATA_EXPLICIT: {
             // Merge other's explicit values first, then check if the number is exceed
             // HLL_EXPLICIT_INT64_NUM. This is OK because the max value is 2 * 160.
-            _hash_set.insert(other._hash_set.begin(), other._hash_set.end());
-            if (_hash_set.size() > HLL_EXPLICIT_INT64_NUM) {
-                _convert_explicit_to_register();
-                _type = HLL_DATA_FULL;
+            if (other._explicit_data_num > HLL_EXPLICIT_INT64_NUM / 2) { //merge
+                uint64_t explicit_data[HLL_EXPLICIT_INT64_NUM * 2];
+                memcpy(explicit_data, _explicit_data, sizeof(*_explicit_data) * _explicit_data_num);
+                uint32_t explicit_data_num = _explicit_data_num;
+                _explicit_data_num = 0;
+
+                // merge _explicit_data and other's _explicit_data to _explicit_data
+                uint32_t i = 0, j = 0, k = 0;
+                while (i < explicit_data_num || j < other._explicit_data_num) {
+                    if (i == explicit_data_num) {
+                        uint32_t n = other._explicit_data_num - j;
+                        memcpy(_explicit_data + k, other._explicit_data + j, n * sizeof(*_explicit_data));
+                        k += n; break;
+                    } else if (j == other._explicit_data_num) {
+                        uint32_t n = explicit_data_num - i;
+                        memcpy(_explicit_data + k, explicit_data + i, n * sizeof(*_explicit_data));
+                        k += n; break;
+                    } else {
+                        if (explicit_data[i] < other._explicit_data[j]) { 
+                            _explicit_data[k++] = explicit_data[i++];
+                        } else if (explicit_data[i] > other._explicit_data[j]) {
+                            _explicit_data[k++] = other._explicit_data[j++];
+                        } else {
+                            _explicit_data[k++] = explicit_data[i++]; j++;
+                        }
+                    }
+                }
+                _explicit_data_num = k;
+            } else { //依次插入 

Review comment:
       ok

##########
File path: be/src/olap/hll.cpp
##########
@@ -363,53 +427,53 @@ int64_t HyperLogLog::estimate_cardinality() const {
         // there are relatively large fluctuations, we fixed the problem refer to redis.
         double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) -
                       1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
-                      1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate +
-                      83.3216;
+                      1.2940 * 1.0e-7 * (estimate * estimate) -
+                      5.2921 * 1.0e-3 * estimate + 83.3216;
         estimate -= estimate * (bias / 100);
     }
     return (int64_t)(estimate + 0.5);
 }
 
 void HllSetResolver::parse() {
     // skip LengthValueType
-    char* pdata = _buf_ref;
+    char*  pdata = _buf_ref;
     _set_type = (HllDataType)pdata[0];
     char* sparse_data = NULL;
     switch (_set_type) {
-    case HLL_DATA_EXPLICIT:
-        // first byte : type
-        // second~five byte : hash values's number
-        // five byte later : hash value
-        _explicit_num = (ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]);
-        _explicit_value =
-                (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExplicitLengthValueType));
-        break;
-    case HLL_DATA_SPARSE:
-        // first byte : type
-        // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero
-        // 2^HLL_COLUMN_PRECISION)/8 + 1以后value
-        _sparse_count = (SparseLengthValueType*)(pdata + sizeof(SetTypeValueType));
-        sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType);
-        for (int i = 0; i < *_sparse_count; i++) {
-            SparseIndexType* index = (SparseIndexType*)sparse_data;
-            sparse_data += sizeof(SparseIndexType);
-            SparseValueType* value = (SparseValueType*)sparse_data;
-            _sparse_map[*index] = *value;
-            sparse_data += sizeof(SetTypeValueType);
-        }
-        break;
-    case HLL_DATA_FULL:
-        // first byte : type
-        // second byte later : hll register value
-        _full_value_position = pdata + sizeof(SetTypeValueType);
-        break;
-    default:
-        // HLL_DATA_EMPTY
-        break;
+        case HLL_DATA_EXPLICIT:

Review comment:
       ok
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708970470



##########
File path: be/src/olap/hll.h
##########
@@ -162,35 +166,60 @@ class HyperLogLog {
     void _convert_explicit_to_register();
 
     // update one hash value into this registers
-    void _update_registers(uint64_t hash_value) {
+    inline void _update_registers(uint64_t hash_value) {
         // Use the lower bits to index into the number of streams and then
         // find the first 1 bit after the index bits.
         int idx = hash_value % HLL_REGISTERS_COUNT;
         hash_value >>= HLL_COLUMN_PRECISION;
         // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
         hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
         uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
-        _registers[idx] = std::max((uint8_t)_registers[idx], first_one_bit);
+        _registers[idx] = (_registers[idx] > first_one_bit ? _registers[idx] : first_one_bit);

Review comment:
       ```suggestion
           _registers[idx] = _registers[idx] > first_one_bit ? _registers[idx] : first_one_bit;
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] yangzhg commented on a change in pull request #6625: hll optimize

Posted by GitBox <gi...@apache.org>.
yangzhg commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708964143



##########
File path: be/src/olap/hll.cpp
##########
@@ -100,36 +102,72 @@ void HyperLogLog::merge(const HyperLogLog& other) {
     }
     case HLL_DATA_EXPLICIT: {
         switch (other._type) {
-        case HLL_DATA_EXPLICIT:
+        case HLL_DATA_EXPLICIT: {
             // Merge other's explicit values first, then check if the number is exceed
             // HLL_EXPLICIT_INT64_NUM. This is OK because the max value is 2 * 160.
-            _hash_set.insert(other._hash_set.begin(), other._hash_set.end());
-            if (_hash_set.size() > HLL_EXPLICIT_INT64_NUM) {
-                _convert_explicit_to_register();
-                _type = HLL_DATA_FULL;
+            if (other._explicit_data_num > HLL_EXPLICIT_INT64_NUM / 2) { //merge
+                uint64_t explicit_data[HLL_EXPLICIT_INT64_NUM * 2];
+                memcpy(explicit_data, _explicit_data, sizeof(*_explicit_data) * _explicit_data_num);
+                uint32_t explicit_data_num = _explicit_data_num;
+                _explicit_data_num = 0;
+
+                // merge _explicit_data and other's _explicit_data to _explicit_data
+                uint32_t i = 0, j = 0, k = 0;
+                while (i < explicit_data_num || j < other._explicit_data_num) {
+                    if (i == explicit_data_num) {
+                        uint32_t n = other._explicit_data_num - j;
+                        memcpy(_explicit_data + k, other._explicit_data + j, n * sizeof(*_explicit_data));
+                        k += n; break;
+                    } else if (j == other._explicit_data_num) {
+                        uint32_t n = explicit_data_num - i;
+                        memcpy(_explicit_data + k, explicit_data + i, n * sizeof(*_explicit_data));
+                        k += n; break;
+                    } else {
+                        if (explicit_data[i] < other._explicit_data[j]) { 
+                            _explicit_data[k++] = explicit_data[i++];
+                        } else if (explicit_data[i] > other._explicit_data[j]) {
+                            _explicit_data[k++] = other._explicit_data[j++];
+                        } else {
+                            _explicit_data[k++] = explicit_data[i++]; j++;
+                        }
+                    }
+                }
+                _explicit_data_num = k;
+            } else { //依次插入 

Review comment:
       It's better to use English




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org