You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/02 20:57:44 UTC

[couchdb] 01/01: Fix collation issue for older versions of libicu library

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-centos-7-icu-collation-issue
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 642757fb4d3d9705c6590df1f3dbb4beca948503
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Fri Apr 2 16:46:46 2021 -0400

    Fix collation issue for older versions of libicu library
    
    Previously, mango tests with objects as keys were failing on CentOS 6 and
    CentOS 7. The reason for the failures was that old libicu collation algorithms
    didn't consider the `<<255,255,255,255>>` as the highest sortable string as
    CouchDB intends it to be. Later versions of libicu, at least as old as 59,
    started to do that
    https://www.unicode.org/reports/tr35/tr35-collation.html#tailored_noncharacter_weights.
    However, as long as we support CentOS 7 we can fix the issue by explicitly
    checkign for the highest marker.
---
 .../priv/couch_ejson_compare/couch_ejson_compare.c | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
index ad3d0cd..ca47853 100644
--- a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
+++ b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
@@ -13,6 +13,7 @@
  */
 
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 #include "erl_nif.h"
 #include "unicode/ucol.h"
@@ -65,6 +66,11 @@ static __inline int compare_lists(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline int compare_props(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline UCollator* get_collator();
 
+/* Should match the <<255,255,255,255>> in:
+ *  - src/mango/src/mango_idx_view.hrl#L13
+ *  - src/couch_mrview/src/couch_mrview_util.erl#L40 */
+static const unsigned char max_utf8_marker[]  = {255, 255, 255, 255};
+
 
 UCollator*
 get_collator()
@@ -357,12 +363,46 @@ compare_props(int depth, ctx_t* ctx, ERL_NIF_TERM a, ERL_NIF_TERM b)
 
 
 int
+is_max_utf8_marker(ErlNifBinary bin)
+{
+    if (bin.size == sizeof(max_utf8_marker)) {
+        if(memcmp(bin.data, max_utf8_marker, sizeof(max_utf8_marker)) == 0) {
+            return 1;
+        }
+        return 0;
+    }
+    return 0;
+}
+
+
+int
 compare_strings(ctx_t* ctx, ErlNifBinary a, ErlNifBinary b)
 {
     UErrorCode status = U_ZERO_ERROR;
     UCharIterator iterA, iterB;
     int result;
 
+    /* libicu versions earlier than 59 (at least) don't consider the
+     * {255,255,255,255} to be the highest sortable string as CouchDB expects.
+     * While we are still shipping CentOS 7 packages with libicu 50, we should
+     * explicitly check for the marker, later one we can remove the max
+     * logic */
+
+    int a_is_max = is_max_utf8_marker(a);
+    int b_is_max = is_max_utf8_marker(b);
+
+    if(a_is_max && b_is_max) {
+        return 0;
+    }
+
+    if(a_is_max) {
+        return 1;
+    }
+
+    if(b_is_max) {
+        return -1;
+    }
+
     uiter_setUTF8(&iterA, (const char *) a.data, (uint32_t) a.size);
     uiter_setUTF8(&iterB, (const char *) b.data, (uint32_t) b.size);