You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/05 20:11:01 UTC

[couchdb] branch 3.x-fix-centos-7-icu-collation-issue created (now f2dd976)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch 3.x-fix-centos-7-icu-collation-issue
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at f2dd976  Fix collation issue for older versions of libicu library

This branch includes the following new commits:

     new f2dd976  Fix collation issue for older versions of libicu library

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[couchdb] 01/01: Fix collation issue for older versions of libicu library

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch 3.x-fix-centos-7-icu-collation-issue
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit f2dd97675828b9f22bf116020c61d8bd10eddcfc
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Fri Apr 2 16:46:46 2021 -0400

    Fix collation issue for older versions of libicu library
    
    Previously, mango tests with objects as keys were failing on CentOS 6 and
    CentOS 7. The reason for the failures was that old libicu collation algorithms
    didn't consider the `<<255,255,255,255>>` as the highest sortable string as
    CouchDB intends it to be. Later versions of libicu, at least as old as 59,
    started to do that
    https://www.unicode.org/reports/tr35/tr35-collation.html#tailored_noncharacter_weights.
    However, as long as we support CentOS 7 we can fix the issue by explicitly
    checkign for the highest marker.
---
 .../priv/couch_ejson_compare/couch_ejson_compare.c | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
index ad3d0cd..49d6cd8 100644
--- a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
+++ b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
@@ -13,6 +13,7 @@
  */
 
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 #include "erl_nif.h"
 #include "unicode/ucol.h"
@@ -65,6 +66,11 @@ static __inline int compare_lists(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline int compare_props(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline UCollator* get_collator();
 
+/* Should match the <<255,255,255,255>> in:
+ *  - src/mango/src/mango_idx_view.hrl#L13
+ *  - src/couch_mrview/src/couch_mrview_util.erl#L40 */
+static const unsigned char max_utf8_marker[]  = {255, 255, 255, 255};
+
 
 UCollator*
 get_collator()
@@ -357,12 +363,46 @@ compare_props(int depth, ctx_t* ctx, ERL_NIF_TERM a, ERL_NIF_TERM b)
 
 
 int
+is_max_utf8_marker(ErlNifBinary bin)
+{
+    if (bin.size == sizeof(max_utf8_marker)) {
+        if(memcmp(bin.data, max_utf8_marker, sizeof(max_utf8_marker)) == 0) {
+            return 1;
+        }
+        return 0;
+    }
+    return 0;
+}
+
+
+int
 compare_strings(ctx_t* ctx, ErlNifBinary a, ErlNifBinary b)
 {
     UErrorCode status = U_ZERO_ERROR;
     UCharIterator iterA, iterB;
     int result;
 
+    /* libicu versions earlier than 59 (at least) don't consider the
+     * {255,255,255,255} to be the highest sortable string as CouchDB expects.
+     * While we are still shipping CentOS 7 packages with libicu 50, we should
+     * explicitly check for the marker, later on we can remove the max
+     * logic */
+
+    int a_is_max = is_max_utf8_marker(a);
+    int b_is_max = is_max_utf8_marker(b);
+
+    if(a_is_max && b_is_max) {
+        return 0;
+    }
+
+    if(a_is_max) {
+        return 1;
+    }
+
+    if(b_is_max) {
+        return -1;
+    }
+
     uiter_setUTF8(&iterA, (const char *) a.data, (uint32_t) a.size);
     uiter_setUTF8(&iterB, (const char *) b.data, (uint32_t) b.size);