You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/02 20:57:43 UTC

[couchdb] branch fix-centos-7-icu-collation-issue created (now 642757f)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch fix-centos-7-icu-collation-issue
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at 642757f  Fix collation issue for older versions of libicu library

This branch includes the following new commits:

     new 642757f  Fix collation issue for older versions of libicu library

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[couchdb] 01/01: Fix collation issue for older versions of libicu library

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-centos-7-icu-collation-issue
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 642757fb4d3d9705c6590df1f3dbb4beca948503
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Fri Apr 2 16:46:46 2021 -0400

    Fix collation issue for older versions of libicu library
    
    Previously, mango tests with objects as keys were failing on CentOS 6 and
    CentOS 7. The reason for the failures was that old libicu collation algorithms
    didn't consider the `<<255,255,255,255>>` as the highest sortable string as
    CouchDB intends it to be. Later versions of libicu, at least as old as 59,
    started to do that
    https://www.unicode.org/reports/tr35/tr35-collation.html#tailored_noncharacter_weights.
    However, as long as we support CentOS 7 we can fix the issue by explicitly
    checkign for the highest marker.
---
 .../priv/couch_ejson_compare/couch_ejson_compare.c | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
index ad3d0cd..ca47853 100644
--- a/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
+++ b/src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
@@ -13,6 +13,7 @@
  */
 
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 #include "erl_nif.h"
 #include "unicode/ucol.h"
@@ -65,6 +66,11 @@ static __inline int compare_lists(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline int compare_props(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
 static __inline UCollator* get_collator();
 
+/* Should match the <<255,255,255,255>> in:
+ *  - src/mango/src/mango_idx_view.hrl#L13
+ *  - src/couch_mrview/src/couch_mrview_util.erl#L40 */
+static const unsigned char max_utf8_marker[]  = {255, 255, 255, 255};
+
 
 UCollator*
 get_collator()
@@ -357,12 +363,46 @@ compare_props(int depth, ctx_t* ctx, ERL_NIF_TERM a, ERL_NIF_TERM b)
 
 
 int
+is_max_utf8_marker(ErlNifBinary bin)
+{
+    if (bin.size == sizeof(max_utf8_marker)) {
+        if(memcmp(bin.data, max_utf8_marker, sizeof(max_utf8_marker)) == 0) {
+            return 1;
+        }
+        return 0;
+    }
+    return 0;
+}
+
+
+int
 compare_strings(ctx_t* ctx, ErlNifBinary a, ErlNifBinary b)
 {
     UErrorCode status = U_ZERO_ERROR;
     UCharIterator iterA, iterB;
     int result;
 
+    /* libicu versions earlier than 59 (at least) don't consider the
+     * {255,255,255,255} to be the highest sortable string as CouchDB expects.
+     * While we are still shipping CentOS 7 packages with libicu 50, we should
+     * explicitly check for the marker, later one we can remove the max
+     * logic */
+
+    int a_is_max = is_max_utf8_marker(a);
+    int b_is_max = is_max_utf8_marker(b);
+
+    if(a_is_max && b_is_max) {
+        return 0;
+    }
+
+    if(a_is_max) {
+        return 1;
+    }
+
+    if(b_is_max) {
+        return -1;
+    }
+
     uiter_setUTF8(&iterA, (const char *) a.data, (uint32_t) a.size);
     uiter_setUTF8(&iterB, (const char *) b.data, (uint32_t) b.size);