You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/05/30 22:30:41 UTC

[couchdb] branch qjs updated (cd18fe6a0 -> 96ea57f54)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch qjs
in repository https://gitbox.apache.org/repos/asf/couchdb.git


 discard cd18fe6a0 Add QuickJS as a Javascript engine option
     add c742d9c7c Fix purge infos replicating to the wrong shards during shard splitting.
     new 96ea57f54 Add QuickJS as a Javascript engine option

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (cd18fe6a0)
            \
             N -- N -- N   refs/heads/qjs (96ea57f54)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/couch/src/couch_db_split.erl          |  47 ++++++++++---
 src/mem3/src/mem3_rep.erl                 |  67 +++++++++++++-----
 src/mem3/src/mem3_reshard_job.erl         |  13 ++++
 src/mem3/test/eunit/mem3_reshard_test.erl | 108 +++++++++++++++++++++++++++++-
 4 files changed, 204 insertions(+), 31 deletions(-)


[couchdb] 01/01: Add QuickJS as a Javascript engine option

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch qjs
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 96ea57f5491993093ed79cb17fd89f12654b72e0
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Sat Nov 26 00:14:01 2022 -0500

    Add QuickJS as a Javascript engine option
    
    https://bellard.org/quickjs
    https://fuchsia.googlesource.com/third_party/quickjs/
    
    Some benefits over SM:
    
     * Small. We're using 6 or so C files vs 700+ SM91 C++ files.
    
     * Built with Apache CouchDB as opposed having to maintain a separate SM
       package, like for RHEL9, for instance, where they dropped support for SM
       already. (see https://github.com/apache/couchdb/issues/4154).
    
     * Embedding friendly. Designed from ground-up for embedding. SM has been
       updating the C++ API such that we have to keep copy-pasting new versions of
       our C++ code every year or so. (see
       https://github.com/apache/couchdb/pull/4305).
    
     * Easy to modify to accept Spidermonkey 1.8.5 top level functions for
       map/reduce code so we don't have have to parse the JS, AST trasform it, and
       then re-compile it.
    
     * Configurable runtime feature set - can disable workers, promises and other
       API and features which may not work well in a backend JS environment. Some
       users may want more, some may want to disable even Date(time) features to
       hedge again Spectre-style attacks (spectreattack.com).
    
     * Allows granular time (reduction) trackign if we wanted to provide a runtime
       allowance for each function.
    
     * Better sandboxing. Creating a whole JSRuntime takes only 300 microseconds, so
       we can afford to do that on reset. JSRuntimes cannot share JS data or object
       between them.
    
     * Seems to be faster in preliminary benchmarking with small
       concurrent VDU and view builds:
         https://gist.github.com/nickva/ed239651114794ebb138b1f16c5f6758
       Results seem promising:
         - 4x faster than SM 1.8.5
         - 5x faster than SM 91
         - 6x reduced memory usage per couchjs process (5MB vs 30MB)
    
     * Allows compiling JS bytecode ahead of time a C array of bytes.
    
    QuickJS can be built alongside spidermonkey and toggled on/off at runtime:
    
    ```
    ./configure --dev --js-engine=quickjs
    ```
    
    This makes it the default engine. But spidermonkey can still be set in the
    config option.
    
    ```
    [couchdb]
    js_engine = spidermonkey | quickjs
    ```
    
    Only tested on MacOS and Linux. All `make check` tests pass there.
---
 LICENSE                                            |    26 +
 build-aux/Jenkinsfile.pr                           |     6 +-
 configure                                          |    25 +-
 rebar.config.script                                |     1 +
 rel/overlay/etc/default.ini                        |     7 +
 rel/reltool.config                                 |     2 +
 share/server/dispatch-quickjs.js                   |   196 +
 share/server/dreyfus.js                            |     2 +-
 share/server/render.js                             |     2 +-
 share/server/util.js                               |    11 +-
 share/server/views.js                              |     2 +-
 src/chttpd/src/chttpd_node.erl                     |    16 +-
 src/couch/rebar.config.script                      |    13 +
 src/couch/src/couch.app.src                        |     3 +-
 src/couch/src/couch_proc_manager.erl               |     9 +
 src/couch/src/couch_server.erl                     |     4 +
 src/couch_quickjs/.gitignore                       |    14 +
 src/couch_quickjs/build_js.escript                 |   115 +
 src/couch_quickjs/c_src/.gitignore                 |     5 +
 src/couch_quickjs/c_src/couchjs.c                  |   413 +
 .../patches/01-spidermonkey-185-mode.patch         |    27 +
 .../02-getpropertyvalue-unitialized-read.patch     |    11 +
 .../03-freebsd-extra-clang-patch-makefile.patch    |    25 +
 .../patches/04-freebsd-patch-quickjs-libc.c.patch  |    18 +
 ...ck-overflow-is-array-proxy-cve-2023-31922.patch |    16 +
 src/couch_quickjs/priv/.gitignore                  |     2 +
 src/couch_quickjs/quickjs/Changelog                |   148 +
 src/couch_quickjs/quickjs/LICENSE                  |    22 +
 src/couch_quickjs/quickjs/Makefile                 |   479 +
 src/couch_quickjs/quickjs/VERSION                  |     1 +
 src/couch_quickjs/quickjs/cutils.c                 |   631 +
 src/couch_quickjs/quickjs/cutils.h                 |   297 +
 src/couch_quickjs/quickjs/libbf.c                  |  8466 +++
 src/couch_quickjs/quickjs/libbf.h                  |   535 +
 src/couch_quickjs/quickjs/libregexp-opcode.h       |    58 +
 src/couch_quickjs/quickjs/libregexp.c              |  2610 +
 src/couch_quickjs/quickjs/libregexp.h              |    92 +
 src/couch_quickjs/quickjs/libunicode-table.h       |  4449 ++
 src/couch_quickjs/quickjs/libunicode.c             |  1556 +
 src/couch_quickjs/quickjs/libunicode.h             |   124 +
 src/couch_quickjs/quickjs/list.h                   |   100 +
 src/couch_quickjs/quickjs/qjsc.c                   |   762 +
 src/couch_quickjs/quickjs/quickjs-atom.h           |   273 +
 src/couch_quickjs/quickjs/quickjs-libc.c           |  3933 ++
 src/couch_quickjs/quickjs/quickjs-libc.h           |    59 +
 src/couch_quickjs/quickjs/quickjs-opcode.h         |   365 +
 src/couch_quickjs/quickjs/quickjs.c                | 54083 +++++++++++++++++++
 src/couch_quickjs/quickjs/quickjs.h                |  1049 +
 src/couch_quickjs/quickjs/run-test262.c            |  2107 +
 src/couch_quickjs/quickjs/test262.conf             |   209 +
 src/couch_quickjs/quickjs/test262_errors.txt       |    35 +
 src/couch_quickjs/quickjs/tests/test262.patch      |    71 +
 src/couch_quickjs/rebar.config.script              |    59 +
 src/couch_quickjs/src/couch_quickjs.app.src        |    18 +
 src/couch_quickjs/src/couch_quickjs.erl            |    52 +
 src/couch_quickjs/test/couch_quickjs_tests.erl     |    71 +
 src/couch_quickjs/update_and_apply_patches.sh      |    53 +
 test/elixir/test/view_errors_test.exs              |     7 +-
 test/elixir/test/view_sandboxing_test.exs          |    20 +-
 59 files changed, 83750 insertions(+), 15 deletions(-)

diff --git a/LICENSE b/LICENSE
index 4a0a1ab3a..9b553bd74 100644
--- a/LICENSE
+++ b/LICENSE
@@ -2293,3 +2293,29 @@ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+For the QuickJS component couch_js/quickjs/quickjs:
+
+QuickJS Javascript Engine
+
+Copyright (c) 2017-2021 Fabrice Bellard
+Copyright (c) 2017-2021 Charlie Gordon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
diff --git a/build-aux/Jenkinsfile.pr b/build-aux/Jenkinsfile.pr
index 8b114803f..bbacf21ad 100644
--- a/build-aux/Jenkinsfile.pr
+++ b/build-aux/Jenkinsfile.pr
@@ -20,7 +20,7 @@ mkdir build
 cd build
 tar -xf ${WORKSPACE}/apache-couchdb-*.tar.gz
 cd apache-couchdb-*
-./configure --enable-nouveau
+./configure --enable-nouveau --js-engine=${JS_ENGINE}
 make check || (make build-report && false)
 '''
 
@@ -253,6 +253,10 @@ pipeline {
             name 'SM_VSN'
             values '78'
           }
+          axis {
+            name 'JS_ENGINE'
+            values 'quickjs' 'spidermonkey'
+          }
         }
 
         stages {
diff --git a/configure b/configure
index 8b1e43d6f..2f8a7c9d9 100755
--- a/configure
+++ b/configure
@@ -36,6 +36,7 @@ run_erlang() {
 }
 
 COUCHDB_USER="$(whoami 2>/dev/null || echo couchdb)"
+JS_ENGINE=${JS_ENGINE:-"spidermonkey"}
 SM_VSN=${SM_VSN:-"91"}
 ARCH="$(uname -m)"
 ERLANG_VER="$(run_erlang 'io:put_chars(erlang:system_info(otp_release)).')"
@@ -66,6 +67,7 @@ Options:
   --generate-tls-dev-cert     generate a cert for TLS distribution (To enable TLS, change the vm.args file.)
   --rebar3=PATH               use rebar3 by specified path
   --erlfmt=PATH               use erlfmt by specified path
+  --js-engine=ENGINE          default js engine: spidermonkey or quickjs
 EOF
 }
 
@@ -221,6 +223,24 @@ parse_opts() {
                 exit 1
                 ;;
 
+            --js-engine)
+                if [ -n "$2" ]; then
+                    eval JS_ENGINE=$2
+                    shift 2
+                    continue
+                else
+                    printf 'ERROR: "--js-engine" requires a non-empty argument.\n' >&2
+                    exit 1
+                fi
+                ;;
+            --js-engine=?*)
+                eval JS_ENGINE=${1#*=}
+                ;;
+            --js-engine=)
+                printf 'ERROR: "--js-engine" requires a non-empty argument.\n' >&2
+                exit 1
+                ;;
+
             --generate-tls-dev-cert)
                 echo "WARNING: To enable TLS distribution, don't forget to customize vm.args file."
                 generate_tls_dev_cert
@@ -250,7 +270,7 @@ if [ "${ARCH}" = "aarch64" ] && [ "${SM_VSN}" = "60" ]; then
   exit 1
 fi
 
-if [ "${ERLANG_OS}" = "unix" ]; then
+if [ "${JS_ENGINE}" = "spidermonkey" ] && [ "${ERLANG_OS}" = "unix" ]; then
     case "${SM_VSN}" in
         1.8.5)
             SM_HEADERS="js"
@@ -292,6 +312,7 @@ cat > rel/couchdb.config << EOF
 {log_file, "$LOG_FILE"}.
 {fauxton_root, "./share/www"}.
 {user, "$COUCHDB_USER"}.
+{js_engine, "$JS_ENGINE"}.
 {spidermonkey_version, "$SM_VSN"}.
 {node_name, "-name couchdb@127.0.0.1"}.
 {cluster_port, 5984}.
@@ -321,12 +342,14 @@ with_docs = $WITH_DOCS
 with_nouveau = $WITH_NOUVEAU
 
 user = $COUCHDB_USER
+js_engine = $JS_ENGINE
 spidermonkey_version = $SM_VSN
 EOF
 
 cat > $rootdir/config.erl << EOF
 {with_proper, $WITH_PROPER}.
 {erlang_md5, $ERLANG_MD5}.
+{js_engine, "$JS_ENGINE"}.
 {spidermonkey_version, "$SM_VSN"}.
 EOF
 
diff --git a/rebar.config.script b/rebar.config.script
index 0fa463b09..fba0144aa 100644
--- a/rebar.config.script
+++ b/rebar.config.script
@@ -114,6 +114,7 @@ SubDirs = [
     "src/b64url",
     "src/exxhash",
     "src/ets_lru",
+    "src/couch_quickjs",
     "src/chttpd",
     "src/couch",
     "src/couch_event",
diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index 4f2c44d95..16b42c56d 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -100,6 +100,9 @@ view_index_dir = {{view_index_dir}}
 ; checksums can be read and verified.
 ;write_xxhash_checksums = false
 
+; Javascript engine. The choices are: spidermonkey and quickjs
+;js_engine = spidermonkey
+
 [purge]
 ; Allowed maximum number of documents in one purge request
 ;max_document_id_number = 100
@@ -890,3 +893,7 @@ port = {{prometheus_port}}
 
 [nouveau]
 enable = {{with_nouveau}}
+[quickjs]
+; Memory limit in bytes. Default is undefined and so the built-in C default
+; of 64MB is used
+;memory_limit_bytes = 67108864
diff --git a/rel/reltool.config b/rel/reltool.config
index d84ef597c..07e311715 100644
--- a/rel/reltool.config
+++ b/rel/reltool.config
@@ -29,6 +29,7 @@
         b64url,
         exxhash,
         bear,
+        couch_quickjs,
         chttpd,
         config,
         couch,
@@ -93,6 +94,7 @@
     {app, b64url, [{incl_cond, include}]},
     {app, exxhash, [{incl_cond, include}]},
     {app, bear, [{incl_cond, include}]},
+    {app, couch_quickjs, [{incl_cond, include}]},
     {app, chttpd, [{incl_cond, include}]},
     {app, config, [{incl_cond, include}]},
     {app, couch, [{incl_cond, include}]},
diff --git a/share/server/dispatch-quickjs.js b/share/server/dispatch-quickjs.js
new file mode 100644
index 000000000..fc159f4c9
--- /dev/null
+++ b/share/server/dispatch-quickjs.js
@@ -0,0 +1,196 @@
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+function create_sandbox() {
+  var sandbox = {};
+  sandbox.emit = Views.emit;
+  sandbox.sum = Views.sum;
+  sandbox.log = log;
+  sandbox.toJSON = JSON.stringify;
+  sandbox.JSON = JSON;
+  sandbox.provides = Mime.provides;
+  sandbox.registerType = Mime.registerType;
+  sandbox.start = Render.start;
+  sandbox.send = Render.send;
+  sandbox.getRow = Render.getRow;
+  sandbox.isArray = isArray;
+  return sandbox;
+};
+
+function create_filter_sandbox() {
+  var sandbox = create_sandbox();
+  sandbox.emit = Filter.emit;
+  return sandbox;
+};
+
+function create_dreyfus_sandbox() {
+  var sandbox = create_sandbox();
+  sandbox.index = Dreyfus.index;
+  return sandbox;
+};
+
+function create_nouveau_sandbox() {
+  var sandbox = create_sandbox();
+  sandbox.index = Nouveau.index;
+  return sandbox;
+};
+
+function seal(obj, flag) {
+  Object.freeze(obj);
+};
+
+// This is a copy from loop.js
+var DDoc = (function() {
+  var ddoc_dispatch = {
+    "lists"     : Render.list,
+    "shows"    : Render.show,
+    "filters"   : Filter.filter,
+    "views"     : Filter.filter_view,
+    "updates"  : Render.update,
+    "validate_doc_update" : Validate.validate,
+    "rewrites"  : Render.rewrite
+  };
+  var ddocs = {};
+  return {
+    ddoc : function() {
+      var args = [];
+      for (var i=0; i < arguments.length; i++) {
+        args.push(arguments[i]);
+      };
+      var ddocId = args.shift();
+      if (ddocId == "new") {
+        // get the real ddocId.
+        ddocId = args.shift();
+        // store the ddoc, functions are lazily compiled.
+        ddocs[ddocId] = args.shift();
+        print("true");
+      } else {
+        // Couch makes sure we know this ddoc already.
+        var ddoc = ddocs[ddocId];
+        if (!ddoc) throw(["fatal", "query_protocol_error", "uncached design doc: "+ddocId]);
+        var funPath = args.shift();
+        var cmd = funPath[0];
+        // the first member of the fun path determines the type of operation
+        var funArgs = args.shift();
+        if (ddoc_dispatch[cmd]) {
+          // get the function, call the command with it
+          var point = ddoc;
+          for (var i=0; i < funPath.length; i++) {
+            if (i+1 == funPath.length) {
+              var fun = point[funPath[i]];
+              if (!fun) {
+                throw(["error","not_found",
+                       "missing " + funPath[0] + " function " + funPath[i] +
+                       " on design doc " + ddocId]);
+              }
+              if (typeof fun != "function") {
+                // For filter_view we want the emit() function
+                // to be overridden and just toggle a flag instead of
+                // accumulating rows
+                var sandbox = (cmd === "views") ? create_filter_sandbox() : create_sandbox();
+                fun = Couch.compileFunction(fun, ddoc, funPath.join('.'), sandbox);
+                // cache the compiled fun on the ddoc
+                point[funPath[i]] = fun;
+              };
+            } else {
+              point = point[funPath[i]];
+            }
+          };
+
+          // run the correct responder with the cmd body
+          ddoc_dispatch[cmd].apply(null, [fun, ddoc, funArgs]);
+        } else {
+          // unknown command, quit and hope the restarted version is better
+          throw(["fatal", "unknown_command", "unknown ddoc command '" + cmd + "'"]);
+        }
+      }
+    }
+  };
+})();
+
+// This mostly a copy from loop.js handleError
+function handleError(e) {
+    if (e === null) {
+      // internal error, another possibility when out of memory
+      // nothing to do except rethrow and let main.c catch it and exit(1)
+      throw(null);
+    }
+    const type = e[0];
+    if (type == "fatal") {
+      e[0] = "error"; // we tell the client it was a fatal error by dying
+      respond(e);
+      return false;
+    } else if (type == "error") {
+      respond(e);
+      return true;
+    } else if (e.name == "InternalError") {
+      // If the internal error is caught by handleViewError it will be
+      // re-thrown as a ["fatal", ...] error, and we already handle that above.
+      // Here we handle the case when the error is thrown outside of
+      // handleViewError, for instance when serializing the rows to be sent
+      // back to the user
+      respond(["error", e.name, e.message]);
+      return false;
+    } else if (e.error && e.reason) {
+      // compatibility with old error format
+      respond(["error", e.error, e.reason]);
+      return true;
+    } else if (e.name) {
+      respond(["error", e.name, e]);
+      return true;
+    } else {
+      respond(["error","unnamed_error", e.stack]);
+      return true;
+    }
+  };
+
+globalThis.dispatch = function(line) {
+  const cmd = JSON.parse(line);
+  State.line_length = line.length;
+  try {
+    switch (cmd.shift()) {
+    case "ddoc":
+      DDoc.ddoc.apply(null, cmd);
+      break;
+    case "reset":
+      State.reset.apply(null, cmd);
+      break;
+    case "add_fun":
+      State.addFun.apply(null, cmd);
+      break;
+    case "add_lib":
+      State.addLib.apply(null, cmd);
+      break;
+    case "map_doc":
+      Views.mapDoc.apply(null, cmd);
+      break;
+    case "index_doc":
+      Dreyfus.indexDoc.apply(null, cmd);
+      break;
+    case "nouveau_index_doc":
+      Nouveau.indexDoc.apply(null, cmd);
+      break;
+    case "reduce":
+      Views.reduce.apply(null, cmd);
+      break;
+    case "rereduce":
+      Views.rereduce.apply(null, cmd);
+      break;
+    default:
+      // unknown command, quit and hope the restarted version is better
+      throw(["fatal", "unknown_command", "unknown command '" + cmdkey + "'"]);
+    }
+  } catch(e) {
+      return handleError(e);
+  };
+  return true;
+};
diff --git a/share/server/dreyfus.js b/share/server/dreyfus.js
index 1d8a029d4..3aa72493f 100644
--- a/share/server/dreyfus.js
+++ b/share/server/dreyfus.js
@@ -20,7 +20,7 @@ var Dreyfus = (function() {
     } else if (err[0] == "fatal") {
       throw(err);
     }
-    var message = "function raised exception " + err.toSource();
+    var message = "function raised exception " + errstr(err);
     if (doc) message += " with doc._id " + doc._id;
     log(message);
   };
diff --git a/share/server/render.js b/share/server/render.js
index 078a6491b..bc4c2ca4a 100644
--- a/share/server/render.js
+++ b/share/server/render.js
@@ -347,7 +347,7 @@ var Render = (function() {
       throw(e);
     } else {
       var logMessage = "function raised error: " +
-                        e.toSource() + " \n" +
+                        errstr(e) + " \n" +
                        "stacktrace: " + e.stack;
       log(logMessage);
       throw(["error", errType || "render_error", logMessage]);
diff --git a/share/server/util.js b/share/server/util.js
index c207d0ab9..a459bf2e8 100644
--- a/share/server/util.js
+++ b/share/server/util.js
@@ -81,7 +81,7 @@ var Couch = {
           throw [
             "error",
             "compilation_error",
-            "Module require('" +name+ "') raised error " + e.toSource()
+            "Module require('" +name+ "') raised error " + errstr(e)
           ];
         }
         ddoc._module_cache[newModule.id] = newModule.exports;
@@ -106,7 +106,7 @@ var Couch = {
       throw([
         "error",
         "compilation_error",
-        err.toSource() + " (" + source + ")"
+        errstr(err) + " (" + source + ")"
       ]);
     };
     if (typeof(functionObject) == "function") {
@@ -126,13 +126,18 @@ var Couch = {
   }
 };
 
+function errstr(e) {
+  // toSource() is a Spidermonkey "special"
+  return (e.toSource ? e.toSource() : e.toString());
+};
+
 // prints the object as JSON, and rescues and logs any JSON.stringify() related errors
 function respond(obj) {
   try {
     print(JSON.stringify(obj));
   } catch(e) {
     log("Error converting object to JSON: " + e.toString());
-    log("error on obj: "+ obj.toSource());
+    log("error on obj: "+ obj.toString());
   }
 };
 
diff --git a/share/server/views.js b/share/server/views.js
index 57cdaf3a9..4fe3b7528 100644
--- a/share/server/views.js
+++ b/share/server/views.js
@@ -76,7 +76,7 @@ var Views = (function() {
     } else if (err.name == "InternalError") {
       throw(["fatal", err.name, err.message]);
     }
-    var message = "function raised exception " + err.toSource();
+    var message = "function raised exception " + errstr(err);
     if (doc) message += " with doc._id " + doc._id;
     log(message);
   };
diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl
index ef586e174..a1c2d275f 100644
--- a/src/chttpd/src/chttpd_node.erl
+++ b/src/chttpd/src/chttpd_node.erl
@@ -45,6 +45,17 @@ handle_node_req(#httpd{method = 'GET', path_parts = [_, _Node, <<"_versions">>]}
     UcaVer = couch_ejson_compare:get_uca_version(),
     ColVer = couch_ejson_compare:get_collator_version(),
     Hashes = crypto:supports(hashs),
+    EngineName = couch_server:get_js_engine(),
+    JsEngine =
+        case EngineName of
+            <<"spidermonkey">> ->
+                #{
+                    name => EngineName,
+                    version => couch_server:get_spidermonkey_version()
+                };
+            _Other ->
+                #{name => EngineName}
+        end,
     send_json(Req, 200, #{
         erlang => #{
             version => ?l2b(?COUCHDB_ERLANG_VERSION),
@@ -56,10 +67,7 @@ handle_node_req(#httpd{method = 'GET', path_parts = [_, _Node, <<"_versions">>]}
             collation_algorithm_version => couch_util:version_to_binary(UcaVer),
             collator_version => couch_util:version_to_binary(ColVer)
         },
-        javascript_engine => #{
-            name => <<"spidermonkey">>,
-            version => couch_server:get_spidermonkey_version()
-        }
+        javascript_engine => JsEngine
     });
 handle_node_req(#httpd{path_parts = [_, _Node, <<"_versions">>]} = Req) ->
     send_method_not_allowed(Req, "GET");
diff --git a/src/couch/rebar.config.script b/src/couch/rebar.config.script
index f1682f6b7..378838129 100644
--- a/src/couch/rebar.config.script
+++ b/src/couch/rebar.config.script
@@ -56,6 +56,18 @@ CouchConfig = case filelib:is_file(os:getenv("COUCHDB_CONFIG")) of
         []
 end.
 
+JsEngine = case lists:keyfind(js_engine, 1, CouchConfig) of
+    {_, "spidermonkey"} ->
+        "spidermonkey";
+    {_, "quickjs"} ->
+        "quickjs";
+    {_, InvalidJsEngine} ->
+        io:format(standard_error, "Unsupported default JS engine ~p~n", [InvalidJsEngine]),
+        erlang:halt(1);
+    false ->
+        "spidermonkey"
+end.
+
 SMVsn = case lists:keyfind(spidermonkey_version, 1, CouchConfig) of
     {_, "1.8.5"} ->
         "1.8.5";
@@ -243,6 +255,7 @@ AddConfig = [
     {erl_opts, PlatformDefines ++ [
         {d, 'COUCHDB_VERSION', Version},
         {d, 'COUCHDB_GIT_SHA', GitSha},
+        {d, 'COUCHDB_JS_ENGINE', JsEngine},
         {d, 'COUCHDB_SPIDERMONKEY_VERSION', SMVsn},
         {i, "../"}
     ] ++ MD5Config ++ ProperConfig},
diff --git a/src/couch/src/couch.app.src b/src/couch/src/couch.app.src
index ef4e5e956..c9076d22a 100644
--- a/src/couch/src/couch.app.src
+++ b/src/couch/src/couch.app.src
@@ -48,7 +48,8 @@
         ioq,
         couch_stats,
         hyper,
-        couch_dist
+        couch_dist,
+        couch_quickjs
     ]},
     {env, [
         {httpd_global_handlers, [
diff --git a/src/couch/src/couch_proc_manager.erl b/src/couch/src/couch_proc_manager.erl
index 623734e6e..493836acf 100644
--- a/src/couch/src/couch_proc_manager.erl
+++ b/src/couch/src/couch_proc_manager.erl
@@ -145,6 +145,7 @@ init([]) ->
     ets:insert(?SERVERS, get_servers_from_env("COUCHDB_NATIVE_QUERY_SERVER_")),
     ets:insert(?SERVERS, [{"QUERY", {mango_native_proc, start_link, []}}]),
     maybe_configure_erlang_native_servers(),
+    configure_js_engine(couch_server:get_js_engine()),
 
     {ok, #state{
         config = get_proc_config(),
@@ -540,6 +541,14 @@ maybe_configure_erlang_native_servers() ->
             ok
     end.
 
+configure_js_engine(<<"quickjs">>) ->
+    ets:insert(?SERVERS, [
+        {"JAVASCRIPT", couch_quickjs:mainjs_cmd()},
+        {"COFFEESCRIPT", couch_quickjs:coffee_cmd()}
+    ]);
+configure_js_engine(<<"spidermonkey">>) ->
+    ok.
+
 new_proc_int(From, Lang) when is_binary(Lang) ->
     LangStr = binary_to_list(Lang),
     case get_query_server(LangStr) of
diff --git a/src/couch/src/couch_server.erl b/src/couch/src/couch_server.erl
index 7dbbe4af1..5c6eb587b 100644
--- a/src/couch/src/couch_server.erl
+++ b/src/couch/src/couch_server.erl
@@ -29,6 +29,7 @@
 -export([lock/2, unlock/1]).
 -export([db_updated/1]).
 -export([num_servers/0, couch_server/1, couch_dbs_pid_to_name/1, couch_dbs/1]).
+-export([get_js_engine/0]).
 -export([aggregate_queue_len/0, get_spidermonkey_version/0]).
 -export([names/0]).
 -export([try_lock/2, unlock/2]).
@@ -91,6 +92,9 @@ get_stats() ->
         lists:foldl(Fun, {0, 0}, lists:seq(1, num_servers())),
     [{start_time, ?l2b(Time)}, {dbs_open, Open}].
 
+get_js_engine() ->
+    list_to_binary(config:get("couchdb", "js_engine", ?COUCHDB_JS_ENGINE)).
+
 get_spidermonkey_version() -> list_to_binary(?COUCHDB_SPIDERMONKEY_VERSION).
 
 sup_start_link(N) ->
diff --git a/src/couch_quickjs/.gitignore b/src/couch_quickjs/.gitignore
new file mode 100644
index 000000000..41c1192cb
--- /dev/null
+++ b/src/couch_quickjs/.gitignore
@@ -0,0 +1,14 @@
+/quickjs/examples/hello
+/quickjs/examples/hello_module
+/quickjs/examples/test_fib
+/quickjs/hello.c
+/quickjs/libquickjs.a
+/quickjs/libquickjs.lto.a
+/quickjs/qjs
+/quickjs/qjsc
+/quickjs/qjscalc
+/quickjs/qjscalc.c
+/quickjs/repl.c
+/quickjs/run-test262
+/quickjs/test_fib.c
+compile_commands.json
diff --git a/src/couch_quickjs/build_js.escript b/src/couch_quickjs/build_js.escript
new file mode 100644
index 000000000..49b298fe0
--- /dev/null
+++ b/src/couch_quickjs/build_js.escript
@@ -0,0 +1,115 @@
+%% -*- tab-width: 4;erlang-indent-level: 4;indent-tabs-mode: nil -*-
+%% ex: ft=erlang ts=4 sw=4 et
+
+%% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+%% use this file except in compliance with the License. You may obtain a copy of
+%% the License at
+%%
+%%   http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+%% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+%% License for the specific language governing permissions and limitations under
+%% the License.
+%%
+%%
+
+%% Utility script to compile query server .js files into C arrays. Main JS and
+%% Coffeescript files are treated separately. First each is bundled into a
+%% single .js file. Then the file is passed to qjsc to produce the bytecode
+%% array.
+%%
+
+-export([main/1]).
+
+-mode(compile).
+
+main(["compile"]) ->
+    concat_js_files("bundle_mainjs.js", "bundle_coffee.js"),
+    Changed1 = compile_bytecode("bundle_mainjs.js", "couchjs_mainjs_bytecode.c"),
+    Changed2 = compile_bytecode("bundle_coffee.js", "couchjs_coffee_bytecode.c"),
+    case Changed1 orelse Changed2 of
+        true ->
+            % A stupid hack. The compile step is often too quick and
+            % generates .o timestamps with the same 1 second timestamp
+            % as the .c file. During dev work, it means it will
+            % re-compile and re-link everything uncessarily at least
+            % one more time as the port compiler compares timestamps
+            % with the >= operator.
+            timer:sleep(1000),
+            ok;
+        false ->
+            ok
+    end;
+main(["clean"]) ->
+    rm("priv/bundle_*.js"),
+    rm("c_src/couchjs_*_bytecode.c");
+main(Arg) ->
+    io:format(standard_error, "Expected a 'compile' or 'clean' arg. Got:~p", [Arg]),
+    halt(1).
+
+concat_js_files(JsScript, CoffeeScript) ->
+    Prefix = "../../share/server/",
+    JsFiles = [
+        "rewrite_fun.js",
+        "dreyfus.js",
+        "nouveau.js",
+        "filter.js",
+        "mimeparse.js",
+        "render.js",
+        "state.js",
+        "util.js",
+        "validate.js",
+        "views.js"
+    ],
+    Main = JsFiles ++ ["dispatch-quickjs.js"],
+    Coffee = JsFiles ++ ["coffee-script.js", "dispatch-quickjs.js"],
+    concat([Prefix ++ File || File <- Main], "priv/" ++ JsScript),
+    concat([Prefix ++ File || File <- Coffee], "priv/" ++ CoffeeScript),
+    ok.
+
+compile_bytecode(Js, CBytecode) ->
+    % cp_if_different/2 is used to to avoid triggering a re-compile if nothing changed
+    Tmp = CBytecode ++ ".tmp",
+    os:cmd("quickjs/qjsc -c -N bytecode -o c_src/" ++ Tmp ++ " priv/" ++ Js),
+    Changed = cp_if_different("c_src/" ++ Tmp, "c_src/" ++ CBytecode),
+    rm("c_src/" ++ Tmp),
+    Changed.
+
+cp_if_different(From, To) ->
+    Bin = fread(From),
+    case filelib:is_file(To) of
+        true ->
+            case fread(To) of
+                Bin ->
+                    false;
+                <<_/binary>> ->
+                    ok = fwrite(To, Bin),
+                    true
+            end;
+        false ->
+            ok = fwrite(To, Bin),
+            true
+    end.
+
+concat(Sources, Target) ->
+    SourceBins = [fread(P) || P <- Sources],
+    TargetBin =  iolist_to_binary(["(function () {\n"] ++ SourceBins ++ ["})();\n"]),
+    fwrite(Target, TargetBin).
+
+fread(Path) ->
+    {ok, Bin} = file:read_file(Path),
+    Bin.
+
+fwrite(Path, Bin) when is_binary(Bin) ->
+    ok = file:write_file(Path, Bin).
+
+rm(Path) ->
+    Fun = fun(F) ->
+        case filelib:is_file(F) of
+            true -> ok = file:delete(F);
+            false -> ok
+        end
+    end,
+    lists:foreach(Fun, filelib:wildcard(Path)).
diff --git a/src/couch_quickjs/c_src/.gitignore b/src/couch_quickjs/c_src/.gitignore
new file mode 100644
index 000000000..934c75cc0
--- /dev/null
+++ b/src/couch_quickjs/c_src/.gitignore
@@ -0,0 +1,5 @@
+/couchjs_mainjs*.c
+/couchjs_coffee*.c
+/couchjs_mainjs*.d
+/couchjs_coffee*.d
+
diff --git a/src/couch_quickjs/c_src/couchjs.c b/src/couch_quickjs/c_src/couchjs.c
new file mode 100644
index 000000000..984e2360f
--- /dev/null
+++ b/src/couch_quickjs/c_src/couchjs.c
@@ -0,0 +1,413 @@
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "quickjs.h"
+#include "quickjs-libc.h"
+
+#define DEFAULT_STACK_SIZE (64L * 1024L * 1024L)
+#define BUF_SIZE 1024
+
+#define USAGE "couchjs [-V|-M memorylimit|-h] <script.js>\n"
+
+#define BAIL(error) {fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, error);\
+  exit(EXIT_FAILURE);\
+}
+#define BAILJS(cx, error) {fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, error);\
+  js_std_dump_error(cx);\
+  exit(EXIT_FAILURE);\
+}
+
+// These are auto-generated by qjsc
+extern const uint32_t bytecode_size;
+extern const uint8_t bytecode[];
+
+typedef struct {
+    int             stack_size;
+} couch_args;
+
+typedef enum {CMD_EMPTY, CMD_DDOC, CMD_RESET, CMD_LIST, CMD_VIEW} CmdType;
+
+static void parse_args(int argc, const char* argv[], couch_args* args)
+{
+    int i = 1;
+    while(i < argc) {
+        if (strncmp("-h", argv[i], 2) == 0) {
+            fprintf(stderr, USAGE);
+            exit(0);
+        } else if (strncmp("-M", argv[i], 2) == 0) {
+            args->stack_size = atoi(argv[++i]);
+            if (args->stack_size <= 1L * 1024L * 1024L) {
+              BAIL("Invalid stack size");
+            }
+        } else if (strncmp("-V", argv[i], 2) == 0) {
+            fprintf(stderr, "quickjs\n");
+            exit(0);
+        } else {
+            break;
+        }
+        i++;
+    }
+}
+
+// Parse the command type. We only care about resets, ddoc operations and
+// making sure wires were not crossed and we ended up in a list streaming
+// sub-command state somehow. See protocol description at:
+//   https://docs.couchdb.org/en/stable/query-server/protocol.html
+//
+static CmdType parse_command(char* str, size_t len) {
+  if (len == 0) {
+    return CMD_EMPTY;
+  }
+  if (len >= 8  && strncmp("[\"reset\"", str, 8) == 0) {
+    return CMD_RESET;
+  }
+  if (len >= 7  && strncmp("[\"ddoc\"", str, 7) == 0) {
+    return CMD_DDOC;
+  }
+  if (len >= 11 && strncmp("[\"list_row\"", str, 11) == 0) {
+    return CMD_LIST;
+  }
+  if (len >= 11 && strncmp("[\"list_end\"", str, 11) == 0) {
+    return CMD_LIST;
+  }
+  return CMD_VIEW;
+}
+
+static void add_cx_methods(JSContext* cx) {
+  //TODO: configure some features with env vars of command line switches
+  JS_AddIntrinsicBaseObjects(cx);
+  JS_AddIntrinsicEval(cx);
+  JS_AddIntrinsicJSON(cx);
+  JS_AddIntrinsicRegExp(cx);
+  JS_AddIntrinsicMapSet(cx);
+  JS_AddIntrinsicDate(cx);
+}
+
+// Creates a new JSContext with only the provided sandbox function
+// in its global. Make sure to free the context when done with it.
+//
+static JSContext* make_sandbox(JSContext* cx, JSValue sbox) {
+   JSContext *cx1 = JS_NewContextRaw(JS_GetRuntime(cx));
+   if(!cx1) {
+     return NULL;
+   }
+   add_cx_methods(cx1);
+   JSValue global = JS_GetGlobalObject(cx1);
+
+   int i;
+   JSPropertyEnum *tab;
+   uint32_t tablen;
+   JSValue prop_val;
+
+   int prop_flags = JS_GPN_STRING_MASK | JS_GPN_ENUM_ONLY;
+   if(JS_GetOwnPropertyNames(cx, &tab, &tablen, sbox, prop_flags) < 0){
+     JS_FreeContext(cx1);
+     return NULL;
+   }
+   for(i=0; i < tablen; i++) {
+     prop_val = JS_GetProperty(cx, sbox, tab[i].atom);
+     if (JS_IsException(prop_val)) {
+       goto exception;
+     }
+     JS_SetProperty(cx1, global, tab[i].atom, prop_val);
+   }
+
+   for(i=0; i < tablen; i++) {
+     JS_FreeAtom(cx, tab[i].atom);
+   }
+
+   js_free(cx, tab);
+   JS_FreeValue(cx1, global);
+   return cx1;
+
+exception:
+  for(i = 0; i < tablen; i++) {
+    JS_FreeAtom(cx, tab[i].atom);
+  }
+  js_free(cx, tab);
+  JS_FreeValue(cx1, global);
+  JS_FreeContext(cx1);
+  return NULL;
+}
+
+// This is mostly for test compatibility between engines, and
+// some anti-footgun help for user code. For real sandboxing we rely on
+// destroying and re-creating the whole JSRuntime instance.
+//
+static JSValue js_evalcx(JSContext* cx, JSValueConst this_val, int argc, JSValueConst *argv)
+{
+    size_t strlen;
+    const char *str;
+    const char *name;
+
+    if (argc != 3) {
+      return JS_EXCEPTION;
+    }
+
+    if(!JS_IsObject(argv[1])) {
+      return JS_EXCEPTION;
+    }
+    JSValue sbox = argv[1];
+
+    str = JS_ToCStringLen(cx, &strlen, argv[0]);
+    if(!str) {
+      return JS_EXCEPTION;
+    }
+
+    name = JS_ToCString(cx, argv[2]);
+    if(!name) {
+      JS_FreeCString(cx, str);
+      return JS_EXCEPTION;
+    }
+
+    JSContext *cx1 = make_sandbox(cx, sbox);
+    if(!cx1) {
+      JS_FreeCString(cx, str);
+      JS_FreeCString(cx, name);
+      return JS_EXCEPTION;
+    }
+
+    int flags = JS_EVAL_TYPE_GLOBAL | JS_EVAL_FLAG_BACKTRACE_BARRIER;
+    JSValue res = JS_Eval(cx1, str, strlen, name, flags);
+
+    JS_FreeCString(cx, str);
+    JS_FreeCString(cx, name);
+    JS_FreeContext(cx1);
+    return res;
+}
+
+// Once list/show features are gone, could avoid this too and just have the
+// dispatch return the list of rows as a response. That way JS can entirely
+// avoid any IO logic, only take rows and return rows in a simple
+// request/response manner.
+//
+static JSValue js_print(JSContext* cx, JSValueConst this_val, int argc, JSValueConst *argv)
+{
+    const char *str;
+    if (argc == 1) {
+      str = JS_ToCString(cx, argv[0]);
+      if (!str) {
+        return JS_UNDEFINED;
+      }
+      fputs(str, stdout);
+      JS_FreeCString(cx, str);
+    } else if (argc > 1) {
+      return JS_EXCEPTION;
+    }
+    fputc('\n', stdout);
+    fflush(stdout);
+    return JS_UNDEFINED;
+}
+
+//TODO: remove when lists/show are gone. The only reason to have this function
+//is to support getRow() for lists.
+//
+static JSValue js_readline(JSContext* cx, JSValueConst this_val, int argc, JSValueConst *argv)
+{
+    if (argc != 0) return JS_EXCEPTION;
+
+    JSValue res;
+    size_t linemax = BUF_SIZE;
+    char* line = malloc(linemax);
+    if(!line) {
+      BAIL("Could not allocate line buffer for list sub-command");
+    }
+    int len;
+    if ((len = getline(&line, &linemax, stdin)) != -1) {
+      if (line[len - 1] != '\n') {
+        BAIL("list getline() didn't end in newline");
+      }
+      line[--len] = '\0'; // don't care about the last \n so shorten the string
+      switch (parse_command(line, len)) {
+        case CMD_LIST:
+          res = JS_NewStringLen(cx, (const char*)line, len);
+          break;
+        case CMD_EMPTY:
+          res = JS_NewString(cx, "");
+          break;
+        default:
+          BAIL("unexpected command during list subcommand mode");
+      }
+      free(line);
+      return res;
+    } else {
+      free(line);
+      return JS_EXCEPTION;
+    }
+}
+
+// TODO: This may not be neeed. Mainly for SM API compat to minimize main.js differences
+//
+static JSValue js_gc(JSContext* cx, JSValueConst this_val, int argc, JSValueConst *argv)
+{
+    if (argc != 0) {
+      return JS_EXCEPTION;
+    }
+    JS_RunGC(JS_GetRuntime(cx));
+    return JS_UNDEFINED;
+}
+
+static void free_cx(JSContext* cx) {
+  if (cx == NULL) {
+    return;
+  }
+  JSRuntime* rt = JS_GetRuntime(cx);
+  if (rt == NULL) {
+    BAIL("JSRuntime is unexpectedly NULL");
+  }
+  JS_FreeContext(cx);
+  JS_FreeRuntime(rt);
+}
+
+static JSContext* new_cx(const couch_args* args) {
+  JSRuntime* rt;
+  JSContext* cx;
+
+  rt = JS_NewRuntime();
+  if (rt == NULL) {
+    BAIL("Could not create JSRuntime");
+  }
+
+  JS_SetMemoryLimit(rt, args->stack_size);
+  JS_SetMaxStackSize(rt, args->stack_size);
+
+  cx = JS_NewContextRaw(rt);
+  if (cx == NULL) {
+    BAIL("Could not create JSContext");
+  }
+
+  add_cx_methods(cx);
+  return cx;
+}
+
+// This is what we rely on for sandboxing. On a reset command, blow away
+// the whole runtime instance and re-create it by re-evaluating the bytecode again
+// in a new instance.
+//
+static JSContext* reset_cx(const couch_args* args, JSContext *cx) {
+  JSValue global, obj, val;
+
+  free_cx(cx);
+  cx = new_cx(args);
+
+  global = JS_GetGlobalObject(cx);
+  JS_SetPropertyStr(cx, global, "print",    JS_NewCFunction(cx, js_print,   "print",    1));
+  JS_SetPropertyStr(cx, global, "readline", JS_NewCFunction(cx, js_readline,"readline", 0));
+  JS_SetPropertyStr(cx, global, "gc",       JS_NewCFunction(cx, js_gc,      "gc",       0));
+  JS_SetPropertyStr(cx, global, "evalcx",   JS_NewCFunction(cx, js_evalcx,  "evalcx",   3));
+
+  obj = JS_ReadObject(cx, bytecode, bytecode_size,  JS_READ_OBJ_BYTECODE);
+  if (JS_IsException(obj)) {
+    BAILJS(cx, "Error reading bytecode");
+  }
+  val = JS_EvalFunction(cx, obj); // this calls auto-frees obj
+  if (JS_IsException(val)) {
+    BAILJS(cx, "Error evaluating bytecode");
+  }
+  JS_FreeValue(cx, val);
+  JS_FreeValue(cx, global);
+  return cx;
+}
+
+// Dispatch a single command line to the engine. If it weren't for list functions we could have
+// made it return the responses as a result too.
+//
+// The result is a boolean value indicating whether to continue processing or stop and exit.
+//
+static bool dispatch(JSContext* cx, char* str, size_t len) {
+  JSValue global = JS_GetGlobalObject(cx);
+
+  JSValue fun = JS_GetPropertyStr(cx, global, "dispatch");
+  if (JS_IsException(fun)) {
+    BAILJS(cx, "Could not find main dispatch function");
+  }
+  if (!JS_IsFunction(cx, fun)) {
+    BAIL("dispatch is not a function");
+  }
+
+  JSValue argv[] = {JS_NewStringLen(cx, str, len)};
+  JSValue jres = JS_Call(cx, fun, global, 1, argv);
+  if (JS_IsException(jres)) {
+    BAILJS(cx, "couchjs internal error");
+  }
+  if (!JS_IsBool(jres)) {
+    BAIL("dispatch didn't return boolean value");
+  }
+  bool res = JS_VALUE_GET_BOOL(jres);
+
+  JS_FreeValue(cx, jres);
+  JS_FreeValue(cx, argv[0]);
+  JS_FreeValue(cx, fun);
+  JS_FreeValue(cx, global);
+
+  return res;
+}
+
+int main(int argc, const char* argv[])
+{
+    JSContext* view_cx = NULL;
+    JSContext* ddoc_cx = NULL;
+
+    couch_args args = {.stack_size = DEFAULT_STACK_SIZE};
+    parse_args(argc, argv, &args);
+    //load_bytecode(&args);
+
+    size_t linemax = BUF_SIZE;
+    char* line = malloc(linemax);
+    if (!line) {
+      BAIL("Could not allocate line buffer");
+    }
+
+    int len;
+    bool do_continue = true;
+    while (do_continue && (len = getline(&line, &linemax, stdin)) != -1) {
+      if (line[len - 1] != '\n') {
+        BAIL("getline() didn't end in newline");
+      }
+      line[--len] = '\0'; // don't care about the last \n so shorten the string
+      switch (parse_command(line, len)) {
+          case CMD_RESET:
+            view_cx = reset_cx(&args, view_cx);
+            do_continue = dispatch(view_cx, line, len);
+            break;
+          case CMD_DDOC:
+            if (ddoc_cx == NULL) {
+              ddoc_cx = reset_cx(&args, NULL);
+            }
+            do_continue = dispatch(ddoc_cx, line, len);
+            break;
+          case CMD_VIEW:
+            if (view_cx == NULL) {
+              view_cx = reset_cx(&args, NULL);
+            }
+            do_continue = dispatch(view_cx, line, len);
+            break;
+         case CMD_EMPTY:
+            do_continue = false;
+            break;
+         case CMD_LIST:
+            BAIL("unexpected list subcommand in the main command loop");
+      }
+    }
+
+    free_cx(view_cx);
+    free_cx(ddoc_cx);
+    free(line);
+
+    return EXIT_SUCCESS;
+}
+
diff --git a/src/couch_quickjs/patches/01-spidermonkey-185-mode.patch b/src/couch_quickjs/patches/01-spidermonkey-185-mode.patch
new file mode 100644
index 000000000..abcb364d5
--- /dev/null
+++ b/src/couch_quickjs/patches/01-spidermonkey-185-mode.patch
@@ -0,0 +1,27 @@
+--- quickjs-master/quickjs.c	2022-03-06 13:00:24.000000000 -0500
++++ quickjs/quickjs.c	2023-03-20 22:52:22.000000000 -0400
+@@ -28692,10 +28692,24 @@
+     if (s->token.val == TOK_FUNCTION ||
+         (token_is_pseudo_keyword(s, JS_ATOM_async) &&
+          peek_token(s, TRUE) == TOK_FUNCTION)) {
++
++        if (peek_token(s, TRUE) == '(') {
++           /* Spidermonkey 1.8.5 mode: accept top function statements as expressions */
++           if (js_parse_expr(s))
++               return -1;
++           if (s->cur_func->eval_ret_idx >= 0) {
++               /* store the expression value so that it can be returned by eval() */
++               emit_op(s, OP_put_loc);
++               emit_u16(s, s->cur_func->eval_ret_idx);
++           } else {
++               emit_op(s, OP_drop); /* drop the result */
++           }
++        } else {
+         if (js_parse_function_decl(s, JS_PARSE_FUNC_STATEMENT,
+                                    JS_FUNC_NORMAL, JS_ATOM_NULL,
+                                    s->token.ptr, s->token.line_num))
+             return -1;
++        }
+     } else if (s->token.val == TOK_EXPORT && fd->module) {
+         if (js_parse_export(s))
+             return -1;
diff --git a/src/couch_quickjs/patches/02-getpropertyvalue-unitialized-read.patch b/src/couch_quickjs/patches/02-getpropertyvalue-unitialized-read.patch
new file mode 100644
index 000000000..2baab3048
--- /dev/null
+++ b/src/couch_quickjs/patches/02-getpropertyvalue-unitialized-read.patch
@@ -0,0 +1,11 @@
+--- quickjs/quickjs.c
++++ quickjs/quickjs.c
+@@ -7855,6 +7855,8 @@ static JSValue JS_GetPropertyValue(JSContext *ctx, JSValueConst this_obj,
+         uint32_t idx, len;
+         /* fast path for array access */
+         p = JS_VALUE_GET_OBJ(this_obj);
++        if (unlikely(!p->fast_array))
++            goto slow_path;
+         idx = JS_VALUE_GET_INT(prop);
+         len = (uint32_t)p->u.array.count;
+         if (unlikely(idx >= len))
diff --git a/src/couch_quickjs/patches/03-freebsd-extra-clang-patch-makefile.patch b/src/couch_quickjs/patches/03-freebsd-extra-clang-patch-makefile.patch
new file mode 100644
index 000000000..284f6fcf0
--- /dev/null
+++ b/src/couch_quickjs/patches/03-freebsd-extra-clang-patch-makefile.patch
@@ -0,0 +1,25 @@
+--- quickjs/Makefile.orig	2019-09-18 18:34:20 UTC
++++ quickjs/Makefile
+@@ -25,6 +25,9 @@
+ ifeq ($(shell uname -s),Darwin)
+ CONFIG_DARWIN=y
+ endif
++ifeq ($(shell uname -s),FreeBSD)
++CONFIG_FREEBSD=y
++endif
+ # Windows cross compilation from Linux
+ #CONFIG_WIN32=y
+ # use link time optimization (smaller and faster executables but slower build)
+@@ -38,6 +41,12 @@ ifdef CONFIG_DARWIN
+ # use clang instead of gcc
+ CONFIG_CLANG=y
+ CONFIG_DEFAULT_AR=y
++endif
++ifdef CONFIG_FREEBSD
++# use clang instead of gcc
++CONFIG_CLANG=y
++CONFIG_DEFAULT_AR=y
++CONFIG_LTO=
+ endif
+ 
+ # installation directory
diff --git a/src/couch_quickjs/patches/04-freebsd-patch-quickjs-libc.c.patch b/src/couch_quickjs/patches/04-freebsd-patch-quickjs-libc.c.patch
new file mode 100644
index 000000000..0f91084d7
--- /dev/null
+++ b/src/couch_quickjs/patches/04-freebsd-patch-quickjs-libc.c.patch
@@ -0,0 +1,18 @@
+--- quickjs/quickjs-libc.c
++++ quickjs/quickjs-libc.c
+@@ -47,8 +47,14 @@
+ #include <sys/ioctl.h>
+ #include <sys/wait.h>
+ 
+-#if defined(__APPLE__)
++#if defined(__FreeBSD__)
++extern char **environ;
++#endif
++
++#if defined(__APPLE__) || defined(__FreeBSD__)
+ typedef sig_t sighandler_t;
++#endif
++#if defined(__APPLE__)
+ #if !defined(environ)
+ #include <crt_externs.h>
+ #define environ (*_NSGetEnviron())
diff --git a/src/couch_quickjs/patches/05-stack-overflow-is-array-proxy-cve-2023-31922.patch b/src/couch_quickjs/patches/05-stack-overflow-is-array-proxy-cve-2023-31922.patch
new file mode 100644
index 000000000..e14eae750
--- /dev/null
+++ b/src/couch_quickjs/patches/05-stack-overflow-is-array-proxy-cve-2023-31922.patch
@@ -0,0 +1,16 @@
+
+--- quickjs-master/quickjs.c
++++ quickjs/quickjs.c
+@@ -45243,6 +45243,12 @@ static int js_proxy_isArray(JSContext *ctx, JSValueConst obj)
+     JSProxyData *s = JS_GetOpaque(obj, JS_CLASS_PROXY);
+     if (!s)
+         return FALSE;
++
++    if (js_check_stack_overflow(ctx->rt, 0)) {
++        JS_ThrowStackOverflow(ctx);
++        return -1;
++    }
++
+     if (s->is_revoked) {
+         JS_ThrowTypeErrorRevokedProxy(ctx);
+         return -1;
diff --git a/src/couch_quickjs/priv/.gitignore b/src/couch_quickjs/priv/.gitignore
new file mode 100644
index 000000000..e476b42e2
--- /dev/null
+++ b/src/couch_quickjs/priv/.gitignore
@@ -0,0 +1,2 @@
+bundle_*.js
+couchjs_*
diff --git a/src/couch_quickjs/quickjs/Changelog b/src/couch_quickjs/quickjs/Changelog
new file mode 100644
index 000000000..c09af91cb
--- /dev/null
+++ b/src/couch_quickjs/quickjs/Changelog
@@ -0,0 +1,148 @@
+2021-03-27:
+
+- faster Array.prototype.push and Array.prototype.unshift
+- added JS_UpdateStackTop()
+- fixed Windows console
+- misc bug fixes
+
+2020-11-08:
+
+- improved function parameter initializers
+- added std.setenv(), std.unsetenv() and std.getenviron()
+- added JS_EvalThis()
+- misc bug fixes
+
+2020-09-06:
+
+- added logical assignment operators
+- added IsHTMLDDA support
+- faster for-of loops
+- os.Worker now takes a module filename as parameter
+- qjsc: added -D option to compile dynamically loaded modules or workers
+- misc bug fixes
+
+2020-07-05:
+
+- modified JS_GetPrototype() to return a live value
+- REPL: support unicode characters larger than 16 bits
+- added os.Worker
+- improved object serialization
+- added std.parseExtJSON
+- misc bug fixes
+
+2020-04-12:
+
+- added cross realm support
+- added AggregateError and Promise.any
+- added env, uid and gid options in os.exec()
+- misc bug fixes
+
+2020-03-16:
+
+- reworked error handling in std and os libraries: suppressed I/O
+  exceptions in std FILE functions and return a positive errno value
+  when it is explicit
+- output exception messages to stderr
+- added std.loadFile(), std.strerror(), std.FILE.prototype.tello()
+- added JS_GetRuntimeOpaque(), JS_SetRuntimeOpaque(), JS_NewUint32()
+- updated to Unicode 13.0.0
+- misc bug fixes
+
+2020-01-19:
+
+- keep CONFIG_BIGNUM in the makefile
+- added os.chdir()
+- qjs: added -I option
+- more memory checks in the bignum operations
+- modified operator overloading semantics to be closer to the TC39
+  proposal
+- suppressed "use bigint" mode. Simplified "use math" mode
+- BigDecimal: changed suffix from 'd' to 'm'
+- misc bug fixes
+
+2020-01-05:
+
+- always compile the bignum code. Added '--bignum' option to qjs.
+- added BigDecimal
+- added String.prototype.replaceAll
+- misc bug fixes
+
+2019-12-21:
+
+- added nullish coalescing operator (ES2020)
+- added optional chaining (ES2020)
+- removed recursions in garbage collector
+- test stack overflow in the parser
+- improved backtrace logic
+- added JS_SetHostPromiseRejectionTracker()
+- allow exotic constructors
+- improved c++ compatibility
+- misc bug fixes
+
+2019-10-27:
+
+- added example of C class in a module (examples/test_point.js)
+- added JS_GetTypedArrayBuffer()
+- misc bug fixes
+
+2019-09-18:
+
+- added os.exec and other system calls
+- exported JS_ValueToAtom()
+- qjsc: added 'qjsc_' prefix to the generated C identifiers
+- added cross-compilation support
+- misc bug fixes
+
+2019-09-01:
+
+- added globalThis
+- documented JS_EVAL_FLAG_COMPILE_ONLY
+- added import.meta.url and import.meta.main
+- added 'debugger' statement
+- misc bug fixes
+
+2019-08-18:
+
+- added os.realpath, os.getcwd, os.mkdir, os.stat, os.lstat,
+  os.readlink, os.readdir, os.utimes, std.popen
+- module autodetection
+- added import.meta
+- misc bug fixes
+
+2019-08-10:
+
+- added public class fields and private class fields, methods and
+  accessors (TC39 proposal)
+- changed JS_ToCStringLen() prototype
+- qjsc: handle '-' in module names and modules with the same filename
+- added std.urlGet
+- exported JS_GetOwnPropertyNames() and JS_GetOwnProperty()
+- exported some bigint C functions
+- added support for eshost in run-test262
+- misc bug fixes
+
+2019-07-28:
+
+- added dynamic import
+- added Promise.allSettled
+- added String.prototype.matchAll
+- added Object.fromEntries
+- reduced number of ticks in await
+- added BigInt support in Atomics
+- exported JS_NewPromiseCapability()
+- misc async function and async generator fixes
+- enabled hashbang support by default
+
+2019-07-21:
+
+- updated test262 tests
+- updated to Unicode version 12.1.0
+- fixed missing Date object in qjsc
+- fixed multi-context creation
+- misc ES2020 related fixes
+- simplified power and division operators in bignum extension
+- fixed several crash conditions
+
+2019-07-09:
+
+- first public release
diff --git a/src/couch_quickjs/quickjs/LICENSE b/src/couch_quickjs/quickjs/LICENSE
new file mode 100644
index 000000000..2c8fdebaf
--- /dev/null
+++ b/src/couch_quickjs/quickjs/LICENSE
@@ -0,0 +1,22 @@
+QuickJS Javascript Engine
+ 
+Copyright (c) 2017-2021 Fabrice Bellard
+Copyright (c) 2017-2021 Charlie Gordon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/src/couch_quickjs/quickjs/Makefile b/src/couch_quickjs/quickjs/Makefile
new file mode 100644
index 000000000..91b53d544
--- /dev/null
+++ b/src/couch_quickjs/quickjs/Makefile
@@ -0,0 +1,479 @@
+#
+# QuickJS Javascript Engine
+# 
+# Copyright (c) 2017-2021 Fabrice Bellard
+# Copyright (c) 2017-2021 Charlie Gordon
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+ifeq ($(shell uname -s),Darwin)
+CONFIG_DARWIN=y
+endif
+ifeq ($(shell uname -s),FreeBSD)
+CONFIG_FREEBSD=y
+endif
+# Windows cross compilation from Linux
+#CONFIG_WIN32=y
+# use link time optimization (smaller and faster executables but slower build)
+CONFIG_LTO=y
+# consider warnings as errors (for development)
+#CONFIG_WERROR=y
+# force 32 bit build for some utilities
+#CONFIG_M32=y
+
+ifdef CONFIG_DARWIN
+# use clang instead of gcc
+CONFIG_CLANG=y
+CONFIG_DEFAULT_AR=y
+endif
+ifdef CONFIG_FREEBSD
+# use clang instead of gcc
+CONFIG_CLANG=y
+CONFIG_DEFAULT_AR=y
+CONFIG_LTO=
+endif
+
+# installation directory
+prefix=/usr/local
+
+# use the gprof profiler
+#CONFIG_PROFILE=y
+# use address sanitizer
+#CONFIG_ASAN=y
+# include the code for BigInt/BigFloat/BigDecimal and math mode
+CONFIG_BIGNUM=y
+
+OBJDIR=.obj
+
+ifdef CONFIG_WIN32
+  ifdef CONFIG_M32
+    CROSS_PREFIX=i686-w64-mingw32-
+  else
+    CROSS_PREFIX=x86_64-w64-mingw32-
+  endif
+  EXE=.exe
+else
+  CROSS_PREFIX=
+  EXE=
+endif
+ifdef CONFIG_CLANG
+  HOST_CC=clang
+  CC=$(CROSS_PREFIX)clang
+  CFLAGS=-g -Wall -MMD -MF $(OBJDIR)/$(@F).d
+  CFLAGS += -Wextra
+  CFLAGS += -Wno-sign-compare
+  CFLAGS += -Wno-missing-field-initializers
+  CFLAGS += -Wundef -Wuninitialized
+  CFLAGS += -Wunused -Wno-unused-parameter
+  CFLAGS += -Wwrite-strings
+  CFLAGS += -Wchar-subscripts -funsigned-char
+  CFLAGS += -MMD -MF $(OBJDIR)/$(@F).d
+  ifdef CONFIG_DEFAULT_AR
+    AR=$(CROSS_PREFIX)ar
+  else
+    ifdef CONFIG_LTO
+      AR=$(CROSS_PREFIX)llvm-ar
+    else
+      AR=$(CROSS_PREFIX)ar
+    endif
+  endif
+else
+  HOST_CC=gcc
+  CC=$(CROSS_PREFIX)gcc
+  CFLAGS=-g -Wall -MMD -MF $(OBJDIR)/$(@F).d
+  CFLAGS += -Wno-array-bounds -Wno-format-truncation
+  ifdef CONFIG_LTO
+    AR=$(CROSS_PREFIX)gcc-ar
+  else
+    AR=$(CROSS_PREFIX)ar
+  endif
+endif
+STRIP=$(CROSS_PREFIX)strip
+ifdef CONFIG_WERROR
+CFLAGS+=-Werror
+endif
+DEFINES:=-D_GNU_SOURCE -DCONFIG_VERSION=\"$(shell cat VERSION)\"
+ifdef CONFIG_BIGNUM
+DEFINES+=-DCONFIG_BIGNUM
+endif
+ifdef CONFIG_WIN32
+DEFINES+=-D__USE_MINGW_ANSI_STDIO # for standard snprintf behavior
+endif
+
+CFLAGS+=$(DEFINES)
+CFLAGS_DEBUG=$(CFLAGS) -O0
+CFLAGS_SMALL=$(CFLAGS) -Os
+CFLAGS_OPT=$(CFLAGS) -O2
+CFLAGS_NOLTO:=$(CFLAGS_OPT)
+LDFLAGS=-g
+ifdef CONFIG_LTO
+CFLAGS_SMALL+=-flto
+CFLAGS_OPT+=-flto
+LDFLAGS+=-flto
+endif
+ifdef CONFIG_PROFILE
+CFLAGS+=-p
+LDFLAGS+=-p
+endif
+ifdef CONFIG_ASAN
+CFLAGS+=-fsanitize=address -fno-omit-frame-pointer
+LDFLAGS+=-fsanitize=address -fno-omit-frame-pointer
+endif
+ifdef CONFIG_WIN32
+LDEXPORT=
+else
+LDEXPORT=-rdynamic
+endif
+
+PROGS=qjs$(EXE) qjsc$(EXE) run-test262
+ifneq ($(CROSS_PREFIX),)
+QJSC_CC=gcc
+QJSC=./host-qjsc
+PROGS+=$(QJSC)
+else
+QJSC_CC=$(CC)
+QJSC=./qjsc$(EXE)
+endif
+ifndef CONFIG_WIN32
+PROGS+=qjscalc
+endif
+ifdef CONFIG_M32
+PROGS+=qjs32 qjs32_s
+endif
+PROGS+=libquickjs.a
+ifdef CONFIG_LTO
+PROGS+=libquickjs.lto.a
+endif
+
+# examples
+ifeq ($(CROSS_PREFIX),)
+ifdef CONFIG_ASAN
+PROGS+=
+else
+PROGS+=examples/hello examples/hello_module examples/test_fib
+ifndef CONFIG_DARWIN
+PROGS+=examples/fib.so examples/point.so
+endif
+endif
+endif
+
+all: $(OBJDIR) $(OBJDIR)/quickjs.check.o $(OBJDIR)/qjs.check.o $(PROGS)
+
+QJS_LIB_OBJS=$(OBJDIR)/quickjs.o $(OBJDIR)/libregexp.o $(OBJDIR)/libunicode.o $(OBJDIR)/cutils.o $(OBJDIR)/quickjs-libc.o
+
+QJS_OBJS=$(OBJDIR)/qjs.o $(OBJDIR)/repl.o $(QJS_LIB_OBJS)
+ifdef CONFIG_BIGNUM
+QJS_LIB_OBJS+=$(OBJDIR)/libbf.o 
+QJS_OBJS+=$(OBJDIR)/qjscalc.o
+endif
+
+HOST_LIBS=-lm -ldl -lpthread
+LIBS=-lm
+ifndef CONFIG_WIN32
+LIBS+=-ldl -lpthread
+endif
+LIBS+=$(EXTRA_LIBS)
+
+$(OBJDIR):
+	mkdir -p $(OBJDIR) $(OBJDIR)/examples $(OBJDIR)/tests
+
+qjs$(EXE): $(QJS_OBJS)
+	$(CC) $(LDFLAGS) $(LDEXPORT) -o $@ $^ $(LIBS)
+
+qjs-debug$(EXE): $(patsubst %.o, %.debug.o, $(QJS_OBJS))
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+qjsc$(EXE): $(OBJDIR)/qjsc.o $(QJS_LIB_OBJS)
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+ifneq ($(CROSS_PREFIX),)
+
+$(QJSC): $(OBJDIR)/qjsc.host.o \
+    $(patsubst %.o, %.host.o, $(QJS_LIB_OBJS))
+	$(HOST_CC) $(LDFLAGS) -o $@ $^ $(HOST_LIBS)
+
+endif #CROSS_PREFIX
+
+QJSC_DEFINES:=-DCONFIG_CC=\"$(QJSC_CC)\" -DCONFIG_PREFIX=\"$(prefix)\"
+ifdef CONFIG_LTO
+QJSC_DEFINES+=-DCONFIG_LTO
+endif
+QJSC_HOST_DEFINES:=-DCONFIG_CC=\"$(HOST_CC)\" -DCONFIG_PREFIX=\"$(prefix)\"
+
+$(OBJDIR)/qjsc.o: CFLAGS+=$(QJSC_DEFINES)
+$(OBJDIR)/qjsc.host.o: CFLAGS+=$(QJSC_HOST_DEFINES)
+
+qjs32: $(patsubst %.o, %.m32.o, $(QJS_OBJS))
+	$(CC) -m32 $(LDFLAGS) $(LDEXPORT) -o $@ $^ $(LIBS)
+
+qjs32_s: $(patsubst %.o, %.m32s.o, $(QJS_OBJS))
+	$(CC) -m32 $(LDFLAGS) -o $@ $^ $(LIBS)
+	@size $@
+
+qjscalc: qjs
+	ln -sf $< $@
+
+ifdef CONFIG_LTO
+LTOEXT=.lto
+else
+LTOEXT=
+endif
+
+libquickjs$(LTOEXT).a: $(QJS_LIB_OBJS)
+	$(AR) rcs $@ $^
+
+ifdef CONFIG_LTO
+libquickjs.a: $(patsubst %.o, %.nolto.o, $(QJS_LIB_OBJS))
+	$(AR) rcs $@ $^
+endif # CONFIG_LTO
+
+repl.c: $(QJSC) repl.js
+	$(QJSC) -c -o $@ -m repl.js
+
+qjscalc.c: $(QJSC) qjscalc.js
+	$(QJSC) -fbignum -c -o $@ qjscalc.js
+
+ifneq ($(wildcard unicode/UnicodeData.txt),)
+$(OBJDIR)/libunicode.o $(OBJDIR)/libunicode.m32.o $(OBJDIR)/libunicode.m32s.o \
+    $(OBJDIR)/libunicode.nolto.o: libunicode-table.h
+
+libunicode-table.h: unicode_gen
+	./unicode_gen unicode $@
+endif
+
+run-test262: $(OBJDIR)/run-test262.o $(QJS_LIB_OBJS)
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+run-test262-debug: $(patsubst %.o, %.debug.o, $(OBJDIR)/run-test262.o $(QJS_LIB_OBJS))
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+run-test262-32: $(patsubst %.o, %.m32.o, $(OBJDIR)/run-test262.o $(QJS_LIB_OBJS))
+	$(CC) -m32 $(LDFLAGS) -o $@ $^ $(LIBS)
+
+# object suffix order: nolto, [m32|m32s]
+
+$(OBJDIR)/%.o: %.c | $(OBJDIR)
+	$(CC) $(CFLAGS_OPT) -c -o $@ $<
+
+$(OBJDIR)/%.host.o: %.c | $(OBJDIR)
+	$(HOST_CC) $(CFLAGS_OPT) -c -o $@ $<
+
+$(OBJDIR)/%.pic.o: %.c | $(OBJDIR)
+	$(CC) $(CFLAGS_OPT) -fPIC -DJS_SHARED_LIBRARY -c -o $@ $<
+
+$(OBJDIR)/%.nolto.o: %.c | $(OBJDIR)
+	$(CC) $(CFLAGS_NOLTO) -c -o $@ $<
+
+$(OBJDIR)/%.m32.o: %.c | $(OBJDIR)
+	$(CC) -m32 $(CFLAGS_OPT) -c -o $@ $<
+
+$(OBJDIR)/%.m32s.o: %.c | $(OBJDIR)
+	$(CC) -m32 $(CFLAGS_SMALL) -c -o $@ $<
+
+$(OBJDIR)/%.debug.o: %.c | $(OBJDIR)
+	$(CC) $(CFLAGS_DEBUG) -c -o $@ $<
+
+$(OBJDIR)/%.check.o: %.c | $(OBJDIR)
+	$(CC) $(CFLAGS) -DCONFIG_CHECK_JSVALUE -c -o $@ $<
+
+regexp_test: libregexp.c libunicode.c cutils.c
+	$(CC) $(LDFLAGS) $(CFLAGS) -DTEST -o $@ libregexp.c libunicode.c cutils.c $(LIBS)
+
+unicode_gen: $(OBJDIR)/unicode_gen.host.o $(OBJDIR)/cutils.host.o libunicode.c unicode_gen_def.h
+	$(HOST_CC) $(LDFLAGS) $(CFLAGS) -o $@ $(OBJDIR)/unicode_gen.host.o $(OBJDIR)/cutils.host.o
+
+clean:
+	rm -f repl.c qjscalc.c out.c
+	rm -f *.a *.o *.d *~ unicode_gen regexp_test $(PROGS)
+	rm -f hello.c test_fib.c
+	rm -f examples/*.so tests/*.so
+	rm -rf $(OBJDIR)/ *.dSYM/ qjs-debug
+	rm -rf run-test262-debug run-test262-32
+
+install: all
+	mkdir -p "$(DESTDIR)$(prefix)/bin"
+	$(STRIP) qjs qjsc
+	install -m755 qjs qjsc "$(DESTDIR)$(prefix)/bin"
+	ln -sf qjs "$(DESTDIR)$(prefix)/bin/qjscalc"
+	mkdir -p "$(DESTDIR)$(prefix)/lib/quickjs"
+	install -m644 libquickjs.a "$(DESTDIR)$(prefix)/lib/quickjs"
+ifdef CONFIG_LTO
+	install -m644 libquickjs.lto.a "$(DESTDIR)$(prefix)/lib/quickjs"
+endif
+	mkdir -p "$(DESTDIR)$(prefix)/include/quickjs"
+	install -m644 quickjs.h quickjs-libc.h "$(DESTDIR)$(prefix)/include/quickjs"
+
+###############################################################################
+# examples
+
+# example of static JS compilation
+HELLO_SRCS=examples/hello.js
+HELLO_OPTS=-fno-string-normalize -fno-map -fno-promise -fno-typedarray \
+           -fno-typedarray -fno-regexp -fno-json -fno-eval -fno-proxy \
+           -fno-date -fno-module-loader
+ifdef CONFIG_BIGNUM
+HELLO_OPTS+=-fno-bigint
+endif
+
+hello.c: $(QJSC) $(HELLO_SRCS)
+	$(QJSC) -e $(HELLO_OPTS) -o $@ $(HELLO_SRCS)
+
+ifdef CONFIG_M32
+examples/hello: $(OBJDIR)/hello.m32s.o $(patsubst %.o, %.m32s.o, $(QJS_LIB_OBJS))
+	$(CC) -m32 $(LDFLAGS) -o $@ $^ $(LIBS)
+else
+examples/hello: $(OBJDIR)/hello.o $(QJS_LIB_OBJS)
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+endif
+
+# example of static JS compilation with modules
+HELLO_MODULE_SRCS=examples/hello_module.js
+HELLO_MODULE_OPTS=-fno-string-normalize -fno-map -fno-promise -fno-typedarray \
+           -fno-typedarray -fno-regexp -fno-json -fno-eval -fno-proxy \
+           -fno-date -m
+examples/hello_module: $(QJSC) libquickjs$(LTOEXT).a $(HELLO_MODULE_SRCS)
+	$(QJSC) $(HELLO_MODULE_OPTS) -o $@ $(HELLO_MODULE_SRCS)
+
+# use of an external C module (static compilation)
+
+test_fib.c: $(QJSC) examples/test_fib.js
+	$(QJSC) -e -M examples/fib.so,fib -m -o $@ examples/test_fib.js
+
+examples/test_fib: $(OBJDIR)/test_fib.o $(OBJDIR)/examples/fib.o libquickjs$(LTOEXT).a
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+examples/fib.so: $(OBJDIR)/examples/fib.pic.o
+	$(CC) $(LDFLAGS) -shared -o $@ $^
+
+examples/point.so: $(OBJDIR)/examples/point.pic.o
+	$(CC) $(LDFLAGS) -shared -o $@ $^
+
+###############################################################################
+# documentation
+
+DOCS=doc/quickjs.pdf doc/quickjs.html doc/jsbignum.pdf doc/jsbignum.html 
+
+build_doc: $(DOCS)
+
+clean_doc: 
+	rm -f $(DOCS)
+
+doc/%.pdf: doc/%.texi
+	texi2pdf --clean -o $@ -q $<
+
+doc/%.html.pre: doc/%.texi
+	makeinfo --html --no-headers --no-split --number-sections -o $@ $<
+
+doc/%.html: doc/%.html.pre
+	sed -e 's|</style>|</style>\n<meta name="viewport" content="width=device-width, initial-scale=1.0">|' < $< > $@
+
+###############################################################################
+# tests
+
+ifndef CONFIG_DARWIN
+test: tests/bjson.so examples/point.so
+endif
+ifdef CONFIG_M32
+test: qjs32
+endif
+
+test: qjs
+	./qjs tests/test_closure.js
+	./qjs tests/test_language.js
+	./qjs tests/test_builtin.js
+	./qjs tests/test_loop.js
+	./qjs tests/test_std.js
+	./qjs tests/test_worker.js
+ifndef CONFIG_DARWIN
+ifdef CONFIG_BIGNUM
+	./qjs --bignum tests/test_bjson.js
+else
+	./qjs tests/test_bjson.js
+endif
+	./qjs examples/test_point.js
+endif
+ifdef CONFIG_BIGNUM
+	./qjs --bignum tests/test_op_overloading.js
+	./qjs --bignum tests/test_bignum.js
+	./qjs --qjscalc tests/test_qjscalc.js
+endif
+ifdef CONFIG_M32
+	./qjs32 tests/test_closure.js
+	./qjs32 tests/test_language.js
+	./qjs32 tests/test_builtin.js
+	./qjs32 tests/test_loop.js
+	./qjs32 tests/test_std.js
+	./qjs32 tests/test_worker.js
+ifdef CONFIG_BIGNUM
+	./qjs32 --bignum tests/test_op_overloading.js
+	./qjs32 --bignum tests/test_bignum.js
+	./qjs32 --qjscalc tests/test_qjscalc.js
+endif
+endif
+
+stats: qjs qjs32
+	./qjs -qd
+	./qjs32 -qd
+
+microbench: qjs
+	./qjs tests/microbench.js
+
+microbench-32: qjs32
+	./qjs32 tests/microbench.js
+
+# ES5 tests (obsolete)
+test2o: run-test262
+	time ./run-test262 -m -c test262o.conf
+
+test2o-32: run-test262-32
+	time ./run-test262-32 -m -c test262o.conf
+
+test2o-update: run-test262
+	./run-test262 -u -c test262o.conf
+
+# Test262 tests
+test2-default: run-test262
+	time ./run-test262 -m -c test262.conf
+
+test2: run-test262
+	time ./run-test262 -m -c test262.conf -a
+
+test2-32: run-test262-32
+	time ./run-test262-32 -m -c test262.conf -a
+
+test2-update: run-test262
+	./run-test262 -u -c test262.conf -a
+
+test2-check: run-test262
+	time ./run-test262 -m -c test262.conf -E -a
+
+testall: all test microbench test2o test2
+
+testall-32: all test-32 microbench-32 test2o-32 test2-32
+
+testall-complete: testall testall-32
+
+bench-v8: qjs
+	make -C tests/bench-v8
+	./qjs -d tests/bench-v8/combined.js
+
+tests/bjson.so: $(OBJDIR)/tests/bjson.pic.o
+	$(CC) $(LDFLAGS) -shared -o $@ $^ $(LIBS)
+
+-include $(wildcard $(OBJDIR)/*.d)
diff --git a/src/couch_quickjs/quickjs/VERSION b/src/couch_quickjs/quickjs/VERSION
new file mode 100644
index 000000000..22ffec184
--- /dev/null
+++ b/src/couch_quickjs/quickjs/VERSION
@@ -0,0 +1 @@
+2021-03-27
diff --git a/src/couch_quickjs/quickjs/cutils.c b/src/couch_quickjs/quickjs/cutils.c
new file mode 100644
index 000000000..a02fb7688
--- /dev/null
+++ b/src/couch_quickjs/quickjs/cutils.c
@@ -0,0 +1,631 @@
+/*
+ * C utilities
+ * 
+ * Copyright (c) 2017 Fabrice Bellard
+ * Copyright (c) 2018 Charlie Gordon
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "cutils.h"
+
+void pstrcpy(char *buf, int buf_size, const char *str)
+{
+    int c;
+    char *q = buf;
+
+    if (buf_size <= 0)
+        return;
+
+    for(;;) {
+        c = *str++;
+        if (c == 0 || q >= buf + buf_size - 1)
+            break;
+        *q++ = c;
+    }
+    *q = '\0';
+}
+
+/* strcat and truncate. */
+char *pstrcat(char *buf, int buf_size, const char *s)
+{
+    int len;
+    len = strlen(buf);
+    if (len < buf_size)
+        pstrcpy(buf + len, buf_size - len, s);
+    return buf;
+}
+
+int strstart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (*p != *q)
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+int has_suffix(const char *str, const char *suffix)
+{
+    size_t len = strlen(str);
+    size_t slen = strlen(suffix);
+    return (len >= slen && !memcmp(str + len - slen, suffix, slen));
+}
+
+/* Dynamic buffer package */
+
+static void *dbuf_default_realloc(void *opaque, void *ptr, size_t size)
+{
+    return realloc(ptr, size);
+}
+
+void dbuf_init2(DynBuf *s, void *opaque, DynBufReallocFunc *realloc_func)
+{
+    memset(s, 0, sizeof(*s));
+    if (!realloc_func)
+        realloc_func = dbuf_default_realloc;
+    s->opaque = opaque;
+    s->realloc_func = realloc_func;
+}
+
+void dbuf_init(DynBuf *s)
+{
+    dbuf_init2(s, NULL, NULL);
+}
+
+/* return < 0 if error */
+int dbuf_realloc(DynBuf *s, size_t new_size)
+{
+    size_t size;
+    uint8_t *new_buf;
+    if (new_size > s->allocated_size) {
+        if (s->error)
+            return -1;
+        size = s->allocated_size * 3 / 2;
+        if (size > new_size)
+            new_size = size;
+        new_buf = s->realloc_func(s->opaque, s->buf, new_size);
+        if (!new_buf) {
+            s->error = TRUE;
+            return -1;
+        }
+        s->buf = new_buf;
+        s->allocated_size = new_size;
+    }
+    return 0;
+}
+
+int dbuf_write(DynBuf *s, size_t offset, const uint8_t *data, size_t len)
+{
+    size_t end;
+    end = offset + len;
+    if (dbuf_realloc(s, end))
+        return -1;
+    memcpy(s->buf + offset, data, len);
+    if (end > s->size)
+        s->size = end;
+    return 0;
+}
+
+int dbuf_put(DynBuf *s, const uint8_t *data, size_t len)
+{
+    if (unlikely((s->size + len) > s->allocated_size)) {
+        if (dbuf_realloc(s, s->size + len))
+            return -1;
+    }
+    memcpy(s->buf + s->size, data, len);
+    s->size += len;
+    return 0;
+}
+
+int dbuf_put_self(DynBuf *s, size_t offset, size_t len)
+{
+    if (unlikely((s->size + len) > s->allocated_size)) {
+        if (dbuf_realloc(s, s->size + len))
+            return -1;
+    }
+    memcpy(s->buf + s->size, s->buf + offset, len);
+    s->size += len;
+    return 0;
+}
+
+int dbuf_putc(DynBuf *s, uint8_t c)
+{
+    return dbuf_put(s, &c, 1);
+}
+
+int dbuf_putstr(DynBuf *s, const char *str)
+{
+    return dbuf_put(s, (const uint8_t *)str, strlen(str));
+}
+
+int __attribute__((format(printf, 2, 3))) dbuf_printf(DynBuf *s,
+                                                      const char *fmt, ...)
+{
+    va_list ap;
+    char buf[128];
+    int len;
+    
+    va_start(ap, fmt);
+    len = vsnprintf(buf, sizeof(buf), fmt, ap);
+    va_end(ap);
+    if (len < sizeof(buf)) {
+        /* fast case */
+        return dbuf_put(s, (uint8_t *)buf, len);
+    } else {
+        if (dbuf_realloc(s, s->size + len + 1))
+            return -1;
+        va_start(ap, fmt);
+        vsnprintf((char *)(s->buf + s->size), s->allocated_size - s->size,
+                  fmt, ap);
+        va_end(ap);
+        s->size += len;
+    }
+    return 0;
+}
+
+void dbuf_free(DynBuf *s)
+{
+    /* we test s->buf as a fail safe to avoid crashing if dbuf_free()
+       is called twice */
+    if (s->buf) {
+        s->realloc_func(s->opaque, s->buf, 0);
+    }
+    memset(s, 0, sizeof(*s));
+}
+
+/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
+   are output. */
+int unicode_to_utf8(uint8_t *buf, unsigned int c)
+{
+    uint8_t *q = buf;
+
+    if (c < 0x80) {
+        *q++ = c;
+    } else {
+        if (c < 0x800) {
+            *q++ = (c >> 6) | 0xc0;
+        } else {
+            if (c < 0x10000) {
+                *q++ = (c >> 12) | 0xe0;
+            } else {
+                if (c < 0x00200000) {
+                    *q++ = (c >> 18) | 0xf0;
+                } else {
+                    if (c < 0x04000000) {
+                        *q++ = (c >> 24) | 0xf8;
+                    } else if (c < 0x80000000) {
+                        *q++ = (c >> 30) | 0xfc;
+                        *q++ = ((c >> 24) & 0x3f) | 0x80;
+                    } else {
+                        return 0;
+                    }
+                    *q++ = ((c >> 18) & 0x3f) | 0x80;
+                }
+                *q++ = ((c >> 12) & 0x3f) | 0x80;
+            }
+            *q++ = ((c >> 6) & 0x3f) | 0x80;
+        }
+        *q++ = (c & 0x3f) | 0x80;
+    }
+    return q - buf;
+}
+
+static const unsigned int utf8_min_code[5] = {
+    0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
+};
+
+static const unsigned char utf8_first_code_mask[5] = {
+    0x1f, 0xf, 0x7, 0x3, 0x1,
+};
+
+/* return -1 if error. *pp is not updated in this case. max_len must
+   be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
+int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
+{
+    int l, c, b, i;
+
+    c = *p++;
+    if (c < 0x80) {
+        *pp = p;
+        return c;
+    }
+    switch(c) {
+    case 0xc0: case 0xc1: case 0xc2: case 0xc3:
+    case 0xc4: case 0xc5: case 0xc6: case 0xc7:
+    case 0xc8: case 0xc9: case 0xca: case 0xcb:
+    case 0xcc: case 0xcd: case 0xce: case 0xcf:
+    case 0xd0: case 0xd1: case 0xd2: case 0xd3:
+    case 0xd4: case 0xd5: case 0xd6: case 0xd7:
+    case 0xd8: case 0xd9: case 0xda: case 0xdb:
+    case 0xdc: case 0xdd: case 0xde: case 0xdf:
+        l = 1;
+        break;
+    case 0xe0: case 0xe1: case 0xe2: case 0xe3:
+    case 0xe4: case 0xe5: case 0xe6: case 0xe7:
+    case 0xe8: case 0xe9: case 0xea: case 0xeb:
+    case 0xec: case 0xed: case 0xee: case 0xef:
+        l = 2;
+        break;
+    case 0xf0: case 0xf1: case 0xf2: case 0xf3:
+    case 0xf4: case 0xf5: case 0xf6: case 0xf7:
+        l = 3;
+        break;
+    case 0xf8: case 0xf9: case 0xfa: case 0xfb:
+        l = 4;
+        break;
+    case 0xfc: case 0xfd:
+        l = 5;
+        break;
+    default:
+        return -1;
+    }
+    /* check that we have enough characters */
+    if (l > (max_len - 1))
+        return -1;
+    c &= utf8_first_code_mask[l - 1];
+    for(i = 0; i < l; i++) {
+        b = *p++;
+        if (b < 0x80 || b >= 0xc0)
+            return -1;
+        c = (c << 6) | (b & 0x3f);
+    }
+    if (c < utf8_min_code[l - 1])
+        return -1;
+    *pp = p;
+    return c;
+}
+
+#if 0
+
+#if defined(EMSCRIPTEN) || defined(__ANDROID__)
+
+static void *rqsort_arg;
+static int (*rqsort_cmp)(const void *, const void *, void *);
+
+static int rqsort_cmp2(const void *p1, const void *p2)
+{
+    return rqsort_cmp(p1, p2, rqsort_arg);
+}
+
+/* not reentrant, but not needed with emscripten */
+void rqsort(void *base, size_t nmemb, size_t size,
+            int (*cmp)(const void *, const void *, void *),
+            void *arg)
+{
+    rqsort_arg = arg;
+    rqsort_cmp = cmp;
+    qsort(base, nmemb, size, rqsort_cmp2);
+}
+
+#endif
+
+#else
+
+typedef void (*exchange_f)(void *a, void *b, size_t size);
+typedef int (*cmp_f)(const void *, const void *, void *opaque);
+
+static void exchange_bytes(void *a, void *b, size_t size) {
+    uint8_t *ap = (uint8_t *)a;
+    uint8_t *bp = (uint8_t *)b;
+
+    while (size-- != 0) {
+        uint8_t t = *ap;
+        *ap++ = *bp;
+        *bp++ = t;
+    }
+}
+
+static void exchange_one_byte(void *a, void *b, size_t size) {
+    uint8_t *ap = (uint8_t *)a;
+    uint8_t *bp = (uint8_t *)b;
+    uint8_t t = *ap;
+    *ap = *bp;
+    *bp = t;
+}
+
+static void exchange_int16s(void *a, void *b, size_t size) {
+    uint16_t *ap = (uint16_t *)a;
+    uint16_t *bp = (uint16_t *)b;
+
+    for (size /= sizeof(uint16_t); size-- != 0;) {
+        uint16_t t = *ap;
+        *ap++ = *bp;
+        *bp++ = t;
+    }
+}
+
+static void exchange_one_int16(void *a, void *b, size_t size) {
+    uint16_t *ap = (uint16_t *)a;
+    uint16_t *bp = (uint16_t *)b;
+    uint16_t t = *ap;
+    *ap = *bp;
+    *bp = t;
+}
+
+static void exchange_int32s(void *a, void *b, size_t size) {
+    uint32_t *ap = (uint32_t *)a;
+    uint32_t *bp = (uint32_t *)b;
+
+    for (size /= sizeof(uint32_t); size-- != 0;) {
+        uint32_t t = *ap;
+        *ap++ = *bp;
+        *bp++ = t;
+    }
+}
+
+static void exchange_one_int32(void *a, void *b, size_t size) {
+    uint32_t *ap = (uint32_t *)a;
+    uint32_t *bp = (uint32_t *)b;
+    uint32_t t = *ap;
+    *ap = *bp;
+    *bp = t;
+}
+
+static void exchange_int64s(void *a, void *b, size_t size) {
+    uint64_t *ap = (uint64_t *)a;
+    uint64_t *bp = (uint64_t *)b;
+
+    for (size /= sizeof(uint64_t); size-- != 0;) {
+        uint64_t t = *ap;
+        *ap++ = *bp;
+        *bp++ = t;
+    }
+}
+
+static void exchange_one_int64(void *a, void *b, size_t size) {
+    uint64_t *ap = (uint64_t *)a;
+    uint64_t *bp = (uint64_t *)b;
+    uint64_t t = *ap;
+    *ap = *bp;
+    *bp = t;
+}
+
+static void exchange_int128s(void *a, void *b, size_t size) {
+    uint64_t *ap = (uint64_t *)a;
+    uint64_t *bp = (uint64_t *)b;
+
+    for (size /= sizeof(uint64_t) * 2; size-- != 0; ap += 2, bp += 2) {
+        uint64_t t = ap[0];
+        uint64_t u = ap[1];
+        ap[0] = bp[0];
+        ap[1] = bp[1];
+        bp[0] = t;
+        bp[1] = u;
+    }
+}
+
+static void exchange_one_int128(void *a, void *b, size_t size) {
+    uint64_t *ap = (uint64_t *)a;
+    uint64_t *bp = (uint64_t *)b;
+    uint64_t t = ap[0];
+    uint64_t u = ap[1];
+    ap[0] = bp[0];
+    ap[1] = bp[1];
+    bp[0] = t;
+    bp[1] = u;
+}
+
+static inline exchange_f exchange_func(const void *base, size_t size) {
+    switch (((uintptr_t)base | (uintptr_t)size) & 15) {
+    case 0:
+        if (size == sizeof(uint64_t) * 2)
+            return exchange_one_int128;
+        else
+            return exchange_int128s;
+    case 8:
+        if (size == sizeof(uint64_t))
+            return exchange_one_int64;
+        else
+            return exchange_int64s;
+    case 4:
+    case 12:
+        if (size == sizeof(uint32_t))
+            return exchange_one_int32;
+        else
+            return exchange_int32s;
+    case 2:
+    case 6:
+    case 10:
+    case 14:
+        if (size == sizeof(uint16_t))
+            return exchange_one_int16;
+        else
+            return exchange_int16s;
+    default:
+        if (size == 1)
+            return exchange_one_byte;
+        else
+            return exchange_bytes;
+    }
+}
+
+static void heapsortx(void *base, size_t nmemb, size_t size, cmp_f cmp, void *opaque)
+{
+    uint8_t *basep = (uint8_t *)base;
+    size_t i, n, c, r;
+    exchange_f swap = exchange_func(base, size);
+
+    if (nmemb > 1) {
+        i = (nmemb / 2) * size;
+        n = nmemb * size;
+
+        while (i > 0) {
+            i -= size;
+            for (r = i; (c = r * 2 + size) < n; r = c) {
+                if (c < n - size && cmp(basep + c, basep + c + size, opaque) <= 0)
+                    c += size;
+                if (cmp(basep + r, basep + c, opaque) > 0)
+                    break;
+                swap(basep + r, basep + c, size);
+            }
+        }
+        for (i = n - size; i > 0; i -= size) {
+            swap(basep, basep + i, size);
+
+            for (r = 0; (c = r * 2 + size) < i; r = c) {
+                if (c < i - size && cmp(basep + c, basep + c + size, opaque) <= 0)
+                    c += size;
+                if (cmp(basep + r, basep + c, opaque) > 0)
+                    break;
+                swap(basep + r, basep + c, size);
+            }
+        }
+    }
+}
+
+static inline void *med3(void *a, void *b, void *c, cmp_f cmp, void *opaque)
+{
+    return cmp(a, b, opaque) < 0 ?
+        (cmp(b, c, opaque) < 0 ? b : (cmp(a, c, opaque) < 0 ? c : a )) :
+        (cmp(b, c, opaque) > 0 ? b : (cmp(a, c, opaque) < 0 ? a : c ));
+}
+
+/* pointer based version with local stack and insertion sort threshhold */
+void rqsort(void *base, size_t nmemb, size_t size, cmp_f cmp, void *opaque)
+{
+    struct { uint8_t *base; size_t count; int depth; } stack[50], *sp = stack;
+    uint8_t *ptr, *pi, *pj, *plt, *pgt, *top, *m;
+    size_t m4, i, lt, gt, span, span2;
+    int c, depth;
+    exchange_f swap = exchange_func(base, size);
+    exchange_f swap_block = exchange_func(base, size | 128);
+
+    if (nmemb < 2 || size <= 0)
+        return;
+
+    sp->base = (uint8_t *)base;
+    sp->count = nmemb;
+    sp->depth = 0;
+    sp++;
+
+    while (sp > stack) {
+        sp--;
+        ptr = sp->base;
+        nmemb = sp->count;
+        depth = sp->depth;
+
+        while (nmemb > 6) {
+            if (++depth > 50) {
+                /* depth check to ensure worst case logarithmic time */
+                heapsortx(ptr, nmemb, size, cmp, opaque);
+                nmemb = 0;
+                break;
+            }
+            /* select median of 3 from 1/4, 1/2, 3/4 positions */
+            /* should use median of 5 or 9? */
+            m4 = (nmemb >> 2) * size;
+            m = med3(ptr + m4, ptr + 2 * m4, ptr + 3 * m4, cmp, opaque);
+            swap(ptr, m, size);  /* move the pivot to the start or the array */
+            i = lt = 1;
+            pi = plt = ptr + size;
+            gt = nmemb;
+            pj = pgt = top = ptr + nmemb * size;
+            for (;;) {
+                while (pi < pj && (c = cmp(ptr, pi, opaque)) >= 0) {
+                    if (c == 0) {
+                        swap(plt, pi, size);
+                        lt++;
+                        plt += size;
+                    }
+                    i++;
+                    pi += size;
+                }
+                while (pi < (pj -= size) && (c = cmp(ptr, pj, opaque)) <= 0) {
+                    if (c == 0) {
+                        gt--;
+                        pgt -= size;
+                        swap(pgt, pj, size);
+                    }
+                }
+                if (pi >= pj)
+                    break;
+                swap(pi, pj, size);
+                i++;
+                pi += size;
+            }
+            /* array has 4 parts:
+             * from 0 to lt excluded: elements identical to pivot
+             * from lt to pi excluded: elements smaller than pivot
+             * from pi to gt excluded: elements greater than pivot
+             * from gt to n excluded: elements identical to pivot
+             */
+            /* move elements identical to pivot in the middle of the array: */
+            /* swap values in ranges [0..lt[ and [i-lt..i[
+               swapping the smallest span between lt and i-lt is sufficient
+             */
+            span = plt - ptr;
+            span2 = pi - plt;
+            lt = i - lt;
+            if (span > span2)
+                span = span2;
+            swap_block(ptr, pi - span, span);
+            /* swap values in ranges [gt..top[ and [i..top-(top-gt)[
+               swapping the smallest span between top-gt and gt-i is sufficient
+             */
+            span = top - pgt;
+            span2 = pgt - pi;
+            pgt = top - span2;
+            gt = nmemb - (gt - i);
+            if (span > span2)
+                span = span2;
+            swap_block(pi, top - span, span);
+
+            /* now array has 3 parts:
+             * from 0 to lt excluded: elements smaller than pivot
+             * from lt to gt excluded: elements identical to pivot
+             * from gt to n excluded: elements greater than pivot
+             */
+            /* stack the larger segment and keep processing the smaller one
+               to minimize stack use for pathological distributions */
+            if (lt > nmemb - gt) {
+                sp->base = ptr;
+                sp->count = lt;
+                sp->depth = depth;
+                sp++;
+                ptr = pgt;
+                nmemb -= gt;
+            } else {
+                sp->base = pgt;
+                sp->count = nmemb - gt;
+                sp->depth = depth;
+                sp++;
+                nmemb = lt;
+            }
+        }
+        /* Use insertion sort for small fragments */
+        for (pi = ptr + size, top = ptr + nmemb * size; pi < top; pi += size) {
+            for (pj = pi; pj > ptr && cmp(pj - size, pj, opaque) > 0; pj -= size)
+                swap(pj, pj - size, size);
+        }
+    }
+}
+
+#endif
diff --git a/src/couch_quickjs/quickjs/cutils.h b/src/couch_quickjs/quickjs/cutils.h
new file mode 100644
index 000000000..31f7cd84a
--- /dev/null
+++ b/src/couch_quickjs/quickjs/cutils.h
@@ -0,0 +1,297 @@
+/*
+ * C utilities
+ * 
+ * Copyright (c) 2017 Fabrice Bellard
+ * Copyright (c) 2018 Charlie Gordon
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef CUTILS_H
+#define CUTILS_H
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+/* set if CPU is big endian */
+#undef WORDS_BIGENDIAN
+
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+#define force_inline inline __attribute__((always_inline))
+#define no_inline __attribute__((noinline))
+#define __maybe_unused __attribute__((unused))
+
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#define stringify(s)    tostring(s)
+#define tostring(s)     #s
+
+#ifndef offsetof
+#define offsetof(type, field) ((size_t) &((type *)0)->field)
+#endif
+#ifndef countof
+#define countof(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+typedef int BOOL;
+
+#ifndef FALSE
+enum {
+    FALSE = 0,
+    TRUE = 1,
+};
+#endif
+
+void pstrcpy(char *buf, int buf_size, const char *str);
+char *pstrcat(char *buf, int buf_size, const char *s);
+int strstart(const char *str, const char *val, const char **ptr);
+int has_suffix(const char *str, const char *suffix);
+
+static inline int max_int(int a, int b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline int min_int(int a, int b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline uint32_t max_uint32(uint32_t a, uint32_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline uint32_t min_uint32(uint32_t a, uint32_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline int64_t max_int64(int64_t a, int64_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline int64_t min_int64(int64_t a, int64_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz32(unsigned int a)
+{
+    return __builtin_clz(a);
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz64(uint64_t a)
+{
+    return __builtin_clzll(a);
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int ctz32(unsigned int a)
+{
+    return __builtin_ctz(a);
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int ctz64(uint64_t a)
+{
+    return __builtin_ctzll(a);
+}
+
+struct __attribute__((packed)) packed_u64 {
+    uint64_t v;
+};
+
+struct __attribute__((packed)) packed_u32 {
+    uint32_t v;
+};
+
+struct __attribute__((packed)) packed_u16 {
+    uint16_t v;
+};
+
+static inline uint64_t get_u64(const uint8_t *tab)
+{
+    return ((const struct packed_u64 *)tab)->v;
+}
+
+static inline int64_t get_i64(const uint8_t *tab)
+{
+    return (int64_t)((const struct packed_u64 *)tab)->v;
+}
+
+static inline void put_u64(uint8_t *tab, uint64_t val)
+{
+    ((struct packed_u64 *)tab)->v = val;
+}
+
+static inline uint32_t get_u32(const uint8_t *tab)
+{
+    return ((const struct packed_u32 *)tab)->v;
+}
+
+static inline int32_t get_i32(const uint8_t *tab)
+{
+    return (int32_t)((const struct packed_u32 *)tab)->v;
+}
+
+static inline void put_u32(uint8_t *tab, uint32_t val)
+{
+    ((struct packed_u32 *)tab)->v = val;
+}
+
+static inline uint32_t get_u16(const uint8_t *tab)
+{
+    return ((const struct packed_u16 *)tab)->v;
+}
+
+static inline int32_t get_i16(const uint8_t *tab)
+{
+    return (int16_t)((const struct packed_u16 *)tab)->v;
+}
+
+static inline void put_u16(uint8_t *tab, uint16_t val)
+{
+    ((struct packed_u16 *)tab)->v = val;
+}
+
+static inline uint32_t get_u8(const uint8_t *tab)
+{
+    return *tab;
+}
+
+static inline int32_t get_i8(const uint8_t *tab)
+{
+    return (int8_t)*tab;
+}
+
+static inline void put_u8(uint8_t *tab, uint8_t val)
+{
+    *tab = val;
+}
+
+static inline uint16_t bswap16(uint16_t x)
+{
+    return (x >> 8) | (x << 8);
+}
+
+static inline uint32_t bswap32(uint32_t v)
+{
+    return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >>  8) |
+        ((v & 0x0000ff00) <<  8) | ((v & 0x000000ff) << 24);
+}
+
+static inline uint64_t bswap64(uint64_t v)
+{
+    return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | 
+        ((v & ((uint64_t)0xff << (6 * 8))) >> (5 * 8)) | 
+        ((v & ((uint64_t)0xff << (5 * 8))) >> (3 * 8)) | 
+        ((v & ((uint64_t)0xff << (4 * 8))) >> (1 * 8)) | 
+        ((v & ((uint64_t)0xff << (3 * 8))) << (1 * 8)) | 
+        ((v & ((uint64_t)0xff << (2 * 8))) << (3 * 8)) | 
+        ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | 
+        ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8));
+}
+
+/* XXX: should take an extra argument to pass slack information to the caller */
+typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size);
+
+typedef struct DynBuf {
+    uint8_t *buf;
+    size_t size;
+    size_t allocated_size;
+    BOOL error; /* true if a memory allocation error occurred */
+    DynBufReallocFunc *realloc_func;
+    void *opaque; /* for realloc_func */
+} DynBuf;
+
+void dbuf_init(DynBuf *s);
+void dbuf_init2(DynBuf *s, void *opaque, DynBufReallocFunc *realloc_func);
+int dbuf_realloc(DynBuf *s, size_t new_size);
+int dbuf_write(DynBuf *s, size_t offset, const uint8_t *data, size_t len);
+int dbuf_put(DynBuf *s, const uint8_t *data, size_t len);
+int dbuf_put_self(DynBuf *s, size_t offset, size_t len);
+int dbuf_putc(DynBuf *s, uint8_t c);
+int dbuf_putstr(DynBuf *s, const char *str);
+static inline int dbuf_put_u16(DynBuf *s, uint16_t val)
+{
+    return dbuf_put(s, (uint8_t *)&val, 2);
+}
+static inline int dbuf_put_u32(DynBuf *s, uint32_t val)
+{
+    return dbuf_put(s, (uint8_t *)&val, 4);
+}
+static inline int dbuf_put_u64(DynBuf *s, uint64_t val)
+{
+    return dbuf_put(s, (uint8_t *)&val, 8);
+}
+int __attribute__((format(printf, 2, 3))) dbuf_printf(DynBuf *s,
+                                                      const char *fmt, ...);
+void dbuf_free(DynBuf *s);
+static inline BOOL dbuf_error(DynBuf *s) {
+    return s->error;
+}
+static inline void dbuf_set_error(DynBuf *s)
+{
+    s->error = TRUE;
+}
+
+#define UTF8_CHAR_LEN_MAX 6
+
+int unicode_to_utf8(uint8_t *buf, unsigned int c);
+int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
+
+static inline int from_hex(int c)
+{
+    if (c >= '0' && c <= '9')
+        return c - '0';
+    else if (c >= 'A' && c <= 'F')
+        return c - 'A' + 10;
+    else if (c >= 'a' && c <= 'f')
+        return c - 'a' + 10;
+    else
+        return -1;
+}
+
+void rqsort(void *base, size_t nmemb, size_t size,
+            int (*cmp)(const void *, const void *, void *),
+            void *arg);
+
+#endif  /* CUTILS_H */
diff --git a/src/couch_quickjs/quickjs/libbf.c b/src/couch_quickjs/quickjs/libbf.c
new file mode 100644
index 000000000..fe1628e79
--- /dev/null
+++ b/src/couch_quickjs/quickjs/libbf.c
@@ -0,0 +1,8466 @@
+/*
+ * Tiny arbitrary precision floating point library
+ * 
+ * Copyright (c) 2017-2021 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+
+#include "cutils.h"
+#include "libbf.h"
+
+/* enable it to check the multiplication result */
+//#define USE_MUL_CHECK
+/* enable it to use FFT/NTT multiplication */
+#define USE_FFT_MUL
+/* enable decimal floating point support */
+#define USE_BF_DEC
+
+//#define inline __attribute__((always_inline))
+
+#ifdef __AVX2__
+#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */
+#else
+#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */
+#endif
+
+/* XXX: adjust */
+#define DIVNORM_LARGE_THRESHOLD 50
+#define UDIV1NORM_THRESHOLD 3
+
+#if LIMB_BITS == 64
+#define FMT_LIMB1 "%" PRIx64 
+#define FMT_LIMB "%016" PRIx64 
+#define PRId_LIMB PRId64
+#define PRIu_LIMB PRIu64
+
+#else
+
+#define FMT_LIMB1 "%x"
+#define FMT_LIMB "%08x"
+#define PRId_LIMB "d"
+#define PRIu_LIMB "u"
+
+#endif
+
+typedef intptr_t mp_size_t;
+
+typedef int bf_op2_func_t(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                          bf_flags_t flags);
+
+#ifdef USE_FFT_MUL
+
+#define FFT_MUL_R_OVERLAP_A (1 << 0)
+#define FFT_MUL_R_OVERLAP_B (1 << 1)
+#define FFT_MUL_R_NORESIZE  (1 << 2)
+
+static no_inline int fft_mul(bf_context_t *s,
+                             bf_t *res, limb_t *a_tab, limb_t a_len,
+                             limb_t *b_tab, limb_t b_len, int mul_flags);
+static void fft_clear_cache(bf_context_t *s);
+#endif
+#ifdef USE_BF_DEC
+static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos);
+#endif
+
+
+/* could leading zeros */
+static inline int clz(limb_t a)
+{
+    if (a == 0) {
+        return LIMB_BITS;
+    } else {
+#if LIMB_BITS == 64
+        return clz64(a);
+#else
+        return clz32(a);
+#endif
+    }
+}
+
+static inline int ctz(limb_t a)
+{
+    if (a == 0) {
+        return LIMB_BITS;
+    } else {
+#if LIMB_BITS == 64
+        return ctz64(a);
+#else
+        return ctz32(a);
+#endif
+    }
+}
+
+static inline int ceil_log2(limb_t a)
+{
+    if (a <= 1)
+        return 0;
+    else
+        return LIMB_BITS - clz(a - 1);
+}
+
+/* b must be >= 1 */
+static inline slimb_t ceil_div(slimb_t a, slimb_t b)
+{
+    if (a >= 0)
+        return (a + b - 1) / b;
+    else
+        return a / b;
+}
+
+/* b must be >= 1 */
+static inline slimb_t floor_div(slimb_t a, slimb_t b)
+{
+    if (a >= 0) {
+        return a / b;
+    } else {
+        return (a - b + 1) / b;
+    }
+}
+
+/* return r = a modulo b (0 <= r <= b - 1. b must be >= 1 */
+static inline limb_t smod(slimb_t a, slimb_t b)
+{
+    a = a % (slimb_t)b;
+    if (a < 0)
+        a += b;
+    return a;
+}
+
+/* signed addition with saturation */
+static inline slimb_t sat_add(slimb_t a, slimb_t b)
+{
+    slimb_t r;
+    r = a + b;
+    /* overflow ? */
+    if (((a ^ r) & (b ^ r)) < 0)
+        r = (a >> (LIMB_BITS - 1)) ^ (((limb_t)1 << (LIMB_BITS - 1)) - 1);
+    return r;
+}
+
+#define malloc(s) malloc_is_forbidden(s)
+#define free(p) free_is_forbidden(p)
+#define realloc(p, s) realloc_is_forbidden(p, s)
+
+void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func,
+                     void *realloc_opaque)
+{
+    memset(s, 0, sizeof(*s));
+    s->realloc_func = realloc_func;
+    s->realloc_opaque = realloc_opaque;
+}
+
+void bf_context_end(bf_context_t *s)
+{
+    bf_clear_cache(s);
+}
+
+void bf_init(bf_context_t *s, bf_t *r)
+{
+    r->ctx = s;
+    r->sign = 0;
+    r->expn = BF_EXP_ZERO;
+    r->len = 0;
+    r->tab = NULL;
+}
+
+/* return 0 if OK, -1 if alloc error */
+int bf_resize(bf_t *r, limb_t len)
+{
+    limb_t *tab;
+    
+    if (len != r->len) {
+        tab = bf_realloc(r->ctx, r->tab, len * sizeof(limb_t));
+        if (!tab && len != 0)
+            return -1;
+        r->tab = tab;
+        r->len = len;
+    }
+    return 0;
+}
+
+/* return 0 or BF_ST_MEM_ERROR */
+int bf_set_ui(bf_t *r, uint64_t a)
+{
+    r->sign = 0;
+    if (a == 0) {
+        r->expn = BF_EXP_ZERO;
+        bf_resize(r, 0); /* cannot fail */
+    } 
+#if LIMB_BITS == 32
+    else if (a <= 0xffffffff)
+#else
+    else
+#endif
+    {
+        int shift;
+        if (bf_resize(r, 1))
+            goto fail;
+        shift = clz(a);
+        r->tab[0] = a << shift;
+        r->expn = LIMB_BITS - shift;
+    }
+#if LIMB_BITS == 32
+    else {
+        uint32_t a1, a0;
+        int shift;
+        if (bf_resize(r, 2))
+            goto fail;
+        a0 = a;
+        a1 = a >> 32;
+        shift = clz(a1);
+        r->tab[0] = a0 << shift;
+        r->tab[1] = (a1 << shift) | (a0 >> (LIMB_BITS - shift));
+        r->expn = 2 * LIMB_BITS - shift;
+    }
+#endif
+    return 0;
+ fail:
+    bf_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+/* return 0 or BF_ST_MEM_ERROR */
+int bf_set_si(bf_t *r, int64_t a)
+{
+    int ret;
+
+    if (a < 0) {
+        ret = bf_set_ui(r, -a);
+        r->sign = 1;
+    } else {
+        ret = bf_set_ui(r, a);
+    }
+    return ret;
+}
+
+void bf_set_nan(bf_t *r)
+{
+    bf_resize(r, 0); /* cannot fail */
+    r->expn = BF_EXP_NAN;
+    r->sign = 0;
+}
+
+void bf_set_zero(bf_t *r, int is_neg)
+{
+    bf_resize(r, 0); /* cannot fail */
+    r->expn = BF_EXP_ZERO;
+    r->sign = is_neg;
+}
+
+void bf_set_inf(bf_t *r, int is_neg)
+{
+    bf_resize(r, 0); /* cannot fail */
+    r->expn = BF_EXP_INF;
+    r->sign = is_neg;
+}
+
+/* return 0 or BF_ST_MEM_ERROR */
+int bf_set(bf_t *r, const bf_t *a)
+{
+    if (r == a)
+        return 0;
+    if (bf_resize(r, a->len)) {
+        bf_set_nan(r);
+        return BF_ST_MEM_ERROR;
+    }
+    r->sign = a->sign;
+    r->expn = a->expn;
+    memcpy(r->tab, a->tab, a->len * sizeof(limb_t));
+    return 0;
+}
+
+/* equivalent to bf_set(r, a); bf_delete(a) */
+void bf_move(bf_t *r, bf_t *a)
+{
+    bf_context_t *s = r->ctx;
+    if (r == a)
+        return;
+    bf_free(s, r->tab);
+    *r = *a;
+}
+
+static limb_t get_limbz(const bf_t *a, limb_t idx)
+{
+    if (idx >= a->len)
+        return 0;
+    else
+        return a->tab[idx];
+}
+
+/* get LIMB_BITS at bit position 'pos' in tab */
+static inline limb_t get_bits(const limb_t *tab, limb_t len, slimb_t pos)
+{
+    limb_t i, a0, a1;
+    int p;
+
+    i = pos >> LIMB_LOG2_BITS;
+    p = pos & (LIMB_BITS - 1);
+    if (i < len)
+        a0 = tab[i];
+    else
+        a0 = 0;
+    if (p == 0) {
+        return a0;
+    } else {
+        i++;
+        if (i < len)
+            a1 = tab[i];
+        else
+            a1 = 0;
+        return (a0 >> p) | (a1 << (LIMB_BITS - p));
+    }
+}
+
+static inline limb_t get_bit(const limb_t *tab, limb_t len, slimb_t pos)
+{
+    slimb_t i;
+    i = pos >> LIMB_LOG2_BITS;
+    if (i < 0 || i >= len)
+        return 0;
+    return (tab[i] >> (pos & (LIMB_BITS - 1))) & 1;
+}
+
+static inline limb_t limb_mask(int start, int last)
+{
+    limb_t v;
+    int n;
+    n = last - start + 1;
+    if (n == LIMB_BITS)
+        v = -1;
+    else
+        v = (((limb_t)1 << n) - 1) << start;
+    return v;
+}
+
+static limb_t mp_scan_nz(const limb_t *tab, mp_size_t n)
+{
+    mp_size_t i;
+    for(i = 0; i < n; i++) {
+        if (tab[i] != 0)
+            return 1;
+    }
+    return 0;
+}
+
+/* return != 0 if one bit between 0 and bit_pos inclusive is not zero. */
+static inline limb_t scan_bit_nz(const bf_t *r, slimb_t bit_pos)
+{
+    slimb_t pos;
+    limb_t v;
+    
+    pos = bit_pos >> LIMB_LOG2_BITS;
+    if (pos < 0)
+        return 0;
+    v = r->tab[pos] & limb_mask(0, bit_pos & (LIMB_BITS - 1));
+    if (v != 0)
+        return 1;
+    pos--;
+    while (pos >= 0) {
+        if (r->tab[pos] != 0)
+            return 1;
+        pos--;
+    }
+    return 0;
+}
+
+/* return the addend for rounding. Note that prec can be <= 0 (for
+   BF_FLAG_RADPNT_PREC) */
+static int bf_get_rnd_add(int *pret, const bf_t *r, limb_t l,
+                          slimb_t prec, int rnd_mode)
+{
+    int add_one, inexact;
+    limb_t bit1, bit0;
+    
+    if (rnd_mode == BF_RNDF) {
+        bit0 = 1; /* faithful rounding does not honor the INEXACT flag */
+    } else {
+        /* starting limb for bit 'prec + 1' */
+        bit0 = scan_bit_nz(r, l * LIMB_BITS - 1 - bf_max(0, prec + 1));
+    }
+
+    /* get the bit at 'prec' */
+    bit1 = get_bit(r->tab, l, l * LIMB_BITS - 1 - prec);
+    inexact = (bit1 | bit0) != 0;
+    
+    add_one = 0;
+    switch(rnd_mode) {
+    case BF_RNDZ:
+        break;
+    case BF_RNDN:
+        if (bit1) {
+            if (bit0) {
+                add_one = 1;
+            } else {
+                /* round to even */
+                add_one =
+                    get_bit(r->tab, l, l * LIMB_BITS - 1 - (prec - 1));
+            }
+        }
+        break;
+    case BF_RNDD:
+    case BF_RNDU:
+        if (r->sign == (rnd_mode == BF_RNDD))
+            add_one = inexact;
+        break;
+    case BF_RNDA:
+        add_one = inexact;
+        break;
+    case BF_RNDNA:
+    case BF_RNDF:
+        add_one = bit1;
+        break;
+    default:
+        abort();
+    }
+    
+    if (inexact)
+        *pret |= BF_ST_INEXACT;
+    return add_one;
+}
+
+static int bf_set_overflow(bf_t *r, int sign, limb_t prec, bf_flags_t flags)
+{
+    slimb_t i, l, e_max;
+    int rnd_mode;
+    
+    rnd_mode = flags & BF_RND_MASK;
+    if (prec == BF_PREC_INF ||
+        rnd_mode == BF_RNDN ||
+        rnd_mode == BF_RNDNA ||
+        rnd_mode == BF_RNDA ||
+        (rnd_mode == BF_RNDD && sign == 1) ||
+        (rnd_mode == BF_RNDU && sign == 0)) {
+        bf_set_inf(r, sign);
+    } else {
+        /* set to maximum finite number */
+        l = (prec + LIMB_BITS - 1) / LIMB_BITS;
+        if (bf_resize(r, l)) {
+            bf_set_nan(r);
+            return BF_ST_MEM_ERROR;
+        }
+        r->tab[0] = limb_mask((-prec) & (LIMB_BITS - 1),
+                              LIMB_BITS - 1);
+        for(i = 1; i < l; i++)
+            r->tab[i] = (limb_t)-1;
+        e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
+        r->expn = e_max;
+        r->sign = sign;
+    }
+    return BF_ST_OVERFLOW | BF_ST_INEXACT;
+}
+
+/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is
+   assumed to have length 'l' (1 <= l <= r->len). Note: 'prec1' can be
+   infinite (BF_PREC_INF). 'ret' is 0 or BF_ST_INEXACT if the result
+   is known to be inexact. Can fail with BF_ST_MEM_ERROR in case of
+   overflow not returning infinity. */
+static int __bf_round(bf_t *r, limb_t prec1, bf_flags_t flags, limb_t l,
+                      int ret)
+{
+    limb_t v, a;
+    int shift, add_one, rnd_mode;
+    slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec;
+
+    /* e_min and e_max are computed to match the IEEE 754 conventions */
+    e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
+    e_min = -e_range + 3;
+    e_max = e_range;
+    
+    if (flags & BF_FLAG_RADPNT_PREC) {
+        /* 'prec' is the precision after the radix point */
+        if (prec1 != BF_PREC_INF)
+            prec = r->expn + prec1;
+        else
+            prec = prec1;
+    } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) {
+        /* restrict the precision in case of potentially subnormal
+           result */
+        assert(prec1 != BF_PREC_INF);
+        prec = prec1 - (e_min - r->expn);
+    } else {
+        prec = prec1;
+    }
+
+    /* round to prec bits */
+    rnd_mode = flags & BF_RND_MASK;
+    add_one = bf_get_rnd_add(&ret, r, l, prec, rnd_mode);
+    
+    if (prec <= 0) {
+        if (add_one) {
+            bf_resize(r, 1); /* cannot fail */
+            r->tab[0] = (limb_t)1 << (LIMB_BITS - 1);
+            r->expn += 1 - prec;
+            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
+            return ret;
+        } else {
+            goto underflow;
+        }
+    } else if (add_one) {
+        limb_t carry;
+        
+        /* add one starting at digit 'prec - 1' */
+        bit_pos = l * LIMB_BITS - 1 - (prec - 1);
+        pos = bit_pos >> LIMB_LOG2_BITS;
+        carry = (limb_t)1 << (bit_pos & (LIMB_BITS - 1));
+        
+        for(i = pos; i < l; i++) {
+            v = r->tab[i] + carry;
+            carry = (v < carry);
+            r->tab[i] = v;
+            if (carry == 0)
+                break;
+        }
+        if (carry) {
+            /* shift right by one digit */
+            v = 1;
+            for(i = l - 1; i >= pos; i--) {
+                a = r->tab[i];
+                r->tab[i] = (a >> 1) | (v << (LIMB_BITS - 1));
+                v = a;
+            }
+            r->expn++;
+        }
+    }
+    
+    /* check underflow */
+    if (unlikely(r->expn < e_min)) {
+        if (flags & BF_FLAG_SUBNORMAL) {
+            /* if inexact, also set the underflow flag */
+            if (ret & BF_ST_INEXACT)
+                ret |= BF_ST_UNDERFLOW;
+        } else {
+        underflow:
+            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
+            bf_set_zero(r, r->sign);
+            return ret;
+        }
+    }
+    
+    /* check overflow */
+    if (unlikely(r->expn > e_max))
+        return bf_set_overflow(r, r->sign, prec1, flags);
+    
+    /* keep the bits starting at 'prec - 1' */
+    bit_pos = l * LIMB_BITS - 1 - (prec - 1);
+    i = bit_pos >> LIMB_LOG2_BITS;
+    if (i >= 0) {
+        shift = bit_pos & (LIMB_BITS - 1);
+        if (shift != 0)
+            r->tab[i] &= limb_mask(shift, LIMB_BITS - 1);
+    } else {
+        i = 0;
+    }
+    /* remove trailing zeros */
+    while (r->tab[i] == 0)
+        i++;
+    if (i > 0) {
+        l -= i;
+        memmove(r->tab, r->tab + i, l * sizeof(limb_t));
+    }
+    bf_resize(r, l); /* cannot fail */
+    return ret;
+}
+
+/* 'r' must be a finite number. */
+int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags)
+{
+    limb_t l, v, a;
+    int shift, ret;
+    slimb_t i;
+    
+    //    bf_print_str("bf_renorm", r);
+    l = r->len;
+    while (l > 0 && r->tab[l - 1] == 0)
+        l--;
+    if (l == 0) {
+        /* zero */
+        r->expn = BF_EXP_ZERO;
+        bf_resize(r, 0); /* cannot fail */
+        ret = 0;
+    } else {
+        r->expn -= (r->len - l) * LIMB_BITS;
+        /* shift to have the MSB set to '1' */
+        v = r->tab[l - 1];
+        shift = clz(v);
+        if (shift != 0) {
+            v = 0;
+            for(i = 0; i < l; i++) {
+                a = r->tab[i];
+                r->tab[i] = (a << shift) | (v >> (LIMB_BITS - shift));
+                v = a;
+            }
+            r->expn -= shift;
+        }
+        ret = __bf_round(r, prec1, flags, l, 0);
+    }
+    //    bf_print_str("r_final", r);
+    return ret;
+}
+
+/* return true if rounding can be done at precision 'prec' assuming
+   the exact result r is such that |r-a| <= 2^(EXP(a)-k). */
+/* XXX: check the case where the exponent would be incremented by the
+   rounding */
+int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k)
+{
+    BOOL is_rndn;
+    slimb_t bit_pos, n;
+    limb_t bit;
+    
+    if (a->expn == BF_EXP_INF || a->expn == BF_EXP_NAN)
+        return FALSE;
+    if (rnd_mode == BF_RNDF) {
+        return (k >= (prec + 1));
+    }
+    if (a->expn == BF_EXP_ZERO)
+        return FALSE;
+    is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA);
+    if (k < (prec + 2))
+        return FALSE;
+    bit_pos = a->len * LIMB_BITS - 1 - prec;
+    n = k - prec;
+    /* bit pattern for RNDN or RNDNA: 0111.. or 1000...
+       for other rounding modes: 000... or 111... 
+    */
+    bit = get_bit(a->tab, a->len, bit_pos);
+    bit_pos--;
+    n--;
+    bit ^= is_rndn;
+    /* XXX: slow, but a few iterations on average */
+    while (n != 0) {
+        if (get_bit(a->tab, a->len, bit_pos) != bit)
+            return TRUE;
+        bit_pos--;
+        n--;
+    }
+    return FALSE;
+}
+
+/* Cannot fail with BF_ST_MEM_ERROR. */
+int bf_round(bf_t *r, limb_t prec, bf_flags_t flags)
+{
+    if (r->len == 0)
+        return 0;
+    return __bf_round(r, prec, flags, r->len, 0);
+}
+
+/* for debugging */
+static __maybe_unused void dump_limbs(const char *str, const limb_t *tab, limb_t n)
+{
+    limb_t i;
+    printf("%s: len=%" PRId_LIMB "\n", str, n);
+    for(i = 0; i < n; i++) {
+        printf("%" PRId_LIMB ": " FMT_LIMB "\n",
+               i, tab[i]);
+    }
+}
+
+void mp_print_str(const char *str, const limb_t *tab, limb_t n)
+{
+    slimb_t i;
+    printf("%s= 0x", str);
+    for(i = n - 1; i >= 0; i--) {
+        if (i != (n - 1))
+            printf("_");
+        printf(FMT_LIMB, tab[i]);
+    }
+    printf("\n");
+}
+
+static __maybe_unused void mp_print_str_h(const char *str,
+                                          const limb_t *tab, limb_t n,
+                                          limb_t high)
+{
+    slimb_t i;
+    printf("%s= 0x", str);
+    printf(FMT_LIMB, high);
+    for(i = n - 1; i >= 0; i--) {
+        printf("_");
+        printf(FMT_LIMB, tab[i]);
+    }
+    printf("\n");
+}
+
+/* for debugging */
+void bf_print_str(const char *str, const bf_t *a)
+{
+    slimb_t i;
+    printf("%s=", str);
+
+    if (a->expn == BF_EXP_NAN) {
+        printf("NaN");
+    } else {
+        if (a->sign)
+            putchar('-');
+        if (a->expn == BF_EXP_ZERO) {
+            putchar('0');
+        } else if (a->expn == BF_EXP_INF) {
+            printf("Inf");
+        } else {
+            printf("0x0.");
+            for(i = a->len - 1; i >= 0; i--)
+                printf(FMT_LIMB, a->tab[i]);
+            printf("p%" PRId_LIMB, a->expn);
+        }
+    }
+    printf("\n");
+}
+
+/* compare the absolute value of 'a' and 'b'. Return < 0 if a < b, 0
+   if a = b and > 0 otherwise. */
+int bf_cmpu(const bf_t *a, const bf_t *b)
+{
+    slimb_t i;
+    limb_t len, v1, v2;
+    
+    if (a->expn != b->expn) {
+        if (a->expn < b->expn)
+            return -1;
+        else
+            return 1;
+    }
+    len = bf_max(a->len, b->len);
+    for(i = len - 1; i >= 0; i--) {
+        v1 = get_limbz(a, a->len - len + i);
+        v2 = get_limbz(b, b->len - len + i);
+        if (v1 != v2) {
+            if (v1 < v2)
+                return -1;
+            else
+                return 1;
+        }
+    }
+    return 0;
+}
+
+/* Full order: -0 < 0, NaN == NaN and NaN is larger than all other numbers */
+int bf_cmp_full(const bf_t *a, const bf_t *b)
+{
+    int res;
+    
+    if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+        if (a->expn == b->expn)
+            res = 0;
+        else if (a->expn == BF_EXP_NAN)
+            res = 1;
+        else
+            res = -1;
+    } else if (a->sign != b->sign) {
+        res = 1 - 2 * a->sign;
+    } else {
+        res = bf_cmpu(a, b);
+        if (a->sign)
+            res = -res;
+    }
+    return res;
+}
+
+/* Standard floating point comparison: return 2 if one of the operands
+   is NaN (unordered) or -1, 0, 1 depending on the ordering assuming
+   -0 == +0 */
+int bf_cmp(const bf_t *a, const bf_t *b)
+{
+    int res;
+    
+    if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+        res = 2;
+    } else if (a->sign != b->sign) {
+        if (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_ZERO)
+            res = 0;
+        else
+            res = 1 - 2 * a->sign;
+    } else {
+        res = bf_cmpu(a, b);
+        if (a->sign)
+            res = -res;
+    }
+    return res;
+}
+
+/* Compute the number of bits 'n' matching the pattern:
+   a= X1000..0
+   b= X0111..1
+              
+   When computing a-b, the result will have at least n leading zero
+   bits.
+
+   Precondition: a > b and a.expn - b.expn = 0 or 1
+*/
+static limb_t count_cancelled_bits(const bf_t *a, const bf_t *b)
+{
+    slimb_t bit_offset, b_offset, n;
+    int p, p1;
+    limb_t v1, v2, mask;
+
+    bit_offset = a->len * LIMB_BITS - 1;
+    b_offset = (b->len - a->len) * LIMB_BITS - (LIMB_BITS - 1) +
+        a->expn - b->expn;
+    n = 0;
+
+    /* first search the equals bits */
+    for(;;) {
+        v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS);
+        v2 = get_bits(b->tab, b->len, bit_offset + b_offset);
+        //        printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2);
+        if (v1 != v2)
+            break;
+        n += LIMB_BITS;
+        bit_offset -= LIMB_BITS;
+    }
+    /* find the position of the first different bit */
+    p = clz(v1 ^ v2) + 1;
+    n += p;
+    /* then search for '0' in a and '1' in b */
+    p = LIMB_BITS - p;
+    if (p > 0) {
+        /* search in the trailing p bits of v1 and v2 */
+        mask = limb_mask(0, p - 1);
+        p1 = bf_min(clz(v1 & mask), clz((~v2) & mask)) - (LIMB_BITS - p);
+        n += p1;
+        if (p1 != p)
+            goto done;
+    }
+    bit_offset -= LIMB_BITS;
+    for(;;) {
+        v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS);
+        v2 = get_bits(b->tab, b->len, bit_offset + b_offset);
+        //        printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2);
+        if (v1 != 0 || v2 != -1) {
+            /* different: count the matching bits */
+            p1 = bf_min(clz(v1), clz(~v2));
+            n += p1;
+            break;
+        }
+        n += LIMB_BITS;
+        bit_offset -= LIMB_BITS;
+    }
+ done:
+    return n;
+}
+
+static int bf_add_internal(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                           bf_flags_t flags, int b_neg)
+{
+    const bf_t *tmp;
+    int is_sub, ret, cmp_res, a_sign, b_sign;
+
+    a_sign = a->sign;
+    b_sign = b->sign ^ b_neg;
+    is_sub = a_sign ^ b_sign;
+    cmp_res = bf_cmpu(a, b);
+    if (cmp_res < 0) {
+        tmp = a;
+        a = b;
+        b = tmp;
+        a_sign = b_sign; /* b_sign is never used later */
+    }
+    /* abs(a) >= abs(b) */
+    if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) {
+        /* zero result */
+        bf_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD);
+        ret = 0;
+    } else if (a->len == 0 || b->len == 0) {
+        ret = 0;
+        if (a->expn >= BF_EXP_INF) {
+            if (a->expn == BF_EXP_NAN) {
+                /* at least one operand is NaN */
+                bf_set_nan(r);
+            } else if (b->expn == BF_EXP_INF && is_sub) {
+                /* infinities with different signs */
+                bf_set_nan(r);
+                ret = BF_ST_INVALID_OP;
+            } else {
+                bf_set_inf(r, a_sign);
+            }
+        } else {
+            /* at least one zero and not subtract */
+            bf_set(r, a);
+            r->sign = a_sign;
+            goto renorm;
+        }
+    } else {
+        slimb_t d, a_offset, b_bit_offset, i, cancelled_bits;
+        limb_t carry, v1, v2, u, r_len, carry1, precl, tot_len, z, sub_mask;
+
+        r->sign = a_sign;
+        r->expn = a->expn;
+        d = a->expn - b->expn;
+        /* must add more precision for the leading cancelled bits in
+           subtraction */
+        if (is_sub) {
+            if (d <= 1)
+                cancelled_bits = count_cancelled_bits(a, b);
+            else
+                cancelled_bits = 1;
+        } else {
+            cancelled_bits = 0;
+        }
+        
+        /* add two extra bits for rounding */
+        precl = (cancelled_bits + prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
+        tot_len = bf_max(a->len, b->len + (d + LIMB_BITS - 1) / LIMB_BITS);
+        r_len = bf_min(precl, tot_len);
+        if (bf_resize(r, r_len))
+            goto fail;
+        a_offset = a->len - r_len;
+        b_bit_offset = (b->len - r_len) * LIMB_BITS + d;
+
+        /* compute the bits before for the rounding */
+        carry = is_sub;
+        z = 0;
+        sub_mask = -is_sub;
+        i = r_len - tot_len;
+        while (i < 0) {
+            slimb_t ap, bp;
+            BOOL inflag;
+            
+            ap = a_offset + i;
+            bp = b_bit_offset + i * LIMB_BITS;
+            inflag = FALSE;
+            if (ap >= 0 && ap < a->len) {
+                v1 = a->tab[ap];
+                inflag = TRUE;
+            } else {
+                v1 = 0;
+            }
+            if (bp + LIMB_BITS > 0 && bp < (slimb_t)(b->len * LIMB_BITS)) {
+                v2 = get_bits(b->tab, b->len, bp);
+                inflag = TRUE;
+            } else {
+                v2 = 0;
+            }
+            if (!inflag) {
+                /* outside 'a' and 'b': go directly to the next value
+                   inside a or b so that the running time does not
+                   depend on the exponent difference */
+                i = 0;
+                if (ap < 0)
+                    i = bf_min(i, -a_offset);
+                /* b_bit_offset + i * LIMB_BITS + LIMB_BITS >= 1
+                   equivalent to 
+                   i >= ceil(-b_bit_offset + 1 - LIMB_BITS) / LIMB_BITS)
+                */
+                if (bp + LIMB_BITS <= 0)
+                    i = bf_min(i, (-b_bit_offset) >> LIMB_LOG2_BITS);
+            } else {
+                i++;
+            }
+            v2 ^= sub_mask;
+            u = v1 + v2;
+            carry1 = u < v1;
+            u += carry;
+            carry = (u < carry) | carry1;
+            z |= u;
+        }
+        /* and the result */
+        for(i = 0; i < r_len; i++) {
+            v1 = get_limbz(a, a_offset + i);
+            v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS);
+            v2 ^= sub_mask;
+            u = v1 + v2;
+            carry1 = u < v1;
+            u += carry;
+            carry = (u < carry) | carry1;
+            r->tab[i] = u;
+        }
+        /* set the extra bits for the rounding */
+        r->tab[0] |= (z != 0);
+
+        /* carry is only possible in add case */
+        if (!is_sub && carry) {
+            if (bf_resize(r, r_len + 1))
+                goto fail;
+            r->tab[r_len] = 1;
+            r->expn += LIMB_BITS;
+        }
+    renorm:
+        ret = bf_normalize_and_round(r, prec, flags);
+    }
+    return ret;
+ fail:
+    bf_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+static int __bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                     bf_flags_t flags)
+{
+    return bf_add_internal(r, a, b, prec, flags, 0);
+}
+
+static int __bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                     bf_flags_t flags)
+{
+    return bf_add_internal(r, a, b, prec, flags, 1);
+}
+
+limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2, 
+              limb_t n, limb_t carry)
+{
+    slimb_t i;
+    limb_t k, a, v, k1;
+    
+    k = carry;
+    for(i=0;i<n;i++) {
+        v = op1[i];
+        a = v + op2[i];
+        k1 = a < v;
+        a = a + k;
+        k = (a < k) | k1;
+        res[i] = a;
+    }
+    return k;
+}
+
+limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n)
+{
+    size_t i;
+    limb_t k, a;
+
+    k=b;
+    for(i=0;i<n;i++) {
+        if (k == 0)
+            break;
+        a = tab[i] + k;
+        k = (a < k);
+        tab[i] = a;
+    }
+    return k;
+}
+
+limb_t mp_sub(limb_t *res, const limb_t *op1, const limb_t *op2, 
+              mp_size_t n, limb_t carry)
+{
+    int i;
+    limb_t k, a, v, k1;
+    
+    k = carry;
+    for(i=0;i<n;i++) {
+        v = op1[i];
+        a = v - op2[i];
+        k1 = a > v;
+        v = a - k;
+        k = (v > a) | k1;
+        res[i] = v;
+    }
+    return k;
+}
+
+/* compute 0 - op2 */
+static limb_t mp_neg(limb_t *res, const limb_t *op2, mp_size_t n, limb_t carry)
+{
+    int i;
+    limb_t k, a, v, k1;
+    
+    k = carry;
+    for(i=0;i<n;i++) {
+        v = 0;
+        a = v - op2[i];
+        k1 = a > v;
+        v = a - k;
+        k = (v > a) | k1;
+        res[i] = v;
+    }
+    return k;
+}
+
+limb_t mp_sub_ui(limb_t *tab, limb_t b, mp_size_t n)
+{
+    mp_size_t i;
+    limb_t k, a, v;
+    
+    k=b;
+    for(i=0;i<n;i++) {
+        v = tab[i];
+        a = v - k;
+        k = a > v;
+        tab[i] = a;
+        if (k == 0)
+            break;
+    }
+    return k;
+}
+
+/* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift). 
+   1 <= shift <= LIMB_BITS - 1 */
+static limb_t mp_shr(limb_t *tab_r, const limb_t *tab, mp_size_t n, 
+                     int shift, limb_t high)
+{
+    mp_size_t i;
+    limb_t l, a;
+
+    assert(shift >= 1 && shift < LIMB_BITS);
+    l = high;
+    for(i = n - 1; i >= 0; i--) {
+        a = tab[i];
+        tab_r[i] = (a >> shift) | (l << (LIMB_BITS - shift));
+        l = a;
+    }
+    return l & (((limb_t)1 << shift) - 1);
+}
+
+/* tabr[] = taba[] * b + l. Return the high carry */
+static limb_t mp_mul1(limb_t *tabr, const limb_t *taba, limb_t n, 
+                      limb_t b, limb_t l)
+{
+    limb_t i;
+    dlimb_t t;
+
+    for(i = 0; i < n; i++) {
+        t = (dlimb_t)taba[i] * (dlimb_t)b + l;
+        tabr[i] = t;
+        l = t >> LIMB_BITS;
+    }
+    return l;
+}
+
+/* tabr[] += taba[] * b, return the high word. */
+static limb_t mp_add_mul1(limb_t *tabr, const limb_t *taba, limb_t n,
+                          limb_t b)
+{
+    limb_t i, l;
+    dlimb_t t;
+    
+    l = 0;
+    for(i = 0; i < n; i++) {
+        t = (dlimb_t)taba[i] * (dlimb_t)b + l + tabr[i];
+        tabr[i] = t;
+        l = t >> LIMB_BITS;
+    }
+    return l;
+}
+
+/* size of the result : op1_size + op2_size. */
+static void mp_mul_basecase(limb_t *result, 
+                            const limb_t *op1, limb_t op1_size, 
+                            const limb_t *op2, limb_t op2_size) 
+{
+    limb_t i, r;
+    
+    result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0);
+    for(i=1;i<op2_size;i++) {
+        r = mp_add_mul1(result + i, op1, op1_size, op2[i]);
+        result[i + op1_size] = r;
+    }
+}
+
+/* return 0 if OK, -1 if memory error */
+/* XXX: change API so that result can be allocated */
+int mp_mul(bf_context_t *s, limb_t *result, 
+           const limb_t *op1, limb_t op1_size, 
+           const limb_t *op2, limb_t op2_size) 
+{
+#ifdef USE_FFT_MUL
+    if (unlikely(bf_min(op1_size, op2_size) >= FFT_MUL_THRESHOLD)) {
+        bf_t r_s, *r = &r_s;
+        r->tab = result;
+        /* XXX: optimize memory usage in API */
+        if (fft_mul(s, r, (limb_t *)op1, op1_size,
+                    (limb_t *)op2, op2_size, FFT_MUL_R_NORESIZE))
+            return -1;
+    } else
+#endif
+    {
+        mp_mul_basecase(result, op1, op1_size, op2, op2_size);
+    }
+    return 0;
+}
+
+/* tabr[] -= taba[] * b. Return the value to substract to the high
+   word. */
+static limb_t mp_sub_mul1(limb_t *tabr, const limb_t *taba, limb_t n,
+                          limb_t b)
+{
+    limb_t i, l;
+    dlimb_t t;
+    
+    l = 0;
+    for(i = 0; i < n; i++) {
+        t = tabr[i] - (dlimb_t)taba[i] * (dlimb_t)b - l;
+        tabr[i] = t;
+        l = -(t >> LIMB_BITS);
+    }
+    return l;
+}
+
+/* WARNING: d must be >= 2^(LIMB_BITS-1) */
+static inline limb_t udiv1norm_init(limb_t d)
+{
+    limb_t a0, a1;
+    a1 = -d - 1;
+    a0 = -1;
+    return (((dlimb_t)a1 << LIMB_BITS) | a0) / d;
+}
+
+/* return the quotient and the remainder in '*pr'of 'a1*2^LIMB_BITS+a0
+   / d' with 0 <= a1 < d. */
+static inline limb_t udiv1norm(limb_t *pr, limb_t a1, limb_t a0,
+                                limb_t d, limb_t d_inv)
+{
+    limb_t n1m, n_adj, q, r, ah;
+    dlimb_t a;
+    n1m = ((slimb_t)a0 >> (LIMB_BITS - 1));
+    n_adj = a0 + (n1m & d);
+    a = (dlimb_t)d_inv * (a1 - n1m) + n_adj;
+    q = (a >> LIMB_BITS) + a1;
+    /* compute a - q * r and update q so that the remainder is\
+       between 0 and d - 1 */
+    a = ((dlimb_t)a1 << LIMB_BITS) | a0;
+    a = a - (dlimb_t)q * d - d;
+    ah = a >> LIMB_BITS;
+    q += 1 + ah;
+    r = (limb_t)a + (ah & d);
+    *pr = r;
+    return q;
+}
+
+/* b must be >= 1 << (LIMB_BITS - 1) */
+static limb_t mp_div1norm(limb_t *tabr, const limb_t *taba, limb_t n,
+                          limb_t b, limb_t r)
+{
+    slimb_t i;
+
+    if (n >= UDIV1NORM_THRESHOLD) {
+        limb_t b_inv;
+        b_inv = udiv1norm_init(b);
+        for(i = n - 1; i >= 0; i--) {
+            tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv);
+        }
+    } else {
+        dlimb_t a1;
+        for(i = n - 1; i >= 0; i--) {
+            a1 = ((dlimb_t)r << LIMB_BITS) | taba[i];
+            tabr[i] = a1 / b;
+            r = a1 % b;
+        }
+    }
+    return r;
+}
+
+static int mp_divnorm_large(bf_context_t *s, 
+                            limb_t *tabq, limb_t *taba, limb_t na, 
+                            const limb_t *tabb, limb_t nb);
+
+/* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb
+   - 1] must be >= 1 << (LIMB_BITS - 1). na - nb must be >= 0. 'taba'
+   is modified and contains the remainder (nb limbs). tabq[0..na-nb]
+   contains the quotient with tabq[na - nb] <= 1. */
+static int mp_divnorm(bf_context_t *s, limb_t *tabq, limb_t *taba, limb_t na, 
+                      const limb_t *tabb, limb_t nb)
+{
+    limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r;
+    slimb_t i, j;
+
+    b1 = tabb[nb - 1];
+    if (nb == 1) {
+        taba[0] = mp_div1norm(tabq, taba, na, b1, 0);
+        return 0;
+    }
+    n = na - nb;
+    if (bf_min(n, nb) >= DIVNORM_LARGE_THRESHOLD) {
+        return mp_divnorm_large(s, tabq, taba, na, tabb, nb);
+    }
+    
+    if (n >= UDIV1NORM_THRESHOLD)
+        b1_inv = udiv1norm_init(b1);
+    else
+        b1_inv = 0;
+
+    /* first iteration: the quotient is only 0 or 1 */
+    q = 1;
+    for(j = nb - 1; j >= 0; j--) {
+        if (taba[n + j] != tabb[j]) {
+            if (taba[n + j] < tabb[j])
+                q = 0;
+            break;
+        }
+    }
+    tabq[n] = q;
+    if (q) {
+        mp_sub(taba + n, taba + n, tabb, nb, 0);
+    }
+    
+    for(i = n - 1; i >= 0; i--) {
+        if (unlikely(taba[i + nb] >= b1)) {
+            q = -1;
+        } else if (b1_inv) {
+            q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv);
+        } else {
+            dlimb_t al;
+            al = ((dlimb_t)taba[i + nb] << LIMB_BITS) | taba[i + nb - 1];
+            q = al / b1;
+            r = al % b1;
+        }
+        r = mp_sub_mul1(taba + i, tabb, nb, q);
+
+        v = taba[i + nb];
+        a = v - r;
+        c = (a > v);
+        taba[i + nb] = a;
+
+        if (c != 0) {
+            /* negative result */
+            for(;;) {
+                q--;
+                c = mp_add(taba + i, taba + i, tabb, nb, 0);
+                /* propagate carry and test if positive result */
+                if (c != 0) {
+                    if (++taba[i + nb] == 0) {
+                        break;
+                    }
+                }
+            }
+        }
+        tabq[i] = q;
+    }
+    return 0;
+}
+
+/* compute r=B^(2*n)/a such as a*r < B^(2*n) < a*r + 2 with n >= 1. 'a'
+   has n limbs with a[n-1] >= B/2 and 'r' has n+1 limbs with r[n] = 1.
+   
+   See Modern Computer Arithmetic by Richard P. Brent and Paul
+   Zimmermann, algorithm 3.5 */
+int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n)
+{
+    mp_size_t l, h, k, i;
+    limb_t *tabxh, *tabt, c, *tabu;
+    
+    if (n <= 2) {
+        /* return ceil(B^(2*n)/a) - 1 */
+        /* XXX: could avoid allocation */
+        tabu = bf_malloc(s, sizeof(limb_t) * (2 * n + 1));
+        tabt = bf_malloc(s, sizeof(limb_t) * (n + 2));
+        if (!tabt || !tabu)
+            goto fail;
+        for(i = 0; i < 2 * n; i++)
+            tabu[i] = 0;
+        tabu[2 * n] = 1;
+        if (mp_divnorm(s, tabt, tabu, 2 * n + 1, taba, n))
+            goto fail;
+        for(i = 0; i < n + 1; i++)
+            tabr[i] = tabt[i];
+        if (mp_scan_nz(tabu, n) == 0) {
+            /* only happens for a=B^n/2 */
+            mp_sub_ui(tabr, 1, n + 1);
+        }
+    } else {
+        l = (n - 1) / 2;
+        h = n - l;
+        /* n=2p  -> l=p-1, h = p + 1, k = p + 3
+           n=2p+1-> l=p,  h = p + 1; k = p + 2
+        */
+        tabt = bf_malloc(s, sizeof(limb_t) * (n + h + 1));
+        tabu = bf_malloc(s, sizeof(limb_t) * (n + 2 * h - l + 2));
+        if (!tabt || !tabu)
+            goto fail;
+        tabxh = tabr + l;
+        if (mp_recip(s, tabxh, taba + l, h))
+            goto fail;
+        if (mp_mul(s, tabt, taba, n, tabxh, h + 1)) /* n + h + 1 limbs */
+            goto fail;
+        while (tabt[n + h] != 0) {
+            mp_sub_ui(tabxh, 1, h + 1);
+            c = mp_sub(tabt, tabt, taba, n, 0);
+            mp_sub_ui(tabt + n, c, h + 1);
+        }
+        /* T = B^(n+h) - T */
+        mp_neg(tabt, tabt, n + h + 1, 0);
+        tabt[n + h]++;
+        if (mp_mul(s, tabu, tabt + l, n + h + 1 - l, tabxh, h + 1))
+            goto fail;
+        /* n + 2*h - l + 2 limbs */
+        k = 2 * h - l;
+        for(i = 0; i < l; i++)
+            tabr[i] = tabu[i + k];
+        mp_add(tabr + l, tabr + l, tabu + 2 * h, h, 0);
+    }
+    bf_free(s, tabt);
+    bf_free(s, tabu);
+    return 0;
+ fail:
+    bf_free(s, tabt);
+    bf_free(s, tabu);
+    return -1;
+}
+
+/* return -1, 0 or 1 */
+static int mp_cmp(const limb_t *taba, const limb_t *tabb, mp_size_t n)
+{
+    mp_size_t i;
+    for(i = n - 1; i >= 0; i--) {
+        if (taba[i] != tabb[i]) {
+            if (taba[i] < tabb[i])
+                return -1;
+            else
+                return 1;
+        }
+    }
+    return 0;
+}
+
+//#define DEBUG_DIVNORM_LARGE
+//#define DEBUG_DIVNORM_LARGE2
+
+/* subquadratic divnorm */
+static int mp_divnorm_large(bf_context_t *s, 
+                            limb_t *tabq, limb_t *taba, limb_t na, 
+                            const limb_t *tabb, limb_t nb)
+{
+    limb_t *tabb_inv, nq, *tabt, i, n;
+    nq = na - nb;
+#ifdef DEBUG_DIVNORM_LARGE
+    printf("na=%d nb=%d nq=%d\n", (int)na, (int)nb, (int)nq);
+    mp_print_str("a", taba, na);
+    mp_print_str("b", tabb, nb);
+#endif
+    assert(nq >= 1);
+    n = nq;
+    if (nq < nb)
+        n++; 
+    tabb_inv = bf_malloc(s, sizeof(limb_t) * (n + 1));
+    tabt = bf_malloc(s, sizeof(limb_t) * 2 * (n + 1));
+    if (!tabb_inv || !tabt)
+        goto fail;
+
+    if (n >= nb) {
+        for(i = 0; i < n - nb; i++)
+            tabt[i] = 0;
+        for(i = 0; i < nb; i++)
+            tabt[i + n - nb] = tabb[i];
+    } else {
+        /* truncate B: need to increment it so that the approximate
+           inverse is smaller that the exact inverse */
+        for(i = 0; i < n; i++)
+            tabt[i] = tabb[i + nb - n];
+        if (mp_add_ui(tabt, 1, n)) {
+            /* tabt = B^n : tabb_inv = B^n */
+            memset(tabb_inv, 0, n * sizeof(limb_t));
+            tabb_inv[n] = 1;
+            goto recip_done;
+        }
+    }
+    if (mp_recip(s, tabb_inv, tabt, n))
+        goto fail;
+ recip_done:
+    /* Q=A*B^-1 */
+    if (mp_mul(s, tabt, tabb_inv, n + 1, taba + na - (n + 1), n + 1))
+        goto fail;
+    
+    for(i = 0; i < nq + 1; i++)
+        tabq[i] = tabt[i + 2 * (n + 1) - (nq + 1)];
+#ifdef DEBUG_DIVNORM_LARGE
+    mp_print_str("q", tabq, nq + 1);
+#endif
+
+    bf_free(s, tabt);
+    bf_free(s, tabb_inv);
+    tabb_inv = NULL;
+    
+    /* R=A-B*Q */
+    tabt = bf_malloc(s, sizeof(limb_t) * (na + 1));
+    if (!tabt)
+        goto fail;
+    if (mp_mul(s, tabt, tabq, nq + 1, tabb, nb))
+        goto fail;
+    /* we add one more limb for the result */
+    mp_sub(taba, taba, tabt, nb + 1, 0);
+    bf_free(s, tabt);
+    /* the approximated quotient is smaller than than the exact one,
+       hence we may have to increment it */
+#ifdef DEBUG_DIVNORM_LARGE2
+    int cnt = 0;
+    static int cnt_max;
+#endif
+    for(;;) {
+        if (taba[nb] == 0 && mp_cmp(taba, tabb, nb) < 0)
+            break;
+        taba[nb] -= mp_sub(taba, taba, tabb, nb, 0);
+        mp_add_ui(tabq, 1, nq + 1);
+#ifdef DEBUG_DIVNORM_LARGE2
+        cnt++;
+#endif
+    }
+#ifdef DEBUG_DIVNORM_LARGE2
+    if (cnt > cnt_max) {
+        cnt_max = cnt;
+        printf("\ncnt=%d nq=%d nb=%d\n", cnt_max, (int)nq, (int)nb);
+    }
+#endif
+    return 0;
+ fail:
+    bf_free(s, tabb_inv);
+    bf_free(s, tabt);
+    return -1;
+}
+
+int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+           bf_flags_t flags)
+{
+    int ret, r_sign;
+
+    if (a->len < b->len) {
+        const bf_t *tmp = a;
+        a = b;
+        b = tmp;
+    }
+    r_sign = a->sign ^ b->sign;
+    /* here b->len <= a->len */
+    if (b->len == 0) {
+        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            ret = 0;
+        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) {
+            if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) ||
+                (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) {
+                bf_set_nan(r);
+                ret = BF_ST_INVALID_OP;
+            } else {
+                bf_set_inf(r, r_sign);
+                ret = 0;
+            }
+        } else {
+            bf_set_zero(r, r_sign);
+            ret = 0;
+        }
+    } else {
+        bf_t tmp, *r1 = NULL;
+        limb_t a_len, b_len, precl;
+        limb_t *a_tab, *b_tab;
+            
+        a_len = a->len;
+        b_len = b->len;
+        
+        if ((flags & BF_RND_MASK) == BF_RNDF) {
+            /* faithful rounding does not require using the full inputs */
+            precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
+            a_len = bf_min(a_len, precl);
+            b_len = bf_min(b_len, precl);
+        }
+        a_tab = a->tab + a->len - a_len;
+        b_tab = b->tab + b->len - b_len;
+        
+#ifdef USE_FFT_MUL
+        if (b_len >= FFT_MUL_THRESHOLD) {
+            int mul_flags = 0;
+            if (r == a)
+                mul_flags |= FFT_MUL_R_OVERLAP_A;
+            if (r == b)
+                mul_flags |= FFT_MUL_R_OVERLAP_B;
+            if (fft_mul(r->ctx, r, a_tab, a_len, b_tab, b_len, mul_flags))
+                goto fail;
+        } else
+#endif
+        {
+            if (r == a || r == b) {
+                bf_init(r->ctx, &tmp);
+                r1 = r;
+                r = &tmp;
+            }
+            if (bf_resize(r, a_len + b_len)) {
+            fail:
+                bf_set_nan(r);
+                ret = BF_ST_MEM_ERROR;
+                goto done;
+            }
+            mp_mul_basecase(r->tab, a_tab, a_len, b_tab, b_len);
+        }
+        r->sign = r_sign;
+        r->expn = a->expn + b->expn;
+        ret = bf_normalize_and_round(r, prec, flags);
+    done:
+        if (r == &tmp)
+            bf_move(r1, &tmp);
+    }
+    return ret;
+}
+
+/* multiply 'r' by 2^e */
+int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags)
+{
+    slimb_t e_max;
+    if (r->len == 0)
+        return 0;
+    e_max = ((limb_t)1 << BF_EXT_EXP_BITS_MAX) - 1;
+    e = bf_max(e, -e_max);
+    e = bf_min(e, e_max);
+    r->expn += e;
+    return __bf_round(r, prec, flags, r->len, 0);
+}
+
+/* Return e such as a=m*2^e with m odd integer. return 0 if a is zero,
+   Infinite or Nan. */
+slimb_t bf_get_exp_min(const bf_t *a)
+{
+    slimb_t i;
+    limb_t v;
+    int k;
+    
+    for(i = 0; i < a->len; i++) {
+        v = a->tab[i];
+        if (v != 0) {
+            k = ctz(v);
+            return a->expn - (a->len - i) * LIMB_BITS + k;
+        }
+    }
+    return 0;
+}
+
+/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the
+   integer defined as floor(a/b) and r = a - q * b. */
+static void bf_tdivremu(bf_t *q, bf_t *r,
+                        const bf_t *a, const bf_t *b)
+{
+    if (bf_cmpu(a, b) < 0) {
+        bf_set_ui(q, 0);
+        bf_set(r, a);
+    } else {
+        bf_div(q, a, b, bf_max(a->expn - b->expn + 1, 2), BF_RNDZ);
+        bf_rint(q, BF_RNDZ);
+        bf_mul(r, q, b, BF_PREC_INF, BF_RNDZ);
+        bf_sub(r, a, r, BF_PREC_INF, BF_RNDZ);
+    }
+}
+
+static int __bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                    bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    int ret, r_sign;
+    limb_t n, nb, precl;
+    
+    r_sign = a->sign ^ b->sign;
+    if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) {
+        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_inf(r, r_sign);
+            return 0;
+        } else {
+            bf_set_zero(r, r_sign);
+            return 0;
+        }
+    } else if (a->expn == BF_EXP_ZERO) {
+        if (b->expn == BF_EXP_ZERO) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set_zero(r, r_sign);
+            return 0;
+        }
+    } else if (b->expn == BF_EXP_ZERO) {
+        bf_set_inf(r, r_sign);
+        return BF_ST_DIVIDE_ZERO;
+    }
+
+    /* number of limbs of the quotient (2 extra bits for rounding) */
+    precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
+    nb = b->len;
+    n = bf_max(a->len, precl);
+    
+    {
+        limb_t *taba, na;
+        slimb_t d;
+        
+        na = n + nb;
+        taba = bf_malloc(s, (na + 1) * sizeof(limb_t));
+        if (!taba)
+            goto fail;
+        d = na - a->len;
+        memset(taba, 0, d * sizeof(limb_t));
+        memcpy(taba + d, a->tab, a->len * sizeof(limb_t));
+        if (bf_resize(r, n + 1))
+            goto fail1;
+        if (mp_divnorm(s, r->tab, taba, na, b->tab, nb)) {
+        fail1:
+            bf_free(s, taba);
+            goto fail;
+        }
+        /* see if non zero remainder */
+        if (mp_scan_nz(taba, nb))
+            r->tab[0] |= 1;
+        bf_free(r->ctx, taba);
+        r->expn = a->expn - b->expn + LIMB_BITS;
+        r->sign = r_sign;
+        ret = bf_normalize_and_round(r, prec, flags);
+    }
+    return ret;
+ fail:
+    bf_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+/* division and remainder. 
+   
+   rnd_mode is the rounding mode for the quotient. The additional
+   rounding mode BF_RND_EUCLIDIAN is supported.
+
+   'q' is an integer. 'r' is rounded with prec and flags (prec can be
+   BF_PREC_INF).
+*/
+int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b,
+              limb_t prec, bf_flags_t flags, int rnd_mode)
+{
+    bf_t a1_s, *a1 = &a1_s;
+    bf_t b1_s, *b1 = &b1_s;
+    int q_sign, ret;
+    BOOL is_ceil, is_rndn;
+    
+    assert(q != a && q != b);
+    assert(r != a && r != b);
+    assert(q != r);
+    
+    if (a->len == 0 || b->len == 0) {
+        bf_set_zero(q, 0);
+        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set(r, a);
+            return bf_round(r, prec, flags);
+        }
+    }
+
+    q_sign = a->sign ^ b->sign;
+    is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA);
+    switch(rnd_mode) {
+    default:
+    case BF_RNDZ:
+    case BF_RNDN:
+    case BF_RNDNA:
+        is_ceil = FALSE;
+        break;
+    case BF_RNDD:
+        is_ceil = q_sign;
+        break;
+    case BF_RNDU:
+        is_ceil = q_sign ^ 1;
+        break;
+    case BF_RNDA:
+        is_ceil = TRUE;
+        break;
+    case BF_DIVREM_EUCLIDIAN:
+        is_ceil = a->sign;
+        break;
+    }
+
+    a1->expn = a->expn;
+    a1->tab = a->tab;
+    a1->len = a->len;
+    a1->sign = 0;
+    
+    b1->expn = b->expn;
+    b1->tab = b->tab;
+    b1->len = b->len;
+    b1->sign = 0;
+
+    /* XXX: could improve to avoid having a large 'q' */
+    bf_tdivremu(q, r, a1, b1);
+    if (bf_is_nan(q) || bf_is_nan(r))
+        goto fail;
+
+    if (r->len != 0) {
+        if (is_rndn) {
+            int res;
+            b1->expn--;
+            res = bf_cmpu(r, b1);
+            b1->expn++;
+            if (res > 0 ||
+                (res == 0 &&
+                 (rnd_mode == BF_RNDNA ||
+                  get_bit(q->tab, q->len, q->len * LIMB_BITS - q->expn)))) {
+                goto do_sub_r;
+            }
+        } else if (is_ceil) {
+        do_sub_r:
+            ret = bf_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ);
+            ret |= bf_sub(r, r, b1, BF_PREC_INF, BF_RNDZ);
+            if (ret & BF_ST_MEM_ERROR)
+                goto fail;
+        }
+    }
+
+    r->sign ^= a->sign;
+    q->sign = q_sign;
+    return bf_round(r, prec, flags);
+ fail:
+    bf_set_nan(q);
+    bf_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+           bf_flags_t flags, int rnd_mode)
+{
+    bf_t q_s, *q = &q_s;
+    int ret;
+    
+    bf_init(r->ctx, q);
+    ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode);
+    bf_delete(q);
+    return ret;
+}
+
+static inline int bf_get_limb(slimb_t *pres, const bf_t *a, int flags)
+{
+#if LIMB_BITS == 32
+    return bf_get_int32(pres, a, flags);
+#else
+    return bf_get_int64(pres, a, flags);
+#endif
+}
+
+int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+              bf_flags_t flags, int rnd_mode)
+{
+    bf_t q_s, *q = &q_s;
+    int ret;
+    
+    bf_init(r->ctx, q);
+    ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode);
+    bf_get_limb(pq, q, BF_GET_INT_MOD);
+    bf_delete(q);
+    return ret;
+}
+
+static __maybe_unused inline limb_t mul_mod(limb_t a, limb_t b, limb_t m)
+{
+    dlimb_t t;
+    t = (dlimb_t)a * (dlimb_t)b;
+    return t % m;
+}
+
+#if defined(USE_MUL_CHECK)
+static limb_t mp_mod1(const limb_t *tab, limb_t n, limb_t m, limb_t r)
+{
+    slimb_t i;
+    dlimb_t t;
+
+    for(i = n - 1; i >= 0; i--) {
+        t = ((dlimb_t)r << LIMB_BITS) | tab[i];
+        r = t % m;
+    }
+    return r;
+}
+#endif
+
+static const uint16_t sqrt_table[192] = {
+128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,212,212,213,214,214,215,215,216,217,217,218, [...]
+};
+
+/* a >= 2^(LIMB_BITS - 2).  Return (s, r) with s=floor(sqrt(a)) and
+   r=a-s^2. 0 <= r <= 2 * s */
+static limb_t mp_sqrtrem1(limb_t *pr, limb_t a)
+{
+    limb_t s1, r1, s, r, q, u, num;
+    
+    /* use a table for the 16 -> 8 bit sqrt */
+    s1 = sqrt_table[(a >> (LIMB_BITS - 8)) - 64];
+    r1 = (a >> (LIMB_BITS - 16)) - s1 * s1;
+    if (r1 > 2 * s1) {
+        r1 -= 2 * s1 + 1;
+        s1++;
+    }
+    
+    /* one iteration to get a 32 -> 16 bit sqrt */
+    num = (r1 << 8) | ((a >> (LIMB_BITS - 32 + 8)) & 0xff);
+    q = num / (2 * s1); /* q <= 2^8 */
+    u = num % (2 * s1);
+    s = (s1 << 8) + q;
+    r = (u << 8) | ((a >> (LIMB_BITS - 32)) & 0xff);
+    r -= q * q;
+    if ((slimb_t)r < 0) {
+        s--;
+        r += 2 * s + 1;
+    }
+
+#if LIMB_BITS == 64
+    s1 = s;
+    r1 = r;
+    /* one more iteration for 64 -> 32 bit sqrt */
+    num = (r1 << 16) | ((a >> (LIMB_BITS - 64 + 16)) & 0xffff);
+    q = num / (2 * s1); /* q <= 2^16 */
+    u = num % (2 * s1);
+    s = (s1 << 16) + q;
+    r = (u << 16) | ((a >> (LIMB_BITS - 64)) & 0xffff);
+    r -= q * q;
+    if ((slimb_t)r < 0) {
+        s--;
+        r += 2 * s + 1;
+    }
+#endif
+    *pr = r;
+    return s;
+}
+
+/* return floor(sqrt(a)) */
+limb_t bf_isqrt(limb_t a)
+{
+    limb_t s, r;
+    int k;
+
+    if (a == 0)
+        return 0;
+    k = clz(a) & ~1;
+    s = mp_sqrtrem1(&r, a << k);
+    s >>= (k >> 1);
+    return s;
+}
+
+static limb_t mp_sqrtrem2(limb_t *tabs, limb_t *taba)
+{
+    limb_t s1, r1, s, q, u, a0, a1;
+    dlimb_t r, num;
+    int l;
+
+    a0 = taba[0];
+    a1 = taba[1];
+    s1 = mp_sqrtrem1(&r1, a1);
+    l = LIMB_BITS / 2;
+    num = ((dlimb_t)r1 << l) | (a0 >> l);
+    q = num / (2 * s1);
+    u = num % (2 * s1);
+    s = (s1 << l) + q;
+    r = ((dlimb_t)u << l) | (a0 & (((limb_t)1 << l) - 1));
+    if (unlikely((q >> l) != 0))
+        r -= (dlimb_t)1 << LIMB_BITS; /* special case when q=2^l */
+    else
+        r -= q * q;
+    if ((slimb_t)(r >> LIMB_BITS) < 0) {
+        s--;
+        r += 2 * (dlimb_t)s + 1;
+    }
+    tabs[0] = s;
+    taba[0] = r;
+    return r >> LIMB_BITS;
+}
+
+//#define DEBUG_SQRTREM
+
+/* tmp_buf must contain (n / 2 + 1 limbs). *prh contains the highest
+   limb of the remainder. */
+static int mp_sqrtrem_rec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n,
+                          limb_t *tmp_buf, limb_t *prh)
+{
+    limb_t l, h, rh, ql, qh, c, i;
+    
+    if (n == 1) {
+        *prh = mp_sqrtrem2(tabs, taba);
+        return 0;
+    }
+#ifdef DEBUG_SQRTREM
+    mp_print_str("a", taba, 2 * n);
+#endif
+    l = n / 2;
+    h = n - l;
+    if (mp_sqrtrem_rec(s, tabs + l, taba + 2 * l, h, tmp_buf, &qh))
+        return -1;
+#ifdef DEBUG_SQRTREM
+    mp_print_str("s1", tabs + l, h);
+    mp_print_str_h("r1", taba + 2 * l, h, qh);
+    mp_print_str_h("r2", taba + l, n, qh);
+#endif
+    
+    /* the remainder is in taba + 2 * l. Its high bit is in qh */
+    if (qh) {
+        mp_sub(taba + 2 * l, taba + 2 * l, tabs + l, h, 0);
+    }
+    /* instead of dividing by 2*s, divide by s (which is normalized)
+       and update q and r */
+    if (mp_divnorm(s, tmp_buf, taba + l, n, tabs + l, h))
+        return -1;
+    qh += tmp_buf[l];
+    for(i = 0; i < l; i++)
+        tabs[i] = tmp_buf[i];
+    ql = mp_shr(tabs, tabs, l, 1, qh & 1);
+    qh = qh >> 1; /* 0 or 1 */
+    if (ql)
+        rh = mp_add(taba + l, taba + l, tabs + l, h, 0);
+    else
+        rh = 0;
+#ifdef DEBUG_SQRTREM
+    mp_print_str_h("q", tabs, l, qh);
+    mp_print_str_h("u", taba + l, h, rh);
+#endif
+    
+    mp_add_ui(tabs + l, qh, h);
+#ifdef DEBUG_SQRTREM
+    mp_print_str_h("s2", tabs, n, sh);
+#endif
+    
+    /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */
+    /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */
+    if (qh) {
+        c = qh;
+    } else {
+        if (mp_mul(s, taba + n, tabs, l, tabs, l))
+            return -1;
+        c = mp_sub(taba, taba, taba + n, 2 * l, 0);
+    }
+    rh -= mp_sub_ui(taba + 2 * l, c, n - 2 * l);
+    if ((slimb_t)rh < 0) {
+        mp_sub_ui(tabs, 1, n);
+        rh += mp_add_mul1(taba, tabs, n, 2);
+        rh += mp_add_ui(taba, 1, n);
+    }
+    *prh = rh;
+    return 0;
+}
+
+/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= 2 ^ (LIMB_BITS
+   - 2). Return (s, r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2
+   * s. tabs has n limbs. r is returned in the lower n limbs of
+   taba. Its r[n] is the returned value of the function. */
+/* Algorithm from the article "Karatsuba Square Root" by Paul Zimmermann and
+   inspirated from its GMP implementation */
+int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n)
+{
+    limb_t tmp_buf1[8];
+    limb_t *tmp_buf;
+    mp_size_t n2;
+    int ret;
+    n2 = n / 2 + 1;
+    if (n2 <= countof(tmp_buf1)) {
+        tmp_buf = tmp_buf1;
+    } else {
+        tmp_buf = bf_malloc(s, sizeof(limb_t) * n2);
+        if (!tmp_buf)
+            return -1;
+    }
+    ret = mp_sqrtrem_rec(s, tabs, taba, n, tmp_buf, taba + n);
+    if (tmp_buf != tmp_buf1)
+        bf_free(s, tmp_buf);
+    return ret;
+}
+
+/* Integer square root with remainder. 'a' must be an integer. r =
+   floor(sqrt(a)) and rem = a - r^2.  BF_ST_INEXACT is set if the result
+   is inexact. 'rem' can be NULL if the remainder is not needed. */
+int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a)
+{
+    int ret;
+    
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+        } else if (a->expn == BF_EXP_INF && a->sign) {
+            goto invalid_op;
+        } else {
+            bf_set(r, a);
+        }
+        if (rem1)
+            bf_set_ui(rem1, 0);
+        ret = 0;
+    } else if (a->sign) {
+ invalid_op:
+        bf_set_nan(r);
+        if (rem1)
+            bf_set_ui(rem1, 0);
+        ret = BF_ST_INVALID_OP;
+    } else {
+        bf_t rem_s, *rem;
+        
+        bf_sqrt(r, a, (a->expn + 1) / 2, BF_RNDZ);
+        bf_rint(r, BF_RNDZ);
+        /* see if the result is exact by computing the remainder */
+        if (rem1) {
+            rem = rem1;
+        } else {
+            rem = &rem_s;
+            bf_init(r->ctx, rem);
+        }
+        /* XXX: could avoid recomputing the remainder */
+        bf_mul(rem, r, r, BF_PREC_INF, BF_RNDZ);
+        bf_neg(rem);
+        bf_add(rem, rem, a, BF_PREC_INF, BF_RNDZ);
+        if (bf_is_nan(rem)) {
+            ret = BF_ST_MEM_ERROR;
+            goto done;
+        }
+        if (rem->len != 0) {
+            ret = BF_ST_INEXACT;
+        } else {
+            ret = 0;
+        }
+    done:
+        if (!rem1)
+            bf_delete(rem);
+    }
+    return ret;
+}
+
+int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = a->ctx;
+    int ret;
+
+    assert(r != a);
+
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+        } else if (a->expn == BF_EXP_INF && a->sign) {
+            goto invalid_op;
+        } else {
+            bf_set(r, a);
+        }
+        ret = 0;
+    } else if (a->sign) {
+ invalid_op:
+        bf_set_nan(r);
+        ret = BF_ST_INVALID_OP;
+    } else {
+        limb_t *a1;
+        slimb_t n, n1;
+        limb_t res;
+        
+        /* convert the mantissa to an integer with at least 2 *
+           prec + 4 bits */
+        n = (2 * (prec + 2) + 2 * LIMB_BITS - 1) / (2 * LIMB_BITS);
+        if (bf_resize(r, n))
+            goto fail;
+        a1 = bf_malloc(s, sizeof(limb_t) * 2 * n);
+        if (!a1)
+            goto fail;
+        n1 = bf_min(2 * n, a->len);
+        memset(a1, 0, (2 * n - n1) * sizeof(limb_t));
+        memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t));
+        if (a->expn & 1) {
+            res = mp_shr(a1, a1, 2 * n, 1, 0);
+        } else {
+            res = 0;
+        }
+        if (mp_sqrtrem(s, r->tab, a1, n)) {
+            bf_free(s, a1);
+            goto fail;
+        }
+        if (!res) {
+            res = mp_scan_nz(a1, n + 1);
+        }
+        bf_free(s, a1);
+        if (!res) {
+            res = mp_scan_nz(a->tab, a->len - n1);
+        }
+        if (res != 0)
+            r->tab[0] |= 1;
+        r->sign = 0;
+        r->expn = (a->expn + 1) >> 1;
+        ret = bf_round(r, prec, flags);
+    }
+    return ret;
+ fail:
+    bf_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+static no_inline int bf_op2(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+                            bf_flags_t flags, bf_op2_func_t *func)
+{
+    bf_t tmp;
+    int ret;
+    
+    if (r == a || r == b) {
+        bf_init(r->ctx, &tmp);
+        ret = func(&tmp, a, b, prec, flags);
+        bf_move(r, &tmp);
+    } else {
+        ret = func(r, a, b, prec, flags);
+    }
+    return ret;
+}
+
+int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+            bf_flags_t flags)
+{
+    return bf_op2(r, a, b, prec, flags, __bf_add);
+}
+
+int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+            bf_flags_t flags)
+{
+    return bf_op2(r, a, b, prec, flags, __bf_sub);
+}
+
+int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
+           bf_flags_t flags)
+{
+    return bf_op2(r, a, b, prec, flags, __bf_div);
+}
+
+int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec,
+               bf_flags_t flags)
+{
+    bf_t b;
+    int ret;
+    bf_init(r->ctx, &b);
+    ret = bf_set_ui(&b, b1);
+    ret |= bf_mul(r, a, &b, prec, flags);
+    bf_delete(&b);
+    return ret;
+}
+
+int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec,
+               bf_flags_t flags)
+{
+    bf_t b;
+    int ret;
+    bf_init(r->ctx, &b);
+    ret = bf_set_si(&b, b1);
+    ret |= bf_mul(r, a, &b, prec, flags);
+    bf_delete(&b);
+    return ret;
+}
+
+int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec,
+              bf_flags_t flags)
+{
+    bf_t b;
+    int ret;
+    
+    bf_init(r->ctx, &b);
+    ret = bf_set_si(&b, b1);
+    ret |= bf_add(r, a, &b, prec, flags);
+    bf_delete(&b);
+    return ret;
+}
+
+static int bf_pow_ui(bf_t *r, const bf_t *a, limb_t b, limb_t prec,
+                     bf_flags_t flags)
+{
+    int ret, n_bits, i;
+    
+    assert(r != a);
+    if (b == 0)
+        return bf_set_ui(r, 1);
+    ret = bf_set(r, a);
+    n_bits = LIMB_BITS - clz(b);
+    for(i = n_bits - 2; i >= 0; i--) {
+        ret |= bf_mul(r, r, r, prec, flags);
+        if ((b >> i) & 1)
+            ret |= bf_mul(r, r, a, prec, flags);
+    }
+    return ret;
+}
+
+static int bf_pow_ui_ui(bf_t *r, limb_t a1, limb_t b,
+                        limb_t prec, bf_flags_t flags)
+{
+    bf_t a;
+    int ret;
+    
+    if (a1 == 10 && b <= LIMB_DIGITS) {
+        /* use precomputed powers. We do not round at this point
+           because we expect the caller to do it */
+        ret = bf_set_ui(r, mp_pow_dec[b]);
+    } else {
+        bf_init(r->ctx, &a);
+        ret = bf_set_ui(&a, a1);
+        ret |= bf_pow_ui(r, &a, b, prec, flags);
+        bf_delete(&a);
+    }
+    return ret;
+}
+
+/* convert to integer (infinite precision) */
+int bf_rint(bf_t *r, int rnd_mode)
+{
+    return bf_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC);
+}
+
+/* logical operations */
+#define BF_LOGIC_OR  0
+#define BF_LOGIC_XOR 1
+#define BF_LOGIC_AND 2
+
+static inline limb_t bf_logic_op1(limb_t a, limb_t b, int op)
+{
+    switch(op) {
+    case BF_LOGIC_OR:
+        return a | b;
+    case BF_LOGIC_XOR:
+        return a ^ b;
+    default:
+    case BF_LOGIC_AND:
+        return a & b;
+    }
+}
+
+static int bf_logic_op(bf_t *r, const bf_t *a1, const bf_t *b1, int op)
+{
+    bf_t b1_s, a1_s, *a, *b;
+    limb_t a_sign, b_sign, r_sign;
+    slimb_t l, i, a_bit_offset, b_bit_offset;
+    limb_t v1, v2, v1_mask, v2_mask, r_mask;
+    int ret;
+    
+    assert(r != a1 && r != b1);
+
+    if (a1->expn <= 0)
+        a_sign = 0; /* minus zero is considered as positive */
+    else
+        a_sign = a1->sign;
+
+    if (b1->expn <= 0)
+        b_sign = 0; /* minus zero is considered as positive */
+    else
+        b_sign = b1->sign;
+    
+    if (a_sign) {
+        a = &a1_s;
+        bf_init(r->ctx, a);
+        if (bf_add_si(a, a1, 1, BF_PREC_INF, BF_RNDZ)) {
+            b = NULL;
+            goto fail;
+        }
+    } else {
+        a = (bf_t *)a1;
+    }
+
+    if (b_sign) {
+        b = &b1_s;
+        bf_init(r->ctx, b);
+        if (bf_add_si(b, b1, 1, BF_PREC_INF, BF_RNDZ))
+            goto fail;
+    } else {
+        b = (bf_t *)b1;
+    }
+    
+    r_sign = bf_logic_op1(a_sign, b_sign, op);
+    if (op == BF_LOGIC_AND && r_sign == 0) {
+        /* no need to compute extra zeros for and */
+        if (a_sign == 0 && b_sign == 0)
+            l = bf_min(a->expn, b->expn);
+        else if (a_sign == 0)
+            l = a->expn;
+        else
+            l = b->expn;
+    } else {
+        l = bf_max(a->expn, b->expn);
+    }
+    /* Note: a or b can be zero */
+    l = (bf_max(l, 1) + LIMB_BITS - 1) / LIMB_BITS;
+    if (bf_resize(r, l))
+        goto fail;
+    a_bit_offset = a->len * LIMB_BITS - a->expn;
+    b_bit_offset = b->len * LIMB_BITS - b->expn;
+    v1_mask = -a_sign;
+    v2_mask = -b_sign;
+    r_mask = -r_sign;
+    for(i = 0; i < l; i++) {
+        v1 = get_bits(a->tab, a->len, a_bit_offset + i * LIMB_BITS) ^ v1_mask;
+        v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS) ^ v2_mask;
+        r->tab[i] = bf_logic_op1(v1, v2, op) ^ r_mask;
+    }
+    r->expn = l * LIMB_BITS;
+    r->sign = r_sign;
+    bf_normalize_and_round(r, BF_PREC_INF, BF_RNDZ); /* cannot fail */
+    if (r_sign) {
+        if (bf_add_si(r, r, -1, BF_PREC_INF, BF_RNDZ))
+            goto fail;
+    }
+    ret = 0;
+ done:
+    if (a == &a1_s)
+        bf_delete(a);
+    if (b == &b1_s)
+        bf_delete(b);
+    return ret;
+ fail:
+    bf_set_nan(r);
+    ret = BF_ST_MEM_ERROR;
+    goto done;
+}
+
+/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
+int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b)
+{
+    return bf_logic_op(r, a, b, BF_LOGIC_OR);
+}
+
+/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
+int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b)
+{
+    return bf_logic_op(r, a, b, BF_LOGIC_XOR);
+}
+
+/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
+int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b)
+{
+    return bf_logic_op(r, a, b, BF_LOGIC_AND);
+}
+
+/* conversion between fixed size types */
+
+typedef union {
+    double d;
+    uint64_t u;
+} Float64Union;
+
+int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode)
+{
+    Float64Union u;
+    int e, ret;
+    uint64_t m;
+    
+    ret = 0;
+    if (a->expn == BF_EXP_NAN) {
+        u.u = 0x7ff8000000000000; /* quiet nan */
+    } else {
+        bf_t b_s, *b = &b_s;
+        
+        bf_init(a->ctx, b);
+        bf_set(b, a);
+        if (bf_is_finite(b)) {
+            ret = bf_round(b, 53, rnd_mode | BF_FLAG_SUBNORMAL | bf_set_exp_bits(11));
+        }
+        if (b->expn == BF_EXP_INF) {
+            e = (1 << 11) - 1;
+            m = 0;
+        } else if (b->expn == BF_EXP_ZERO) {
+            e = 0;
+            m = 0;
+        } else {
+            e = b->expn + 1023 - 1;
+#if LIMB_BITS == 32
+            if (b->len == 2) {
+                m = ((uint64_t)b->tab[1] << 32) | b->tab[0];
+            } else {
+                m = ((uint64_t)b->tab[0] << 32);
+            }
+#else
+            m = b->tab[0];
+#endif
+            if (e <= 0) {
+                /* subnormal */
+                m = m >> (12 - e);
+                e = 0;
+            } else {
+                m = (m << 1) >> 12;
+            }
+        }
+        u.u = m | ((uint64_t)e << 52) | ((uint64_t)b->sign << 63);
+        bf_delete(b);
+    }
+    *pres = u.d;
+    return ret;
+}
+
+int bf_set_float64(bf_t *a, double d)
+{
+    Float64Union u;
+    uint64_t m;
+    int shift, e, sgn;
+    
+    u.d = d;
+    sgn = u.u >> 63;
+    e = (u.u >> 52) & ((1 << 11) - 1);
+    m = u.u & (((uint64_t)1 << 52) - 1);
+    if (e == ((1 << 11) - 1)) {
+        if (m != 0) {
+            bf_set_nan(a);
+        } else {
+            bf_set_inf(a, sgn);
+        }
+    } else if (e == 0) {
+        if (m == 0) {
+            bf_set_zero(a, sgn);
+        } else {
+            /* subnormal number */
+            m <<= 12;
+            shift = clz64(m);
+            m <<= shift;
+            e = -shift;
+            goto norm;
+        }
+    } else {
+        m = (m << 11) | ((uint64_t)1 << 63);
+    norm:
+        a->expn = e - 1023 + 1;
+#if LIMB_BITS == 32
+        if (bf_resize(a, 2))
+            goto fail;
+        a->tab[0] = m;
+        a->tab[1] = m >> 32;
+#else
+        if (bf_resize(a, 1))
+            goto fail;
+        a->tab[0] = m;
+#endif
+        a->sign = sgn;
+    }
+    return 0;
+fail:
+    bf_set_nan(a);
+    return BF_ST_MEM_ERROR;
+}
+
+/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
+   is an overflow and 0 otherwise. */
+int bf_get_int32(int *pres, const bf_t *a, int flags)
+{
+    uint32_t v;
+    int ret;
+    if (a->expn >= BF_EXP_INF) {
+        ret = BF_ST_INVALID_OP;
+        if (flags & BF_GET_INT_MOD) {
+            v = 0;
+        } else if (a->expn == BF_EXP_INF) {
+            v = (uint32_t)INT32_MAX + a->sign;
+        } else {
+            v = INT32_MAX;
+        }
+    } else if (a->expn <= 0) {
+        v = 0;
+        ret = 0;
+    } else if (a->expn <= 31) {
+        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
+        if (a->sign)
+            v = -v;
+        ret = 0;
+    } else if (!(flags & BF_GET_INT_MOD)) {
+        ret = BF_ST_INVALID_OP;
+        if (a->sign) {
+            v = (uint32_t)INT32_MAX + 1;
+            if (a->expn == 32 && 
+                (a->tab[a->len - 1] >> (LIMB_BITS - 32)) == v) {
+                ret = 0;
+            }
+        } else {
+            v = INT32_MAX;
+        }
+    } else {
+        v = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); 
+        if (a->sign)
+            v = -v;
+        ret = 0;
+    }
+    *pres = v;
+    return ret;
+}
+
+/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
+   is an overflow and 0 otherwise. */
+int bf_get_int64(int64_t *pres, const bf_t *a, int flags)
+{
+    uint64_t v;
+    int ret;
+    if (a->expn >= BF_EXP_INF) {
+        ret = BF_ST_INVALID_OP;
+        if (flags & BF_GET_INT_MOD) {
+            v = 0;
+        } else if (a->expn == BF_EXP_INF) {
+            v = (uint64_t)INT64_MAX + a->sign;
+        } else {
+            v = INT64_MAX;
+        }
+    } else if (a->expn <= 0) {
+        v = 0;
+        ret = 0;
+    } else if (a->expn <= 63) {
+#if LIMB_BITS == 32
+        if (a->expn <= 32)
+            v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
+        else
+            v = (((uint64_t)a->tab[a->len - 1] << 32) |
+                 get_limbz(a, a->len - 2)) >> (64 - a->expn);
+#else
+        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
+#endif
+        if (a->sign)
+            v = -v;
+        ret = 0;
+    } else if (!(flags & BF_GET_INT_MOD)) {
+        ret = BF_ST_INVALID_OP;
+        if (a->sign) {
+            uint64_t v1;
+            v = (uint64_t)INT64_MAX + 1;
+            if (a->expn == 64) {
+                v1 = a->tab[a->len - 1];
+#if LIMB_BITS == 32
+                v1 = (v1 << 32) | get_limbz(a, a->len - 2);
+#endif
+                if (v1 == v)
+                    ret = 0;
+            }
+        } else {
+            v = INT64_MAX;
+        }
+    } else {
+        slimb_t bit_pos = a->len * LIMB_BITS - a->expn;
+        v = get_bits(a->tab, a->len, bit_pos); 
+#if LIMB_BITS == 32
+        v |= (uint64_t)get_bits(a->tab, a->len, bit_pos + 32) << 32;
+#endif
+        if (a->sign)
+            v = -v;
+        ret = 0;
+    }
+    *pres = v;
+    return ret;
+}
+
+/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
+   is an overflow and 0 otherwise. */
+int bf_get_uint64(uint64_t *pres, const bf_t *a)
+{
+    uint64_t v;
+    int ret;
+    if (a->expn == BF_EXP_NAN) {
+        goto overflow;
+    } else if (a->expn <= 0) {
+        v = 0;
+        ret = 0;
+    } else if (a->sign) {
+        v = 0;
+        ret = BF_ST_INVALID_OP;
+    } else if (a->expn <= 64) {
+#if LIMB_BITS == 32
+        if (a->expn <= 32)
+            v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
+        else
+            v = (((uint64_t)a->tab[a->len - 1] << 32) |
+                 get_limbz(a, a->len - 2)) >> (64 - a->expn);
+#else
+        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
+#endif
+        ret = 0;
+    } else {
+    overflow:
+        v = UINT64_MAX;
+        ret = BF_ST_INVALID_OP;
+    }
+    *pres = v;
+    return ret;
+}
+
+/* base conversion from radix */
+
+static const uint8_t digits_per_limb_table[BF_RADIX_MAX - 1] = {
+#if LIMB_BITS == 32
+32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+#else
+64,40,32,27,24,22,21,20,19,18,17,17,16,16,16,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12,
+#endif
+};
+
+static limb_t get_limb_radix(int radix)
+{
+    int i, k;
+    limb_t radixl;
+    
+    k = digits_per_limb_table[radix - 2];
+    radixl = radix;
+    for(i = 1; i < k; i++)
+        radixl *= radix;
+    return radixl;
+}
+
+/* return != 0 if error */
+static int bf_integer_from_radix_rec(bf_t *r, const limb_t *tab,
+                                     limb_t n, int level, limb_t n0,
+                                     limb_t radix, bf_t *pow_tab)
+{
+    int ret;
+    if (n == 1) {
+        ret = bf_set_ui(r, tab[0]);
+    } else {
+        bf_t T_s, *T = &T_s, *B;
+        limb_t n1, n2;
+        
+        n2 = (((n0 * 2) >> (level + 1)) + 1) / 2;
+        n1 = n - n2;
+        //        printf("level=%d n0=%ld n1=%ld n2=%ld\n", level, n0, n1, n2);
+        B = &pow_tab[level];
+        if (B->len == 0) {
+            ret = bf_pow_ui_ui(B, radix, n2, BF_PREC_INF, BF_RNDZ);
+            if (ret)
+                return ret;
+        }
+        ret = bf_integer_from_radix_rec(r, tab + n2, n1, level + 1, n0,
+                                        radix, pow_tab);
+        if (ret)
+            return ret;
+        ret = bf_mul(r, r, B, BF_PREC_INF, BF_RNDZ);
+        if (ret)
+            return ret;
+        bf_init(r->ctx, T);
+        ret = bf_integer_from_radix_rec(T, tab, n2, level + 1, n0,
+                                        radix, pow_tab);
+        if (!ret)
+            ret = bf_add(r, r, T, BF_PREC_INF, BF_RNDZ);
+        bf_delete(T);
+    }
+    return ret;
+    //    bf_print_str("  r=", r);
+}
+
+/* return 0 if OK != 0 if memory error */
+static int bf_integer_from_radix(bf_t *r, const limb_t *tab,
+                                 limb_t n, limb_t radix)
+{
+    bf_context_t *s = r->ctx;
+    int pow_tab_len, i, ret;
+    limb_t radixl;
+    bf_t *pow_tab;
+    
+    radixl = get_limb_radix(radix);
+    pow_tab_len = ceil_log2(n) + 2; /* XXX: check */
+    pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len);
+    if (!pow_tab)
+        return -1;
+    for(i = 0; i < pow_tab_len; i++)
+        bf_init(r->ctx, &pow_tab[i]);
+    ret = bf_integer_from_radix_rec(r, tab, n, 0, n, radixl, pow_tab);
+    for(i = 0; i < pow_tab_len; i++) {
+        bf_delete(&pow_tab[i]);
+    }
+    bf_free(s, pow_tab);
+    return ret;
+}
+
+/* compute and round T * radix^expn. */
+int bf_mul_pow_radix(bf_t *r, const bf_t *T, limb_t radix,
+                     slimb_t expn, limb_t prec, bf_flags_t flags)
+{
+    int ret, expn_sign, overflow;
+    slimb_t e, extra_bits, prec1, ziv_extra_bits;
+    bf_t B_s, *B = &B_s;
+
+    if (T->len == 0) {
+        return bf_set(r, T);
+    } else if (expn == 0) {
+        ret = bf_set(r, T);
+        ret |= bf_round(r, prec, flags);
+        return ret;
+    }
+
+    e = expn;
+    expn_sign = 0;
+    if (e < 0) {
+        e = -e;
+        expn_sign = 1;
+    }
+    bf_init(r->ctx, B);
+    if (prec == BF_PREC_INF) {
+        /* infinite precision: only used if the result is known to be exact */
+        ret = bf_pow_ui_ui(B, radix, e, BF_PREC_INF, BF_RNDN);
+        if (expn_sign) {
+            ret |= bf_div(r, T, B, T->len * LIMB_BITS, BF_RNDN);
+        } else {
+            ret |= bf_mul(r, T, B, BF_PREC_INF, BF_RNDN);
+        }
+    } else {
+        ziv_extra_bits = 16;
+        for(;;) {
+            prec1 = prec + ziv_extra_bits;
+            /* XXX: correct overflow/underflow handling */
+            /* XXX: rigorous error analysis needed */
+            extra_bits = ceil_log2(e) * 2 + 1;
+            ret = bf_pow_ui_ui(B, radix, e, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
+            overflow = !bf_is_finite(B);
+            /* XXX: if bf_pow_ui_ui returns an exact result, can stop
+               after the next operation */
+            if (expn_sign)
+                ret |= bf_div(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
+            else
+                ret |= bf_mul(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
+            if (ret & BF_ST_MEM_ERROR)
+                break;
+            if ((ret & BF_ST_INEXACT) &&
+                !bf_can_round(r, prec, flags & BF_RND_MASK, prec1) &&
+                !overflow) {
+                /* and more precision and retry */
+                ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
+            } else {
+                /* XXX: need to use __bf_round() to pass the inexact
+                   flag for the subnormal case */
+                ret = bf_round(r, prec, flags) | (ret & BF_ST_INEXACT);
+                break;
+            }
+        }
+    }
+    bf_delete(B);
+    return ret;
+}
+
+static inline int to_digit(int c)
+{
+    if (c >= '0' && c <= '9')
+        return c - '0';
+    else if (c >= 'A' && c <= 'Z')
+        return c - 'A' + 10;
+    else if (c >= 'a' && c <= 'z')
+        return c - 'a' + 10;
+    else
+        return 36;
+}
+
+/* add a limb at 'pos' and decrement pos. new space is created if
+   needed. Return 0 if OK, -1 if memory error */
+static int bf_add_limb(bf_t *a, slimb_t *ppos, limb_t v)
+{
+    slimb_t pos;
+    pos = *ppos;
+    if (unlikely(pos < 0)) {
+        limb_t new_size, d, *new_tab;
+        new_size = bf_max(a->len + 1, a->len * 3 / 2);
+        new_tab = bf_realloc(a->ctx, a->tab, sizeof(limb_t) * new_size);
+        if (!new_tab)
+            return -1;
+        a->tab = new_tab;
+        d = new_size - a->len;
+        memmove(a->tab + d, a->tab, a->len * sizeof(limb_t));
+        a->len = new_size;
+        pos += d;
+    }
+    a->tab[pos--] = v;
+    *ppos = pos;
+    return 0;
+}
+
+static int bf_tolower(int c)
+{
+    if (c >= 'A' && c <= 'Z')
+        c = c - 'A' + 'a';
+    return c;
+}
+
+static int strcasestart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (bf_tolower(*p) != *q)
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+static int bf_atof_internal(bf_t *r, slimb_t *pexponent,
+                            const char *str, const char **pnext, int radix,
+                            limb_t prec, bf_flags_t flags, BOOL is_dec)
+{
+    const char *p, *p_start;
+    int is_neg, radix_bits, exp_is_neg, ret, digits_per_limb, shift;
+    limb_t cur_limb;
+    slimb_t pos, expn, int_len, digit_count;
+    BOOL has_decpt, is_bin_exp;
+    bf_t a_s, *a;
+    
+    *pexponent = 0;
+    p = str;
+    if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 &&
+        strcasestart(p, "nan", &p)) {
+        bf_set_nan(r);
+        ret = 0;
+        goto done;
+    }
+    is_neg = 0;
+    
+    if (p[0] == '+') {
+        p++;
+        p_start = p;
+    } else if (p[0] == '-') {
+        is_neg = 1;
+        p++;
+        p_start = p;
+    } else {
+        p_start = p;
+    }
+    if (p[0] == '0') {
+        if ((p[1] == 'x' || p[1] == 'X') &&
+            (radix == 0 || radix == 16) &&
+            !(flags & BF_ATOF_NO_HEX)) {
+            radix = 16;
+            p += 2;
+        } else if ((p[1] == 'o' || p[1] == 'O') &&
+                   radix == 0 && (flags & BF_ATOF_BIN_OCT)) {
+            p += 2;
+            radix = 8;
+        } else if ((p[1] == 'b' || p[1] == 'B') &&
+                   radix == 0 && (flags & BF_ATOF_BIN_OCT)) {
+            p += 2;
+            radix = 2;
+        } else {
+            goto no_prefix;
+        }
+        /* there must be a digit after the prefix */
+        if (to_digit((uint8_t)*p) >= radix) {
+            bf_set_nan(r);
+            ret = 0;
+            goto done;
+        }
+    no_prefix: ;
+    } else {
+        if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 &&
+            strcasestart(p, "inf", &p)) {
+            bf_set_inf(r, is_neg);
+            ret = 0;
+            goto done;
+        }
+    }
+    
+    if (radix == 0)
+        radix = 10;
+    if (is_dec) {
+        assert(radix == 10);
+        radix_bits = 0;
+        a = r;
+    } else if ((radix & (radix - 1)) != 0) {
+        radix_bits = 0; /* base is not a power of two */
+        a = &a_s;
+        bf_init(r->ctx, a);
+    } else {
+        radix_bits = ceil_log2(radix);
+        a = r;
+    }
+
+    /* skip leading zeros */
+    /* XXX: could also skip zeros after the decimal point */
+    while (*p == '0')
+        p++;
+
+    if (radix_bits) {
+        shift = digits_per_limb = LIMB_BITS;
+    } else {
+        radix_bits = 0;
+        shift = digits_per_limb = digits_per_limb_table[radix - 2];
+    }
+    cur_limb = 0;
+    bf_resize(a, 1);
+    pos = 0;
+    has_decpt = FALSE;
+    int_len = digit_count = 0;
+    for(;;) {
+        limb_t c;
+        if (*p == '.' && (p > p_start || to_digit(p[1]) < radix)) {
+            if (has_decpt)
+                break;
+            has_decpt = TRUE;
+            int_len = digit_count;
+            p++;
+        }
+        c = to_digit(*p);
+        if (c >= radix)
+            break;
+        digit_count++;
+        p++;
+        if (radix_bits) {
+            shift -= radix_bits;
+            if (shift <= 0) {
+                cur_limb |= c >> (-shift);
+                if (bf_add_limb(a, &pos, cur_limb))
+                    goto mem_error;
+                if (shift < 0)
+                    cur_limb = c << (LIMB_BITS + shift);
+                else
+                    cur_limb = 0;
+                shift += LIMB_BITS;
+            } else {
+                cur_limb |= c << shift;
+            }
+        } else {
+            cur_limb = cur_limb * radix + c;
+            shift--;
+            if (shift == 0) {
+                if (bf_add_limb(a, &pos, cur_limb))
+                    goto mem_error;
+                shift = digits_per_limb;
+                cur_limb = 0;
+            }
+        }
+    }
+    if (!has_decpt)
+        int_len = digit_count;
+
+    /* add the last limb and pad with zeros */
+    if (shift != digits_per_limb) {
+        if (radix_bits == 0) {
+            while (shift != 0) {
+                cur_limb *= radix;
+                shift--;
+            }
+        }
+        if (bf_add_limb(a, &pos, cur_limb)) {
+        mem_error:
+            ret = BF_ST_MEM_ERROR;
+            if (!radix_bits)
+                bf_delete(a);
+            bf_set_nan(r);
+            goto done;
+        }
+    }
+            
+    /* reset the next limbs to zero (we prefer to reallocate in the
+       renormalization) */
+    memset(a->tab, 0, (pos + 1) * sizeof(limb_t));
+
+    if (p == p_start) {
+        ret = 0;
+        if (!radix_bits)
+            bf_delete(a);
+        bf_set_nan(r);
+        goto done;
+    }
+
+    /* parse the exponent, if any */
+    expn = 0;
+    is_bin_exp = FALSE;
+    if (((radix == 10 && (*p == 'e' || *p == 'E')) ||
+         (radix != 10 && (*p == '@' ||
+                          (radix_bits && (*p == 'p' || *p == 'P'))))) &&
+        p > p_start) {
+        is_bin_exp = (*p == 'p' || *p == 'P');
+        p++;
+        exp_is_neg = 0;
+        if (*p == '+') {
+            p++;
+        } else if (*p == '-') {
+            exp_is_neg = 1;
+            p++;
+        }
+        for(;;) {
+            int c;
+            c = to_digit(*p);
+            if (c >= 10)
+                break;
+            if (unlikely(expn > ((BF_RAW_EXP_MAX - 2 - 9) / 10))) {
+                /* exponent overflow */
+                if (exp_is_neg) {
+                    bf_set_zero(r, is_neg);
+                    ret = BF_ST_UNDERFLOW | BF_ST_INEXACT;
+                } else {
+                    bf_set_inf(r, is_neg);
+                    ret = BF_ST_OVERFLOW | BF_ST_INEXACT;
+                }
+                goto done;
+            }
+            p++;
+            expn = expn * 10 + c;
+        }
+        if (exp_is_neg)
+            expn = -expn;
+    }
+    if (is_dec) {
+        a->expn = expn + int_len;
+        a->sign = is_neg;
+        ret = bfdec_normalize_and_round((bfdec_t *)a, prec, flags);
+    } else if (radix_bits) {
+        /* XXX: may overflow */
+        if (!is_bin_exp)
+            expn *= radix_bits; 
+        a->expn = expn + (int_len * radix_bits);
+        a->sign = is_neg;
+        ret = bf_normalize_and_round(a, prec, flags);
+    } else {
+        limb_t l;
+        pos++;
+        l = a->len - pos; /* number of limbs */
+        if (l == 0) {
+            bf_set_zero(r, is_neg);
+            ret = 0;
+        } else {
+            bf_t T_s, *T = &T_s;
+
+            expn -= l * digits_per_limb - int_len;
+            bf_init(r->ctx, T);
+            if (bf_integer_from_radix(T, a->tab + pos, l, radix)) {
+                bf_set_nan(r);
+                ret = BF_ST_MEM_ERROR;
+            } else {
+                T->sign = is_neg;
+                if (flags & BF_ATOF_EXPONENT) {
+                    /* return the exponent */
+                    *pexponent = expn;
+                    ret = bf_set(r, T);
+                } else {
+                    ret = bf_mul_pow_radix(r, T, radix, expn, prec, flags);
+                }
+            }
+            bf_delete(T);
+        }
+        bf_delete(a);
+    }
+ done:
+    if (pnext)
+        *pnext = p;
+    return ret;
+}
+
+/* 
+   Return (status, n, exp). 'status' is the floating point status. 'n'
+   is the parsed number. 
+
+   If (flags & BF_ATOF_EXPONENT) and if the radix is not a power of
+   two, the parsed number is equal to r *
+   (*pexponent)^radix. Otherwise *pexponent = 0.
+*/
+int bf_atof2(bf_t *r, slimb_t *pexponent,
+             const char *str, const char **pnext, int radix,
+             limb_t prec, bf_flags_t flags)
+{
+    return bf_atof_internal(r, pexponent, str, pnext, radix, prec, flags,
+                            FALSE);
+}
+
+int bf_atof(bf_t *r, const char *str, const char **pnext, int radix,
+            limb_t prec, bf_flags_t flags)
+{
+    slimb_t dummy_exp;
+    return bf_atof_internal(r, &dummy_exp, str, pnext, radix, prec, flags, FALSE);
+}
+
+/* base conversion to radix */
+
+#if LIMB_BITS == 64
+#define RADIXL_10 UINT64_C(10000000000000000000)
+#else
+#define RADIXL_10 UINT64_C(1000000000)
+#endif
+
+static const uint32_t inv_log2_radix[BF_RADIX_MAX - 1][LIMB_BITS / 32 + 1] = {
+#if LIMB_BITS == 32
+{ 0x80000000, 0x00000000,},
+{ 0x50c24e60, 0xd4d4f4a7,},
+{ 0x40000000, 0x00000000,},
+{ 0x372068d2, 0x0a1ee5ca,},
+{ 0x3184648d, 0xb8153e7a,},
+{ 0x2d983275, 0x9d5369c4,},
+{ 0x2aaaaaaa, 0xaaaaaaab,},
+{ 0x28612730, 0x6a6a7a54,},
+{ 0x268826a1, 0x3ef3fde6,},
+{ 0x25001383, 0xbac8a744,},
+{ 0x23b46706, 0x82c0c709,},
+{ 0x229729f1, 0xb2c83ded,},
+{ 0x219e7ffd, 0xa5ad572b,},
+{ 0x20c33b88, 0xda7c29ab,},
+{ 0x20000000, 0x00000000,},
+{ 0x1f50b57e, 0xac5884b3,},
+{ 0x1eb22cc6, 0x8aa6e26f,},
+{ 0x1e21e118, 0x0c5daab2,},
+{ 0x1d9dcd21, 0x439834e4,},
+{ 0x1d244c78, 0x367a0d65,},
+{ 0x1cb40589, 0xac173e0c,},
+{ 0x1c4bd95b, 0xa8d72b0d,},
+{ 0x1bead768, 0x98f8ce4c,},
+{ 0x1b903469, 0x050f72e5,},
+{ 0x1b3b433f, 0x2eb06f15,},
+{ 0x1aeb6f75, 0x9c46fc38,},
+{ 0x1aa038eb, 0x0e3bfd17,},
+{ 0x1a593062, 0xb38d8c56,},
+{ 0x1a15f4c3, 0x2b95a2e6,},
+{ 0x19d630dc, 0xcc7ddef9,},
+{ 0x19999999, 0x9999999a,},
+{ 0x195fec80, 0x8a609431,},
+{ 0x1928ee7b, 0x0b4f22f9,},
+{ 0x18f46acf, 0x8c06e318,},
+{ 0x18c23246, 0xdc0a9f3d,},
+#else
+{ 0x80000000, 0x00000000, 0x00000000,},
+{ 0x50c24e60, 0xd4d4f4a7, 0x021f57bc,},
+{ 0x40000000, 0x00000000, 0x00000000,},
+{ 0x372068d2, 0x0a1ee5ca, 0x19ea911b,},
+{ 0x3184648d, 0xb8153e7a, 0x7fc2d2e1,},
+{ 0x2d983275, 0x9d5369c4, 0x4dec1661,},
+{ 0x2aaaaaaa, 0xaaaaaaaa, 0xaaaaaaab,},
+{ 0x28612730, 0x6a6a7a53, 0x810fabde,},
+{ 0x268826a1, 0x3ef3fde6, 0x23e2566b,},
+{ 0x25001383, 0xbac8a744, 0x385a3349,},
+{ 0x23b46706, 0x82c0c709, 0x3f891718,},
+{ 0x229729f1, 0xb2c83ded, 0x15fba800,},
+{ 0x219e7ffd, 0xa5ad572a, 0xe169744b,},
+{ 0x20c33b88, 0xda7c29aa, 0x9bddee52,},
+{ 0x20000000, 0x00000000, 0x00000000,},
+{ 0x1f50b57e, 0xac5884b3, 0x70e28eee,},
+{ 0x1eb22cc6, 0x8aa6e26f, 0x06d1a2a2,},
+{ 0x1e21e118, 0x0c5daab1, 0x81b4f4bf,},
+{ 0x1d9dcd21, 0x439834e3, 0x81667575,},
+{ 0x1d244c78, 0x367a0d64, 0xc8204d6d,},
+{ 0x1cb40589, 0xac173e0c, 0x3b7b16ba,},
+{ 0x1c4bd95b, 0xa8d72b0d, 0x5879f25a,},
+{ 0x1bead768, 0x98f8ce4c, 0x66cc2858,},
+{ 0x1b903469, 0x050f72e5, 0x0cf5488e,},
+{ 0x1b3b433f, 0x2eb06f14, 0x8c89719c,},
+{ 0x1aeb6f75, 0x9c46fc37, 0xab5fc7e9,},
+{ 0x1aa038eb, 0x0e3bfd17, 0x1bd62080,},
+{ 0x1a593062, 0xb38d8c56, 0x7998ab45,},
+{ 0x1a15f4c3, 0x2b95a2e6, 0x46aed6a0,},
+{ 0x19d630dc, 0xcc7ddef9, 0x5aadd61b,},
+{ 0x19999999, 0x99999999, 0x9999999a,},
+{ 0x195fec80, 0x8a609430, 0xe1106014,},
+{ 0x1928ee7b, 0x0b4f22f9, 0x5f69791d,},
+{ 0x18f46acf, 0x8c06e318, 0x4d2aeb2c,},
+{ 0x18c23246, 0xdc0a9f3d, 0x3fe16970,},
+#endif
+};
+
+static const limb_t log2_radix[BF_RADIX_MAX - 1] = {
+#if LIMB_BITS == 32
+0x20000000,
+0x32b80347,
+0x40000000,
+0x4a4d3c26,
+0x52b80347,
+0x59d5d9fd,
+0x60000000,
+0x6570068e,
+0x6a4d3c26,
+0x6eb3a9f0,
+0x72b80347,
+0x766a008e,
+0x79d5d9fd,
+0x7d053f6d,
+0x80000000,
+0x82cc7edf,
+0x8570068e,
+0x87ef05ae,
+0x8a4d3c26,
+0x8c8ddd45,
+0x8eb3a9f0,
+0x90c10501,
+0x92b80347,
+0x949a784c,
+0x966a008e,
+0x982809d6,
+0x99d5d9fd,
+0x9b74948f,
+0x9d053f6d,
+0x9e88c6b3,
+0xa0000000,
+0xa16bad37,
+0xa2cc7edf,
+0xa4231623,
+0xa570068e,
+#else
+0x2000000000000000,
+0x32b803473f7ad0f4,
+0x4000000000000000,
+0x4a4d3c25e68dc57f,
+0x52b803473f7ad0f4,
+0x59d5d9fd5010b366,
+0x6000000000000000,
+0x6570068e7ef5a1e8,
+0x6a4d3c25e68dc57f,
+0x6eb3a9f01975077f,
+0x72b803473f7ad0f4,
+0x766a008e4788cbcd,
+0x79d5d9fd5010b366,
+0x7d053f6d26089673,
+0x8000000000000000,
+0x82cc7edf592262d0,
+0x8570068e7ef5a1e8,
+0x87ef05ae409a0289,
+0x8a4d3c25e68dc57f,
+0x8c8ddd448f8b845a,
+0x8eb3a9f01975077f,
+0x90c10500d63aa659,
+0x92b803473f7ad0f4,
+0x949a784bcd1b8afe,
+0x966a008e4788cbcd,
+0x982809d5be7072dc,
+0x99d5d9fd5010b366,
+0x9b74948f5532da4b,
+0x9d053f6d26089673,
+0x9e88c6b3626a72aa,
+0xa000000000000000,
+0xa16bad3758efd873,
+0xa2cc7edf592262d0,
+0xa4231623369e78e6,
+0xa570068e7ef5a1e8,
+#endif
+};
+
+/* compute floor(a*b) or ceil(a*b) with b = log2(radix) or
+   b=1/log2(radix). For is_inv = 0, strict accuracy is not guaranteed
+   when radix is not a power of two. */
+slimb_t bf_mul_log2_radix(slimb_t a1, unsigned int radix, int is_inv,
+                          int is_ceil1)
+{
+    int is_neg;
+    limb_t a;
+    BOOL is_ceil;
+
+    is_ceil = is_ceil1;
+    a = a1;
+    if (a1 < 0) {
+        a = -a;
+        is_neg = 1;
+    } else {
+        is_neg = 0;
+    }
+    is_ceil ^= is_neg;
+    if ((radix & (radix - 1)) == 0) {
+        int radix_bits;
+        /* radix is a power of two */
+        radix_bits = ceil_log2(radix);
+        if (is_inv) {
+            if (is_ceil)
+                a += radix_bits - 1;
+            a = a / radix_bits;
+        } else {
+            a = a * radix_bits;
+        }
+    } else {
+        const uint32_t *tab;
+        limb_t b0, b1;
+        dlimb_t t;
+        
+        if (is_inv) {
+            tab = inv_log2_radix[radix - 2];
+#if LIMB_BITS == 32
+            b1 = tab[0];
+            b0 = tab[1];
+#else
+            b1 = ((limb_t)tab[0] << 32) | tab[1];
+            b0 = (limb_t)tab[2] << 32;
+#endif
+            t = (dlimb_t)b0 * (dlimb_t)a;
+            t = (dlimb_t)b1 * (dlimb_t)a + (t >> LIMB_BITS);
+            a = t >> (LIMB_BITS - 1);
+        } else {
+            b0 = log2_radix[radix - 2];
+            t = (dlimb_t)b0 * (dlimb_t)a;
+            a = t >> (LIMB_BITS - 3);
+        }
+        /* a = floor(result) and 'result' cannot be an integer */
+        a += is_ceil;
+    }
+    if (is_neg)
+        a = -a;
+    return a;
+}
+
+/* 'n' is the number of output limbs */
+static int bf_integer_to_radix_rec(bf_t *pow_tab,
+                                   limb_t *out, const bf_t *a, limb_t n,
+                                   int level, limb_t n0, limb_t radixl,
+                                   unsigned int radixl_bits)
+{
+    limb_t n1, n2, q_prec;
+    int ret;
+    
+    assert(n >= 1);
+    if (n == 1) {
+        out[0] = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn);
+    } else if (n == 2) {
+        dlimb_t t;
+        slimb_t pos;
+        pos = a->len * LIMB_BITS - a->expn;
+        t = ((dlimb_t)get_bits(a->tab, a->len, pos + LIMB_BITS) << LIMB_BITS) |
+            get_bits(a->tab, a->len, pos);
+        if (likely(radixl == RADIXL_10)) {
+            /* use division by a constant when possible */
+            out[0] = t % RADIXL_10;
+            out[1] = t / RADIXL_10;
+        } else {
+            out[0] = t % radixl;
+            out[1] = t / radixl;
+        }
+    } else {
+        bf_t Q, R, *B, *B_inv;
+        int q_add;
+        bf_init(a->ctx, &Q);
+        bf_init(a->ctx, &R);
+        n2 = (((n0 * 2) >> (level + 1)) + 1) / 2;
+        n1 = n - n2;
+        B = &pow_tab[2 * level];
+        B_inv = &pow_tab[2 * level + 1];
+        ret = 0;
+        if (B->len == 0) {
+            /* compute BASE^n2 */
+            ret |= bf_pow_ui_ui(B, radixl, n2, BF_PREC_INF, BF_RNDZ);
+            /* we use enough bits for the maximum possible 'n1' value,
+               i.e. n2 + 1 */
+            ret |= bf_set_ui(&R, 1);
+            ret |= bf_div(B_inv, &R, B, (n2 + 1) * radixl_bits + 2, BF_RNDN);
+        }
+        //        printf("%d: n1=% " PRId64 " n2=%" PRId64 "\n", level, n1, n2);
+        q_prec = n1 * radixl_bits;
+        ret |= bf_mul(&Q, a, B_inv, q_prec, BF_RNDN);
+        ret |= bf_rint(&Q, BF_RNDZ);
+        
+        ret |= bf_mul(&R, &Q, B, BF_PREC_INF, BF_RNDZ);
+        ret |= bf_sub(&R, a, &R, BF_PREC_INF, BF_RNDZ);
+
+        if (ret & BF_ST_MEM_ERROR)
+            goto fail;
+        /* adjust if necessary */
+        q_add = 0;
+        while (R.sign && R.len != 0) {
+            if (bf_add(&R, &R, B, BF_PREC_INF, BF_RNDZ))
+                goto fail;
+            q_add--;
+        }
+        while (bf_cmpu(&R, B) >= 0) {
+            if (bf_sub(&R, &R, B, BF_PREC_INF, BF_RNDZ))
+                goto fail;
+            q_add++;
+        }
+        if (q_add != 0) {
+            if (bf_add_si(&Q, &Q, q_add, BF_PREC_INF, BF_RNDZ))
+                goto fail;
+        }
+        if (bf_integer_to_radix_rec(pow_tab, out + n2, &Q, n1, level + 1, n0,
+                                    radixl, radixl_bits))
+            goto fail;
+        if (bf_integer_to_radix_rec(pow_tab, out, &R, n2, level + 1, n0,
+                                    radixl, radixl_bits)) {
+        fail:
+            bf_delete(&Q);
+            bf_delete(&R);
+            return -1;
+        }
+        bf_delete(&Q);
+        bf_delete(&R);
+    }
+    return 0;
+}
+
+/* return 0 if OK != 0 if memory error */
+static int bf_integer_to_radix(bf_t *r, const bf_t *a, limb_t radixl)
+{
+    bf_context_t *s = r->ctx;
+    limb_t r_len;
+    bf_t *pow_tab;
+    int i, pow_tab_len, ret;
+    
+    r_len = r->len;
+    pow_tab_len = (ceil_log2(r_len) + 2) * 2; /* XXX: check */
+    pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len);
+    if (!pow_tab)
+        return -1;
+    for(i = 0; i < pow_tab_len; i++)
+        bf_init(r->ctx, &pow_tab[i]);
+
+    ret = bf_integer_to_radix_rec(pow_tab, r->tab, a, r_len, 0, r_len, radixl,
+                                  ceil_log2(radixl));
+
+    for(i = 0; i < pow_tab_len; i++) {
+        bf_delete(&pow_tab[i]);
+    }
+    bf_free(s, pow_tab);
+    return ret;
+}
+
+/* a must be >= 0. 'P' is the wanted number of digits in radix
+   'radix'. 'r' is the mantissa represented as an integer. *pE
+   contains the exponent. Return != 0 if memory error. */
+static int bf_convert_to_radix(bf_t *r, slimb_t *pE,
+                               const bf_t *a, int radix,
+                               limb_t P, bf_rnd_t rnd_mode,
+                               BOOL is_fixed_exponent)
+{
+    slimb_t E, e, prec, extra_bits, ziv_extra_bits, prec0;
+    bf_t B_s, *B = &B_s;
+    int e_sign, ret, res;
+    
+    if (a->len == 0) {
+        /* zero case */
+        *pE = 0;
+        return bf_set(r, a);
+    }
+
+    if (is_fixed_exponent) {
+        E = *pE;
+    } else {
+        /* compute the new exponent */
+        E = 1 + bf_mul_log2_radix(a->expn - 1, radix, TRUE, FALSE);
+    }
+    //    bf_print_str("a", a);
+    //    printf("E=%ld P=%ld radix=%d\n", E, P, radix);
+    
+    for(;;) {
+        e = P - E;
+        e_sign = 0;
+        if (e < 0) {
+            e = -e;
+            e_sign = 1;
+        }
+        /* Note: precision for log2(radix) is not critical here */
+        prec0 = bf_mul_log2_radix(P, radix, FALSE, TRUE);
+        ziv_extra_bits = 16;
+        for(;;) {
+            prec = prec0 + ziv_extra_bits;
+            /* XXX: rigorous error analysis needed */
+            extra_bits = ceil_log2(e) * 2 + 1;
+            ret = bf_pow_ui_ui(r, radix, e, prec + extra_bits,
+                               BF_RNDN | BF_FLAG_EXT_EXP);
+            if (!e_sign)
+                ret |= bf_mul(r, r, a, prec + extra_bits,
+                              BF_RNDN | BF_FLAG_EXT_EXP);
+            else
+                ret |= bf_div(r, a, r, prec + extra_bits,
+                              BF_RNDN | BF_FLAG_EXT_EXP);
+            if (ret & BF_ST_MEM_ERROR)
+                return BF_ST_MEM_ERROR;
+            /* if the result is not exact, check that it can be safely
+               rounded to an integer */
+            if ((ret & BF_ST_INEXACT) &&
+                !bf_can_round(r, r->expn, rnd_mode, prec)) {
+                /* and more precision and retry */
+                ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
+                continue;
+            } else {
+                ret = bf_rint(r, rnd_mode);
+                if (ret & BF_ST_MEM_ERROR)
+                    return BF_ST_MEM_ERROR;
+                break;
+            }
+        }
+        if (is_fixed_exponent)
+            break;
+        /* check that the result is < B^P */
+        /* XXX: do a fast approximate test first ? */
+        bf_init(r->ctx, B);
+        ret = bf_pow_ui_ui(B, radix, P, BF_PREC_INF, BF_RNDZ);
+        if (ret) {
+            bf_delete(B);
+            return ret;
+        }
+        res = bf_cmpu(r, B);
+        bf_delete(B);
+        if (res < 0)
+            break;
+        /* try a larger exponent */
+        E++;
+    }
+    *pE = E;
+    return 0;
+}
+
+static void limb_to_a(char *buf, limb_t n, unsigned int radix, int len)
+{
+    int digit, i;
+
+    if (radix == 10) {
+        /* specific case with constant divisor */
+        for(i = len - 1; i >= 0; i--) {
+            digit = (limb_t)n % 10;
+            n = (limb_t)n / 10;
+            buf[i] = digit + '0';
+        }
+    } else {
+        for(i = len - 1; i >= 0; i--) {
+            digit = (limb_t)n % radix;
+            n = (limb_t)n / radix;
+            if (digit < 10)
+                digit += '0';
+            else
+                digit += 'a' - 10;
+            buf[i] = digit;
+        }
+    }
+}
+
+/* for power of 2 radixes */
+static void limb_to_a2(char *buf, limb_t n, unsigned int radix_bits, int len)
+{
+    int digit, i;
+    unsigned int mask;
+
+    mask = (1 << radix_bits) - 1;
+    for(i = len - 1; i >= 0; i--) {
+        digit = n & mask;
+        n >>= radix_bits;
+        if (digit < 10)
+            digit += '0';
+        else
+            digit += 'a' - 10;
+        buf[i] = digit;
+    }
+}
+
+/* 'a' must be an integer if the is_dec = FALSE or if the radix is not
+   a power of two. A dot is added before the 'dot_pos' digit. dot_pos
+   = n_digits does not display the dot. 0 <= dot_pos <=
+   n_digits. n_digits >= 1. */
+static void output_digits(DynBuf *s, const bf_t *a1, int radix, limb_t n_digits,
+                          limb_t dot_pos, BOOL is_dec)
+{
+    limb_t i, v, l;
+    slimb_t pos, pos_incr;
+    int digits_per_limb, buf_pos, radix_bits, first_buf_pos;
+    char buf[65];
+    bf_t a_s, *a;
+
+    if (is_dec) {
+        digits_per_limb = LIMB_DIGITS;
+        a = (bf_t *)a1;
+        radix_bits = 0;
+        pos = a->len;
+        pos_incr = 1;
+        first_buf_pos = 0;
+    } else if ((radix & (radix - 1)) == 0) {
+        a = (bf_t *)a1;
+        radix_bits = ceil_log2(radix);
+        digits_per_limb = LIMB_BITS / radix_bits;
+        pos_incr = digits_per_limb * radix_bits;
+        /* digits are aligned relative to the radix point */
+        pos = a->len * LIMB_BITS + smod(-a->expn, radix_bits);
+        first_buf_pos = 0;
+    } else {
+        limb_t n, radixl;
+
+        digits_per_limb = digits_per_limb_table[radix - 2];
+        radixl = get_limb_radix(radix);
+        a = &a_s;
+        bf_init(a1->ctx, a);
+        n = (n_digits + digits_per_limb - 1) / digits_per_limb;
+        if (bf_resize(a, n)) {
+            dbuf_set_error(s);
+            goto done;
+        }
+        if (bf_integer_to_radix(a, a1, radixl)) {
+            dbuf_set_error(s);
+            goto done;
+        }
+        radix_bits = 0;
+        pos = n;
+        pos_incr = 1;
+        first_buf_pos = pos * digits_per_limb - n_digits;
+    }
+    buf_pos = digits_per_limb;
+    i = 0;
+    while (i < n_digits) {
+        if (buf_pos == digits_per_limb) {
+            pos -= pos_incr;
+            if (radix_bits == 0) {
+                v = get_limbz(a, pos);
+                limb_to_a(buf, v, radix, digits_per_limb);
+            } else {
+                v = get_bits(a->tab, a->len, pos);
+                limb_to_a2(buf, v, radix_bits, digits_per_limb);
+            }
+            buf_pos = first_buf_pos;
+            first_buf_pos = 0;
+        }
+        if (i < dot_pos) {
+            l = dot_pos;
+        } else {
+            if (i == dot_pos)
+                dbuf_putc(s, '.');
+            l = n_digits;
+        }
+        l = bf_min(digits_per_limb - buf_pos, l - i);
+        dbuf_put(s, (uint8_t *)(buf + buf_pos), l);
+        buf_pos += l;
+        i += l;
+    }
+ done:
+    if (a != a1)
+        bf_delete(a);
+}
+
+static void *bf_dbuf_realloc(void *opaque, void *ptr, size_t size)
+{
+    bf_context_t *s = opaque;
+    return bf_realloc(s, ptr, size);
+}
+
+/* return the length in bytes. A trailing '\0' is added */
+static char *bf_ftoa_internal(size_t *plen, const bf_t *a2, int radix,
+                              limb_t prec, bf_flags_t flags, BOOL is_dec)
+{
+    bf_context_t *ctx = a2->ctx;
+    DynBuf s_s, *s = &s_s;
+    int radix_bits;
+    
+    //    bf_print_str("ftoa", a2);
+    //    printf("radix=%d\n", radix);
+    dbuf_init2(s, ctx, bf_dbuf_realloc);
+    if (a2->expn == BF_EXP_NAN) {
+        dbuf_putstr(s, "NaN");
+    } else {
+        if (a2->sign)
+            dbuf_putc(s, '-');
+        if (a2->expn == BF_EXP_INF) {
+            if (flags & BF_FTOA_JS_QUIRKS)
+                dbuf_putstr(s, "Infinity");
+            else
+                dbuf_putstr(s, "Inf");
+        } else {
+            int fmt, ret;
+            slimb_t n_digits, n, i, n_max, n1;
+            bf_t a1_s, *a1 = &a1_s;
+
+            if ((radix & (radix - 1)) != 0)
+                radix_bits = 0;
+            else
+                radix_bits = ceil_log2(radix);
+
+            fmt = flags & BF_FTOA_FORMAT_MASK;
+            bf_init(ctx, a1);
+            if (fmt == BF_FTOA_FORMAT_FRAC) {
+                if (is_dec || radix_bits != 0) {
+                    if (bf_set(a1, a2))
+                        goto fail1;
+#ifdef USE_BF_DEC
+                    if (is_dec) {
+                        if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR)
+                            goto fail1;
+                        n = a1->expn;
+                    } else
+#endif
+                    {
+                        if (bf_round(a1, prec * radix_bits, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR)
+                            goto fail1;
+                        n = ceil_div(a1->expn, radix_bits);
+                    }
+                    if (flags & BF_FTOA_ADD_PREFIX) {
+                        if (radix == 16)
+                            dbuf_putstr(s, "0x");
+                        else if (radix == 8)
+                            dbuf_putstr(s, "0o");
+                        else if (radix == 2)
+                            dbuf_putstr(s, "0b");
+                    }
+                    if (a1->expn == BF_EXP_ZERO) {
+                        dbuf_putstr(s, "0");
+                        if (prec > 0) {
+                            dbuf_putstr(s, ".");
+                            for(i = 0; i < prec; i++) {
+                                dbuf_putc(s, '0');
+                            }
+                        }
+                    } else {
+                        n_digits = prec + n;
+                        if (n <= 0) {
+                            /* 0.x */
+                            dbuf_putstr(s, "0.");
+                            for(i = 0; i < -n; i++) {
+                                dbuf_putc(s, '0');
+                            }
+                            if (n_digits > 0) {
+                                output_digits(s, a1, radix, n_digits, n_digits, is_dec);
+                            }
+                        } else {
+                            output_digits(s, a1, radix, n_digits, n, is_dec);
+                        }
+                    }
+                } else {
+                    size_t pos, start;
+                    bf_t a_s, *a = &a_s;
+
+                    /* make a positive number */
+                    a->tab = a2->tab;
+                    a->len = a2->len;
+                    a->expn = a2->expn;
+                    a->sign = 0;
+                    
+                    /* one more digit for the rounding */
+                    n = 1 + bf_mul_log2_radix(bf_max(a->expn, 0), radix, TRUE, TRUE);
+                    n_digits = n + prec;
+                    n1 = n;
+                    if (bf_convert_to_radix(a1, &n1, a, radix, n_digits,
+                                            flags & BF_RND_MASK, TRUE))
+                        goto fail1;
+                    start = s->size;
+                    output_digits(s, a1, radix, n_digits, n, is_dec);
+                    /* remove leading zeros because we allocated one more digit */
+                    pos = start;
+                    while ((pos + 1) < s->size && s->buf[pos] == '0' &&
+                           s->buf[pos + 1] != '.')
+                        pos++;
+                    if (pos > start) {
+                        memmove(s->buf + start, s->buf + pos, s->size - pos);
+                        s->size -= (pos - start);
+                    }
+                }
+            } else {
+#ifdef USE_BF_DEC
+                if (is_dec) {
+                    if (bf_set(a1, a2))
+                        goto fail1;
+                    if (fmt == BF_FTOA_FORMAT_FIXED) {
+                        n_digits = prec;
+                        n_max = n_digits;
+                        if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR)
+                            goto fail1;
+                    } else {
+                        /* prec is ignored */
+                        prec = n_digits = a1->len * LIMB_DIGITS;
+                        /* remove the trailing zero digits */
+                        while (n_digits > 1 &&
+                               get_digit(a1->tab, a1->len, prec - n_digits) == 0) {
+                            n_digits--;
+                        }
+                        n_max = n_digits + 4;
+                    }
+                    n = a1->expn;
+                } else
+#endif
+                if (radix_bits != 0) {
+                    if (bf_set(a1, a2))
+                        goto fail1;
+                    if (fmt == BF_FTOA_FORMAT_FIXED) {
+                        slimb_t prec_bits;
+                        n_digits = prec;
+                        n_max = n_digits;
+                        /* align to the radix point */
+                        prec_bits = prec * radix_bits -
+                            smod(-a1->expn, radix_bits);
+                        if (bf_round(a1, prec_bits,
+                                     (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR)
+                            goto fail1;
+                    } else {
+                        limb_t digit_mask;
+                        slimb_t pos;
+                        /* position of the digit before the most
+                           significant digit in bits */
+                        pos = a1->len * LIMB_BITS +
+                            smod(-a1->expn, radix_bits);
+                        n_digits = ceil_div(pos, radix_bits);
+                        /* remove the trailing zero digits */
+                        digit_mask = ((limb_t)1 << radix_bits) - 1;
+                        while (n_digits > 1 &&
+                               (get_bits(a1->tab, a1->len, pos - n_digits * radix_bits) & digit_mask) == 0) {
+                            n_digits--;
+                        }
+                        n_max = n_digits + 4;
+                    }
+                    n = ceil_div(a1->expn, radix_bits);
+                } else {
+                    bf_t a_s, *a = &a_s;
+                    
+                    /* make a positive number */
+                    a->tab = a2->tab;
+                    a->len = a2->len;
+                    a->expn = a2->expn;
+                    a->sign = 0;
+                    
+                    if (fmt == BF_FTOA_FORMAT_FIXED) {
+                        n_digits = prec;
+                        n_max = n_digits;
+                    } else {
+                        slimb_t n_digits_max, n_digits_min;
+                        
+                        assert(prec != BF_PREC_INF);
+                        n_digits = 1 + bf_mul_log2_radix(prec, radix, TRUE, TRUE);
+                        /* max number of digits for non exponential
+                           notation. The rational is to have the same rule
+                           as JS i.e. n_max = 21 for 64 bit float in base 10. */
+                        n_max = n_digits + 4;
+                        if (fmt == BF_FTOA_FORMAT_FREE_MIN) {
+                            bf_t b_s, *b = &b_s;
+                            
+                            /* find the minimum number of digits by
+                               dichotomy. */
+                            /* XXX: inefficient */
+                            n_digits_max = n_digits;
+                            n_digits_min = 1;
+                            bf_init(ctx, b);
+                            while (n_digits_min < n_digits_max) {
+                                n_digits = (n_digits_min + n_digits_max) / 2;
+                                if (bf_convert_to_radix(a1, &n, a, radix, n_digits,
+                                                        flags & BF_RND_MASK, FALSE)) {
+                                    bf_delete(b);
+                                    goto fail1;
+                                }
+                                /* convert back to a number and compare */
+                                ret = bf_mul_pow_radix(b, a1, radix, n - n_digits,
+                                                       prec,
+                                                       (flags & ~BF_RND_MASK) |
+                                                       BF_RNDN);
+                                if (ret & BF_ST_MEM_ERROR) {
+                                    bf_delete(b);
+                                    goto fail1;
+                                }
+                                if (bf_cmpu(b, a) == 0) {
+                                    n_digits_max = n_digits;
+                                } else {
+                                    n_digits_min = n_digits + 1;
+                                }
+                            }
+                            bf_delete(b);
+                            n_digits = n_digits_max;
+                        }
+                    }
+                    if (bf_convert_to_radix(a1, &n, a, radix, n_digits,
+                                            flags & BF_RND_MASK, FALSE)) {
+                    fail1:
+                        bf_delete(a1);
+                        goto fail;
+                    }
+                }
+                if (a1->expn == BF_EXP_ZERO &&
+                    fmt != BF_FTOA_FORMAT_FIXED &&
+                    !(flags & BF_FTOA_FORCE_EXP)) {
+                    /* just output zero */
+                    dbuf_putstr(s, "0");
+                } else {
+                    if (flags & BF_FTOA_ADD_PREFIX) {
+                        if (radix == 16)
+                            dbuf_putstr(s, "0x");
+                        else if (radix == 8)
+                            dbuf_putstr(s, "0o");
+                        else if (radix == 2)
+                            dbuf_putstr(s, "0b");
+                    }
+                    if (a1->expn == BF_EXP_ZERO)
+                        n = 1;
+                    if ((flags & BF_FTOA_FORCE_EXP) ||
+                        n <= -6 || n > n_max) {
+                        const char *fmt;
+                        /* exponential notation */
+                        output_digits(s, a1, radix, n_digits, 1, is_dec);
+                        if (radix_bits != 0 && radix <= 16) {
+                            if (flags & BF_FTOA_JS_QUIRKS)
+                                fmt = "p%+" PRId_LIMB;
+                            else
+                                fmt = "p%" PRId_LIMB;
+                            dbuf_printf(s, fmt, (n - 1) * radix_bits);
+                        } else {
+                            if (flags & BF_FTOA_JS_QUIRKS)
+                                fmt = "%c%+" PRId_LIMB;
+                            else
+                                fmt = "%c%" PRId_LIMB;
+                            dbuf_printf(s, fmt,
+                                        radix <= 10 ? 'e' : '@', n - 1);
+                        }
+                    } else if (n <= 0) {
+                        /* 0.x */
+                        dbuf_putstr(s, "0.");
+                        for(i = 0; i < -n; i++) {
+                            dbuf_putc(s, '0');
+                        }
+                        output_digits(s, a1, radix, n_digits, n_digits, is_dec);
+                    } else {
+                        if (n_digits <= n) {
+                            /* no dot */
+                            output_digits(s, a1, radix, n_digits, n_digits, is_dec);
+                            for(i = 0; i < (n - n_digits); i++)
+                                dbuf_putc(s, '0');
+                        } else {
+                            output_digits(s, a1, radix, n_digits, n, is_dec);
+                        }
+                    }
+                }
+            }
+            bf_delete(a1);
+        }
+    }
+    dbuf_putc(s, '\0');
+    if (dbuf_error(s))
+        goto fail;
+    if (plen)
+        *plen = s->size - 1;
+    return (char *)s->buf;
+ fail:
+    bf_free(ctx, s->buf);
+    if (plen)
+        *plen = 0;
+    return NULL;
+}
+
+char *bf_ftoa(size_t *plen, const bf_t *a, int radix, limb_t prec,
+              bf_flags_t flags)
+{
+    return bf_ftoa_internal(plen, a, radix, prec, flags, FALSE);
+}
+
+/***************************************************************/
+/* transcendental functions */
+
+/* Note: the algorithm is from MPFR */
+static void bf_const_log2_rec(bf_t *T, bf_t *P, bf_t *Q, limb_t n1,
+                              limb_t n2, BOOL need_P)
+{
+    bf_context_t *s = T->ctx;
+    if ((n2 - n1) == 1) {
+        if (n1 == 0) {
+            bf_set_ui(P, 3);
+        } else {
+            bf_set_ui(P, n1);
+            P->sign = 1;
+        }
+        bf_set_ui(Q, 2 * n1 + 1);
+        Q->expn += 2;
+        bf_set(T, P);
+    } else {
+        limb_t m;
+        bf_t T1_s, *T1 = &T1_s;
+        bf_t P1_s, *P1 = &P1_s;
+        bf_t Q1_s, *Q1 = &Q1_s;
+        
+        m = n1 + ((n2 - n1) >> 1);
+        bf_const_log2_rec(T, P, Q, n1, m, TRUE);
+        bf_init(s, T1);
+        bf_init(s, P1);
+        bf_init(s, Q1);
+        bf_const_log2_rec(T1, P1, Q1, m, n2, need_P);
+        bf_mul(T, T, Q1, BF_PREC_INF, BF_RNDZ);
+        bf_mul(T1, T1, P, BF_PREC_INF, BF_RNDZ);
+        bf_add(T, T, T1, BF_PREC_INF, BF_RNDZ);
+        if (need_P)
+            bf_mul(P, P, P1, BF_PREC_INF, BF_RNDZ);
+        bf_mul(Q, Q, Q1, BF_PREC_INF, BF_RNDZ);
+        bf_delete(T1);
+        bf_delete(P1);
+        bf_delete(Q1);
+    }
+}
+
+/* compute log(2) with faithful rounding at precision 'prec' */
+static void bf_const_log2_internal(bf_t *T, limb_t prec)
+{
+    limb_t w, N;
+    bf_t P_s, *P = &P_s;
+    bf_t Q_s, *Q = &Q_s;
+
+    w = prec + 15;
+    N = w / 3 + 1;
+    bf_init(T->ctx, P);
+    bf_init(T->ctx, Q);
+    bf_const_log2_rec(T, P, Q, 0, N, FALSE);
+    bf_div(T, T, Q, prec, BF_RNDN);
+    bf_delete(P);
+    bf_delete(Q);
+}
+
+/* PI constant */
+
+#define CHUD_A 13591409
+#define CHUD_B 545140134
+#define CHUD_C 640320
+#define CHUD_BITS_PER_TERM 47
+
+static void chud_bs(bf_t *P, bf_t *Q, bf_t *G, int64_t a, int64_t b, int need_g,
+                    limb_t prec)
+{
+    bf_context_t *s = P->ctx;
+    int64_t c;
+
+    if (a == (b - 1)) {
+        bf_t T0, T1;
+        
+        bf_init(s, &T0);
+        bf_init(s, &T1);
+        bf_set_ui(G, 2 * b - 1);
+        bf_mul_ui(G, G, 6 * b - 1, prec, BF_RNDN);
+        bf_mul_ui(G, G, 6 * b - 5, prec, BF_RNDN);
+        bf_set_ui(&T0, CHUD_B);
+        bf_mul_ui(&T0, &T0, b, prec, BF_RNDN);
+        bf_set_ui(&T1, CHUD_A);
+        bf_add(&T0, &T0, &T1, prec, BF_RNDN);
+        bf_mul(P, G, &T0, prec, BF_RNDN);
+        P->sign = b & 1;
+
+        bf_set_ui(Q, b);
+        bf_mul_ui(Q, Q, b, prec, BF_RNDN);
+        bf_mul_ui(Q, Q, b, prec, BF_RNDN);
+        bf_mul_ui(Q, Q, (uint64_t)CHUD_C * CHUD_C * CHUD_C / 24, prec, BF_RNDN);
+        bf_delete(&T0);
+        bf_delete(&T1);
+    } else {
+        bf_t P2, Q2, G2;
+        
+        bf_init(s, &P2);
+        bf_init(s, &Q2);
+        bf_init(s, &G2);
+
+        c = (a + b) / 2;
+        chud_bs(P, Q, G, a, c, 1, prec);
+        chud_bs(&P2, &Q2, &G2, c, b, need_g, prec);
+        
+        /* Q = Q1 * Q2 */
+        /* G = G1 * G2 */
+        /* P = P1 * Q2 + P2 * G1 */
+        bf_mul(&P2, &P2, G, prec, BF_RNDN);
+        if (!need_g)
+            bf_set_ui(G, 0);
+        bf_mul(P, P, &Q2, prec, BF_RNDN);
+        bf_add(P, P, &P2, prec, BF_RNDN);
+        bf_delete(&P2);
+
+        bf_mul(Q, Q, &Q2, prec, BF_RNDN);
+        bf_delete(&Q2);
+        if (need_g)
+            bf_mul(G, G, &G2, prec, BF_RNDN);
+        bf_delete(&G2);
+    }
+}
+
+/* compute Pi with faithful rounding at precision 'prec' using the
+   Chudnovsky formula */
+static void bf_const_pi_internal(bf_t *Q, limb_t prec)
+{
+    bf_context_t *s = Q->ctx;
+    int64_t n, prec1;
+    bf_t P, G;
+
+    /* number of serie terms */
+    n = prec / CHUD_BITS_PER_TERM + 1;
+    /* XXX: precision analysis */
+    prec1 = prec + 32;
+
+    bf_init(s, &P);
+    bf_init(s, &G);
+
+    chud_bs(&P, Q, &G, 0, n, 0, BF_PREC_INF);
+    
+    bf_mul_ui(&G, Q, CHUD_A, prec1, BF_RNDN);
+    bf_add(&P, &G, &P, prec1, BF_RNDN);
+    bf_div(Q, Q, &P, prec1, BF_RNDF);
+ 
+    bf_set_ui(&P, CHUD_C);
+    bf_sqrt(&G, &P, prec1, BF_RNDF);
+    bf_mul_ui(&G, &G, (uint64_t)CHUD_C / 12, prec1, BF_RNDF);
+    bf_mul(Q, Q, &G, prec, BF_RNDN);
+    bf_delete(&P);
+    bf_delete(&G);
+}
+
+static int bf_const_get(bf_t *T, limb_t prec, bf_flags_t flags,
+                        BFConstCache *c,
+                        void (*func)(bf_t *res, limb_t prec), int sign)
+{
+    limb_t ziv_extra_bits, prec1;
+
+    ziv_extra_bits = 32;
+    for(;;) {
+        prec1 = prec + ziv_extra_bits;
+        if (c->prec < prec1) {
+            if (c->val.len == 0)
+                bf_init(T->ctx, &c->val);
+            func(&c->val, prec1);
+            c->prec = prec1;
+        } else {
+            prec1 = c->prec;
+        }
+        bf_set(T, &c->val);
+        T->sign = sign;
+        if (!bf_can_round(T, prec, flags & BF_RND_MASK, prec1)) {
+            /* and more precision and retry */
+            ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
+        } else {
+            break;
+        }
+    }
+    return bf_round(T, prec, flags);
+}
+
+static void bf_const_free(BFConstCache *c)
+{
+    bf_delete(&c->val);
+    memset(c, 0, sizeof(*c));
+}
+
+int bf_const_log2(bf_t *T, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = T->ctx;
+    return bf_const_get(T, prec, flags, &s->log2_cache, bf_const_log2_internal, 0);
+}
+
+/* return rounded pi * (1 - 2 * sign) */
+static int bf_const_pi_signed(bf_t *T, int sign, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = T->ctx;
+    return bf_const_get(T, prec, flags, &s->pi_cache, bf_const_pi_internal,
+                        sign);
+}
+
+int bf_const_pi(bf_t *T, limb_t prec, bf_flags_t flags)
+{
+    return bf_const_pi_signed(T, 0, prec, flags);
+}
+
+void bf_clear_cache(bf_context_t *s)
+{
+#ifdef USE_FFT_MUL
+    fft_clear_cache(s);
+#endif
+    bf_const_free(&s->log2_cache);
+    bf_const_free(&s->pi_cache);
+}
+
+/* ZivFunc should compute the result 'r' with faithful rounding at
+   precision 'prec'. For efficiency purposes, the final bf_round()
+   does not need to be done in the function. */
+typedef int ZivFunc(bf_t *r, const bf_t *a, limb_t prec, void *opaque);
+
+static int bf_ziv_rounding(bf_t *r, const bf_t *a,
+                           limb_t prec, bf_flags_t flags,
+                           ZivFunc *f, void *opaque)
+{
+    int rnd_mode, ret;
+    slimb_t prec1, ziv_extra_bits;
+    
+    rnd_mode = flags & BF_RND_MASK;
+    if (rnd_mode == BF_RNDF) {
+        /* no need to iterate */
+        f(r, a, prec, opaque);
+        ret = 0;
+    } else {
+        ziv_extra_bits = 32;
+        for(;;) {
+            prec1 = prec + ziv_extra_bits;
+            ret = f(r, a, prec1, opaque);
+            if (ret & (BF_ST_OVERFLOW | BF_ST_UNDERFLOW | BF_ST_MEM_ERROR)) {
+                /* overflow or underflow should never happen because
+                   it indicates the rounding cannot be done correctly,
+                   but we do not catch all the cases */
+                return ret;
+            }
+            /* if the result is exact, we can stop */
+            if (!(ret & BF_ST_INEXACT)) {
+                ret = 0;
+                break;
+            }
+            if (bf_can_round(r, prec, rnd_mode, prec1)) {
+                ret = BF_ST_INEXACT;
+                break;
+            }
+            ziv_extra_bits = ziv_extra_bits * 2;
+            //            printf("ziv_extra_bits=%" PRId64 "\n", (int64_t)ziv_extra_bits);
+        }
+    }
+    if (r->len == 0)
+        return ret;
+    else
+        return __bf_round(r, prec, flags, r->len, ret);
+}
+
+/* add (1 - 2*e_sign) * 2^e */
+static int bf_add_epsilon(bf_t *r, const bf_t *a, slimb_t e, int e_sign,
+                          limb_t prec, int flags)
+{
+    bf_t T_s, *T = &T_s;
+    int ret;
+    /* small argument case: result = 1 + epsilon * sign(x) */
+    bf_init(a->ctx, T);
+    bf_set_ui(T, 1);
+    T->sign = e_sign;
+    T->expn += e;
+    ret = bf_add(r, r, T, prec, flags);
+    bf_delete(T);
+    return ret;
+}
+
+/* Compute the exponential using faithful rounding at precision 'prec'.
+   Note: the algorithm is from MPFR */
+static int bf_exp_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    slimb_t n, K, l, i, prec1;
+    
+    assert(r != a);
+
+    /* argument reduction:
+       T = a - n*log(2) with 0 <= T < log(2) and n integer.
+    */
+    bf_init(s, T);
+    if (a->expn <= -1) {
+        /* 0 <= abs(a) <= 0.5 */
+        if (a->sign)
+            n = -1;
+        else
+            n = 0;
+    } else {
+        bf_const_log2(T, LIMB_BITS, BF_RNDZ);
+        bf_div(T, a, T, LIMB_BITS, BF_RNDD);
+        bf_get_limb(&n, T, 0);
+    }
+
+    K = bf_isqrt((prec + 1) / 2);
+    l = (prec - 1) / K + 1;
+    /* XXX: precision analysis ? */
+    prec1 = prec + (K + 2 * l + 18) + K + 8;
+    if (a->expn > 0)
+        prec1 += a->expn;
+    //    printf("n=%ld K=%ld prec1=%ld\n", n, K, prec1);
+
+    bf_const_log2(T, prec1, BF_RNDF);
+    bf_mul_si(T, T, n, prec1, BF_RNDN);
+    bf_sub(T, a, T, prec1, BF_RNDN);
+
+    /* reduce the range of T */
+    bf_mul_2exp(T, -K, BF_PREC_INF, BF_RNDZ);
+    
+    /* Taylor expansion around zero :
+     1 + x + x^2/2 + ... + x^n/n! 
+     = (1 + x * (1 + x/2 * (1 + ... (x/n))))
+    */
+    {
+        bf_t U_s, *U = &U_s;
+        
+        bf_init(s, U);
+        bf_set_ui(r, 1);
+        for(i = l ; i >= 1; i--) {
+            bf_set_ui(U, i);
+            bf_div(U, T, U, prec1, BF_RNDN);
+            bf_mul(r, r, U, prec1, BF_RNDN);
+            bf_add_si(r, r, 1, prec1, BF_RNDN);
+        }
+        bf_delete(U);
+    }
+    bf_delete(T);
+    
+    /* undo the range reduction */
+    for(i = 0; i < K; i++) {
+        bf_mul(r, r, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
+    }
+
+    /* undo the argument reduction */
+    bf_mul_2exp(r, n, BF_PREC_INF, BF_RNDZ | BF_FLAG_EXT_EXP);
+
+    return BF_ST_INEXACT;
+}
+
+/* crude overflow and underflow tests for exp(a). a_low <= a <= a_high */
+static int check_exp_underflow_overflow(bf_context_t *s, bf_t *r,
+                                        const bf_t *a_low, const bf_t *a_high,
+                                        limb_t prec, bf_flags_t flags)
+{
+    bf_t T_s, *T = &T_s;
+    bf_t log2_s, *log2 = &log2_s;
+    slimb_t e_min, e_max;
+    
+    if (a_high->expn <= 0)
+        return 0;
+
+    e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
+    e_min = -e_max + 3;
+    if (flags & BF_FLAG_SUBNORMAL)
+        e_min -= (prec - 1);
+    
+    bf_init(s, T);
+    bf_init(s, log2);
+    bf_const_log2(log2, LIMB_BITS, BF_RNDU);
+    bf_mul_ui(T, log2, e_max, LIMB_BITS, BF_RNDU);
+    /* a_low > e_max * log(2) implies exp(a) > e_max */
+    if (bf_cmp_lt(T, a_low) > 0) {
+        /* overflow */
+        bf_delete(T);
+        bf_delete(log2);
+        return bf_set_overflow(r, 0, prec, flags);
+    }
+    /* a_high < (e_min - 2) * log(2) implies exp(a) < (e_min - 2) */
+    bf_const_log2(log2, LIMB_BITS, BF_RNDD);
+    bf_mul_si(T, log2, e_min - 2, LIMB_BITS, BF_RNDD);
+    if (bf_cmp_lt(a_high, T)) {
+        int rnd_mode = flags & BF_RND_MASK;
+        
+        /* underflow */
+        bf_delete(T);
+        bf_delete(log2);
+        if (rnd_mode == BF_RNDU) {
+            /* set the smallest value */
+            bf_set_ui(r, 1);
+            r->expn = e_min;
+        } else {
+            bf_set_zero(r, 0);
+        }
+        return BF_ST_UNDERFLOW | BF_ST_INEXACT;
+    }
+    bf_delete(log2);
+    bf_delete(T);
+    return 0;
+}
+
+int bf_exp(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    int ret;
+    assert(r != a);
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+        } else if (a->expn == BF_EXP_INF) {
+            if (a->sign)
+                bf_set_zero(r, 0);
+            else
+                bf_set_inf(r, 0);
+        } else {
+            bf_set_ui(r, 1);
+        }
+        return 0;
+    }
+
+    ret = check_exp_underflow_overflow(s, r, a, a, prec, flags);
+    if (ret)
+        return ret;
+    if (a->expn < 0 && (-a->expn) >= (prec + 2)) { 
+        /* small argument case: result = 1 + epsilon * sign(x) */
+        bf_set_ui(r, 1);
+        return bf_add_epsilon(r, r, -(prec + 2), a->sign, prec, flags);
+    }
+                         
+    return bf_ziv_rounding(r, a, prec, flags, bf_exp_internal, NULL);
+}
+
+static int bf_log_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    bf_t U_s, *U = &U_s;
+    bf_t V_s, *V = &V_s;
+    slimb_t n, prec1, l, i, K;
+    
+    assert(r != a);
+
+    bf_init(s, T);
+    /* argument reduction 1 */
+    /* T=a*2^n with 2/3 <= T <= 4/3 */
+    {
+        bf_t U_s, *U = &U_s;
+        bf_set(T, a);
+        n = T->expn;
+        T->expn = 0;
+        /* U= ~ 2/3 */
+        bf_init(s, U);
+        bf_set_ui(U, 0xaaaaaaaa); 
+        U->expn = 0;
+        if (bf_cmp_lt(T, U)) {
+            T->expn++;
+            n--;
+        }
+        bf_delete(U);
+    }
+    //    printf("n=%ld\n", n);
+    //    bf_print_str("T", T);
+
+    /* XXX: precision analysis */
+    /* number of iterations for argument reduction 2 */
+    K = bf_isqrt((prec + 1) / 2); 
+    /* order of Taylor expansion */
+    l = prec / (2 * K) + 1; 
+    /* precision of the intermediate computations */
+    prec1 = prec + K + 2 * l + 32;
+
+    bf_init(s, U);
+    bf_init(s, V);
+    
+    /* Note: cancellation occurs here, so we use more precision (XXX:
+       reduce the precision by computing the exact cancellation) */
+    bf_add_si(T, T, -1, BF_PREC_INF, BF_RNDN); 
+
+    /* argument reduction 2 */
+    for(i = 0; i < K; i++) {
+        /* T = T / (1 + sqrt(1 + T)) */
+        bf_add_si(U, T, 1, prec1, BF_RNDN);
+        bf_sqrt(V, U, prec1, BF_RNDF);
+        bf_add_si(U, V, 1, prec1, BF_RNDN);
+        bf_div(T, T, U, prec1, BF_RNDN);
+    }
+
+    {
+        bf_t Y_s, *Y = &Y_s;
+        bf_t Y2_s, *Y2 = &Y2_s;
+        bf_init(s, Y);
+        bf_init(s, Y2);
+
+        /* compute ln(1+x) = ln((1+y)/(1-y)) with y=x/(2+x)
+           = y + y^3/3 + ... + y^(2*l + 1) / (2*l+1) 
+           with Y=Y^2
+           = y*(1+Y/3+Y^2/5+...) = y*(1+Y*(1/3+Y*(1/5 + ...)))
+        */
+        bf_add_si(Y, T, 2, prec1, BF_RNDN);
+        bf_div(Y, T, Y, prec1, BF_RNDN);
+
+        bf_mul(Y2, Y, Y, prec1, BF_RNDN);
+        bf_set_ui(r, 0);
+        for(i = l; i >= 1; i--) {
+            bf_set_ui(U, 1);
+            bf_set_ui(V, 2 * i + 1);
+            bf_div(U, U, V, prec1, BF_RNDN);
+            bf_add(r, r, U, prec1, BF_RNDN);
+            bf_mul(r, r, Y2, prec1, BF_RNDN);
+        }
+        bf_add_si(r, r, 1, prec1, BF_RNDN);
+        bf_mul(r, r, Y, prec1, BF_RNDN);
+        bf_delete(Y);
+        bf_delete(Y2);
+    }
+    bf_delete(V);
+    bf_delete(U);
+
+    /* multiplication by 2 for the Taylor expansion and undo the
+       argument reduction 2*/
+    bf_mul_2exp(r, K + 1, BF_PREC_INF, BF_RNDZ);
+    
+    /* undo the argument reduction 1 */
+    bf_const_log2(T, prec1, BF_RNDF);
+    bf_mul_si(T, T, n, prec1, BF_RNDN);
+    bf_add(r, r, T, prec1, BF_RNDN);
+    
+    bf_delete(T);
+    return BF_ST_INEXACT;
+}
+
+int bf_log(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    
+    assert(r != a);
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            if (a->sign) {
+                bf_set_nan(r);
+                return BF_ST_INVALID_OP;
+            } else {
+                bf_set_inf(r, 0);
+                return 0;
+            }
+        } else {
+            bf_set_inf(r, 1);
+            return 0;
+        }
+    }
+    if (a->sign) {
+        bf_set_nan(r);
+        return BF_ST_INVALID_OP;
+    }
+    bf_init(s, T);
+    bf_set_ui(T, 1);
+    if (bf_cmp_eq(a, T)) {
+        bf_set_zero(r, 0);
+        bf_delete(T);
+        return 0;
+    }
+    bf_delete(T);
+
+    return bf_ziv_rounding(r, a, prec, flags, bf_log_internal, NULL);
+}
+
+/* x and y finite and x > 0 */
+static int bf_pow_generic(bf_t *r, const bf_t *x, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    const bf_t *y = opaque;
+    bf_t T_s, *T = &T_s;
+    limb_t prec1;
+
+    bf_init(s, T);
+    /* XXX: proof for the added precision */
+    prec1 = prec + 32;
+    bf_log(T, x, prec1, BF_RNDF | BF_FLAG_EXT_EXP);
+    bf_mul(T, T, y, prec1, BF_RNDF | BF_FLAG_EXT_EXP);
+    if (bf_is_nan(T))
+        bf_set_nan(r);
+    else
+        bf_exp_internal(r, T, prec1, NULL); /* no overflow/underlow test needed */
+    bf_delete(T);
+    return BF_ST_INEXACT;
+}
+
+/* x and y finite, x > 0, y integer and y fits on one limb */
+static int bf_pow_int(bf_t *r, const bf_t *x, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    const bf_t *y = opaque;
+    bf_t T_s, *T = &T_s;
+    limb_t prec1;
+    int ret;
+    slimb_t y1;
+    
+    bf_get_limb(&y1, y, 0);
+    if (y1 < 0)
+        y1 = -y1;
+    /* XXX: proof for the added precision */
+    prec1 = prec + ceil_log2(y1) * 2 + 8;
+    ret = bf_pow_ui(r, x, y1 < 0 ? -y1 : y1, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
+    if (y->sign) {
+        bf_init(s, T);
+        bf_set_ui(T, 1);
+        ret |= bf_div(r, T, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
+        bf_delete(T);
+    }
+    return ret;
+}
+
+/* x must be a finite non zero float. Return TRUE if there is a
+   floating point number r such as x=r^(2^n) and return this floating
+   point number 'r'. Otherwise return FALSE and r is undefined. */
+static BOOL check_exact_power2n(bf_t *r, const bf_t *x, slimb_t n)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    slimb_t e, i, er;
+    limb_t v;
+    
+    /* x = m*2^e with m odd integer */
+    e = bf_get_exp_min(x);
+    /* fast check on the exponent */
+    if (n > (LIMB_BITS - 1)) {
+        if (e != 0)
+            return FALSE;
+        er = 0;
+    } else {
+        if ((e & (((limb_t)1 << n) - 1)) != 0)
+            return FALSE;
+        er = e >> n;
+    }
+    /* every perfect odd square = 1 modulo 8 */
+    v = get_bits(x->tab, x->len, x->len * LIMB_BITS - x->expn + e);
+    if ((v & 7) != 1)
+        return FALSE;
+
+    bf_init(s, T);
+    bf_set(T, x);
+    T->expn -= e;
+    for(i = 0; i < n; i++) {
+        if (i != 0)
+            bf_set(T, r);
+        if (bf_sqrtrem(r, NULL, T) != 0)
+            return FALSE;
+    }
+    r->expn += er;
+    return TRUE;
+}
+
+/* prec = BF_PREC_INF is accepted for x and y integers and y >= 0 */
+int bf_pow(bf_t *r, const bf_t *x, const bf_t *y, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    bf_t ytmp_s;
+    BOOL y_is_int, y_is_odd;
+    int r_sign, ret, rnd_mode;
+    slimb_t y_emin;
+    
+    if (x->len == 0 || y->len == 0) {
+        if (y->expn == BF_EXP_ZERO) {
+            /* pow(x, 0) = 1 */
+            bf_set_ui(r, 1);
+        } else if (x->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+        } else {
+            int cmp_x_abs_1;
+            bf_set_ui(r, 1);
+            cmp_x_abs_1 = bf_cmpu(x, r);
+            if (cmp_x_abs_1 == 0 && (flags & BF_POW_JS_QUIRKS) &&
+                (y->expn >= BF_EXP_INF)) {
+                bf_set_nan(r);
+            } else if (cmp_x_abs_1 == 0 &&
+                       (!x->sign || y->expn != BF_EXP_NAN)) {
+                /* pow(1, y) = 1 even if y = NaN */
+                /* pow(-1, +/-inf) = 1 */
+            } else if (y->expn == BF_EXP_NAN) {
+                bf_set_nan(r);
+            } else if (y->expn == BF_EXP_INF) {
+                if (y->sign == (cmp_x_abs_1 > 0)) {
+                    bf_set_zero(r, 0);
+                } else {
+                    bf_set_inf(r, 0);
+                }
+            } else {
+                y_emin = bf_get_exp_min(y);
+                y_is_odd = (y_emin == 0);
+                if (y->sign == (x->expn == BF_EXP_ZERO)) {
+                    bf_set_inf(r, y_is_odd & x->sign);
+                    if (y->sign) {
+                        /* pow(0, y) with y < 0 */
+                        return BF_ST_DIVIDE_ZERO;
+                    }
+                } else {
+                    bf_set_zero(r, y_is_odd & x->sign);
+                }
+            }
+        }
+        return 0;
+    }
+    bf_init(s, T);
+    bf_set(T, x);
+    y_emin = bf_get_exp_min(y);
+    y_is_int = (y_emin >= 0);
+    rnd_mode = flags & BF_RND_MASK;
+    if (x->sign) {
+        if (!y_is_int) {
+            bf_set_nan(r);
+            bf_delete(T);
+            return BF_ST_INVALID_OP;
+        }
+        y_is_odd = (y_emin == 0);
+        r_sign = y_is_odd;
+        /* change the directed rounding mode if the sign of the result
+           is changed */
+        if (r_sign && (rnd_mode == BF_RNDD || rnd_mode == BF_RNDU))
+            flags ^= 1;
+        bf_neg(T);
+    } else {
+        r_sign = 0;
+    }
+
+    bf_set_ui(r, 1);
+    if (bf_cmp_eq(T, r)) {
+        /* abs(x) = 1: nothing more to do */
+        ret = 0;
+    } else {
+        /* check the overflow/underflow cases */
+        {
+            bf_t al_s, *al = &al_s;
+            bf_t ah_s, *ah = &ah_s;
+            limb_t precl = LIMB_BITS;
+            
+            bf_init(s, al);
+            bf_init(s, ah);
+            /* compute bounds of log(abs(x)) * y with a low precision */
+            /* XXX: compute bf_log() once */
+            /* XXX: add a fast test before this slow test */
+            bf_log(al, T, precl, BF_RNDD);
+            bf_log(ah, T, precl, BF_RNDU);
+            bf_mul(al, al, y, precl, BF_RNDD ^ y->sign);
+            bf_mul(ah, ah, y, precl, BF_RNDU ^ y->sign);
+            ret = check_exp_underflow_overflow(s, r, al, ah, prec, flags);
+            bf_delete(al);
+            bf_delete(ah);
+            if (ret)
+                goto done;
+        }
+        
+        if (y_is_int) {
+            slimb_t T_bits, e;
+        int_pow:
+            T_bits = T->expn - bf_get_exp_min(T);
+            if (T_bits == 1) {
+                /* pow(2^b, y) = 2^(b*y) */
+                bf_mul_si(T, y, T->expn - 1, LIMB_BITS, BF_RNDZ);
+                bf_get_limb(&e, T, 0);
+                bf_set_ui(r, 1);
+                ret = bf_mul_2exp(r, e, prec, flags);
+            } else if (prec == BF_PREC_INF) {
+                slimb_t y1;
+                /* specific case for infinite precision (integer case) */
+                bf_get_limb(&y1, y, 0);
+                assert(!y->sign);
+                /* x must be an integer, so abs(x) >= 2 */
+                if (y1 >= ((slimb_t)1 << BF_EXP_BITS_MAX)) {
+                    bf_delete(T);
+                    return bf_set_overflow(r, 0, BF_PREC_INF, flags);
+                }
+                ret = bf_pow_ui(r, T, y1, BF_PREC_INF, BF_RNDZ);
+            } else {
+                if (y->expn <= 31) {
+                    /* small enough power: use exponentiation in all cases */
+                } else if (y->sign) {
+                    /* cannot be exact */
+                    goto general_case;
+                } else {
+                    if (rnd_mode == BF_RNDF)
+                        goto general_case; /* no need to track exact results */
+                    /* see if the result has a chance to be exact:
+                       if x=a*2^b (a odd), x^y=a^y*2^(b*y)
+                       x^y needs a precision of at least floor_log2(a)*y bits
+                    */
+                    bf_mul_si(r, y, T_bits - 1, LIMB_BITS, BF_RNDZ);
+                    bf_get_limb(&e, r, 0);
+                    if (prec < e)
+                        goto general_case;
+                }
+                ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_int, (void *)y);
+            }
+        } else {
+            if (rnd_mode != BF_RNDF) {
+                bf_t *y1;
+                if (y_emin < 0 && check_exact_power2n(r, T, -y_emin)) {
+                    /* the problem is reduced to a power to an integer */
+#if 0
+                    printf("\nn=%" PRId64 "\n", -(int64_t)y_emin);
+                    bf_print_str("T", T);
+                    bf_print_str("r", r);
+#endif
+                    bf_set(T, r);
+                    y1 = &ytmp_s;
+                    y1->tab = y->tab;
+                    y1->len = y->len;
+                    y1->sign = y->sign;
+                    y1->expn = y->expn - y_emin;
+                    y = y1;
+                    goto int_pow;
+                }
+            }
+        general_case:
+            ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_generic, (void *)y);
+        }
+    }
+ done:
+    bf_delete(T);
+    r->sign = r_sign;
+    return ret;
+}
+
+/* compute sqrt(-2*x-x^2) to get |sin(x)| from cos(x) - 1. */
+static void bf_sqrt_sin(bf_t *r, const bf_t *x, limb_t prec1)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    bf_init(s, T);
+    bf_set(T, x);
+    bf_mul(r, T, T, prec1, BF_RNDN);
+    bf_mul_2exp(T, 1, BF_PREC_INF, BF_RNDZ);
+    bf_add(T, T, r, prec1, BF_RNDN);
+    bf_neg(T);
+    bf_sqrt(r, T, prec1, BF_RNDF);
+    bf_delete(T);
+}
+
+static int bf_sincos(bf_t *s, bf_t *c, const bf_t *a, limb_t prec)
+{
+    bf_context_t *s1 = a->ctx;
+    bf_t T_s, *T = &T_s;
+    bf_t U_s, *U = &U_s;
+    bf_t r_s, *r = &r_s;
+    slimb_t K, prec1, i, l, mod, prec2;
+    int is_neg;
+    
+    assert(c != a && s != a);
+
+    bf_init(s1, T);
+    bf_init(s1, U);
+    bf_init(s1, r);
+    
+    /* XXX: precision analysis */
+    K = bf_isqrt(prec / 2);
+    l = prec / (2 * K) + 1;
+    prec1 = prec + 2 * K + l + 8;
+    
+    /* after the modulo reduction, -pi/4 <= T <= pi/4 */
+    if (a->expn <= -1) {
+        /* abs(a) <= 0.25: no modulo reduction needed */
+        bf_set(T, a);
+        mod = 0;
+    } else {
+        slimb_t cancel;
+        cancel = 0;
+        for(;;) {
+            prec2 = prec1 + a->expn + cancel;
+            bf_const_pi(U, prec2, BF_RNDF);
+            bf_mul_2exp(U, -1, BF_PREC_INF, BF_RNDZ);
+            bf_remquo(&mod, T, a, U, prec2, BF_RNDN, BF_RNDN);
+            //            printf("T.expn=%ld prec2=%ld\n", T->expn, prec2);
+            if (mod == 0 || (T->expn != BF_EXP_ZERO &&
+                             (T->expn + prec2) >= (prec1 - 1)))
+                break;
+            /* increase the number of bits until the precision is good enough */
+            cancel = bf_max(-T->expn, (cancel + 1) * 3 / 2);
+        }
+        mod &= 3;
+    }
+    
+    is_neg = T->sign;
+        
+    /* compute cosm1(x) = cos(x) - 1 */
+    bf_mul(T, T, T, prec1, BF_RNDN);
+    bf_mul_2exp(T, -2 * K, BF_PREC_INF, BF_RNDZ);
+    
+    /* Taylor expansion:
+       -x^2/2 + x^4/4! - x^6/6! + ...
+    */
+    bf_set_ui(r, 1);
+    for(i = l ; i >= 1; i--) {
+        bf_set_ui(U, 2 * i - 1);
+        bf_mul_ui(U, U, 2 * i, BF_PREC_INF, BF_RNDZ);
+        bf_div(U, T, U, prec1, BF_RNDN);
+        bf_mul(r, r, U, prec1, BF_RNDN);
+        bf_neg(r);
+        if (i != 1)
+            bf_add_si(r, r, 1, prec1, BF_RNDN);
+    }
+    bf_delete(U);
+
+    /* undo argument reduction:
+       cosm1(2*x)= 2*(2*cosm1(x)+cosm1(x)^2)
+    */
+    for(i = 0; i < K; i++) {
+        bf_mul(T, r, r, prec1, BF_RNDN);
+        bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ);
+        bf_add(r, r, T, prec1, BF_RNDN);
+        bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ);
+    }
+    bf_delete(T);
+
+    if (c) {
+        if ((mod & 1) == 0) {
+            bf_add_si(c, r, 1, prec1, BF_RNDN);
+        } else {
+            bf_sqrt_sin(c, r, prec1);
+            c->sign = is_neg ^ 1;
+        }
+        c->sign ^= mod >> 1;
+    }
+    if (s) {
+        if ((mod & 1) == 0) {
+            bf_sqrt_sin(s, r, prec1);
+            s->sign = is_neg;
+        } else {
+            bf_add_si(s, r, 1, prec1, BF_RNDN);
+        }
+        s->sign ^= mod >> 1;
+    }
+    bf_delete(r);
+    return BF_ST_INEXACT;
+}
+
+static int bf_cos_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    return bf_sincos(NULL, r, a, prec);
+}
+
+int bf_cos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set_ui(r, 1);
+            return 0;
+        }
+    }
+
+    /* small argument case: result = 1+r(x) with r(x) = -x^2/2 +
+       O(X^4). We assume r(x) < 2^(2*EXP(x) - 1). */
+    if (a->expn < 0) {
+        slimb_t e;
+        e = 2 * a->expn - 1;
+        if (e < -(prec + 2)) {
+            bf_set_ui(r, 1);
+            return bf_add_epsilon(r, r, e, 1, prec, flags);
+        }
+    }
+    
+    return bf_ziv_rounding(r, a, prec, flags, bf_cos_internal, NULL);
+}
+
+static int bf_sin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    return bf_sincos(r, NULL, a, prec);
+}
+
+int bf_sin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set_zero(r, a->sign);
+            return 0;
+        }
+    }
+
+    /* small argument case: result = x+r(x) with r(x) = -x^3/6 +
+       O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */
+    if (a->expn < 0) {
+        slimb_t e;
+        e = sat_add(2 * a->expn, a->expn - 2);
+        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
+            bf_set(r, a);
+            return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags);
+        }
+    }
+
+    return bf_ziv_rounding(r, a, prec, flags, bf_sin_internal, NULL);
+}
+
+static int bf_tan_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    limb_t prec1;
+    
+    /* XXX: precision analysis */
+    prec1 = prec + 8;
+    bf_init(s, T);
+    bf_sincos(r, T, a, prec1);
+    bf_div(r, r, T, prec1, BF_RNDF);
+    bf_delete(T);
+    return BF_ST_INEXACT;
+}
+
+int bf_tan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    assert(r != a);
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set_zero(r, a->sign);
+            return 0;
+        }
+    }
+
+    /* small argument case: result = x+r(x) with r(x) = x^3/3 +
+       O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */
+    if (a->expn < 0) {
+        slimb_t e;
+        e = sat_add(2 * a->expn, a->expn - 1);
+        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
+            bf_set(r, a);
+            return bf_add_epsilon(r, r, e, a->sign, prec, flags);
+        }
+    }
+            
+    return bf_ziv_rounding(r, a, prec, flags, bf_tan_internal, NULL);
+}
+
+/* if add_pi2 is true, add pi/2 to the result (used for acos(x) to
+   avoid cancellation) */
+static int bf_atan_internal(bf_t *r, const bf_t *a, limb_t prec,
+                            void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    BOOL add_pi2 = (BOOL)(intptr_t)opaque;
+    bf_t T_s, *T = &T_s;
+    bf_t U_s, *U = &U_s;
+    bf_t V_s, *V = &V_s;
+    bf_t X2_s, *X2 = &X2_s;
+    int cmp_1;
+    slimb_t prec1, i, K, l;
+    
+    /* XXX: precision analysis */
+    K = bf_isqrt((prec + 1) / 2);
+    l = prec / (2 * K) + 1;
+    prec1 = prec + K + 2 * l + 32;
+    //    printf("prec=%d K=%d l=%d prec1=%d\n", (int)prec, (int)K, (int)l, (int)prec1);
+    
+    bf_init(s, T);
+    cmp_1 = (a->expn >= 1); /* a >= 1 */
+    if (cmp_1) {
+        bf_set_ui(T, 1);
+        bf_div(T, T, a, prec1, BF_RNDN);
+    } else {
+        bf_set(T, a);
+    }
+
+    /* abs(T) <= 1 */
+
+    /* argument reduction */
+
+    bf_init(s, U);
+    bf_init(s, V);
+    bf_init(s, X2);
+    for(i = 0; i < K; i++) {
+        /* T = T / (1 + sqrt(1 + T^2)) */
+        bf_mul(U, T, T, prec1, BF_RNDN);
+        bf_add_si(U, U, 1, prec1, BF_RNDN);
+        bf_sqrt(V, U, prec1, BF_RNDN);
+        bf_add_si(V, V, 1, prec1, BF_RNDN);
+        bf_div(T, T, V, prec1, BF_RNDN);
+    }
+
+    /* Taylor series: 
+       x - x^3/3 + ... + (-1)^ l * y^(2*l + 1) / (2*l+1) 
+    */
+    bf_mul(X2, T, T, prec1, BF_RNDN);
+    bf_set_ui(r, 0);
+    for(i = l; i >= 1; i--) {
+        bf_set_si(U, 1);
+        bf_set_ui(V, 2 * i + 1);
+        bf_div(U, U, V, prec1, BF_RNDN);
+        bf_neg(r);
+        bf_add(r, r, U, prec1, BF_RNDN);
+        bf_mul(r, r, X2, prec1, BF_RNDN);
+    }
+    bf_neg(r);
+    bf_add_si(r, r, 1, prec1, BF_RNDN);
+    bf_mul(r, r, T, prec1, BF_RNDN);
+
+    /* undo the argument reduction */
+    bf_mul_2exp(r, K, BF_PREC_INF, BF_RNDZ);
+    
+    bf_delete(U);
+    bf_delete(V);
+    bf_delete(X2);
+
+    i = add_pi2;
+    if (cmp_1 > 0) {
+        /* undo the inversion : r = sign(a)*PI/2 - r */
+        bf_neg(r);
+        i += 1 - 2 * a->sign;
+    }
+    /* add i*(pi/2) with -1 <= i <= 2 */
+    if (i != 0) {
+        bf_const_pi(T, prec1, BF_RNDF);
+        if (i != 2)
+            bf_mul_2exp(T, -1, BF_PREC_INF, BF_RNDZ);
+        T->sign = (i < 0);
+        bf_add(r, T, r, prec1, BF_RNDN);
+    }
+    
+    bf_delete(T);
+    return BF_ST_INEXACT;
+}
+
+int bf_atan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    int res;
+    
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF)  {
+            /* -PI/2 or PI/2 */
+            bf_const_pi_signed(r, a->sign, prec, flags);
+            bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ);
+            return BF_ST_INEXACT;
+        } else {
+            bf_set_zero(r, a->sign);
+            return 0;
+        }
+    }
+    
+    bf_init(s, T);
+    bf_set_ui(T, 1);
+    res = bf_cmpu(a, T);
+    bf_delete(T);
+    if (res == 0) {
+        /* short cut: abs(a) == 1 -> +/-pi/4 */
+        bf_const_pi_signed(r, a->sign, prec, flags);
+        bf_mul_2exp(r, -2, BF_PREC_INF, BF_RNDZ);
+        return BF_ST_INEXACT;
+    }
+
+    /* small argument case: result = x+r(x) with r(x) = -x^3/3 +
+       O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */
+    if (a->expn < 0) {
+        slimb_t e;
+        e = sat_add(2 * a->expn, a->expn - 1);
+        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
+            bf_set(r, a);
+            return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags);
+        }
+    }
+    
+    return bf_ziv_rounding(r, a, prec, flags, bf_atan_internal, (void *)FALSE);
+}
+
+static int bf_atan2_internal(bf_t *r, const bf_t *y, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    const bf_t *x = opaque;
+    bf_t T_s, *T = &T_s;
+    limb_t prec1;
+    int ret;
+    
+    if (y->expn == BF_EXP_NAN || x->expn == BF_EXP_NAN) {
+        bf_set_nan(r);
+        return 0;
+    }
+
+    /* compute atan(y/x) assumming inf/inf = 1 and 0/0 = 0 */
+    bf_init(s, T);
+    prec1 = prec + 32;
+    if (y->expn == BF_EXP_INF && x->expn == BF_EXP_INF) {
+        bf_set_ui(T, 1);
+        T->sign = y->sign ^ x->sign;
+    } else if (y->expn == BF_EXP_ZERO && x->expn == BF_EXP_ZERO) {
+        bf_set_zero(T, y->sign ^ x->sign);
+    } else {
+        bf_div(T, y, x, prec1, BF_RNDF);
+    }
+    ret = bf_atan(r, T, prec1, BF_RNDF);
+
+    if (x->sign) {
+        /* if x < 0 (it includes -0), return sign(y)*pi + atan(y/x) */
+        bf_const_pi(T, prec1, BF_RNDF);
+        T->sign = y->sign;
+        bf_add(r, r, T, prec1, BF_RNDN);
+        ret |= BF_ST_INEXACT;
+    }
+
+    bf_delete(T);
+    return ret;
+}
+
+int bf_atan2(bf_t *r, const bf_t *y, const bf_t *x,
+             limb_t prec, bf_flags_t flags)
+{
+    return bf_ziv_rounding(r, y, prec, flags, bf_atan2_internal, (void *)x);
+}
+
+static int bf_asin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
+{
+    bf_context_t *s = r->ctx;
+    BOOL is_acos = (BOOL)(intptr_t)opaque;
+    bf_t T_s, *T = &T_s;
+    limb_t prec1, prec2;
+    
+    /* asin(x) = atan(x/sqrt(1-x^2)) 
+       acos(x) = pi/2 - asin(x) */
+    prec1 = prec + 8;
+    /* increase the precision in x^2 to compensate the cancellation in
+       (1-x^2) if x is close to 1 */
+    /* XXX: use less precision when possible */
+    if (a->expn >= 0)
+        prec2 = BF_PREC_INF;
+    else
+        prec2 = prec1;
+    bf_init(s, T);
+    bf_mul(T, a, a, prec2, BF_RNDN);
+    bf_neg(T);
+    bf_add_si(T, T, 1, prec2, BF_RNDN);
+
+    bf_sqrt(r, T, prec1, BF_RNDN);
+    bf_div(T, a, r, prec1, BF_RNDN);
+    if (is_acos)
+        bf_neg(T);
+    bf_atan_internal(r, T, prec1, (void *)(intptr_t)is_acos);
+    bf_delete(T);
+    return BF_ST_INEXACT;
+}
+
+int bf_asin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    int res;
+
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_set_zero(r, a->sign);
+            return 0;
+        }
+    }
+    bf_init(s, T);
+    bf_set_ui(T, 1);
+    res = bf_cmpu(a, T);
+    bf_delete(T);
+    if (res > 0) {
+        bf_set_nan(r);
+        return BF_ST_INVALID_OP;
+    }
+    
+    /* small argument case: result = x+r(x) with r(x) = x^3/6 +
+       O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */
+    if (a->expn < 0) {
+        slimb_t e;
+        e = sat_add(2 * a->expn, a->expn - 2);
+        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
+            bf_set(r, a);
+            return bf_add_epsilon(r, r, e, a->sign, prec, flags);
+        }
+    }
+
+    return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)FALSE);
+}
+
+int bf_acos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
+{
+    bf_context_t *s = r->ctx;
+    bf_t T_s, *T = &T_s;
+    int res;
+
+    if (a->len == 0) {
+        if (a->expn == BF_EXP_NAN) {
+            bf_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF) {
+            bf_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bf_const_pi(r, prec, flags);
+            bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ);
+            return BF_ST_INEXACT;
+        }
+    }
+    bf_init(s, T);
+    bf_set_ui(T, 1);
+    res = bf_cmpu(a, T);
+    bf_delete(T);
+    if (res > 0) {
+        bf_set_nan(r);
+        return BF_ST_INVALID_OP;
+    } else if (res == 0 && a->sign == 0) {
+        bf_set_zero(r, 0);
+        return 0;
+    }
+    
+    return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)TRUE);
+}
+
+/***************************************************************/
+/* decimal floating point numbers */
+
+#ifdef USE_BF_DEC
+
+#define adddq(r1, r0, a1, a0)                   \
+    do {                                        \
+        limb_t __t = r0;                        \
+        r0 += (a0);                             \
+        r1 += (a1) + (r0 < __t);                \
+    } while (0)
+
+#define subdq(r1, r0, a1, a0)                   \
+    do {                                        \
+        limb_t __t = r0;                        \
+        r0 -= (a0);                             \
+        r1 -= (a1) + (r0 > __t);                \
+    } while (0)
+
+#if LIMB_BITS == 64
+
+/* Note: we assume __int128 is available */
+#define muldq(r1, r0, a, b)                     \
+    do {                                        \
+        unsigned __int128 __t;                          \
+        __t = (unsigned __int128)(a) * (unsigned __int128)(b);  \
+        r0 = __t;                               \
+        r1 = __t >> 64;                         \
+    } while (0)
+
+#define divdq(q, r, a1, a0, b)                  \
+    do {                                        \
+        unsigned __int128 __t;                  \
+        limb_t __b = (b);                       \
+        __t = ((unsigned __int128)(a1) << 64) | (a0);   \
+        q = __t / __b;                                  \
+        r = __t % __b;                                  \
+    } while (0)
+
+#else
+
+#define muldq(r1, r0, a, b)                     \
+    do {                                        \
+        uint64_t __t;                          \
+        __t = (uint64_t)(a) * (uint64_t)(b);  \
+        r0 = __t;                               \
+        r1 = __t >> 32;                         \
+    } while (0)
+
+#define divdq(q, r, a1, a0, b)                  \
+    do {                                        \
+        uint64_t __t;                  \
+        limb_t __b = (b);                       \
+        __t = ((uint64_t)(a1) << 32) | (a0);   \
+        q = __t / __b;                                  \
+        r = __t % __b;                                  \
+    } while (0)
+
+#endif /* LIMB_BITS != 64 */
+
+static inline __maybe_unused limb_t shrd(limb_t low, limb_t high, long shift)
+{
+    if (shift != 0)
+        low = (low >> shift) | (high << (LIMB_BITS - shift));
+    return low;
+}
+
+static inline __maybe_unused limb_t shld(limb_t a1, limb_t a0, long shift)
+{
+    if (shift != 0)
+        return (a1 << shift) | (a0 >> (LIMB_BITS - shift));
+    else
+        return a1;
+}
+
+#if LIMB_DIGITS == 19
+
+/* WARNING: hardcoded for b = 1e19. It is assumed that:
+   0 <= a1 < 2^63 */
+#define divdq_base(q, r, a1, a0)\
+do {\
+    uint64_t __a0, __a1, __t0, __t1, __b = BF_DEC_BASE; \
+    __a0 = a0;\
+    __a1 = a1;\
+    __t0 = __a1;\
+    __t0 = shld(__t0, __a0, 1);\
+    muldq(q, __t1, __t0, UINT64_C(17014118346046923173)); \
+    muldq(__t1, __t0, q, __b);\
+    subdq(__a1, __a0, __t1, __t0);\
+    subdq(__a1, __a0, 1, __b * 2);    \
+    __t0 = (slimb_t)__a1 >> 1; \
+    q += 2 + __t0;\
+    adddq(__a1, __a0, 0, __b & __t0);\
+    q += __a1;                  \
+    __a0 += __b & __a1;           \
+    r = __a0;\
+} while(0)
+
+#elif LIMB_DIGITS == 9
+
+/* WARNING: hardcoded for b = 1e9. It is assumed that:
+   0 <= a1 < 2^29 */
+#define divdq_base(q, r, a1, a0)\
+do {\
+    uint32_t __t0, __t1, __b = BF_DEC_BASE; \
+    __t0 = a1;\
+    __t1 = a0;\
+    __t0 = (__t0 << 3) | (__t1 >> (32 - 3));    \
+    muldq(q, __t1, __t0, 2305843009U);\
+    r = a0 - q * __b;\
+    __t1 = (r >= __b);\
+    q += __t1;\
+    if (__t1)\
+        r -= __b;\
+} while(0)
+
+#endif
+
+/* fast integer division by a fixed constant */
+
+typedef struct FastDivData {
+    limb_t m1; /* multiplier */
+    int8_t shift1;
+    int8_t shift2;
+} FastDivData;
+
+/* From "Division by Invariant Integers using Multiplication" by
+   Torborn Granlund and Peter L. Montgomery */
+/* d must be != 0 */
+static inline __maybe_unused void fast_udiv_init(FastDivData *s, limb_t d)
+{
+    int l;
+    limb_t q, r, m1;
+    if (d == 1)
+        l = 0;
+    else
+        l = 64 - clz64(d - 1);
+    divdq(q, r, ((limb_t)1 << l) - d, 0, d);
+    (void)r;
+    m1 = q + 1;
+    //    printf("d=%lu l=%d m1=0x%016lx\n", d, l, m1);
+    s->m1 = m1;
+    s->shift1 = l;
+    if (s->shift1 > 1)
+        s->shift1 = 1;
+    s->shift2 = l - 1;
+    if (s->shift2 < 0)
+        s->shift2 = 0;
+}
+
+static inline limb_t fast_udiv(limb_t a, const FastDivData *s)
+{
+    limb_t t0, t1;
+    muldq(t1, t0, s->m1, a);
+    t0 = (a - t1) >> s->shift1;
+    return (t1 + t0) >> s->shift2;
+}
+
+/* contains 10^i */
+const limb_t mp_pow_dec[LIMB_DIGITS + 1] = {
+    1U,
+    10U,
+    100U,
+    1000U,
+    10000U,
+    100000U,
+    1000000U,
+    10000000U,
+    100000000U,
+    1000000000U,
+#if LIMB_BITS == 64
+    10000000000U,
+    100000000000U,
+    1000000000000U,
+    10000000000000U,
+    100000000000000U,
+    1000000000000000U,
+    10000000000000000U,
+    100000000000000000U,
+    1000000000000000000U,
+    10000000000000000000U,
+#endif
+};
+
+/* precomputed from fast_udiv_init(10^i) */
+static const FastDivData mp_pow_div[LIMB_DIGITS + 1] = {
+#if LIMB_BITS == 32
+    { 0x00000001, 0, 0 },
+    { 0x9999999a, 1, 3 },
+    { 0x47ae147b, 1, 6 },
+    { 0x0624dd30, 1, 9 },
+    { 0xa36e2eb2, 1, 13 },
+    { 0x4f8b588f, 1, 16 },
+    { 0x0c6f7a0c, 1, 19 },
+    { 0xad7f29ac, 1, 23 },
+    { 0x5798ee24, 1, 26 },
+    { 0x12e0be83, 1, 29 },
+#else
+    { 0x0000000000000001, 0, 0 },
+    { 0x999999999999999a, 1, 3 },
+    { 0x47ae147ae147ae15, 1, 6 },
+    { 0x0624dd2f1a9fbe77, 1, 9 },
+    { 0xa36e2eb1c432ca58, 1, 13 },
+    { 0x4f8b588e368f0847, 1, 16 },
+    { 0x0c6f7a0b5ed8d36c, 1, 19 },
+    { 0xad7f29abcaf48579, 1, 23 },
+    { 0x5798ee2308c39dfa, 1, 26 },
+    { 0x12e0be826d694b2f, 1, 29 },
+    { 0xb7cdfd9d7bdbab7e, 1, 33 },
+    { 0x5fd7fe17964955fe, 1, 36 },
+    { 0x19799812dea11198, 1, 39 },
+    { 0xc25c268497681c27, 1, 43 },
+    { 0x6849b86a12b9b01f, 1, 46 },
+    { 0x203af9ee756159b3, 1, 49 },
+    { 0xcd2b297d889bc2b7, 1, 53 },
+    { 0x70ef54646d496893, 1, 56 },
+    { 0x2725dd1d243aba0f, 1, 59 },
+    { 0xd83c94fb6d2ac34d, 1, 63 },
+#endif
+};
+
+/* divide by 10^shift with 0 <= shift <= LIMB_DIGITS */
+static inline limb_t fast_shr_dec(limb_t a, int shift)
+{
+    return fast_udiv(a, &mp_pow_div[shift]);
+}
+
+/* division and remainder by 10^shift */
+#define fast_shr_rem_dec(q, r, a, shift) q = fast_shr_dec(a, shift), r = a - q * mp_pow_dec[shift]
+    
+limb_t mp_add_dec(limb_t *res, const limb_t *op1, const limb_t *op2, 
+                  mp_size_t n, limb_t carry)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t k, a, v;
+
+    k=carry;
+    for(i=0;i<n;i++) {
+        /* XXX: reuse the trick in add_mod */
+        v = op1[i];
+        a = v + op2[i] + k - base;
+        k = a <= v;
+        if (!k) 
+            a += base;
+        res[i]=a;
+    }
+    return k;
+}
+
+limb_t mp_add_ui_dec(limb_t *tab, limb_t b, mp_size_t n)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t k, a, v;
+
+    k=b;
+    for(i=0;i<n;i++) {
+        v = tab[i];
+        a = v + k - base;
+        k = a <= v;
+        if (!k) 
+            a += base;
+        tab[i] = a;
+        if (k == 0)
+            break;
+    }
+    return k;
+}
+
+limb_t mp_sub_dec(limb_t *res, const limb_t *op1, const limb_t *op2, 
+                  mp_size_t n, limb_t carry)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t k, v, a;
+
+    k=carry;
+    for(i=0;i<n;i++) {
+        v = op1[i];
+        a = v - op2[i] - k;
+        k = a > v;
+        if (k)
+            a += base;
+        res[i] = a;
+    }
+    return k;
+}
+
+limb_t mp_sub_ui_dec(limb_t *tab, limb_t b, mp_size_t n)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t k, v, a;
+    
+    k=b;
+    for(i=0;i<n;i++) {
+        v = tab[i];
+        a = v - k;
+        k = a > v;
+        if (k)
+            a += base;
+        tab[i]=a;
+        if (k == 0)
+            break;
+    }
+    return k;
+}
+
+/* taba[] = taba[] * b + l. 0 <= b, l <= base - 1. Return the high carry */
+limb_t mp_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, 
+                   limb_t b, limb_t l)
+{
+    mp_size_t i;
+    limb_t t0, t1, r;
+
+    for(i = 0; i < n; i++) {
+        muldq(t1, t0, taba[i], b);
+        adddq(t1, t0, 0, l);
+        divdq_base(l, r, t1, t0);
+        tabr[i] = r;
+    }
+    return l;
+}
+
+/* tabr[] += taba[] * b. 0 <= b <= base - 1. Return the value to add
+   to the high word */
+limb_t mp_add_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n,
+                       limb_t b)
+{
+    mp_size_t i;
+    limb_t l, t0, t1, r;
+
+    l = 0;
+    for(i = 0; i < n; i++) {
+        muldq(t1, t0, taba[i], b);
+        adddq(t1, t0, 0, l);
+        adddq(t1, t0, 0, tabr[i]);
+        divdq_base(l, r, t1, t0);
+        tabr[i] = r;
+    }
+    return l;
+}
+
+/* tabr[] -= taba[] * b. 0 <= b <= base - 1. Return the value to
+   substract to the high word. */
+limb_t mp_sub_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n,
+                       limb_t b)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t l, t0, t1, r, a, v, c;
+
+    /* XXX: optimize */
+    l = 0;
+    for(i = 0; i < n; i++) {
+        muldq(t1, t0, taba[i], b);
+        adddq(t1, t0, 0, l);
+        divdq_base(l, r, t1, t0);
+        v = tabr[i];
+        a = v - r;
+        c = a > v;
+        if (c)
+            a += base;
+        /* never bigger than base because r = 0 when l = base - 1 */
+        l += c;
+        tabr[i] = a;
+    }
+    return l;
+}
+
+/* size of the result : op1_size + op2_size. */
+void mp_mul_basecase_dec(limb_t *result, 
+                         const limb_t *op1, mp_size_t op1_size, 
+                         const limb_t *op2, mp_size_t op2_size) 
+{
+    mp_size_t i;
+    limb_t r;
+    
+    result[op1_size] = mp_mul1_dec(result, op1, op1_size, op2[0], 0);
+
+    for(i=1;i<op2_size;i++) {
+        r = mp_add_mul1_dec(result + i, op1, op1_size, op2[i]);
+        result[i + op1_size] = r;
+    }
+}
+
+/* taba[] = (taba[] + r*base^na) / b. 0 <= b < base. 0 <= r <
+   b. Return the remainder. */
+limb_t mp_div1_dec(limb_t *tabr, const limb_t *taba, mp_size_t na, 
+                   limb_t b, limb_t r)
+{
+    limb_t base = BF_DEC_BASE;
+    mp_size_t i;
+    limb_t t0, t1, q;
+    int shift;
+
+#if (BF_DEC_BASE % 2) == 0
+    if (b == 2) {
+        limb_t base_div2;
+        /* Note: only works if base is even */
+        base_div2 = base >> 1;
+        if (r)
+            r = base_div2;
+        for(i = na - 1; i >= 0; i--) {
+            t0 = taba[i];
+            tabr[i] = (t0 >> 1) + r;
+            r = 0;
+            if (t0 & 1)
+                r = base_div2;
+        }
+        if (r)
+            r = 1;
+    } else 
+#endif
+    if (na >= UDIV1NORM_THRESHOLD) {
+        shift = clz(b);
+        if (shift == 0) {
+            /* normalized case: b >= 2^(LIMB_BITS-1) */
+            limb_t b_inv;
+            b_inv = udiv1norm_init(b);
+            for(i = na - 1; i >= 0; i--) {
+                muldq(t1, t0, r, base);
+                adddq(t1, t0, 0, taba[i]);
+                q = udiv1norm(&r, t1, t0, b, b_inv);
+                tabr[i] = q;
+            }
+        } else {
+            limb_t b_inv;
+            b <<= shift;
+            b_inv = udiv1norm_init(b);
+            for(i = na - 1; i >= 0; i--) {
+                muldq(t1, t0, r, base);
+                adddq(t1, t0, 0, taba[i]);
+                t1 = (t1 << shift) | (t0 >> (LIMB_BITS - shift));
+                t0 <<= shift;
+                q = udiv1norm(&r, t1, t0, b, b_inv);
+                r >>= shift;
+                tabr[i] = q;
+            }
+        }
+    } else {
+        for(i = na - 1; i >= 0; i--) {
+            muldq(t1, t0, r, base);
+            adddq(t1, t0, 0, taba[i]);
+            divdq(q, r, t1, t0, b);
+            tabr[i] = q;
+        }
+    }
+    return r;
+}
+
+static __maybe_unused void mp_print_str_dec(const char *str,
+                                       const limb_t *tab, slimb_t n)
+{
+    slimb_t i;
+    printf("%s=", str);
+    for(i = n - 1; i >= 0; i--) {
+        if (i != n - 1)
+            printf("_");
+        printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]);
+    }
+    printf("\n");
+}
+
+static __maybe_unused void mp_print_str_h_dec(const char *str,
+                                              const limb_t *tab, slimb_t n,
+                                              limb_t high)
+{
+    slimb_t i;
+    printf("%s=", str);
+    printf("%0*" PRIu_LIMB, LIMB_DIGITS, high);
+    for(i = n - 1; i >= 0; i--) {
+        printf("_");
+        printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]);
+    }
+    printf("\n");
+}
+
+//#define DEBUG_DIV_SLOW
+
+#define DIV_STATIC_ALLOC_LEN 16
+
+/* return q = a / b and r = a % b. 
+
+   taba[na] must be allocated if tabb1[nb - 1] < B / 2.  tabb1[nb - 1]
+   must be != zero. na must be >= nb. 's' can be NULL if tabb1[nb - 1]
+   >= B / 2.
+
+   The remainder is is returned in taba and contains nb libms. tabq
+   contains na - nb + 1 limbs. No overlap is permitted.
+
+   Running time of the standard method: (na - nb + 1) * nb
+   Return 0 if OK, -1 if memory alloc error
+*/
+/* XXX: optimize */
+static int mp_div_dec(bf_context_t *s, limb_t *tabq,
+                      limb_t *taba, mp_size_t na, 
+                      const limb_t *tabb1, mp_size_t nb)
+{
+    limb_t base = BF_DEC_BASE;
+    limb_t r, mult, t0, t1, a, c, q, v, *tabb;
+    mp_size_t i, j;
+    limb_t static_tabb[DIV_STATIC_ALLOC_LEN];
+    
+#ifdef DEBUG_DIV_SLOW
+    mp_print_str_dec("a", taba, na);
+    mp_print_str_dec("b", tabb1, nb);
+#endif
+
+    /* normalize tabb */
+    r = tabb1[nb - 1];
+    assert(r != 0);
+    i = na - nb;
+    if (r >= BF_DEC_BASE / 2) {
+        mult = 1;
+        tabb = (limb_t *)tabb1;
+        q = 1;
+        for(j = nb - 1; j >= 0; j--) {
+            if (taba[i + j] != tabb[j]) {
+                if (taba[i + j] < tabb[j])
+                    q = 0;
+                break;
+            }
+        }
+        tabq[i] = q;
+        if (q) {
+            mp_sub_dec(taba + i, taba + i, tabb, nb, 0);
+        }
+        i--;
+    } else {
+        mult = base / (r + 1);
+        if (likely(nb <= DIV_STATIC_ALLOC_LEN)) {
+            tabb = static_tabb;
+        } else {
+            tabb = bf_malloc(s, sizeof(limb_t) * nb);
+            if (!tabb)
+                return -1;
+        }
+        mp_mul1_dec(tabb, tabb1, nb, mult, 0);
+        taba[na] = mp_mul1_dec(taba, taba, na, mult, 0);
+    }
+
+#ifdef DEBUG_DIV_SLOW
+    printf("mult=" FMT_LIMB "\n", mult);
+    mp_print_str_dec("a_norm", taba, na + 1);
+    mp_print_str_dec("b_norm", tabb, nb);
+#endif
+
+    for(; i >= 0; i--) {
+        if (unlikely(taba[i + nb] >= tabb[nb - 1])) {
+            /* XXX: check if it is really possible */
+            q = base - 1;
+        } else {
+            muldq(t1, t0, taba[i + nb], base);
+            adddq(t1, t0, 0, taba[i + nb - 1]);
+            divdq(q, r, t1, t0, tabb[nb - 1]);
+        }
+        //        printf("i=%d q1=%ld\n", i, q);
+
+        r = mp_sub_mul1_dec(taba + i, tabb, nb, q);
+        //        mp_dump("r1", taba + i, nb, bd);
+        //        printf("r2=%ld\n", r);
+
+        v = taba[i + nb];
+        a = v - r;
+        c = a > v;
+        if (c)
+            a += base;
+        taba[i + nb] = a;
+
+        if (c != 0) {
+            /* negative result */
+            for(;;) {
+                q--;
+                c = mp_add_dec(taba + i, taba + i, tabb, nb, 0);
+                /* propagate carry and test if positive result */
+                if (c != 0) {
+                    if (++taba[i + nb] == base) {
+                        break;
+                    }
+                }
+            }
+        }
+        tabq[i] = q;
+    }
+
+#ifdef DEBUG_DIV_SLOW
+    mp_print_str_dec("q", tabq, na - nb + 1);
+    mp_print_str_dec("r", taba, nb);
+#endif
+
+    /* remove the normalization */
+    if (mult != 1) {
+        mp_div1_dec(taba, taba, nb, mult, 0);
+        if (unlikely(tabb != static_tabb))
+            bf_free(s, tabb);
+    }
+    return 0;
+}
+
+/* divide by 10^shift */
+static limb_t mp_shr_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, 
+                         limb_t shift, limb_t high)
+{
+    mp_size_t i;
+    limb_t l, a, q, r;
+
+    assert(shift >= 1 && shift < LIMB_DIGITS);
+    l = high;
+    for(i = n - 1; i >= 0; i--) {
+        a = tab[i];
+        fast_shr_rem_dec(q, r, a, shift);
+        tab_r[i] = q + l * mp_pow_dec[LIMB_DIGITS - shift];
+        l = r;
+    }
+    return l;
+}
+
+/* multiply by 10^shift */
+static limb_t mp_shl_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, 
+                         limb_t shift, limb_t low)
+{
+    mp_size_t i;
+    limb_t l, a, q, r;
+
+    assert(shift >= 1 && shift < LIMB_DIGITS);
+    l = low;
+    for(i = 0; i < n; i++) {
+        a = tab[i];
+        fast_shr_rem_dec(q, r, a, LIMB_DIGITS - shift);
+        tab_r[i] = r * mp_pow_dec[shift] + l;
+        l = q;
+    }
+    return l;
+}
+
+static limb_t mp_sqrtrem2_dec(limb_t *tabs, limb_t *taba)
+{
+    int k;
+    dlimb_t a, b, r;
+    limb_t taba1[2], s, r0, r1;
+
+    /* convert to binary and normalize */
+    a = (dlimb_t)taba[1] * BF_DEC_BASE + taba[0];
+    k = clz(a >> LIMB_BITS) & ~1;
+    b = a << k;
+    taba1[0] = b;
+    taba1[1] = b >> LIMB_BITS;
+    mp_sqrtrem2(&s, taba1);
+    s >>= (k >> 1);
+    /* convert the remainder back to decimal */
+    r = a - (dlimb_t)s * (dlimb_t)s;
+    divdq_base(r1, r0, r >> LIMB_BITS, r);
+    taba[0] = r0;
+    tabs[0] = s;
+    return r1;
+}
+
+//#define DEBUG_SQRTREM_DEC
+
+/* tmp_buf must contain (n / 2 + 1 limbs) */
+static limb_t mp_sqrtrem_rec_dec(limb_t *tabs, limb_t *taba, limb_t n,
+                                 limb_t *tmp_buf)
+{
+    limb_t l, h, rh, ql, qh, c, i;
+    
+    if (n == 1)
+        return mp_sqrtrem2_dec(tabs, taba);
+#ifdef DEBUG_SQRTREM_DEC
+    mp_print_str_dec("a", taba, 2 * n);
+#endif
+    l = n / 2;
+    h = n - l;
+    qh = mp_sqrtrem_rec_dec(tabs + l, taba + 2 * l, h, tmp_buf);
+#ifdef DEBUG_SQRTREM_DEC
+    mp_print_str_dec("s1", tabs + l, h);
+    mp_print_str_h_dec("r1", taba + 2 * l, h, qh);
+    mp_print_str_h_dec("r2", taba + l, n, qh);
+#endif
+    
+    /* the remainder is in taba + 2 * l. Its high bit is in qh */
+    if (qh) {
+        mp_sub_dec(taba + 2 * l, taba + 2 * l, tabs + l, h, 0);
+    }
+    /* instead of dividing by 2*s, divide by s (which is normalized)
+       and update q and r */
+    mp_div_dec(NULL, tmp_buf, taba + l, n, tabs + l, h);
+    qh += tmp_buf[l];
+    for(i = 0; i < l; i++)
+        tabs[i] = tmp_buf[i];
+    ql = mp_div1_dec(tabs, tabs, l, 2, qh & 1);
+    qh = qh >> 1; /* 0 or 1 */
+    if (ql)
+        rh = mp_add_dec(taba + l, taba + l, tabs + l, h, 0);
+    else
+        rh = 0;
+#ifdef DEBUG_SQRTREM_DEC
+    mp_print_str_h_dec("q", tabs, l, qh);
+    mp_print_str_h_dec("u", taba + l, h, rh);
+#endif
+    
+    mp_add_ui_dec(tabs + l, qh, h);
+#ifdef DEBUG_SQRTREM_DEC
+    mp_print_str_dec("s2", tabs, n);
+#endif
+    
+    /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */
+    /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */
+    if (qh) {
+        c = qh;
+    } else {
+        mp_mul_basecase_dec(taba + n, tabs, l, tabs, l);
+        c = mp_sub_dec(taba, taba, taba + n, 2 * l, 0);
+    }
+    rh -= mp_sub_ui_dec(taba + 2 * l, c, n - 2 * l);
+    if ((slimb_t)rh < 0) {
+        mp_sub_ui_dec(tabs, 1, n);
+        rh += mp_add_mul1_dec(taba, tabs, n, 2);
+        rh += mp_add_ui_dec(taba, 1, n);
+    }
+    return rh;
+}
+
+/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= B/4. Return (s,
+   r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 * s. tabs has n
+   limbs. r is returned in the lower n limbs of taba. Its r[n] is the
+   returned value of the function. */
+int mp_sqrtrem_dec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n)
+{
+    limb_t tmp_buf1[8];
+    limb_t *tmp_buf;
+    mp_size_t n2;
+    n2 = n / 2 + 1;
+    if (n2 <= countof(tmp_buf1)) {
+        tmp_buf = tmp_buf1;
+    } else {
+        tmp_buf = bf_malloc(s, sizeof(limb_t) * n2);
+        if (!tmp_buf)
+            return -1;
+    }
+    taba[n] = mp_sqrtrem_rec_dec(tabs, taba, n, tmp_buf);
+    if (tmp_buf != tmp_buf1)
+        bf_free(s, tmp_buf);
+    return 0;
+}
+
+/* return the number of leading zero digits, from 0 to LIMB_DIGITS */
+static int clz_dec(limb_t a)
+{
+    if (a == 0)
+        return LIMB_DIGITS;
+    switch(LIMB_BITS - 1 - clz(a)) {
+    case 0: /* 1-1 */
+        return LIMB_DIGITS - 1;
+    case 1: /* 2-3 */
+        return LIMB_DIGITS - 1;
+    case 2: /* 4-7 */
+        return LIMB_DIGITS - 1;
+    case 3: /* 8-15 */
+        if (a < 10)
+            return LIMB_DIGITS - 1;
+        else
+            return LIMB_DIGITS - 2;
+    case 4: /* 16-31 */
+        return LIMB_DIGITS - 2;
+    case 5: /* 32-63 */
+        return LIMB_DIGITS - 2;
+    case 6: /* 64-127 */
+        if (a < 100)
+            return LIMB_DIGITS - 2;
+        else
+            return LIMB_DIGITS - 3;
+    case 7: /* 128-255 */
+        return LIMB_DIGITS - 3;
+    case 8: /* 256-511 */
+        return LIMB_DIGITS - 3;
+    case 9: /* 512-1023 */
+        if (a < 1000)
+            return LIMB_DIGITS - 3;
+        else
+            return LIMB_DIGITS - 4;
+    case 10: /* 1024-2047 */
+        return LIMB_DIGITS - 4;
+    case 11: /* 2048-4095 */
+        return LIMB_DIGITS - 4;
+    case 12: /* 4096-8191 */
+        return LIMB_DIGITS - 4;
+    case 13: /* 8192-16383 */
+        if (a < 10000)
+            return LIMB_DIGITS - 4;
+        else
+            return LIMB_DIGITS - 5;
+    case 14: /* 16384-32767 */
+        return LIMB_DIGITS - 5;
+    case 15: /* 32768-65535 */
+        return LIMB_DIGITS - 5;
+    case 16: /* 65536-131071 */
+        if (a < 100000)
+            return LIMB_DIGITS - 5;
+        else
+            return LIMB_DIGITS - 6;
+    case 17: /* 131072-262143 */
+        return LIMB_DIGITS - 6;
+    case 18: /* 262144-524287 */
+        return LIMB_DIGITS - 6;
+    case 19: /* 524288-1048575 */
+        if (a < 1000000)
+            return LIMB_DIGITS - 6;
+        else
+            return LIMB_DIGITS - 7;
+    case 20: /* 1048576-2097151 */
+        return LIMB_DIGITS - 7;
+    case 21: /* 2097152-4194303 */
+        return LIMB_DIGITS - 7;
+    case 22: /* 4194304-8388607 */
+        return LIMB_DIGITS - 7;
+    case 23: /* 8388608-16777215 */
+        if (a < 10000000)
+            return LIMB_DIGITS - 7;
+        else
+            return LIMB_DIGITS - 8;
+    case 24: /* 16777216-33554431 */
+        return LIMB_DIGITS - 8;
+    case 25: /* 33554432-67108863 */
+        return LIMB_DIGITS - 8;
+    case 26: /* 67108864-134217727 */
+        if (a < 100000000)
+            return LIMB_DIGITS - 8;
+        else
+            return LIMB_DIGITS - 9;
+#if LIMB_BITS == 64
+    case 27: /* 134217728-268435455 */
+        return LIMB_DIGITS - 9;
+    case 28: /* 268435456-536870911 */
+        return LIMB_DIGITS - 9;
+    case 29: /* 536870912-1073741823 */
+        if (a < 1000000000)
+            return LIMB_DIGITS - 9;
+        else
+            return LIMB_DIGITS - 10;
+    case 30: /* 1073741824-2147483647 */
+        return LIMB_DIGITS - 10;
+    case 31: /* 2147483648-4294967295 */
+        return LIMB_DIGITS - 10;
+    case 32: /* 4294967296-8589934591 */
+        return LIMB_DIGITS - 10;
+    case 33: /* 8589934592-17179869183 */
+        if (a < 10000000000)
+            return LIMB_DIGITS - 10;
+        else
+            return LIMB_DIGITS - 11;
+    case 34: /* 17179869184-34359738367 */
+        return LIMB_DIGITS - 11;
+    case 35: /* 34359738368-68719476735 */
+        return LIMB_DIGITS - 11;
+    case 36: /* 68719476736-137438953471 */
+        if (a < 100000000000)
+            return LIMB_DIGITS - 11;
+        else
+            return LIMB_DIGITS - 12;
+    case 37: /* 137438953472-274877906943 */
+        return LIMB_DIGITS - 12;
+    case 38: /* 274877906944-549755813887 */
+        return LIMB_DIGITS - 12;
+    case 39: /* 549755813888-1099511627775 */
+        if (a < 1000000000000)
+            return LIMB_DIGITS - 12;
+        else
+            return LIMB_DIGITS - 13;
+    case 40: /* 1099511627776-2199023255551 */
+        return LIMB_DIGITS - 13;
+    case 41: /* 2199023255552-4398046511103 */
+        return LIMB_DIGITS - 13;
+    case 42: /* 4398046511104-8796093022207 */
+        return LIMB_DIGITS - 13;
+    case 43: /* 8796093022208-17592186044415 */
+        if (a < 10000000000000)
+            return LIMB_DIGITS - 13;
+        else
+            return LIMB_DIGITS - 14;
+    case 44: /* 17592186044416-35184372088831 */
+        return LIMB_DIGITS - 14;
+    case 45: /* 35184372088832-70368744177663 */
+        return LIMB_DIGITS - 14;
+    case 46: /* 70368744177664-140737488355327 */
+        if (a < 100000000000000)
+            return LIMB_DIGITS - 14;
+        else
+            return LIMB_DIGITS - 15;
+    case 47: /* 140737488355328-281474976710655 */
+        return LIMB_DIGITS - 15;
+    case 48: /* 281474976710656-562949953421311 */
+        return LIMB_DIGITS - 15;
+    case 49: /* 562949953421312-1125899906842623 */
+        if (a < 1000000000000000)
+            return LIMB_DIGITS - 15;
+        else
+            return LIMB_DIGITS - 16;
+    case 50: /* 1125899906842624-2251799813685247 */
+        return LIMB_DIGITS - 16;
+    case 51: /* 2251799813685248-4503599627370495 */
+        return LIMB_DIGITS - 16;
+    case 52: /* 4503599627370496-9007199254740991 */
+        return LIMB_DIGITS - 16;
+    case 53: /* 9007199254740992-18014398509481983 */
+        if (a < 10000000000000000)
+            return LIMB_DIGITS - 16;
+        else
+            return LIMB_DIGITS - 17;
+    case 54: /* 18014398509481984-36028797018963967 */
+        return LIMB_DIGITS - 17;
+    case 55: /* 36028797018963968-72057594037927935 */
+        return LIMB_DIGITS - 17;
+    case 56: /* 72057594037927936-144115188075855871 */
+        if (a < 100000000000000000)
+            return LIMB_DIGITS - 17;
+        else
+            return LIMB_DIGITS - 18;
+    case 57: /* 144115188075855872-288230376151711743 */
+        return LIMB_DIGITS - 18;
+    case 58: /* 288230376151711744-576460752303423487 */
+        return LIMB_DIGITS - 18;
+    case 59: /* 576460752303423488-1152921504606846975 */
+        if (a < 1000000000000000000)
+            return LIMB_DIGITS - 18;
+        else
+            return LIMB_DIGITS - 19;
+#endif
+    default:
+        return 0;
+    }
+}
+
+/* for debugging */
+void bfdec_print_str(const char *str, const bfdec_t *a)
+{
+    slimb_t i;
+    printf("%s=", str);
+
+    if (a->expn == BF_EXP_NAN) {
+        printf("NaN");
+    } else {
+        if (a->sign)
+            putchar('-');
+        if (a->expn == BF_EXP_ZERO) {
+            putchar('0');
+        } else if (a->expn == BF_EXP_INF) {
+            printf("Inf");
+        } else {
+            printf("0.");
+            for(i = a->len - 1; i >= 0; i--)
+                printf("%0*" PRIu_LIMB, LIMB_DIGITS, a->tab[i]);
+            printf("e%" PRId_LIMB, a->expn);
+        }
+    }
+    printf("\n");
+}
+
+/* return != 0 if one digit between 0 and bit_pos inclusive is not zero. */
+static inline limb_t scan_digit_nz(const bfdec_t *r, slimb_t bit_pos)
+{
+    slimb_t pos;
+    limb_t v, q;
+    int shift;
+
+    if (bit_pos < 0)
+        return 0;
+    pos = (limb_t)bit_pos / LIMB_DIGITS;
+    shift = (limb_t)bit_pos % LIMB_DIGITS;
+    fast_shr_rem_dec(q, v, r->tab[pos], shift + 1);
+    (void)q;
+    if (v != 0)
+        return 1;
+    pos--;
+    while (pos >= 0) {
+        if (r->tab[pos] != 0)
+            return 1;
+        pos--;
+    }
+    return 0;
+}
+
+static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos)
+{
+    slimb_t i;
+    int shift;
+    i = floor_div(pos, LIMB_DIGITS);
+    if (i < 0 || i >= len)
+        return 0;
+    shift = pos - i * LIMB_DIGITS;
+    return fast_shr_dec(tab[i], shift) % 10;
+}
+
+#if 0
+static limb_t get_digits(const limb_t *tab, limb_t len, slimb_t pos)
+{
+    limb_t a0, a1;
+    int shift;
+    slimb_t i;
+    
+    i = floor_div(pos, LIMB_DIGITS);
+    shift = pos - i * LIMB_DIGITS;
+    if (i >= 0 && i < len)
+        a0 = tab[i];
+    else
+        a0 = 0;
+    if (shift == 0) {
+        return a0;
+    } else {
+        i++;
+        if (i >= 0 && i < len)
+            a1 = tab[i];
+        else
+            a1 = 0;
+        return fast_shr_dec(a0, shift) +
+            fast_urem(a1, &mp_pow_div[LIMB_DIGITS - shift]) *
+            mp_pow_dec[shift];
+    }
+}
+#endif
+
+/* return the addend for rounding. Note that prec can be <= 0 for bf_rint() */
+static int bfdec_get_rnd_add(int *pret, const bfdec_t *r, limb_t l,
+                             slimb_t prec, int rnd_mode)
+{
+    int add_one, inexact;
+    limb_t digit1, digit0;
+    
+    //    bfdec_print_str("get_rnd_add", r);
+    if (rnd_mode == BF_RNDF) {
+        digit0 = 1; /* faithful rounding does not honor the INEXACT flag */
+    } else {
+        /* starting limb for bit 'prec + 1' */
+        digit0 = scan_digit_nz(r, l * LIMB_DIGITS - 1 - bf_max(0, prec + 1));
+    }
+
+    /* get the digit at 'prec' */
+    digit1 = get_digit(r->tab, l, l * LIMB_DIGITS - 1 - prec);
+    inexact = (digit1 | digit0) != 0;
+    
+    add_one = 0;
+    switch(rnd_mode) {
+    case BF_RNDZ:
+        break;
+    case BF_RNDN:
+        if (digit1 == 5) {
+            if (digit0) {
+                add_one = 1;
+            } else {
+                /* round to even */
+                add_one =
+                    get_digit(r->tab, l, l * LIMB_DIGITS - 1 - (prec - 1)) & 1;
+            }
+        } else if (digit1 > 5) {
+            add_one = 1;
+        }
+        break;
+    case BF_RNDD:
+    case BF_RNDU:
+        if (r->sign == (rnd_mode == BF_RNDD))
+            add_one = inexact;
+        break;
+    case BF_RNDNA:
+    case BF_RNDF:
+        add_one = (digit1 >= 5);
+        break;
+    case BF_RNDA:
+        add_one = inexact;
+        break;
+    default:
+        abort();
+    }
+    
+    if (inexact)
+        *pret |= BF_ST_INEXACT;
+    return add_one;
+}
+
+/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is
+   assumed to have length 'l' (1 <= l <= r->len). prec1 can be
+   BF_PREC_INF. BF_FLAG_SUBNORMAL is not supported. Cannot fail with
+   BF_ST_MEM_ERROR.
+ */
+static int __bfdec_round(bfdec_t *r, limb_t prec1, bf_flags_t flags, limb_t l)
+{
+    int shift, add_one, rnd_mode, ret;
+    slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec;
+
+    /* XXX: align to IEEE 754 2008 for decimal numbers ? */
+    e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
+    e_min = -e_range + 3;
+    e_max = e_range;
+    
+    if (flags & BF_FLAG_RADPNT_PREC) {
+        /* 'prec' is the precision after the decimal point */
+        if (prec1 != BF_PREC_INF)
+            prec = r->expn + prec1;
+        else
+            prec = prec1;
+    } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) {
+        /* restrict the precision in case of potentially subnormal
+           result */
+        assert(prec1 != BF_PREC_INF);
+        prec = prec1 - (e_min - r->expn);
+    } else {
+        prec = prec1;
+    }
+    
+    /* round to prec bits */
+    rnd_mode = flags & BF_RND_MASK;
+    ret = 0;
+    add_one = bfdec_get_rnd_add(&ret, r, l, prec, rnd_mode);
+    
+    if (prec <= 0) {
+        if (add_one) {
+            bfdec_resize(r, 1); /* cannot fail because r is non zero */
+            r->tab[0] = BF_DEC_BASE / 10;
+            r->expn += 1 - prec;
+            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
+            return ret;
+        } else {
+            goto underflow;
+        }
+    } else if (add_one) {
+        limb_t carry;
+        
+        /* add one starting at digit 'prec - 1' */
+        bit_pos = l * LIMB_DIGITS - 1 - (prec - 1);
+        pos = bit_pos / LIMB_DIGITS;
+        carry = mp_pow_dec[bit_pos % LIMB_DIGITS];
+        carry = mp_add_ui_dec(r->tab + pos, carry, l - pos);
+        if (carry) {
+            /* shift right by one digit */
+            mp_shr_dec(r->tab + pos, r->tab + pos, l - pos, 1, 1);
+            r->expn++;
+        }
+    }
+    
+    /* check underflow */
+    if (unlikely(r->expn < e_min)) {
+        if (flags & BF_FLAG_SUBNORMAL) {
+            /* if inexact, also set the underflow flag */
+            if (ret & BF_ST_INEXACT)
+                ret |= BF_ST_UNDERFLOW;
+        } else {
+        underflow:
+            bfdec_set_zero(r, r->sign);
+            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
+            return ret;
+        }
+    }
+    
+    /* check overflow */
+    if (unlikely(r->expn > e_max)) {
+        bfdec_set_inf(r, r->sign);
+        ret |= BF_ST_OVERFLOW | BF_ST_INEXACT;
+        return ret;
+    }
+    
+    /* keep the bits starting at 'prec - 1' */
+    bit_pos = l * LIMB_DIGITS - 1 - (prec - 1);
+    i = floor_div(bit_pos, LIMB_DIGITS);
+    if (i >= 0) {
+        shift = smod(bit_pos, LIMB_DIGITS);
+        if (shift != 0) {
+            r->tab[i] = fast_shr_dec(r->tab[i], shift) *
+                mp_pow_dec[shift];
+        }
+    } else {
+        i = 0;
+    }
+    /* remove trailing zeros */
+    while (r->tab[i] == 0)
+        i++;
+    if (i > 0) {
+        l -= i;
+        memmove(r->tab, r->tab + i, l * sizeof(limb_t));
+    }
+    bfdec_resize(r, l); /* cannot fail */
+    return ret;
+}
+
+/* Cannot fail with BF_ST_MEM_ERROR. */
+int bfdec_round(bfdec_t *r, limb_t prec, bf_flags_t flags)
+{
+    if (r->len == 0)
+        return 0;
+    return __bfdec_round(r, prec, flags, r->len);
+}
+
+/* 'r' must be a finite number. Cannot fail with BF_ST_MEM_ERROR.  */
+int bfdec_normalize_and_round(bfdec_t *r, limb_t prec1, bf_flags_t flags)
+{
+    limb_t l, v;
+    int shift, ret;
+    
+    //    bfdec_print_str("bf_renorm", r);
+    l = r->len;
+    while (l > 0 && r->tab[l - 1] == 0)
+        l--;
+    if (l == 0) {
+        /* zero */
+        r->expn = BF_EXP_ZERO;
+        bfdec_resize(r, 0); /* cannot fail */
+        ret = 0;
+    } else {
+        r->expn -= (r->len - l) * LIMB_DIGITS;
+        /* shift to have the MSB set to '1' */
+        v = r->tab[l - 1];
+        shift = clz_dec(v);
+        if (shift != 0) {
+            mp_shl_dec(r->tab, r->tab, l, shift, 0);
+            r->expn -= shift;
+        }
+        ret = __bfdec_round(r, prec1, flags, l);
+    }
+    //    bf_print_str("r_final", r);
+    return ret;
+}
+
+int bfdec_set_ui(bfdec_t *r, uint64_t v)
+{
+#if LIMB_BITS == 32
+    if (v >= BF_DEC_BASE * BF_DEC_BASE) {
+        if (bfdec_resize(r, 3))
+            goto fail;
+        r->tab[0] = v % BF_DEC_BASE;
+        v /= BF_DEC_BASE;
+        r->tab[1] = v % BF_DEC_BASE;
+        r->tab[2] = v / BF_DEC_BASE;
+        r->expn = 3 * LIMB_DIGITS;
+    } else
+#endif
+    if (v >= BF_DEC_BASE) {
+        if (bfdec_resize(r, 2))
+            goto fail;
+        r->tab[0] = v % BF_DEC_BASE;
+        r->tab[1] = v / BF_DEC_BASE;
+        r->expn = 2 * LIMB_DIGITS;
+    } else {
+        if (bfdec_resize(r, 1))
+            goto fail;
+        r->tab[0] = v;
+        r->expn = LIMB_DIGITS;
+    }
+    r->sign = 0;
+    return bfdec_normalize_and_round(r, BF_PREC_INF, 0);
+ fail:
+    bfdec_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+int bfdec_set_si(bfdec_t *r, int64_t v)
+{
+    int ret;
+    if (v < 0) {
+        ret = bfdec_set_ui(r, -v);
+        r->sign = 1;
+    } else {
+        ret = bfdec_set_ui(r, v);
+    }
+    return ret;
+}
+
+static int bfdec_add_internal(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, bf_flags_t flags, int b_neg)
+{
+    bf_context_t *s = r->ctx;
+    int is_sub, cmp_res, a_sign, b_sign, ret;
+
+    a_sign = a->sign;
+    b_sign = b->sign ^ b_neg;
+    is_sub = a_sign ^ b_sign;
+    cmp_res = bfdec_cmpu(a, b);
+    if (cmp_res < 0) {
+        const bfdec_t *tmp;
+        tmp = a;
+        a = b;
+        b = tmp;
+        a_sign = b_sign; /* b_sign is never used later */
+    }
+    /* abs(a) >= abs(b) */
+    if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) {
+        /* zero result */
+        bfdec_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD);
+        ret = 0;
+    } else if (a->len == 0 || b->len == 0) {
+        ret = 0;
+        if (a->expn >= BF_EXP_INF) {
+            if (a->expn == BF_EXP_NAN) {
+                /* at least one operand is NaN */
+                bfdec_set_nan(r);
+                ret = 0;
+            } else if (b->expn == BF_EXP_INF && is_sub) {
+                /* infinities with different signs */
+                bfdec_set_nan(r);
+                ret = BF_ST_INVALID_OP;
+            } else {
+                bfdec_set_inf(r, a_sign);
+            }
+        } else {
+            /* at least one zero and not subtract */
+            if (bfdec_set(r, a))
+                return BF_ST_MEM_ERROR;
+            r->sign = a_sign;
+            goto renorm;
+        }
+    } else {
+        slimb_t d, a_offset, b_offset, i, r_len;
+        limb_t carry;
+        limb_t *b1_tab;
+        int b_shift;
+        mp_size_t b1_len;
+        
+        d = a->expn - b->expn;
+
+        /* XXX: not efficient in time and memory if the precision is
+           not infinite */
+        r_len = bf_max(a->len, b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS);
+        if (bfdec_resize(r, r_len))
+            goto fail;
+        r->sign = a_sign;
+        r->expn = a->expn;
+
+        a_offset = r_len - a->len;
+        for(i = 0; i < a_offset; i++)
+            r->tab[i] = 0;
+        for(i = 0; i < a->len; i++)
+            r->tab[a_offset + i] = a->tab[i];
+        
+        b_shift = d % LIMB_DIGITS;
+        if (b_shift == 0) {
+            b1_len = b->len;
+            b1_tab = (limb_t *)b->tab;
+        } else {
+            b1_len = b->len + 1;
+            b1_tab = bf_malloc(s, sizeof(limb_t) * b1_len);
+            if (!b1_tab)
+                goto fail;
+            b1_tab[0] = mp_shr_dec(b1_tab + 1, b->tab, b->len, b_shift, 0) *
+                mp_pow_dec[LIMB_DIGITS - b_shift];
+        }
+        b_offset = r_len - (b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS);
+        
+        if (is_sub) {
+            carry = mp_sub_dec(r->tab + b_offset, r->tab + b_offset,
+                               b1_tab, b1_len, 0);
+            if (carry != 0) {
+                carry = mp_sub_ui_dec(r->tab + b_offset + b1_len, carry,
+                                      r_len - (b_offset + b1_len));
+                assert(carry == 0);
+            }
+        } else {
+            carry = mp_add_dec(r->tab + b_offset, r->tab + b_offset,
+                               b1_tab, b1_len, 0);
+            if (carry != 0) {
+                carry = mp_add_ui_dec(r->tab + b_offset + b1_len, carry,
+                                      r_len - (b_offset + b1_len));
+            }
+            if (carry != 0) {
+                if (bfdec_resize(r, r_len + 1)) {
+                    if (b_shift != 0)
+                        bf_free(s, b1_tab);
+                    goto fail;
+                }
+                r->tab[r_len] = 1;
+                r->expn += LIMB_DIGITS;
+            }
+        }
+        if (b_shift != 0)
+            bf_free(s, b1_tab);
+    renorm:
+        ret = bfdec_normalize_and_round(r, prec, flags);
+    }
+    return ret;
+ fail:
+    bfdec_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+static int __bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+                     bf_flags_t flags)
+{
+    return bfdec_add_internal(r, a, b, prec, flags, 0);
+}
+
+static int __bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+                     bf_flags_t flags)
+{
+    return bfdec_add_internal(r, a, b, prec, flags, 1);
+}
+
+int bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+              bf_flags_t flags)
+{
+    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
+                  (bf_op2_func_t *)__bfdec_add);
+}
+
+int bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+              bf_flags_t flags)
+{
+    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
+                  (bf_op2_func_t *)__bfdec_sub);
+}
+
+int bfdec_mul(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+              bf_flags_t flags)
+{
+    int ret, r_sign;
+
+    if (a->len < b->len) {
+        const bfdec_t *tmp = a;
+        a = b;
+        b = tmp;
+    }
+    r_sign = a->sign ^ b->sign;
+    /* here b->len <= a->len */
+    if (b->len == 0) {
+        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+            bfdec_set_nan(r);
+            ret = 0;
+        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) {
+            if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) ||
+                (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) {
+                bfdec_set_nan(r);
+                ret = BF_ST_INVALID_OP;
+            } else {
+                bfdec_set_inf(r, r_sign);
+                ret = 0;
+            }
+        } else {
+            bfdec_set_zero(r, r_sign);
+            ret = 0;
+        }
+    } else {
+        bfdec_t tmp, *r1 = NULL;
+        limb_t a_len, b_len;
+        limb_t *a_tab, *b_tab;
+            
+        a_len = a->len;
+        b_len = b->len;
+        a_tab = a->tab;
+        b_tab = b->tab;
+        
+        if (r == a || r == b) {
+            bfdec_init(r->ctx, &tmp);
+            r1 = r;
+            r = &tmp;
+        }
+        if (bfdec_resize(r, a_len + b_len)) {
+            bfdec_set_nan(r);
+            ret = BF_ST_MEM_ERROR;
+            goto done;
+        }
+        mp_mul_basecase_dec(r->tab, a_tab, a_len, b_tab, b_len);
+        r->sign = r_sign;
+        r->expn = a->expn + b->expn;
+        ret = bfdec_normalize_and_round(r, prec, flags);
+    done:
+        if (r == &tmp)
+            bfdec_move(r1, &tmp);
+    }
+    return ret;
+}
+
+int bfdec_mul_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
+                 bf_flags_t flags)
+{
+    bfdec_t b;
+    int ret;
+    bfdec_init(r->ctx, &b);
+    ret = bfdec_set_si(&b, b1);
+    ret |= bfdec_mul(r, a, &b, prec, flags);
+    bfdec_delete(&b);
+    return ret;
+}
+
+int bfdec_add_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
+                 bf_flags_t flags)
+{
+    bfdec_t b;
+    int ret;
+    
+    bfdec_init(r->ctx, &b);
+    ret = bfdec_set_si(&b, b1);
+    ret |= bfdec_add(r, a, &b, prec, flags);
+    bfdec_delete(&b);
+    return ret;
+}
+
+static int __bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b,
+                       limb_t prec, bf_flags_t flags)
+{
+    int ret, r_sign;
+    limb_t n, nb, precl;
+    
+    r_sign = a->sign ^ b->sign;
+    if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) {
+        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
+            bfdec_set_nan(r);
+            return 0;
+        } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) {
+            bfdec_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else if (a->expn == BF_EXP_INF) {
+            bfdec_set_inf(r, r_sign);
+            return 0;
+        } else {
+            bfdec_set_zero(r, r_sign);
+            return 0;
+        }
+    } else if (a->expn == BF_EXP_ZERO) {
+        if (b->expn == BF_EXP_ZERO) {
+            bfdec_set_nan(r);
+            return BF_ST_INVALID_OP;
+        } else {
+            bfdec_set_zero(r, r_sign);
+            return 0;
+        }
+    } else if (b->expn == BF_EXP_ZERO) {
+        bfdec_set_inf(r, r_sign);
+        return BF_ST_DIVIDE_ZERO;
+    }
+
+    nb = b->len;
+    if (prec == BF_PREC_INF) {
+        /* infinite precision: return BF_ST_INVALID_OP if not an exact
+           result */
+        /* XXX: check */
+        precl = nb + 1;
+    } else if (flags & BF_FLAG_RADPNT_PREC) {
+        /* number of digits after the decimal point */
+        /* XXX: check (2 extra digits for rounding + 2 digits) */
+        precl = (bf_max(a->expn - b->expn, 0) + 2 +
+                 prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS;
+    } else {
+        /* number of limbs of the quotient (2 extra digits for rounding) */
+        precl = (prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS;
+    }
+    n = bf_max(a->len, precl);
+    
+    {
+        limb_t *taba, na, i;
+        slimb_t d;
+        
+        na = n + nb;
+        taba = bf_malloc(r->ctx, (na + 1) * sizeof(limb_t));
+        if (!taba)
+            goto fail;
+        d = na - a->len;
+        memset(taba, 0, d * sizeof(limb_t));
+        memcpy(taba + d, a->tab, a->len * sizeof(limb_t));
+        if (bfdec_resize(r, n + 1))
+            goto fail1;
+        if (mp_div_dec(r->ctx, r->tab, taba, na, b->tab, nb)) {
+        fail1:
+            bf_free(r->ctx, taba);
+            goto fail;
+        }
+        /* see if non zero remainder */
+        for(i = 0; i < nb; i++) {
+            if (taba[i] != 0)
+                break;
+        }
+        bf_free(r->ctx, taba);
+        if (i != nb) {
+            if (prec == BF_PREC_INF) {
+                bfdec_set_nan(r);
+                return BF_ST_INVALID_OP;
+            } else {
+                r->tab[0] |= 1;
+            }
+        }
+        r->expn = a->expn - b->expn + LIMB_DIGITS;
+        r->sign = r_sign;
+        ret = bfdec_normalize_and_round(r, prec, flags);
+    }
+    return ret;
+ fail:
+    bfdec_set_nan(r);
+    return BF_ST_MEM_ERROR;
+}
+
+int bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
+              bf_flags_t flags)
+{
+    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
+                  (bf_op2_func_t *)__bfdec_div);
+}
+
+/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the
+   integer defined as floor(a/b) and r = a - q * b. */
+static void bfdec_tdivremu(bf_context_t *s, bfdec_t *q, bfdec_t *r,
+                           const bfdec_t *a, const bfdec_t *b)
+{
+    if (bfdec_cmpu(a, b) < 0) {
+        bfdec_set_ui(q, 0);
+        bfdec_set(r, a);
+    } else {
+        bfdec_div(q, a, b, 0, BF_RNDZ | BF_FLAG_RADPNT_PREC);
+        bfdec_mul(r, q, b, BF_PREC_INF, BF_RNDZ);
+        bfdec_sub(r, a, r, BF_PREC_INF, BF_RNDZ);
+    }
+}
+
+/* division and remainder. 
+   
+   rnd_mode is the rounding mode for the quotient. The additional
+   rounding mode BF_RND_EUCLIDIAN is supported.
+
+   'q' is an integer. 'r' is rounded with prec and flags (prec can be
+   BF_PREC_INF).
+*/
+int bfdec_divrem(bfdec_t *q, bfdec_t *r, const bfdec_t *a, const bfdec_t *b,
... 74432 lines suppressed ...