You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/05/06 06:28:55 UTC

[couchdb] branch try-xxhash-for-couch-file created (now 52b69f12e)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch try-xxhash-for-couch-file
in repository https://gitbox.apache.org/repos/asf/couchdb.git


      at 52b69f12e Use xxHash for couch_file checksums

This branch includes the following new commits:

     new 52b69f12e Use xxHash for couch_file checksums

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[couchdb] 01/01: Use xxHash for couch_file checksums

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch try-xxhash-for-couch-file
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 52b69f12e597a69843457888f6660a3bef8934f7
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Sat May 6 02:20:43 2023 -0400

    Use xxHash for couch_file checksums
    
    Check xxhash first, since it's faster [1], and if that fails, check the slower
    md5 version.
    
    Bump a stats counter to indicate if there are still any md5 checksums found
    during normal cluster operation.
    
    [1]
    
    Comparison of hashing a 4KB block (units are microseconds).
    ```
    (node1@127.0.0.1)20> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> do_nothing_overhead end, lists:seq(1, 1000000)) end), (T/1000000.0).
    0.167425
    (node1@127.0.0.1)21> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> exxhash:xxhash128(B) end, lists:seq(1, 1000000)) end), (T/1000000).
    0.770687
    (node1@127.0.0.1)22> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> crypto:hash(md5, B) end, lists:seq(1, 1000000)) end), (T/1000000).
    6.205445
    ```
---
 src/couch/priv/stats_descriptions.cfg |  4 ++
 src/couch/src/couch_file.erl          | 72 +++++++++++++++++++++--------------
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg
index 7c8fd94cb..2dae98954 100644
--- a/src/couch/priv/stats_descriptions.cfg
+++ b/src/couch/priv/stats_descriptions.cfg
@@ -298,6 +298,10 @@
     {type, counter},
     {desc, <<"number of the attempts to read beyond set limit">>}
 ]}.
+{[couch_file, old_digests], [
+    {type, counter},
+    {desc, <<"number of old digests found in couch_file instances">>}
+]}.
 {[mango, unindexed_queries], [
     {type, counter},
     {desc, <<"number of mango queries that could not use an index">>}
diff --git a/src/couch/src/couch_file.erl b/src/couch/src/couch_file.erl
index 514d4e3d9..afa848efa 100644
--- a/src/couch/src/couch_file.erl
+++ b/src/couch/src/couch_file.erl
@@ -142,8 +142,8 @@ assemble_file_chunk(Bin) ->
     [<<0:1/integer, (iolist_size(Bin)):31/integer>>, Bin].
 
 assemble_file_chunk_and_checksum(Bin) ->
-    Md5 = couch_hash:md5_hash(Bin),
-    [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Md5, Bin].
+    Digest = exxhash:xxhash128(Bin),
+    [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Digest, Bin].
 
 %%----------------------------------------------------------------------
 %% Purpose: Reads a term from a file that was written with append_term
@@ -169,8 +169,8 @@ pread_binary(Fd, Pos) ->
 
 pread_iolist(Fd, Pos) ->
     case ioq:call(Fd, {pread_iolist, Pos}, erlang:get(io_priority)) of
-        {ok, IoList, Md5} ->
-            {ok, verify_md5(Fd, Pos, IoList, Md5)};
+        {ok, IoList, Digest} ->
+            {ok, verify_digest(Fd, Pos, IoList, Digest)};
         Error ->
             Error
     end.
@@ -191,13 +191,13 @@ pread_binaries(Fd, PosList) ->
 
 pread_iolists(Fd, PosList) ->
     case ioq:call(Fd, {pread_iolists, PosList}, erlang:get(io_priority)) of
-        {ok, DataMd5s} ->
+        {ok, DataAndDigests} ->
             Data = lists:zipwith(
-                fun(Pos, {IoList, Md5}) ->
-                    verify_md5(Fd, Pos, IoList, Md5)
+                fun(Pos, {IoList, Digest}) ->
+                    verify_digest(Fd, Pos, IoList, Digest)
                 end,
                 PosList,
-                DataMd5s
+                DataAndDigests
             ),
             {ok, Data};
         Error ->
@@ -400,9 +400,9 @@ read_header(Fd) ->
 
 write_header(Fd, Data) ->
     Bin = term_to_binary(Data),
-    Md5 = couch_hash:md5_hash(Bin),
+    Digest = exxhash:xxhash128(Bin),
     % now we assemble the final header binary and write to disk
-    FinalBin = <<Md5/binary, Bin/binary>>,
+    FinalBin = <<Digest/binary, Bin/binary>>,
     ioq:call(Fd, {write_header, FinalBin}, erlang:get(io_priority)).
 
 init_status_error(ReturnPid, Ref, Error) ->
@@ -504,11 +504,11 @@ handle_call({pread_iolist, Pos}, _From, File) ->
     update_read_timestamp(),
     {LenIolist, NextPos} = read_raw_iolist_int(File, Pos, 4),
     case iolist_to_binary(LenIolist) of
-        % an MD5-prefixed term
+        % an digest-prefixed term
         <<1:1/integer, Len:31/integer>> ->
-            {Md5AndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16),
-            {Md5, IoList} = extract_md5(Md5AndIoList),
-            {reply, {ok, IoList, Md5}, File};
+            {DigestAndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16),
+            {Digest, IoList} = extract_digest(DigestAndIoList),
+            {reply, {ok, IoList, Digest}, File};
         <<0:1/integer, Len:31/integer>> ->
             {Iolist, _} = read_raw_iolist_int(File, NextPos, Len),
             {reply, {ok, Iolist, <<>>}, File}
@@ -520,7 +520,7 @@ handle_call({pread_iolists, PosL}, _From, File) ->
     LocNums2 = lists:map(
         fun({LenIoList, NextPos}) ->
             case iolist_to_binary(LenIoList) of
-                % an MD5-prefixed term
+                % a digest-prefixed term
                 <<1:1/integer, Len:31/integer>> ->
                     {NextPos, Len + 16};
                 <<0:1/integer, Len:31/integer>> ->
@@ -534,8 +534,8 @@ handle_call({pread_iolists, PosL}, _From, File) ->
         fun({LenIoList, _}, {IoList, _}) ->
             case iolist_to_binary(LenIoList) of
                 <<1:1/integer, _:31/integer>> ->
-                    {Md5, IoList} = extract_md5(IoList),
-                    {IoList, Md5};
+                    {Digest, IoList} = extract_digest(IoList),
+                    {IoList, Digest};
                 <<0:1/integer, _:31/integer>> ->
                     {IoList, <<>>}
             end
@@ -674,9 +674,15 @@ load_header(Fd, Pos, HeaderLen, RestBlock) ->
                 {ok, Missing} = file:pread(Fd, ReadStart, ReadLen),
                 <<RestBlock/binary, Missing/binary>>
         end,
-    <<Md5Sig:16/binary, HeaderBin/binary>> =
+    <<Digest:16/binary, HeaderBin/binary>> =
         iolist_to_binary(remove_block_prefixes(?PREFIX_SIZE, RawBin)),
-    Md5Sig = couch_hash:md5_hash(HeaderBin),
+    case exxhash:xxhash128(HeaderBin) of
+        Digest ->
+            ok;
+        <<_/binary>> ->
+            couch_stats:increment_counter([couch_file, old_digests]),
+            Digest = couch_hash:md5_hash(HeaderBin)
+    end,
     {ok, HeaderBin}.
 
 %% Read multiple block locations using a single file:pread/2.
@@ -779,10 +785,10 @@ get_pread_locnum(File, Pos, Len) ->
             {Pos, TotalBytes}
     end.
 
--spec extract_md5(iolist()) -> {binary(), iolist()}.
-extract_md5(FullIoList) ->
-    {Md5List, IoList} = split_iolist(FullIoList, 16, []),
-    {iolist_to_binary(Md5List), IoList}.
+-spec extract_digest(iolist()) -> {binary(), iolist()}.
+extract_digest(FullIoList) ->
+    {DigestList, IoList} = split_iolist(FullIoList, 16, []),
+    {iolist_to_binary(DigestList), IoList}.
 
 calculate_total_read_len(0, FinalLen) ->
     calculate_total_read_len(1, FinalLen) + 1;
@@ -852,15 +858,23 @@ monitored_by_pids() ->
     {monitored_by, PidsAndRefs} = process_info(self(), monitored_by),
     lists:filter(fun is_pid/1, PidsAndRefs).
 
-verify_md5(_Fd, _Pos, IoList, <<>>) ->
+verify_digest(_Fd, _Pos, IoList, <<>>) ->
     IoList;
-verify_md5(Fd, Pos, IoList, Md5) ->
-    case couch_hash:md5_hash(IoList) of
-        Md5 -> IoList;
-        _ -> report_md5_error(Fd, Pos)
+verify_digest(Fd, Pos, IoList, Digest) ->
+    case exxhash:xxhash128(iolist_to_binary(IoList)) of
+        Digest ->
+            IoList;
+        <<_/binary>> ->
+            case couch_hash:md5_hash(IoList) of
+                Digest ->
+                    couch_stats:increment_counter([couch_file, old_digests]),
+                    IoList;
+                _ ->
+                    report_digest_error(Fd, Pos)
+            end
     end.
 
-report_md5_error(Fd, Pos) ->
+report_digest_error(Fd, Pos) ->
     couch_log:emergency("File corruption in ~p at position ~B", [Fd, Pos]),
     exit({file_corruption, <<"file corruption">>}).