You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/05/06 06:28:56 UTC

[couchdb] 01/01: Use xxHash for couch_file checksums

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch try-xxhash-for-couch-file
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 52b69f12e597a69843457888f6660a3bef8934f7
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Sat May 6 02:20:43 2023 -0400

    Use xxHash for couch_file checksums
    
    Check xxhash first, since it's faster [1], and if that fails, check the slower
    md5 version.
    
    Bump a stats counter to indicate if there are still any md5 checksums found
    during normal cluster operation.
    
    [1]
    
    Comparison of hashing a 4KB block (units are microseconds).
    ```
    (node1@127.0.0.1)20> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> do_nothing_overhead end, lists:seq(1, 1000000)) end), (T/1000000.0).
    0.167425
    (node1@127.0.0.1)21> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> exxhash:xxhash128(B) end, lists:seq(1, 1000000)) end), (T/1000000).
    0.770687
    (node1@127.0.0.1)22> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> crypto:hash(md5, B) end, lists:seq(1, 1000000)) end), (T/1000000).
    6.205445
    ```
---
 src/couch/priv/stats_descriptions.cfg |  4 ++
 src/couch/src/couch_file.erl          | 72 +++++++++++++++++++++--------------
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg
index 7c8fd94cb..2dae98954 100644
--- a/src/couch/priv/stats_descriptions.cfg
+++ b/src/couch/priv/stats_descriptions.cfg
@@ -298,6 +298,10 @@
     {type, counter},
     {desc, <<"number of the attempts to read beyond set limit">>}
 ]}.
+{[couch_file, old_digests], [
+    {type, counter},
+    {desc, <<"number of old digests found in couch_file instances">>}
+]}.
 {[mango, unindexed_queries], [
     {type, counter},
     {desc, <<"number of mango queries that could not use an index">>}
diff --git a/src/couch/src/couch_file.erl b/src/couch/src/couch_file.erl
index 514d4e3d9..afa848efa 100644
--- a/src/couch/src/couch_file.erl
+++ b/src/couch/src/couch_file.erl
@@ -142,8 +142,8 @@ assemble_file_chunk(Bin) ->
     [<<0:1/integer, (iolist_size(Bin)):31/integer>>, Bin].
 
 assemble_file_chunk_and_checksum(Bin) ->
-    Md5 = couch_hash:md5_hash(Bin),
-    [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Md5, Bin].
+    Digest = exxhash:xxhash128(Bin),
+    [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Digest, Bin].
 
 %%----------------------------------------------------------------------
 %% Purpose: Reads a term from a file that was written with append_term
@@ -169,8 +169,8 @@ pread_binary(Fd, Pos) ->
 
 pread_iolist(Fd, Pos) ->
     case ioq:call(Fd, {pread_iolist, Pos}, erlang:get(io_priority)) of
-        {ok, IoList, Md5} ->
-            {ok, verify_md5(Fd, Pos, IoList, Md5)};
+        {ok, IoList, Digest} ->
+            {ok, verify_digest(Fd, Pos, IoList, Digest)};
         Error ->
             Error
     end.
@@ -191,13 +191,13 @@ pread_binaries(Fd, PosList) ->
 
 pread_iolists(Fd, PosList) ->
     case ioq:call(Fd, {pread_iolists, PosList}, erlang:get(io_priority)) of
-        {ok, DataMd5s} ->
+        {ok, DataAndDigests} ->
             Data = lists:zipwith(
-                fun(Pos, {IoList, Md5}) ->
-                    verify_md5(Fd, Pos, IoList, Md5)
+                fun(Pos, {IoList, Digest}) ->
+                    verify_digest(Fd, Pos, IoList, Digest)
                 end,
                 PosList,
-                DataMd5s
+                DataAndDigests
             ),
             {ok, Data};
         Error ->
@@ -400,9 +400,9 @@ read_header(Fd) ->
 
 write_header(Fd, Data) ->
     Bin = term_to_binary(Data),
-    Md5 = couch_hash:md5_hash(Bin),
+    Digest = exxhash:xxhash128(Bin),
     % now we assemble the final header binary and write to disk
-    FinalBin = <<Md5/binary, Bin/binary>>,
+    FinalBin = <<Digest/binary, Bin/binary>>,
     ioq:call(Fd, {write_header, FinalBin}, erlang:get(io_priority)).
 
 init_status_error(ReturnPid, Ref, Error) ->
@@ -504,11 +504,11 @@ handle_call({pread_iolist, Pos}, _From, File) ->
     update_read_timestamp(),
     {LenIolist, NextPos} = read_raw_iolist_int(File, Pos, 4),
     case iolist_to_binary(LenIolist) of
-        % an MD5-prefixed term
+        % an digest-prefixed term
         <<1:1/integer, Len:31/integer>> ->
-            {Md5AndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16),
-            {Md5, IoList} = extract_md5(Md5AndIoList),
-            {reply, {ok, IoList, Md5}, File};
+            {DigestAndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16),
+            {Digest, IoList} = extract_digest(DigestAndIoList),
+            {reply, {ok, IoList, Digest}, File};
         <<0:1/integer, Len:31/integer>> ->
             {Iolist, _} = read_raw_iolist_int(File, NextPos, Len),
             {reply, {ok, Iolist, <<>>}, File}
@@ -520,7 +520,7 @@ handle_call({pread_iolists, PosL}, _From, File) ->
     LocNums2 = lists:map(
         fun({LenIoList, NextPos}) ->
             case iolist_to_binary(LenIoList) of
-                % an MD5-prefixed term
+                % a digest-prefixed term
                 <<1:1/integer, Len:31/integer>> ->
                     {NextPos, Len + 16};
                 <<0:1/integer, Len:31/integer>> ->
@@ -534,8 +534,8 @@ handle_call({pread_iolists, PosL}, _From, File) ->
         fun({LenIoList, _}, {IoList, _}) ->
             case iolist_to_binary(LenIoList) of
                 <<1:1/integer, _:31/integer>> ->
-                    {Md5, IoList} = extract_md5(IoList),
-                    {IoList, Md5};
+                    {Digest, IoList} = extract_digest(IoList),
+                    {IoList, Digest};
                 <<0:1/integer, _:31/integer>> ->
                     {IoList, <<>>}
             end
@@ -674,9 +674,15 @@ load_header(Fd, Pos, HeaderLen, RestBlock) ->
                 {ok, Missing} = file:pread(Fd, ReadStart, ReadLen),
                 <<RestBlock/binary, Missing/binary>>
         end,
-    <<Md5Sig:16/binary, HeaderBin/binary>> =
+    <<Digest:16/binary, HeaderBin/binary>> =
         iolist_to_binary(remove_block_prefixes(?PREFIX_SIZE, RawBin)),
-    Md5Sig = couch_hash:md5_hash(HeaderBin),
+    case exxhash:xxhash128(HeaderBin) of
+        Digest ->
+            ok;
+        <<_/binary>> ->
+            couch_stats:increment_counter([couch_file, old_digests]),
+            Digest = couch_hash:md5_hash(HeaderBin)
+    end,
     {ok, HeaderBin}.
 
 %% Read multiple block locations using a single file:pread/2.
@@ -779,10 +785,10 @@ get_pread_locnum(File, Pos, Len) ->
             {Pos, TotalBytes}
     end.
 
--spec extract_md5(iolist()) -> {binary(), iolist()}.
-extract_md5(FullIoList) ->
-    {Md5List, IoList} = split_iolist(FullIoList, 16, []),
-    {iolist_to_binary(Md5List), IoList}.
+-spec extract_digest(iolist()) -> {binary(), iolist()}.
+extract_digest(FullIoList) ->
+    {DigestList, IoList} = split_iolist(FullIoList, 16, []),
+    {iolist_to_binary(DigestList), IoList}.
 
 calculate_total_read_len(0, FinalLen) ->
     calculate_total_read_len(1, FinalLen) + 1;
@@ -852,15 +858,23 @@ monitored_by_pids() ->
     {monitored_by, PidsAndRefs} = process_info(self(), monitored_by),
     lists:filter(fun is_pid/1, PidsAndRefs).
 
-verify_md5(_Fd, _Pos, IoList, <<>>) ->
+verify_digest(_Fd, _Pos, IoList, <<>>) ->
     IoList;
-verify_md5(Fd, Pos, IoList, Md5) ->
-    case couch_hash:md5_hash(IoList) of
-        Md5 -> IoList;
-        _ -> report_md5_error(Fd, Pos)
+verify_digest(Fd, Pos, IoList, Digest) ->
+    case exxhash:xxhash128(iolist_to_binary(IoList)) of
+        Digest ->
+            IoList;
+        <<_/binary>> ->
+            case couch_hash:md5_hash(IoList) of
+                Digest ->
+                    couch_stats:increment_counter([couch_file, old_digests]),
+                    IoList;
+                _ ->
+                    report_digest_error(Fd, Pos)
+            end
     end.
 
-report_md5_error(Fd, Pos) ->
+report_digest_error(Fd, Pos) ->
     couch_log:emergency("File corruption in ~p at position ~B", [Fd, Pos]),
     exit({file_corruption, <<"file corruption">>}).