You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by "nickva (via GitHub)" <gi...@apache.org> on 2023/05/06 06:29:44 UTC

[GitHub] [couchdb] nickva opened a new pull request, #4578: Use xxHash for couch_file checksums

nickva opened a new pull request, #4578:
URL: https://github.com/apache/couchdb/pull/4578

   Check xxhash first, since it's faster [1], and if that fails, check the slower md5 version.
   
   Bump a stats counter to indicate if there are still any md5 checksums found during normal cluster operation.
   
   [1] Comparison of hashing a 4KB block (units are microseconds).
   ```
   (node1@127.0.0.1)20> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> do_nothing_overhead end, lists:seq(1, 1000000)) end), (T/1000000.0).
   0.167425
   (node1@127.0.0.1)21> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> exxhash:xxhash128(B) end, lists:seq(1, 1000000)) end), (T/1000000).
   0.770687
   (node1@127.0.0.1)22> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> crypto:hash(md5, B) end, lists:seq(1, 1000000)) end), (T/1000000).
   6.205445
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] rnewson commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "rnewson (via GitHub)" <gi...@apache.org>.
rnewson commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187251702


##########
src/couch/src/couch_file.erl:
##########
@@ -906,6 +922,35 @@ reset_eof(#file{} = File) ->
     {ok, Eof} = file:position(File#file.fd, eof),
     File#file{eof = Eof}.
 
+write_checksum(Bin) when is_binary(Bin) ->

Review Comment:
   I'd like to see an assertion that the result of this function is 16 bytes / 128 bits as this is assumed everywhere else, and as a guard against doing something silly in the future without realising.
   
   ```
   -spec write_checksum(binary()) -> <<_:128>>.
   write_checksum(Bin) when is_binary(Bin) ->
   Result = case write_xxhash_checksums() of
       true -> exxhash:xxhash128(Bin);
       false -> couch_hash:md5_hash(Bin)
   end,
   <<_:128>> = Result.
   ```
   
   that sort of thing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] rnewson commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "rnewson (via GitHub)" <gi...@apache.org>.
rnewson commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1193044167


##########
src/couch/test/eunit/couch_file_tests.erl:
##########
@@ -551,3 +551,136 @@ fake_fsync_fd() ->
         {'$gen_call', From, sync} ->
             gen:reply(From, {error, eio})
     end.
+
+checksum_test_() ->
+    {
+        foreach,
+        fun setup_checksum/0,
+        fun teardown_checksum/1,
+        [
+            ?TDEF_FE(t_write_read_xxhash_checksums),
+            ?TDEF_FE(t_downgrade_xxhash_checksums),
+            ?TDEF_FE(t_read_legacy_checksums_after_upgrade)
+        ]
+    }.
+
+setup_checksum() ->
+    Path = ?tempfile(),
+    Ctx = test_util:start_couch(),
+    config:set("couchdb", "write_xxhash_checksums", "false", _Persist = false),
+    {Ctx, Path}.
+
+teardown_checksum({Ctx, Path}) ->
+    file:delete(Path),
+    meck:unload(),
+    test_util:stop_couch(Ctx),
+    persistent_term:erase({couch_file, write_xxhash_checksums}).
+
+t_write_read_xxhash_checksums({_Ctx, Path}) ->
+    enable_xxhash(),
+
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    ?assertEqual(0, legacy_stats()),
+    couch_file:close(Fd1).
+
+t_downgrade_xxhash_checksums({_Ctx, Path}) ->
+    % We're in the future and writting xxhash checkums by default
+    enable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % The future was broken, we travel back, but still know how to
+    % interpret future checksums without crashing
+    disable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+
+    % We'll write some legacy checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    couch_file:close(Fd2).
+
+t_read_legacy_checksums_after_upgrade({_Ctx, Path}) ->
+    % We're in the past and writting legacy checkums by default
+    disable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % We upgrade and xxhash checksums are not the default, but we can
+    % still read legacy checksums.
+    enable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    % one header, one chunk
+    ?assertEqual(2, legacy_stats()),
+
+    % We'll write some new checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    couch_stats:decrement_counter([couchdb, legacy_checksums], legacy_stats()),
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    % one header, legacy chunk, not counting new chunk
+    ?assertEqual(2, legacy_stats()),
+    couch_file:close(Fd2).
+
+enable_xxhash() ->

Review Comment:
   looks really good.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] rnewson commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "rnewson (via GitHub)" <gi...@apache.org>.
rnewson commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187252789


##########
src/couch/src/couch_file.erl:
##########
@@ -906,6 +922,35 @@ reset_eof(#file{} = File) ->
     {ok, Eof} = file:position(File#file.fd, eof),
     File#file{eof = Eof}.
 
+write_checksum(Bin) when is_binary(Bin) ->

Review Comment:
   and `generate_` instead of `write_` because this function generates the checksum but doesn't write it to disk.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187512569


##########
src/couch/src/couch_file.erl:
##########
@@ -906,6 +922,35 @@ reset_eof(#file{} = File) ->
     {ok, Eof} = file:position(File#file.fd, eof),
     File#file{eof = Eof}.
 
+write_checksum(Bin) when is_binary(Bin) ->

Review Comment:
   Good idea. That's a better name for this function.
   
   I updated the function name, but kept the config name. It seem more descriptive at a higher level what happens, as it emphasizes we're not just generating hashes, like for etag, but specifically writting them to couch files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187515923


##########
src/couch/test/eunit/couch_file_tests.erl:
##########
@@ -551,3 +551,136 @@ fake_fsync_fd() ->
         {'$gen_call', From, sync} ->
             gen:reply(From, {error, eio})
     end.
+
+checksum_test_() ->
+    {
+        foreach,
+        fun setup_checksum/0,
+        fun teardown_checksum/1,
+        [
+            ?TDEF_FE(t_write_read_xxhash_checksums),
+            ?TDEF_FE(t_downgrade_xxhash_checksums),
+            ?TDEF_FE(t_read_legacy_checksums_after_upgrade)
+        ]
+    }.
+
+setup_checksum() ->
+    Path = ?tempfile(),
+    Ctx = test_util:start_couch(),
+    config:set("couchdb", "write_xxhash_checksums", "false", _Persist = false),
+    {Ctx, Path}.
+
+teardown_checksum({Ctx, Path}) ->
+    file:delete(Path),
+    meck:unload(),
+    test_util:stop_couch(Ctx),
+    persistent_term:erase({couch_file, write_xxhash_checksums}).
+
+t_write_read_xxhash_checksums({_Ctx, Path}) ->
+    enable_xxhash(),
+
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    ?assertEqual(0, legacy_stats()),
+    couch_file:close(Fd1).
+
+t_downgrade_xxhash_checksums({_Ctx, Path}) ->
+    % We're in the future and writting xxhash checkums by default
+    enable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % The future was broken, we travel back, but still know how to
+    % interpret future checksums without crashing
+    disable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+
+    % We'll write some legacy checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    couch_file:close(Fd2).
+
+t_read_legacy_checksums_after_upgrade({_Ctx, Path}) ->
+    % We're in the past and writting legacy checkums by default
+    disable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % We upgrade and xxhash checksums are not the default, but we can
+    % still read legacy checksums.
+    enable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    % one header, one chunk
+    ?assertEqual(2, legacy_stats()),
+
+    % We'll write some new checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    couch_stats:decrement_counter([couchdb, legacy_checksums], legacy_stats()),
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    % one header, legacy chunk, not counting new chunk
+    ?assertEqual(2, legacy_stats()),
+    couch_file:close(Fd2).
+
+enable_xxhash() ->

Review Comment:
   `?MODULE` won't work as it has to use `couch_file` not the current module name (`couch_file_tests`).  We could export a test-only API function in `couch_file:reset_checksum_config()`? But for a test it will be pretty obvious if they get out of sync.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187512569


##########
src/couch/src/couch_file.erl:
##########
@@ -906,6 +922,35 @@ reset_eof(#file{} = File) ->
     {ok, Eof} = file:position(File#file.fd, eof),
     File#file{eof = Eof}.
 
+write_checksum(Bin) when is_binary(Bin) ->

Review Comment:
   Good idea. That's a better name.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#issuecomment-1537468382

   @big-r81 good idea, will do so!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva merged pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva merged PR #4578:
URL: https://github.com/apache/couchdb/pull/4578


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#issuecomment-1537501462

   We should probably make it configurable and enable a mode where we verify xxh checksums but don't write them, to allow an intermediate release users can downgrade to after writting xxh becomes the default.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] rnewson commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "rnewson (via GitHub)" <gi...@apache.org>.
rnewson commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187251702


##########
src/couch/src/couch_file.erl:
##########
@@ -906,6 +922,35 @@ reset_eof(#file{} = File) ->
     {ok, Eof} = file:position(File#file.fd, eof),
     File#file{eof = Eof}.
 
+write_checksum(Bin) when is_binary(Bin) ->

Review Comment:
   I'd like to see an assertion that the result of this function is 16 bytes / 128 bits as this is assumed everywhere else, and as a guard against doing something silly in the future without realising.
   
   ```
   -spec write_checksum(binary()) -> <<_:128>>.
   write_checksum(Bin) when is_binary(Bin) ->
   Result = case write_xxhash_checksums() of
       true -> exxhash:xxhash128(Bin);
       false -> couch_hash:md5_hash(Bin)
   end,
   <<_:128>> = Result.
   ```
   
   that sort of thing
   ```
   
   that sort of thing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187515923


##########
src/couch/test/eunit/couch_file_tests.erl:
##########
@@ -551,3 +551,136 @@ fake_fsync_fd() ->
         {'$gen_call', From, sync} ->
             gen:reply(From, {error, eio})
     end.
+
+checksum_test_() ->
+    {
+        foreach,
+        fun setup_checksum/0,
+        fun teardown_checksum/1,
+        [
+            ?TDEF_FE(t_write_read_xxhash_checksums),
+            ?TDEF_FE(t_downgrade_xxhash_checksums),
+            ?TDEF_FE(t_read_legacy_checksums_after_upgrade)
+        ]
+    }.
+
+setup_checksum() ->
+    Path = ?tempfile(),
+    Ctx = test_util:start_couch(),
+    config:set("couchdb", "write_xxhash_checksums", "false", _Persist = false),
+    {Ctx, Path}.
+
+teardown_checksum({Ctx, Path}) ->
+    file:delete(Path),
+    meck:unload(),
+    test_util:stop_couch(Ctx),
+    persistent_term:erase({couch_file, write_xxhash_checksums}).
+
+t_write_read_xxhash_checksums({_Ctx, Path}) ->
+    enable_xxhash(),
+
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    ?assertEqual(0, legacy_stats()),
+    couch_file:close(Fd1).
+
+t_downgrade_xxhash_checksums({_Ctx, Path}) ->
+    % We're in the future and writting xxhash checkums by default
+    enable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % The future was broken, we travel back, but still know how to
+    % interpret future checksums without crashing
+    disable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+
+    % We'll write some legacy checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    couch_file:close(Fd2).
+
+t_read_legacy_checksums_after_upgrade({_Ctx, Path}) ->
+    % We're in the past and writting legacy checkums by default
+    disable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % We upgrade and xxhash checksums are not the default, but we can
+    % still read legacy checksums.
+    enable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    % one header, one chunk
+    ?assertEqual(2, legacy_stats()),
+
+    % We'll write some new checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    couch_stats:decrement_counter([couchdb, legacy_checksums], legacy_stats()),
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    % one header, legacy chunk, not counting new chunk
+    ?assertEqual(2, legacy_stats()),
+    couch_file:close(Fd2).
+
+enable_xxhash() ->

Review Comment:
   `?MODULE` won't work as it has to `couch_file` not the current module name (`couch_file_tests`).  We could export a test-only API function in `couch_file:reset_checksum_config()`? But for a test it will be pretty obvious if they get out of sync.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] rnewson commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "rnewson (via GitHub)" <gi...@apache.org>.
rnewson commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187259495


##########
src/couch/test/eunit/couch_file_tests.erl:
##########
@@ -551,3 +551,136 @@ fake_fsync_fd() ->
         {'$gen_call', From, sync} ->
             gen:reply(From, {error, eio})
     end.
+
+checksum_test_() ->
+    {
+        foreach,
+        fun setup_checksum/0,
+        fun teardown_checksum/1,
+        [
+            ?TDEF_FE(t_write_read_xxhash_checksums),
+            ?TDEF_FE(t_downgrade_xxhash_checksums),
+            ?TDEF_FE(t_read_legacy_checksums_after_upgrade)
+        ]
+    }.
+
+setup_checksum() ->
+    Path = ?tempfile(),
+    Ctx = test_util:start_couch(),
+    config:set("couchdb", "write_xxhash_checksums", "false", _Persist = false),
+    {Ctx, Path}.
+
+teardown_checksum({Ctx, Path}) ->
+    file:delete(Path),
+    meck:unload(),
+    test_util:stop_couch(Ctx),
+    persistent_term:erase({couch_file, write_xxhash_checksums}).
+
+t_write_read_xxhash_checksums({_Ctx, Path}) ->
+    enable_xxhash(),
+
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    ?assertEqual(0, legacy_stats()),
+    couch_file:close(Fd1).
+
+t_downgrade_xxhash_checksums({_Ctx, Path}) ->
+    % We're in the future and writting xxhash checkums by default
+    enable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % The future was broken, we travel back, but still know how to
+    % interpret future checksums without crashing
+    disable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+
+    % We'll write some legacy checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    couch_file:close(Fd2).
+
+t_read_legacy_checksums_after_upgrade({_Ctx, Path}) ->
+    % We're in the past and writting legacy checkums by default
+    disable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % We upgrade and xxhash checksums are not the default, but we can
+    % still read legacy checksums.
+    enable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    % one header, one chunk
+    ?assertEqual(2, legacy_stats()),
+
+    % We'll write some new checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    couch_stats:decrement_counter([couchdb, legacy_checksums], legacy_stats()),
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    % one header, legacy chunk, not counting new chunk
+    ?assertEqual(2, legacy_stats()),
+    couch_file:close(Fd2).
+
+enable_xxhash() ->

Review Comment:
   we should at least use the ?MODULE, ?WRITE_XXHASH_CHECKSUMS definitions that the non-test code uses



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#issuecomment-1546701143

   Benchmarking this on small documents I don't see much of a difference, other operations (file reads, etc) probably dominate. On larger documents it is more noticeable (~30% faster reads for 128KB docs).
   
   `write_xxhash_checksums = false`
   
   ```
   (node1@127.0.0.1)3> fabric_bench:go(#{doc_size=>large, docs=>25000}).
    *** Parameters
    * batch_size       : 1000
    * doc_size         : large
    * docs             : 25000
    * individual_docs  : 1000
    * n                : default
    * q                : default
   
    *** Environment
    * Nodes        : 1
    * Bench ver.   : 1
    * N            : 1
    * Q            : 2
    * OS           : unix/darwin
    * Couch ver.   : 3.3.1-9b8435d
    * Couch git sha: 9b8435d
    * VM details   : Erlang/OTP 24 [erts-12.3.2.11] [source] [64-bit] [smp:12:12] [ds:12:12:16] [async-threads:1] [jit]
   
    *** Inserting 25000 docs
    * Add 25000 docs, ok:25/accepted:0       (Hz):     230
    * Get random doc 25000X                  (Hz):     770
    * All docs                               (Hz):  110000
    * All docs w/ include_docs               (Hz):    1300
    * Changes                                (Hz):   53000
    * Single doc updates 1000X               (Hz):      20
    * Time to run all benchmarks            (sec):     194
   ```
   
   `write_xxhash_checksums = true`
   ```
   (node1@127.0.0.1)3> fabric_bench:go(#{doc_size=>large, docs=>25000}).
    *** Parameters
    * batch_size       : 1000
    * doc_size         : large
    * docs             : 25000
    * individual_docs  : 1000
    * n                : default
    * q                : default
   
    *** Environment
    * Nodes        : 1
    * Bench ver.   : 1
    * N            : 1
    * Q            : 2
    * OS           : unix/darwin
    * Couch ver.   : 3.3.1-9b8435d
    * Couch git sha: 9b8435d
    * VM details   : Erlang/OTP 24 [erts-12.3.2.11] [source] [64-bit] [smp:12:12] [ds:12:12:16] [async-threads:1] [jit]
   
    *** Inserting 25000 docs
    * Add 25000 docs, ok:25/accepted:0       (Hz):     240
    * Get random doc 25000X                  (Hz):    1100
    * All docs                               (Hz):  120000
    * All docs w/ include_docs               (Hz):    1300
    * Changes                                (Hz):   51000
    * Single doc updates 1000X               (Hz):      20
    * Time to run all benchmarks            (sec):     180
   ok
   ```
   
   Script used for benchmark: https://gist.github.com/nickva/cf8fb11edecc389daa608bf4e7ae3a36


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] big-r81 commented on pull request #4578: Use xxHash for couch_file checksums

Posted by "big-r81 (via GitHub)" <gi...@apache.org>.
big-r81 commented on PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#issuecomment-1537342334

   Instead of having `verify/extract_digest` do we want to use `verify/extract_checksum` to make it consistent?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4578: Use xxHash for couch_file checksums

Posted by "nickva (via GitHub)" <gi...@apache.org>.
nickva commented on code in PR #4578:
URL: https://github.com/apache/couchdb/pull/4578#discussion_r1187560410


##########
src/couch/test/eunit/couch_file_tests.erl:
##########
@@ -551,3 +551,136 @@ fake_fsync_fd() ->
         {'$gen_call', From, sync} ->
             gen:reply(From, {error, eio})
     end.
+
+checksum_test_() ->
+    {
+        foreach,
+        fun setup_checksum/0,
+        fun teardown_checksum/1,
+        [
+            ?TDEF_FE(t_write_read_xxhash_checksums),
+            ?TDEF_FE(t_downgrade_xxhash_checksums),
+            ?TDEF_FE(t_read_legacy_checksums_after_upgrade)
+        ]
+    }.
+
+setup_checksum() ->
+    Path = ?tempfile(),
+    Ctx = test_util:start_couch(),
+    config:set("couchdb", "write_xxhash_checksums", "false", _Persist = false),
+    {Ctx, Path}.
+
+teardown_checksum({Ctx, Path}) ->
+    file:delete(Path),
+    meck:unload(),
+    test_util:stop_couch(Ctx),
+    persistent_term:erase({couch_file, write_xxhash_checksums}).
+
+t_write_read_xxhash_checksums({_Ctx, Path}) ->
+    enable_xxhash(),
+
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    ?assertEqual(0, legacy_stats()),
+    couch_file:close(Fd1).
+
+t_downgrade_xxhash_checksums({_Ctx, Path}) ->
+    % We're in the future and writting xxhash checkums by default
+    enable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % The future was broken, we travel back, but still know how to
+    % interpret future checksums without crashing
+    disable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+
+    % We'll write some legacy checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    couch_file:close(Fd2).
+
+t_read_legacy_checksums_after_upgrade({_Ctx, Path}) ->
+    % We're in the past and writting legacy checkums by default
+    disable_xxhash(),
+    {ok, Fd} = couch_file:open(Path, [create]),
+    Header = header,
+    ok = couch_file:write_header(Fd, Header),
+    Bin = <<"bin">>,
+    Chunk = couch_file:assemble_file_chunk_and_checksum(Bin),
+    {ok, Pos, _} = couch_file:append_raw_chunk(Fd, Chunk),
+    couch_file:close(Fd),
+
+    % We upgrade and xxhash checksums are not the default, but we can
+    % still read legacy checksums.
+    enable_xxhash(),
+    {ok, Fd1} = couch_file:open(Path, []),
+    {ok, Header1} = couch_file:read_header(Fd1),
+    ?assertEqual(Header, Header1),
+    {ok, Bin1} = couch_file:pread_binary(Fd1, Pos),
+    ?assertEqual(Bin, Bin1),
+    % one header, one chunk
+    ?assertEqual(2, legacy_stats()),
+
+    % We'll write some new checksums to the file and then ensure
+    % we can read both legacy and the new ones
+    OtherBin = <<"otherbin">>,
+    OtherChunk = couch_file:assemble_file_chunk_and_checksum(OtherBin),
+    {ok, OtherPos, _} = couch_file:append_raw_chunk(Fd1, OtherChunk),
+    couch_file:close(Fd1),
+
+    couch_stats:decrement_counter([couchdb, legacy_checksums], legacy_stats()),
+    {ok, Fd2} = couch_file:open(Path, []),
+    {ok, Header2} = couch_file:read_header(Fd2),
+    ?assertEqual(Header, Header2),
+    {ok, Bin2} = couch_file:pread_binary(Fd2, Pos),
+    {ok, OtherBin1} = couch_file:pread_binary(Fd2, OtherPos),
+    ?assertEqual(Bin, Bin2),
+    ?assertEqual(OtherBin, OtherBin1),
+    % one header, legacy chunk, not counting new chunk
+    ?assertEqual(2, legacy_stats()),
+    couch_file:close(Fd2).
+
+enable_xxhash() ->

Review Comment:
   I opted for a few test helper functions to encapsulate stats and config atoms.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org