You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2020/04/04 20:51:56 UTC

[couchdb] 01/01: Compress doc bodies and attachments

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch compress-doc-bodies-and-attachments
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit f30a15b5f23d566d4613ceb3a9cffa1d175fc476
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Sat Apr 4 16:45:52 2020 -0400

    Compress doc bodies and attachments
    
    In CouchDB < 4.x we compressed document bodies by default, so enable
    for >= 4.x as well.
    
    Use the basic term_to_binary compression mechanism for:
    
     - Document bodies
    
     - Local document bodies
    
     - Attachments, but only if they have not already been compressed.
---
 src/fabric/include/fabric2.hrl            |  4 +++
 src/fabric/src/fabric2_db.erl             |  3 +-
 src/fabric/src/fabric2_fdb.erl            | 39 ++++++++++++++++++-----
 src/fabric/test/fabric2_doc_att_tests.erl | 51 +++++++++++++++++++++++++++++--
 4 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index 0c07575..99ac874 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -54,6 +54,10 @@
 
 -define(CURR_LDOC_FORMAT, 0).
 
+% 0 - Attachment storage version
+
+-define(CURR_ATT_STORAGE_VER, 0).
+
 % Misc constants
 
 -define(PDICT_DB_KEY, '$fabric_db_handle').
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index ca9f037..edabe4e 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -809,7 +809,8 @@ read_attachment(Db, DocId, AttId) ->
 
 write_attachment(Db, DocId, Att) ->
     Data = couch_att:fetch(data, Att),
-    {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data),
+    Encoding = couch_att:fetch(encoding, Att),
+    {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data, Encoding),
     couch_att:store(data, {loc, Db, DocId, AttId}, Att).
 
 
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index 22ccc99..74f9651 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -54,7 +54,7 @@
     write_local_doc/2,
 
     read_attachment/3,
-    write_attachment/3,
+    write_attachment/4,
 
     get_last_change/1,
 
@@ -896,27 +896,51 @@ read_attachment(#{} = Db, DocId, AttId) ->
         db_prefix := DbPrefix
     } = ensure_current(Db),
 
+    IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+    InfoFuture = erlfdb:get(Tx, IdKey),
+
     AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
-    case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of
+    Data = case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of
         not_found ->
             throw({not_found, missing});
         KVs ->
             Vs = [V || {_K, V} <- KVs],
             iolist_to_binary(Vs)
+    end,
+
+    case erlfdb:wait(InfoFuture) of
+        <<>> ->
+            Data; % Old format, before CURR_ATT_STORAGE_VER = 0
+        InfoBin ->
+            {?CURR_ATT_STORAGE_VER, Compressed} = erlfdb_tuple:unpack(InfoBin),
+            case Compressed of
+                true -> binary_to_term(Data, [safe]);
+                false -> Data
+            end
     end.
 
 
-write_attachment(#{} = Db, DocId, Data) when is_binary(Data) ->
+write_attachment(#{} = Db, DocId, Data, Encoding)
+        when is_binary(Data), is_atom(Encoding) ->
     #{
         tx := Tx,
         db_prefix := DbPrefix
     } = ensure_current(Db),
 
     AttId = fabric2_util:uuid(),
-    Chunks = chunkify_binary(Data),
+    {Data1, Compressed} = case Encoding of
+        gzip ->
+            {Data, false};
+        _ ->
+            Opts = [{minor_version, 1}, {compressed, 6}],
+            {term_to_binary(Data, Opts), true}
+    end,
 
     IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
-    ok = erlfdb:set(Tx, IdKey, <<>>),
+    InfoVal = erlfdb_tuple:pack({?CURR_ATT_STORAGE_VER, Compressed}),
+    ok = erlfdb:set(Tx, IdKey, InfoVal),
+
+    Chunks = chunkify_binary(Data1),
 
     lists:foldl(fun(Chunk, ChunkId) ->
         AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId, ChunkId}, DbPrefix),
@@ -1356,7 +1380,8 @@ doc_to_fdb(Db, #doc{} = Doc) ->
 
     DiskAtts = lists:map(fun couch_att:to_disk_term/1, Atts),
 
-    Value = term_to_binary({Body, DiskAtts, Deleted}, [{minor_version, 1}]),
+    Opts = [{minor_version, 1}, {compressed, 6}],
+    Value = term_to_binary({Body, DiskAtts, Deleted}, Opts),
     Chunks = chunkify_binary(Value),
 
     {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) ->
@@ -1408,7 +1433,7 @@ local_doc_to_fdb(Db, #doc{} = Doc) ->
         _ when is_binary(Rev) -> Rev
     end,
 
-    BVal = term_to_binary(Body, [{minor_version, 1}]),
+    BVal = term_to_binary(Body, [{minor_version, 1}, {compressed, 6}]),
     {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) ->
         K = erlfdb_tuple:pack({?DB_LOCAL_DOC_BODIES, Id, ChunkId}, DbPrefix),
         {{K, Chunk}, ChunkId + 1}
diff --git a/src/fabric/test/fabric2_doc_att_tests.erl b/src/fabric/test/fabric2_doc_att_tests.erl
index ac531e9..fc7bbcc 100644
--- a/src/fabric/test/fabric2_doc_att_tests.erl
+++ b/src/fabric/test/fabric2_doc_att_tests.erl
@@ -29,6 +29,7 @@ doc_crud_test_() ->
             fun cleanup/1,
             with([
                 ?TDEF(create_att),
+                ?TDEF(create_att_already_compressed),
                 ?TDEF(delete_att),
                 ?TDEF(multiple_atts),
                 ?TDEF(delete_one_att),
@@ -84,7 +85,47 @@ create_att({Db, _}) ->
         IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)),
         AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)),
 
-        ?assertEqual(<<>>, IdVal),
+        ?assertEqual(erlfdb_tuple:pack({0, true}), IdVal),
+        Opts = [{minor_version, 1}, {compressed, 6}],
+        Expect = term_to_binary(<<"foobar">>, Opts),
+        ?assertMatch([{_, Expect}], AttVals)
+    end).
+
+create_att_already_compressed({Db, _}) ->
+    DocId = fabric2_util:uuid(),
+    Att1 = couch_att:new([
+        {name, <<"foo.txt">>},
+        {type, <<"application/octet-stream">>},
+        {att_len, 6},
+        {data, <<"foobar">>},
+        {encoding, gzip},
+        {md5, <<>>}
+    ]),
+    Doc1 = #doc{
+        id = DocId,
+        atts = [Att1]
+    },
+    {ok, _} = fabric2_db:update_doc(Db, Doc1),
+    {ok, Doc2} = fabric2_db:open_doc(Db, DocId),
+    #doc{
+        atts = [Att2]
+    } = Doc2,
+    {loc, _Db, DocId, AttId} = couch_att:fetch(data, Att2),
+    AttData = fabric2_db:read_attachment(Db, DocId, AttId),
+    ?assertEqual(<<"foobar">>, AttData),
+
+    % Check that the raw keys exist
+    #{
+        db_prefix := DbPrefix
+    } = Db,
+    IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+    AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
+
+    fabric2_fdb:transactional(fun(Tx) ->
+        IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)),
+        AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)),
+
+        ?assertEqual(erlfdb_tuple:pack({0, false}), IdVal),
         ?assertMatch([{_, <<"foobar">>}], AttVals)
     end).
 
@@ -175,7 +216,7 @@ large_att({Db, _}) ->
     AttData = iolist_to_binary([
         <<"foobar">> || _ <- lists:seq(1, 60000)
     ]),
-    Att1 = mk_att(<<"long.txt">>, AttData),
+    Att1 = mk_att(<<"long.txt">>, AttData, gzip),
     {ok, _} = create_doc(Db, DocId, [Att1]),
     ?assertEqual(#{<<"long.txt">> => AttData}, read_atts(Db, DocId)),
 
@@ -204,12 +245,16 @@ att_on_conflict_isolation({Db, _}) ->
 
 
 mk_att(Name, Data) ->
+    mk_att(Name, Data, identity).
+
+
+mk_att(Name, Data, Encoding) ->
     couch_att:new([
         {name, Name},
         {type, <<"application/octet-stream">>},
         {att_len, size(Data)},
         {data, Data},
-        {encoding, identity},
+        {encoding, Encoding},
         {md5, <<>>}
     ]).