You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2020/04/04 20:51:55 UTC

[couchdb] branch compress-doc-bodies-and-attachments created (now f30a15b)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch compress-doc-bodies-and-attachments
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at f30a15b  Compress doc bodies and attachments

This branch includes the following new commits:

     new f30a15b  Compress doc bodies and attachments

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[couchdb] 01/01: Compress doc bodies and attachments

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch compress-doc-bodies-and-attachments
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit f30a15b5f23d566d4613ceb3a9cffa1d175fc476
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Sat Apr 4 16:45:52 2020 -0400

    Compress doc bodies and attachments
    
    In CouchDB < 4.x we compressed document bodies by default, so enable
    for >= 4.x as well.
    
    Use the basic term_to_binary compression mechanism for:
    
     - Document bodies
    
     - Local document bodies
    
     - Attachments, but only if they have not already been compressed.
---
 src/fabric/include/fabric2.hrl            |  4 +++
 src/fabric/src/fabric2_db.erl             |  3 +-
 src/fabric/src/fabric2_fdb.erl            | 39 ++++++++++++++++++-----
 src/fabric/test/fabric2_doc_att_tests.erl | 51 +++++++++++++++++++++++++++++--
 4 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index 0c07575..99ac874 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -54,6 +54,10 @@
 
 -define(CURR_LDOC_FORMAT, 0).
 
+% 0 - Attachment storage version
+
+-define(CURR_ATT_STORAGE_VER, 0).
+
 % Misc constants
 
 -define(PDICT_DB_KEY, '$fabric_db_handle').
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index ca9f037..edabe4e 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -809,7 +809,8 @@ read_attachment(Db, DocId, AttId) ->
 
 write_attachment(Db, DocId, Att) ->
     Data = couch_att:fetch(data, Att),
-    {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data),
+    Encoding = couch_att:fetch(encoding, Att),
+    {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data, Encoding),
     couch_att:store(data, {loc, Db, DocId, AttId}, Att).
 
 
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index 22ccc99..74f9651 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -54,7 +54,7 @@
     write_local_doc/2,
 
     read_attachment/3,
-    write_attachment/3,
+    write_attachment/4,
 
     get_last_change/1,
 
@@ -896,27 +896,51 @@ read_attachment(#{} = Db, DocId, AttId) ->
         db_prefix := DbPrefix
     } = ensure_current(Db),
 
+    IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+    InfoFuture = erlfdb:get(Tx, IdKey),
+
     AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
-    case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of
+    Data = case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of
         not_found ->
             throw({not_found, missing});
         KVs ->
             Vs = [V || {_K, V} <- KVs],
             iolist_to_binary(Vs)
+    end,
+
+    case erlfdb:wait(InfoFuture) of
+        <<>> ->
+            Data; % Old format, before CURR_ATT_STORAGE_VER = 0
+        InfoBin ->
+            {?CURR_ATT_STORAGE_VER, Compressed} = erlfdb_tuple:unpack(InfoBin),
+            case Compressed of
+                true -> binary_to_term(Data, [safe]);
+                false -> Data
+            end
     end.
 
 
-write_attachment(#{} = Db, DocId, Data) when is_binary(Data) ->
+write_attachment(#{} = Db, DocId, Data, Encoding)
+        when is_binary(Data), is_atom(Encoding) ->
     #{
         tx := Tx,
         db_prefix := DbPrefix
     } = ensure_current(Db),
 
     AttId = fabric2_util:uuid(),
-    Chunks = chunkify_binary(Data),
+    {Data1, Compressed} = case Encoding of
+        gzip ->
+            {Data, false};
+        _ ->
+            Opts = [{minor_version, 1}, {compressed, 6}],
+            {term_to_binary(Data, Opts), true}
+    end,
 
     IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
-    ok = erlfdb:set(Tx, IdKey, <<>>),
+    InfoVal = erlfdb_tuple:pack({?CURR_ATT_STORAGE_VER, Compressed}),
+    ok = erlfdb:set(Tx, IdKey, InfoVal),
+
+    Chunks = chunkify_binary(Data1),
 
     lists:foldl(fun(Chunk, ChunkId) ->
         AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId, ChunkId}, DbPrefix),
@@ -1356,7 +1380,8 @@ doc_to_fdb(Db, #doc{} = Doc) ->
 
     DiskAtts = lists:map(fun couch_att:to_disk_term/1, Atts),
 
-    Value = term_to_binary({Body, DiskAtts, Deleted}, [{minor_version, 1}]),
+    Opts = [{minor_version, 1}, {compressed, 6}],
+    Value = term_to_binary({Body, DiskAtts, Deleted}, Opts),
     Chunks = chunkify_binary(Value),
 
     {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) ->
@@ -1408,7 +1433,7 @@ local_doc_to_fdb(Db, #doc{} = Doc) ->
         _ when is_binary(Rev) -> Rev
     end,
 
-    BVal = term_to_binary(Body, [{minor_version, 1}]),
+    BVal = term_to_binary(Body, [{minor_version, 1}, {compressed, 6}]),
     {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) ->
         K = erlfdb_tuple:pack({?DB_LOCAL_DOC_BODIES, Id, ChunkId}, DbPrefix),
         {{K, Chunk}, ChunkId + 1}
diff --git a/src/fabric/test/fabric2_doc_att_tests.erl b/src/fabric/test/fabric2_doc_att_tests.erl
index ac531e9..fc7bbcc 100644
--- a/src/fabric/test/fabric2_doc_att_tests.erl
+++ b/src/fabric/test/fabric2_doc_att_tests.erl
@@ -29,6 +29,7 @@ doc_crud_test_() ->
             fun cleanup/1,
             with([
                 ?TDEF(create_att),
+                ?TDEF(create_att_already_compressed),
                 ?TDEF(delete_att),
                 ?TDEF(multiple_atts),
                 ?TDEF(delete_one_att),
@@ -84,7 +85,47 @@ create_att({Db, _}) ->
         IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)),
         AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)),
 
-        ?assertEqual(<<>>, IdVal),
+        ?assertEqual(erlfdb_tuple:pack({0, true}), IdVal),
+        Opts = [{minor_version, 1}, {compressed, 6}],
+        Expect = term_to_binary(<<"foobar">>, Opts),
+        ?assertMatch([{_, Expect}], AttVals)
+    end).
+
+create_att_already_compressed({Db, _}) ->
+    DocId = fabric2_util:uuid(),
+    Att1 = couch_att:new([
+        {name, <<"foo.txt">>},
+        {type, <<"application/octet-stream">>},
+        {att_len, 6},
+        {data, <<"foobar">>},
+        {encoding, gzip},
+        {md5, <<>>}
+    ]),
+    Doc1 = #doc{
+        id = DocId,
+        atts = [Att1]
+    },
+    {ok, _} = fabric2_db:update_doc(Db, Doc1),
+    {ok, Doc2} = fabric2_db:open_doc(Db, DocId),
+    #doc{
+        atts = [Att2]
+    } = Doc2,
+    {loc, _Db, DocId, AttId} = couch_att:fetch(data, Att2),
+    AttData = fabric2_db:read_attachment(Db, DocId, AttId),
+    ?assertEqual(<<"foobar">>, AttData),
+
+    % Check that the raw keys exist
+    #{
+        db_prefix := DbPrefix
+    } = Db,
+    IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+    AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
+
+    fabric2_fdb:transactional(fun(Tx) ->
+        IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)),
+        AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)),
+
+        ?assertEqual(erlfdb_tuple:pack({0, false}), IdVal),
         ?assertMatch([{_, <<"foobar">>}], AttVals)
     end).
 
@@ -175,7 +216,7 @@ large_att({Db, _}) ->
     AttData = iolist_to_binary([
         <<"foobar">> || _ <- lists:seq(1, 60000)
     ]),
-    Att1 = mk_att(<<"long.txt">>, AttData),
+    Att1 = mk_att(<<"long.txt">>, AttData, gzip),
     {ok, _} = create_doc(Db, DocId, [Att1]),
     ?assertEqual(#{<<"long.txt">> => AttData}, read_atts(Db, DocId)),
 
@@ -204,12 +245,16 @@ att_on_conflict_isolation({Db, _}) ->
 
 
 mk_att(Name, Data) ->
+    mk_att(Name, Data, identity).
+
+
+mk_att(Name, Data, Encoding) ->
     couch_att:new([
         {name, Name},
         {type, <<"application/octet-stream">>},
         {att_len, size(Data)},
         {data, Data},
-        {encoding, identity},
+        {encoding, Encoding},
         {md5, <<>>}
     ]).