You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@couchdb.apache.org by Gijs Nelissen <gi...@prezly.com> on 2014/10/10 16:10:02 UTC

Email statistiscs : using reduce for uniques

Hi,

I have a couchDB view with about 20 million very simple events:

key: [1,1,1,1,'deliver'] { email: "john@example.net", ip: "..."}
key: [1,1,1,1,'open'] { email: "john@example.net", ip: "..."}
key: [1,1,1,1,'click'] { email: "john@example.net", ip: "..."}
key: [1,1,1,2,'deliver'] { email: "john@example.net", ip: "..."}
key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."}
key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."} <- second
open by user
key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."} <- third open
by user

Now i want to do very mailchimp/campaignmonitor like summary per campaign
(key[3}) that show nr of unique delivers, nr of unique opens, nr of unique
clicks.

I have been trying different approaches to achieve this by using a custom
map and reduce function.

//map
function(doc) {
   emit([doc.license.id,10, doc.release.id, doc.email.id, doc.contact.id,
doc.type], null);
}

//reduce
function(keys, values, rereduce){
    if (rereduce){
        var result_rereduce = {
            contacts: values[0].contacts,
            open: values[0].open,
            click: values[0].click,
            bounce: values[0].bounce,
            unsubscribe: values[0].unsubscribe,
            unopened: values[0].unopened
        };

        for(var i=1,e=values.length; i<e; ++i) {
            result_rereduce.contacts = result_rereduce.contacts +
values[i].contacts;
            result_rereduce.open = result_rereduce.open + values[i].open;
            result_rereduce.click = result_rereduce.click + values[i].click;
            result_rereduce.bounce = result_rereduce.bounce +
values[i].bounce;
            result_rereduce.unsubscribe = result_rereduce.unsubscribe +
values[i].unsubscribe;
            result_rereduce.unopened = result_rereduce.unopened +
values[i].unopened;
        }

        return result_rereduce;
    }

    var unique_contacts = {};
    var unique_opens = {};
    var unique_clicks = {};
    var unique_bounce = {};
    var unique_unsubscribe = {};
    var unique_openorclick = {};

    for(var x=0,y=keys.length; x<y; ++x) {
        log(keys[x][0][4]);

        if (keys[x][0][4] == "email_delivered") {
            if(!unique_contacts[keys[x][0][3]]) {
                unique_contacts[keys[x][0][3]] = true;
            }
        }

        if (keys[x][0][4] == "email_open") {
            if(!unique_opens[keys[x][0][3]]) {
                unique_opens[keys[x][0][3]] = true;

                log('inserting open id ' + keys[x][0][3]);
            } else {
                log('not inserting open id ' + keys[x][0][3]);
            }

            log(unique_opens);
            log("open counter" + Object.keys(unique_opens).length);

            //used for unopened
            if(!unique_openorclick[keys[x][0][3]]) {
                unique_openorclick[keys[x][0][3]] = true;
            }
        }

        if (keys[x][0][4] == "email_click") {
            if(!unique_clicks[keys[x][0][3]]) {
                unique_clicks[keys[x][0][3]] = true;
            }

            //used for unopened
            if(!unique_openorclick[keys[x][0][3]]) {
                unique_openorclick[keys[x][0][3]] = true;
            }
        }

        if (keys[x][0][4] == "email_bounce") {
            if(!unique_bounce[keys[x][0][3]]) {
                unique_bounce[keys[x][0][3]] = true;
            }
        }

        if (keys[x][0][4] == "email_unsubscribe") {
            if(!unique_unsubscribe[keys[x][0][3]]) {
                unique_unsubscribe[keys[x][0][3]] = true;
            }
        }
    }


    var result = {
        contacts: Object.keys(unique_contacts).length,
        open: Object.keys(unique_opens).length,
        click: Object.keys(unique_clicks).length,
        bounce: Object.keys(unique_bounce).length,
        unsubscribe: Object.keys(unique_unsubscribe).length,
        unopened: Object.keys(unique_contacts).length -
Object.keys(unique_openorclick).length
    };


    return result;

}


Now this works fine right until the rereduce part. The uniqueness detection
only works where reduce=false.

Any tips on how to achieve this ?

Re: Email statistiscs : using reduce for uniques

Posted by Gijs Nelissen <gi...@prezly.com>.

Just figured my example reduce is a bit long. Here is a more isolated
function

//reduce
function(keys, values, rereduce){
    if (rereduce){
        var result_rereduce = {
            contacts: values[0].contacts,
            open: values[0].open

        };

        for(var i=1,e=values.length; i<e; ++i) {
            result_rereduce.contacts = result_rereduce.contacts +
values[i].contacts;
            result_rereduce.open = result_rereduce.open + values[i].open;

        }

        return result_rereduce;
    }

    var unique_contacts = {};
    var unique_opens = {};


    for(var x=0,y=keys.length; x<y; ++x) {
        log(keys[x][0][4]);

        if (keys[x][0][4] == "email_delivered") {
            if(!unique_contacts[keys[x][0][3]]) {
                unique_contacts[keys[x][0][3]] = true;
            }
        }

        if (keys[x][0][4] == "email_open") {
            if(!unique_opens[keys[x][0][3]]) {
                unique_opens[keys[x][0][3]] = true;
            }
        }
    }


    var result = {
        contacts: Object.keys(unique_contacts).length,
        open: Object.keys(unique_opens).length

    };

    return result;
}

On Fri, Oct 10, 2014 at 4:10 PM, Gijs Nelissen <gi...@prezly.com> wrote:

> Hi,
>
> I have a couchDB view with about 20 million very simple events:
>
> key: [1,1,1,1,'deliver'] { email: "john@example.net", ip: "..."}
> key: [1,1,1,1,'open'] { email: "john@example.net", ip: "..."}
> key: [1,1,1,1,'click'] { email: "john@example.net", ip: "..."}
> key: [1,1,1,2,'deliver'] { email: "john@example.net", ip: "..."}
> key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."}
> key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."} <- second
> open by user
> key: [1,1,1,2,'open'] { email: "john@example.net", ip: "..."} <- third
> open by user
>
> Now i want to do very mailchimp/campaignmonitor like summary per campaign
> (key[3}) that show nr of unique delivers, nr of unique opens, nr of unique
> clicks.
>
> I have been trying different approaches to achieve this by using a custom
> map and reduce function.
>
> //map
> function(doc) {
>    emit([doc.license.id,10, doc.release.id, doc.email.id, doc.contact.id,
> doc.type], null);
> }
>
> //reduce
> function(keys, values, rereduce){
>     if (rereduce){
>         var result_rereduce = {
>             contacts: values[0].contacts,
>             open: values[0].open,
>             click: values[0].click,
>             bounce: values[0].bounce,
>             unsubscribe: values[0].unsubscribe,
>             unopened: values[0].unopened
>         };
>
>         for(var i=1,e=values.length; i<e; ++i) {
>             result_rereduce.contacts = result_rereduce.contacts +
> values[i].contacts;
>             result_rereduce.open = result_rereduce.open + values[i].open;
>             result_rereduce.click = result_rereduce.click +
> values[i].click;
>             result_rereduce.bounce = result_rereduce.bounce +
> values[i].bounce;
>             result_rereduce.unsubscribe = result_rereduce.unsubscribe +
> values[i].unsubscribe;
>             result_rereduce.unopened = result_rereduce.unopened +
> values[i].unopened;
>         }
>
>         return result_rereduce;
>     }
>
>     var unique_contacts = {};
>     var unique_opens = {};
>     var unique_clicks = {};
>     var unique_bounce = {};
>     var unique_unsubscribe = {};
>     var unique_openorclick = {};
>
>     for(var x=0,y=keys.length; x<y; ++x) {
>         log(keys[x][0][4]);
>
>         if (keys[x][0][4] == "email_delivered") {
>             if(!unique_contacts[keys[x][0][3]]) {
>                 unique_contacts[keys[x][0][3]] = true;
>             }
>         }
>
>         if (keys[x][0][4] == "email_open") {
>             if(!unique_opens[keys[x][0][3]]) {
>                 unique_opens[keys[x][0][3]] = true;
>
>                 log('inserting open id ' + keys[x][0][3]);
>             } else {
>                 log('not inserting open id ' + keys[x][0][3]);
>             }
>
>             log(unique_opens);
>             log("open counter" + Object.keys(unique_opens).length);
>
>             //used for unopened
>             if(!unique_openorclick[keys[x][0][3]]) {
>                 unique_openorclick[keys[x][0][3]] = true;
>             }
>         }
>
>         if (keys[x][0][4] == "email_click") {
>             if(!unique_clicks[keys[x][0][3]]) {
>                 unique_clicks[keys[x][0][3]] = true;
>             }
>
>             //used for unopened
>             if(!unique_openorclick[keys[x][0][3]]) {
>                 unique_openorclick[keys[x][0][3]] = true;
>             }
>         }
>
>         if (keys[x][0][4] == "email_bounce") {
>             if(!unique_bounce[keys[x][0][3]]) {
>                 unique_bounce[keys[x][0][3]] = true;
>             }
>         }
>
>         if (keys[x][0][4] == "email_unsubscribe") {
>             if(!unique_unsubscribe[keys[x][0][3]]) {
>                 unique_unsubscribe[keys[x][0][3]] = true;
>             }
>         }
>     }
>
>
>     var result = {
>         contacts: Object.keys(unique_contacts).length,
>         open: Object.keys(unique_opens).length,
>         click: Object.keys(unique_clicks).length,
>         bounce: Object.keys(unique_bounce).length,
>         unsubscribe: Object.keys(unique_unsubscribe).length,
>         unopened: Object.keys(unique_contacts).length -
> Object.keys(unique_openorclick).length
>     };
>
>
>     return result;
>
> }
>
>
> Now this works fine right until the rereduce part. The uniqueness
> detection only works where reduce=false.
>
> Any tips on how to achieve this ?
>
>
>


-- 

*Gijs Nelissen*, founder
+32(0)472 24 36 21
www.prezly.com
Follow us on Twitter now <http://www.twitter.com/prezly>[image: Make your
PR team rock] <http://www.prezly.com/>

Re: Filtering Documents

Posted by Will Holley <wi...@gmail.com>.

Have you looked at CouchDB-Lucene (https://github.com/rnewson/couchdb-lucene)?

On 13 October 2014 13:30, Troy Martin <tr...@scriptedmotion.com> wrote:
> I'm using node as my server...I could filter on the server but there are potentially thousands of docs to go through so I'm hoping for a solution that will scale better.
>
> Multi range queries sound very promising...I'll look into those! Thanks for your help.
>
> Sent from my iPhone
>
> On Oct 13, 2014, at 5:13 AM, Mike Marino <mm...@gmail.com> wrote:
>
>>>
>>>
>>> Another option is to enable multirange-queries in the CouchDB. I know many
>>> people asking for them and there is existing a plugin/patch for - but there
>>> ends my knowledge. Maybe someone can help with a link.
>>
>> This should already be available in the master branch and should be
>> available in 2.0:
>>
>> https://issues.apache.org/jira/browse/COUCHDB-523
>>
>> Cheers,
>> Mike

Re: Filtering Documents

Posted by Troy Martin <tr...@scriptedmotion.com>.

I'm using node as my server...I could filter on the server but there are potentially thousands of docs to go through so I'm hoping for a solution that will scale better.

Multi range queries sound very promising...I'll look into those! Thanks for your help.

Sent from my iPhone

On Oct 13, 2014, at 5:13 AM, Mike Marino <mm...@gmail.com> wrote:

>> 
>> 
>> Another option is to enable multirange-queries in the CouchDB. I know many
>> people asking for them and there is existing a plugin/patch for - but there
>> ends my knowledge. Maybe someone can help with a link.
> 
> This should already be available in the master branch and should be
> available in 2.0:
> 
> https://issues.apache.org/jira/browse/COUCHDB-523
> 
> Cheers,
> Mike

Re: Filtering Documents

Posted by Mike Marino <mm...@gmail.com>.

>
>
> Another option is to enable multirange-queries in the CouchDB. I know many
> people asking for them and there is existing a plugin/patch for - but there
> ends my knowledge. Maybe someone can help with a link.
>

This should already be available in the master branch and should be
available in 2.0:

https://issues.apache.org/jira/browse/COUCHDB-523

Cheers,
Mike

Re: Filtering Documents

Posted by Ingo Radatz <th...@googlemail.com>.

Hi Troy,

me in the same situation has finally implemented the filtering at client-side (in that case the browser). You haven't mentioned how our server-side filtering is used finally - so, take that simply as another option you have.

Another option is to enable multirange-queries in the CouchDB. I know many people asking for them and there is existing a plugin/patch for - but there ends my knowledge. Maybe someone can help with a link.

ingo

On 13 Oct 2014, at 13:15, Troy Martin <tr...@scriptedmotion.com> wrote:

> I've been using couchdb in a project of mine for awhile now with great success. I've run into a problem though that I can't seem to solve with any level of efficiency. 
> 
> I use couch to store workorder documents. I have a list page that outputs a list of workorders along with 6 workorder fields that the user can filter the list on. For example, the user can choose to filter workorders by workorderId, dateCreated, etc.
> 
> To enable filtering, I've been using elasticsearch instead of couch because I don't know a good way to filter the workorder documents using couch without creating a view for every combination of the filters. The problem I'm running into is elasticsearch is near real-time for indexing, so if I pull up the list page, access a workorder, update it and then return to the list page, there is up to a one-second delay due to the index refresh rate before the list page will contain the updated information. How to deal with that? 
> 
> I've been forcibly refreshing the elasticsearch index before returning to the list page, but I can't see that scaling well. Consequently, I've been thinking more and more about filtering documents in couch instead of relying on elasticsearch. Can anyone offer me any suggestions on how to go about this or have I run into a wall here?
> 
> Thanks,
> 
> Troy

Filtering Documents

Posted by Troy Martin <tr...@scriptedmotion.com>.

I've been using couchdb in a project of mine for awhile now with great success. I've run into a problem though that I can't seem to solve with any level of efficiency. 

I use couch to store workorder documents. I have a list page that outputs a list of workorders along with 6 workorder fields that the user can filter the list on. For example, the user can choose to filter workorders by workorderId, dateCreated, etc.

To enable filtering, I've been using elasticsearch instead of couch because I don't know a good way to filter the workorder documents using couch without creating a view for every combination of the filters. The problem I'm running into is elasticsearch is near real-time for indexing, so if I pull up the list page, access a workorder, update it and then return to the list page, there is up to a one-second delay due to the index refresh rate before the list page will contain the updated information. How to deal with that? 

I've been forcibly refreshing the elasticsearch index before returning to the list page, but I can't see that scaling well. Consequently, I've been thinking more and more about filtering documents in couch instead of relying on elasticsearch. Can anyone offer me any suggestions on how to go about this or have I run into a wall here?

Thanks,

Troy

Re: Email statistiscs : using reduce for uniques

Posted by Sebastian Rothbucher <se...@googlemail.com>.

Hi Gijs,

Well I think you need a list function that goes like
current Type=null
current count=0
while getRow() do
  if type of row == current type then current count++
  otherwise send current type, current count and make current type = type
of row and current count = 0
until all is done

The thing is: the count does not matter, it is just needed to filter out
duplicates. And as this requires sorting before, and the only way to sort
is a view key, you have that. So indeed: I'd count the results ;-)

I think you can go with group level exact - and you could even filter
email.id and type

Let us know - good luck!
     Sebastian

On Thu, Oct 16, 2014 at 12:29 AM, Rob Crowell <ro...@gmail.com> wrote:

> I've not worked with CouchDB for a year or so now, but back when I was
> using it I ran into the same problem as you.  It was suggested that a _list
> function would be an appropriate way to handle this but I don't know if
> that's still the current thinking.
>
> With this example I believe the database is still grabbing all the groups
> and counting the number of rows, but at least it is doing the row count
> before streaming all the results back to you...
>
> http://stackoverflow.com/a/8142524/195125
>
> function() {
>  var count = 0;
>  while(getRow()) count++;
>  return JSON.stringify({count: count});
> }
>
>
> On Wed, Oct 15, 2014 at 5:17 PM, Gijs Nelissen <gi...@prezly.com> wrote:
>
> > On Sat, Oct 11, 2014 at 9:23 PM, Sebastian Rothbucher <
> > sebastianrothbucher@googlemail.com> wrote:
> >
> > > Thanks Aurélien, this is great! And indeed I think one does need the
> > > contact_id as part of the key, otherwise there is no way of having
> > > uniqueness. And as soon as it is part of the key, there is no reduce of
> > > stuff belonging to different contact_ids. So the replacement for |sort
> > > |uniq |wc -l in UNIX is a key with the sorting criterion plus a list
> > > function ;-)  Again thanks, it helps me a lot also!!!!!
> > >
> >
> > I tried to do it this way:
> >
> > function(doc) {
> >     if (doc.contact.id && doc.email.id && doc.license.id &&
> doc.release.id
> > && doc.type.substring(0,6) == 'email_') {
> >         var type =  doc.type;
> >         emit([doc.email.id, type, doc.contact.id], null);
> >     }
> > }
> >
> > and a native count function.
> >
> > Now how do i get the number of unique opens ?
> >
> > query key -> [email.id, 'opens',[]] + group_level=3 and then count the
> > number of results ?
> > What am i missing here?
> >
> >
> >
> >
> >
> > >
> > > On Fri, Oct 10, 2014 at 6:00 PM, Aurélien Bénel <aurelien.benel@utt.fr
> >
> > > wrote:
> > >
> > > > Hi Gijs,
> > > >
> > > > > I have been trying different approaches to achieve this by using a
> > > > custom map and reduce function.
> > > >
> > > > My rule of thumb is to avoid custom reduce functions at all cost.
> > > > Maybe it's a bit harsh but it saved me a lot of time and frustration.
> > > >
> > > > >>> Now i want to do very mailchimp/campaignmonitor like summary per
> > > > campaign (key[3}) that show nr of unique delivers, nr of unique
> opens,
> > nr
> > > > of unique clicks.
> > > > > SELECT count(*) FROM events WHERE type='click' GROUP BY contact_id;
> > > > > But i want the single view to output both the unique clicks, views
> > and
> > > > opens
> > > >
> > > >
> > > > First, you should emit the following keys (in this order, with no
> > value):
> > > >
> > > >     [campaign, type, contact_id]
> > > >
> > > > Then you can reduce those data (with any builtin reduce function,
> > > `_count`
> > > > for example) and `group=true` (which is shorter than
> > > > `reduce=true&group_level=exact`).
> > > >
> > > > Then you'll need an other computation round to count unique contacts
> > (per
> > > > campaign and type). While waiting for chained map-reduce (coming
> soon I
> > > > hope), you can cheat and do it with a list.
> > > > With a list, it is usually a good idea to send the results as soon as
> > you
> > > > know them (i.e., in this case, when there is a new `type` in the
> > current
> > > > row or when there are no rows anymore).
> > > >
> > > >
> > > > Regards,
> > > >
> > > > Aurélien
> > > >
> > > >
> > > >
> > >
> >
>

Re: Email statistiscs : using reduce for uniques

Posted by Rob Crowell <ro...@gmail.com>.

I've not worked with CouchDB for a year or so now, but back when I was
using it I ran into the same problem as you.  It was suggested that a _list
function would be an appropriate way to handle this but I don't know if
that's still the current thinking.

With this example I believe the database is still grabbing all the groups
and counting the number of rows, but at least it is doing the row count
before streaming all the results back to you...

http://stackoverflow.com/a/8142524/195125

function() {
 var count = 0;
 while(getRow()) count++;
 return JSON.stringify({count: count});
}


On Wed, Oct 15, 2014 at 5:17 PM, Gijs Nelissen <gi...@prezly.com> wrote:

> On Sat, Oct 11, 2014 at 9:23 PM, Sebastian Rothbucher <
> sebastianrothbucher@googlemail.com> wrote:
>
> > Thanks Aurélien, this is great! And indeed I think one does need the
> > contact_id as part of the key, otherwise there is no way of having
> > uniqueness. And as soon as it is part of the key, there is no reduce of
> > stuff belonging to different contact_ids. So the replacement for |sort
> > |uniq |wc -l in UNIX is a key with the sorting criterion plus a list
> > function ;-)  Again thanks, it helps me a lot also!!!!!
> >
>
> I tried to do it this way:
>
> function(doc) {
>     if (doc.contact.id && doc.email.id && doc.license.id && doc.release.id
> && doc.type.substring(0,6) == 'email_') {
>         var type =  doc.type;
>         emit([doc.email.id, type, doc.contact.id], null);
>     }
> }
>
> and a native count function.
>
> Now how do i get the number of unique opens ?
>
> query key -> [email.id, 'opens',[]] + group_level=3 and then count the
> number of results ?
> What am i missing here?
>
>
>
>
>
> >
> > On Fri, Oct 10, 2014 at 6:00 PM, Aurélien Bénel <au...@utt.fr>
> > wrote:
> >
> > > Hi Gijs,
> > >
> > > > I have been trying different approaches to achieve this by using a
> > > custom map and reduce function.
> > >
> > > My rule of thumb is to avoid custom reduce functions at all cost.
> > > Maybe it's a bit harsh but it saved me a lot of time and frustration.
> > >
> > > >>> Now i want to do very mailchimp/campaignmonitor like summary per
> > > campaign (key[3}) that show nr of unique delivers, nr of unique opens,
> nr
> > > of unique clicks.
> > > > SELECT count(*) FROM events WHERE type='click' GROUP BY contact_id;
> > > > But i want the single view to output both the unique clicks, views
> and
> > > opens
> > >
> > >
> > > First, you should emit the following keys (in this order, with no
> value):
> > >
> > >     [campaign, type, contact_id]
> > >
> > > Then you can reduce those data (with any builtin reduce function,
> > `_count`
> > > for example) and `group=true` (which is shorter than
> > > `reduce=true&group_level=exact`).
> > >
> > > Then you'll need an other computation round to count unique contacts
> (per
> > > campaign and type). While waiting for chained map-reduce (coming soon I
> > > hope), you can cheat and do it with a list.
> > > With a list, it is usually a good idea to send the results as soon as
> you
> > > know them (i.e., in this case, when there is a new `type` in the
> current
> > > row or when there are no rows anymore).
> > >
> > >
> > > Regards,
> > >
> > > Aurélien
> > >
> > >
> > >
> >
>

Re: Email statistiscs : using reduce for uniques

Posted by Gijs Nelissen <gi...@prezly.com>.

On Sat, Oct 11, 2014 at 9:23 PM, Sebastian Rothbucher <
sebastianrothbucher@googlemail.com> wrote:

> Thanks Aurélien, this is great! And indeed I think one does need the
> contact_id as part of the key, otherwise there is no way of having
> uniqueness. And as soon as it is part of the key, there is no reduce of
> stuff belonging to different contact_ids. So the replacement for |sort
> |uniq |wc -l in UNIX is a key with the sorting criterion plus a list
> function ;-)  Again thanks, it helps me a lot also!!!!!
>

I tried to do it this way:

function(doc) {
    if (doc.contact.id && doc.email.id && doc.license.id && doc.release.id
&& doc.type.substring(0,6) == 'email_') {
        var type =  doc.type;
        emit([doc.email.id, type, doc.contact.id], null);
    }
}

and a native count function.

Now how do i get the number of unique opens ?

query key -> [email.id, 'opens',[]] + group_level=3 and then count the
number of results ?
What am i missing here?





>
> On Fri, Oct 10, 2014 at 6:00 PM, Aurélien Bénel <au...@utt.fr>
> wrote:
>
> > Hi Gijs,
> >
> > > I have been trying different approaches to achieve this by using a
> > custom map and reduce function.
> >
> > My rule of thumb is to avoid custom reduce functions at all cost.
> > Maybe it's a bit harsh but it saved me a lot of time and frustration.
> >
> > >>> Now i want to do very mailchimp/campaignmonitor like summary per
> > campaign (key[3}) that show nr of unique delivers, nr of unique opens, nr
> > of unique clicks.
> > > SELECT count(*) FROM events WHERE type='click' GROUP BY contact_id;
> > > But i want the single view to output both the unique clicks, views and
> > opens
> >
> >
> > First, you should emit the following keys (in this order, with no value):
> >
> >     [campaign, type, contact_id]
> >
> > Then you can reduce those data (with any builtin reduce function,
> `_count`
> > for example) and `group=true` (which is shorter than
> > `reduce=true&group_level=exact`).
> >
> > Then you'll need an other computation round to count unique contacts (per
> > campaign and type). While waiting for chained map-reduce (coming soon I
> > hope), you can cheat and do it with a list.
> > With a list, it is usually a good idea to send the results as soon as you
> > know them (i.e., in this case, when there is a new `type` in the current
> > row or when there are no rows anymore).
> >
> >
> > Regards,
> >
> > Aurélien
> >
> >
> >
>

Re: Email statistiscs : using reduce for uniques

Posted by Sebastian Rothbucher <se...@googlemail.com>.

Thanks Aurélien, this is great! And indeed I think one does need the
contact_id as part of the key, otherwise there is no way of having
uniqueness. And as soon as it is part of the key, there is no reduce of
stuff belonging to different contact_ids. So the replacement for |sort
|uniq |wc -l in UNIX is a key with the sorting criterion plus a list
function ;-)  Again thanks, it helps me a lot also!!!!!

On Fri, Oct 10, 2014 at 6:00 PM, Aurélien Bénel <au...@utt.fr>
wrote:

> Hi Gijs,
>
> > I have been trying different approaches to achieve this by using a
> custom map and reduce function.
>
> My rule of thumb is to avoid custom reduce functions at all cost.
> Maybe it's a bit harsh but it saved me a lot of time and frustration.
>
> >>> Now i want to do very mailchimp/campaignmonitor like summary per
> campaign (key[3}) that show nr of unique delivers, nr of unique opens, nr
> of unique clicks.
> > SELECT count(*) FROM events WHERE type='click' GROUP BY contact_id;
> > But i want the single view to output both the unique clicks, views and
> opens
>
>
> First, you should emit the following keys (in this order, with no value):
>
>     [campaign, type, contact_id]
>
> Then you can reduce those data (with any builtin reduce function, `_count`
> for example) and `group=true` (which is shorter than
> `reduce=true&group_level=exact`).
>
> Then you'll need an other computation round to count unique contacts (per
> campaign and type). While waiting for chained map-reduce (coming soon I
> hope), you can cheat and do it with a list.
> With a list, it is usually a good idea to send the results as soon as you
> know them (i.e., in this case, when there is a new `type` in the current
> row or when there are no rows anymore).
>
>
> Regards,
>
> Aurélien
>
>
>

Re: Email statistiscs : using reduce for uniques

Posted by Aurélien Bénel <au...@utt.fr>.

Hi Gijs,

> I have been trying different approaches to achieve this by using a custom map and reduce function.

My rule of thumb is to avoid custom reduce functions at all cost.
Maybe it's a bit harsh but it saved me a lot of time and frustration.

>>> Now i want to do very mailchimp/campaignmonitor like summary per campaign (key[3}) that show nr of unique delivers, nr of unique opens, nr of unique clicks.
> SELECT count(*) FROM events WHERE type='click' GROUP BY contact_id;
> But i want the single view to output both the unique clicks, views and opens


First, you should emit the following keys (in this order, with no value):

    [campaign, type, contact_id]

Then you can reduce those data (with any builtin reduce function, `_count` for example) and `group=true` (which is shorter than `reduce=true&group_level=exact`).

Then you'll need an other computation round to count unique contacts (per campaign and type). While waiting for chained map-reduce (coming soon I hope), you can cheat and do it with a list. 
With a list, it is usually a good idea to send the results as soon as you know them (i.e., in this case, when there is a new `type` in the current row or when there are no rows anymore).  


Regards,

Aurélien