You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@cassandra.apache.org by Ravikumar Govindarajan <ra...@gmail.com> on 2013/09/10 15:21:35 UTC

Composite Column Grouping

I have been faced with a problem of grouping composites on the second-part.

Lets say my CF contains this


TimeSeriesCF
                       key:                            UserID
                       composite-col-name:    TimeUUID:PKID

Some sample data

UserID = XYZ
                                 Time:PKID
               Col-Name1 = 200:1000
               Col-Name2 = 201:1001
               Col-Name3 = 202:1000
               Col-Name4 = 203:1000
               Col-Name5 = 204:1002

Whenever a time-series query is issued, it should return the following in
time-desc order.

UserID = XYZ
              Col-Name5 = 204:1002
              Col-Name4 = 203:1000
              Col-Name2 = 201:1001

Is something like this possible in Cassandra? Is there a different way to
design and achieve the same objective?

--
Ravi

Re: Composite Column Grouping

Posted by "Laing, Michael" <mi...@nytimes.com>.

Here's a slightly better version and a python script. -ml

-- put this in <file> and run using 'cqlsh -f <file>

DROP KEYSPACE latest;

CREATE KEYSPACE latest WITH replication = {
    'class': 'SimpleStrategy',
    'replication_factor' : 1
};

USE latest;

CREATE TABLE time_series (
    bucket_userid text, -- bucket is the beginning of a datetime span
concatenated with a shard designator
    user_id text,
    pkid text,
    timeuuid text,
    colname text,
    PRIMARY KEY (bucket_userid, timeuuid)
);

UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-1'
WHERE
    bucket_userid = '2013-09-11T05:15-0_XYZ' AND
    timeuuid='200'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1001',
    colname = 'Col-Name-2'
WHERE
    bucket_userid = '2013-09-11T05:15-1_XYZ' AND
    timeuuid='201'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-3'
WHERE
    bucket_userid = '2013-09-11T05:15-0_XYZ' AND
    timeuuid='202'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-4'
WHERE
    bucket_userid = '2013-09-11T05:30-1_XYZ' AND
    timeuuid='203'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1002',
    colname = 'Col-Name-5'
WHERE
    bucket_userid = '2013-09-11T05:30-0_XYZ' AND
    timeuuid='204'
;

-- This query assumes that the 'current' span is 2013-09-11T05:30 and I am
interested in this span and the previous one.

SELECT * FROM time_series
WHERE bucket_userid IN ( -- go back as many spans as you need to, all
shards in each span (cartesian product)
    '2013-09-11T05:15-0_XYZ',
    '2013-09-11T05:15-1_XYZ',
    '2013-09-11T05:30-0_XYZ',
    '2013-09-11T05:30-1_XYZ'
) -- you could add a range condition on timeuuid to further restrict the
results
ORDER BY timeuuid DESC;

-- returns:
-- bucket_userid          | timeuuid | colname    | pkid | user_id
--------------------------+----------+------------+------+---------
-- 2013-09-11T05:30-0_XYZ |      204 | Col-Name-5 | 1002 |     XYZ
-- 2013-09-11T05:30-1_XYZ |      203 | Col-Name-4 | 1000 |     XYZ
-- 2013-09-11T05:15-0_XYZ |      202 | Col-Name-3 | 1000 |     XYZ
-- 2013-09-11T05:15-1_XYZ |      201 | Col-Name-2 | 1001 |     XYZ
-- 2013-09-11T05:15-0_XYZ |      200 | Col-Name-1 | 1000 |     XYZ

-- do a stable purge on pkid to get the result


python script:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
logging.basicConfig()

import cassandra
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

cql_cluster = Cluster()
cql_session = cql_cluster.connect()
cql_session.set_keyspace('latest')

select_stmt = """
    SELECT * FROM time_series
    WHERE bucket_userid IN ( -- go back as many spans as you need to, all
shards in each span (cartesian product)
        '2013-09-11T05:15-0_XYZ',
        '2013-09-11T05:15-1_XYZ',
        '2013-09-11T05:30-0_XYZ',
        '2013-09-11T05:30-1_XYZ'
    )
    ORDER BY timeuuid DESC;
"""

query = SimpleStatement(select_stmt)
rows = cql_session.execute(query)

pkids = set()
for row in rows:
    if row.pkid in pkids:
        continue
    else:
        print row.user_id, row.timeuuid, row.colname, row.pkid
        pkids.add(row.pkid)

# prints:

# XYZ 204 Col-Name-5 1002
# XYZ 203 Col-Name-4 1000
# XYZ 201 Col-Name-2 1001


On Wed, Sep 11, 2013 at 6:13 AM, Laing, Michael
<mi...@nytimes.com>wrote:

> Then you can do this. I handle millions of entries this way and it works
> well if you are mostly interested in recent activity.
>
> If you need to span all activity then you can use a separate table to
> maintain the 'latest'. This table should also be sharded as entries will be
> 'hot'. Sharding will spread the heat and the tombstones (compaction load)
> around the cluster.
>
> -ml
>
> -- put this in <file> and run using 'cqlsh -f <file>
>
> DROP KEYSPACE latest;
>
> CREATE KEYSPACE latest WITH replication = {
>     'class': 'SimpleStrategy',
>     'replication_factor' : 1
> };
>
> USE latest;
>
> CREATE TABLE time_series (
>     bucket_userid text, -- bucket is the beginning of a datetime span
> concatenated with a shard designator
>     pkid text,
>     timeuuid text,
>     colname text,
>     PRIMARY KEY (bucket_userid, timeuuid)
> );
>
> -- the example table is using 15 minute bucket spans and 2 shards for
> illustration (you would usually use more shards)
> -- adjust these appropriately for your application
>
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-1' where
> bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='200';
> UPDATE time_series SET pkid = '1001', colname = 'Col-Name-2' where
> bucket_userid = '2013-09-11T05:15-1_XYZ' AND timeuuid='201';
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-3' where
> bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='202';
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-4' where
> bucket_userid = '2013-09-11T05:30-1_XYZ' AND timeuuid='203';
> UPDATE time_series SET pkid = '1002', colname = 'Col-Name-5' where
> bucket_userid = '2013-09-11T05:30-0_XYZ' AND timeuuid='204';
>
> -- This query assumes that the 'current' span is 2013-09-11T05:30 and I am
> interested in this span and the previous one.
>
> SELECT * FROM time_series
> WHERE bucket_userid in ( -- go back as many spans as you need to, all
> shards in each span (cartesian product)
>     '2013-09-11T05:15-0_XYZ',
>     '2013-09-11T05:15-1_XYZ',
>     '2013-09-11T05:30-0_XYZ',
>     '2013-09-11T05:30-1_XYZ'
> )
> ORDER BY timeuuid DESC;
>
> -- returns:
> -- bucket_userid          | timeuuid | colname    | pkid
> --------------------------+----------+------------+------
> -- 2013-09-11T05:30-0_XYZ |      204 | Col-Name-5 | 1002
> -- 2013-09-11T05:30-1_XYZ |      203 | Col-Name-4 | 1000
> -- 2013-09-11T05:15-0_XYZ |      202 | Col-Name-3 | 1000
> -- 2013-09-11T05:15-1_XYZ |      201 | Col-Name-2 | 1001
> -- 2013-09-11T05:15-0_XYZ |      200 | Col-Name-1 | 1000
>
> -- do a stable purge on pkid to get the result.
>
>
> On Wed, Sep 11, 2013 at 1:01 AM, Ravikumar Govindarajan <
> ravikumar.govindarajan@gmail.com> wrote:
>
>> Thanks Michael,
>>
>> But I cannot sort the rows in memory, as the number of columns will be
>> quite huge.
>>
>> From the python script above:
>>    select_stmt = "select * from time_series where userid = 'XYZ'"
>>
>> This would return me many hundreds of thousands of columns. I need to go
>> in time-series order using ranges [Pagination queries].
>>
>>
>> On Wed, Sep 11, 2013 at 7:06 AM, Laing, Michael <
>> michael.laing@nytimes.com> wrote:
>>
>>> If you have set up the table as described in my previous message, you
>>> could run this python snippet to return the desired result:
>>>
>>> #!/usr/bin/env python
>>> # -*- coding: utf-8 -*-
>>> import logging
>>> logging.basicConfig()
>>>
>>> from operator import itemgetter
>>>
>>> import cassandra
>>> from cassandra.cluster import Cluster
>>> from cassandra.query import SimpleStatement
>>>
>>> cql_cluster = Cluster()
>>> cql_session = cql_cluster.connect()
>>> cql_session.set_keyspace('latest')
>>>
>>> select_stmt = "select * from time_series where userid = 'XYZ'"
>>> query = SimpleStatement(select_stmt)
>>> rows = cql_session.execute(query)
>>>
>>> results = []
>>> for row in rows:
>>>     max_time = max(row.colname.keys())
>>>     results.append((row.userid, row.pkid, max_time,
>>> row.colname[max_time]))
>>>
>>> sorted_results = sorted(results, key=itemgetter(2), reverse=True)
>>> for result in sorted_results: print result
>>>
>>> # prints:
>>>
>>> # (u'XYZ', u'1002', u'204', u'Col-Name-5')
>>> # (u'XYZ', u'1000', u'203', u'Col-Name-4')
>>> # (u'XYZ', u'1001', u'201', u'Col-Name-2')
>>>
>>>
>>>
>>> On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael <
>>> michael.laing@nytimes.com> wrote:
>>>
>>>> You could try this. C* doesn't do it all for you, but it will
>>>> efficiently get you the right data.
>>>>
>>>> -ml
>>>>
>>>> -- put this in <file> and run using 'cqlsh -f <file>
>>>>
>>>> DROP KEYSPACE latest;
>>>>
>>>> CREATE KEYSPACE latest WITH replication = {
>>>>     'class': 'SimpleStrategy',
>>>>     'replication_factor' : 1
>>>> };
>>>>
>>>> USE latest;
>>>>
>>>> CREATE TABLE time_series (
>>>>     userid text,
>>>>     pkid text,
>>>>     colname map<text, text>,
>>>>     PRIMARY KEY (userid, pkid)
>>>> );
>>>>
>>>> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
>>>> userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
>>>> UPDATE time_series SET colname = colname +
>>>> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';
>>>>
>>>> SELECT * FROM time_series WHERE userid = 'XYZ';
>>>>
>>>> -- returns:
>>>> -- userid | pkid | colname
>>>>
>>>> ----------+------+-----------------------------------------------------------------
>>>> --    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
>>>> 'Col-Name-4'}
>>>> --    XYZ | 1001 |                                           {'201':
>>>> 'Col-Name-2'}
>>>> --    XYZ | 1002 |                                           {'204':
>>>> 'Col-Name-5'}
>>>>
>>>> -- use an app to pop off the latest key/value from the map for each
>>>> row, then sort by key desc.
>>>>
>>>>
>>>> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
>>>> ravikumar.govindarajan@gmail.com> wrote:
>>>>
>>>>> I have been faced with a problem of grouping composites on the
>>>>> second-part.
>>>>>
>>>>> Lets say my CF contains this
>>>>>
>>>>>
>>>>> TimeSeriesCF
>>>>>                        key:                            UserID
>>>>>                        composite-col-name:    TimeUUID:PKID
>>>>>
>>>>> Some sample data
>>>>>
>>>>> UserID = XYZ
>>>>>                                  Time:PKID
>>>>>                Col-Name1 = 200:1000
>>>>>                Col-Name2 = 201:1001
>>>>>                Col-Name3 = 202:1000
>>>>>                Col-Name4 = 203:1000
>>>>>                Col-Name5 = 204:1002
>>>>>
>>>>> Whenever a time-series query is issued, it should return the following
>>>>> in time-desc order.
>>>>>
>>>>> UserID = XYZ
>>>>>               Col-Name5 = 204:1002
>>>>>               Col-Name4 = 203:1000
>>>>>               Col-Name2 = 201:1001
>>>>>
>>>>> Is something like this possible in Cassandra? Is there a different way
>>>>> to design and achieve the same objective?
>>>>>
>>>>> --
>>>>> Ravi
>>>>>
>>>>>
>>>>
>>>>
>>>
>>
>

Re: Composite Column Grouping

Posted by "Laing, Michael" <mi...@nytimes.com>.

Then you can do this. I handle millions of entries this way and it works
well if you are mostly interested in recent activity.

If you need to span all activity then you can use a separate table to
maintain the 'latest'. This table should also be sharded as entries will be
'hot'. Sharding will spread the heat and the tombstones (compaction load)
around the cluster.

-ml

-- put this in <file> and run using 'cqlsh -f <file>

DROP KEYSPACE latest;

CREATE KEYSPACE latest WITH replication = {
    'class': 'SimpleStrategy',
    'replication_factor' : 1
};

USE latest;

CREATE TABLE time_series (
    bucket_userid text, -- bucket is the beginning of a datetime span
concatenated with a shard designator
    pkid text,
    timeuuid text,
    colname text,
    PRIMARY KEY (bucket_userid, timeuuid)
);

-- the example table is using 15 minute bucket spans and 2 shards for
illustration (you would usually use more shards)
-- adjust these appropriately for your application

UPDATE time_series SET pkid = '1000', colname = 'Col-Name-1' where
bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='200';
UPDATE time_series SET pkid = '1001', colname = 'Col-Name-2' where
bucket_userid = '2013-09-11T05:15-1_XYZ' AND timeuuid='201';
UPDATE time_series SET pkid = '1000', colname = 'Col-Name-3' where
bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='202';
UPDATE time_series SET pkid = '1000', colname = 'Col-Name-4' where
bucket_userid = '2013-09-11T05:30-1_XYZ' AND timeuuid='203';
UPDATE time_series SET pkid = '1002', colname = 'Col-Name-5' where
bucket_userid = '2013-09-11T05:30-0_XYZ' AND timeuuid='204';

-- This query assumes that the 'current' span is 2013-09-11T05:30 and I am
interested in this span and the previous one.

SELECT * FROM time_series
WHERE bucket_userid in ( -- go back as many spans as you need to, all
shards in each span (cartesian product)
    '2013-09-11T05:15-0_XYZ',
    '2013-09-11T05:15-1_XYZ',
    '2013-09-11T05:30-0_XYZ',
    '2013-09-11T05:30-1_XYZ'
)
ORDER BY timeuuid DESC;

-- returns:
-- bucket_userid          | timeuuid | colname    | pkid
--------------------------+----------+------------+------
-- 2013-09-11T05:30-0_XYZ |      204 | Col-Name-5 | 1002
-- 2013-09-11T05:30-1_XYZ |      203 | Col-Name-4 | 1000
-- 2013-09-11T05:15-0_XYZ |      202 | Col-Name-3 | 1000
-- 2013-09-11T05:15-1_XYZ |      201 | Col-Name-2 | 1001
-- 2013-09-11T05:15-0_XYZ |      200 | Col-Name-1 | 1000

-- do a stable purge on pkid to get the result.


On Wed, Sep 11, 2013 at 1:01 AM, Ravikumar Govindarajan <
ravikumar.govindarajan@gmail.com> wrote:

> Thanks Michael,
>
> But I cannot sort the rows in memory, as the number of columns will be
> quite huge.
>
> From the python script above:
>    select_stmt = "select * from time_series where userid = 'XYZ'"
>
> This would return me many hundreds of thousands of columns. I need to go
> in time-series order using ranges [Pagination queries].
>
>
> On Wed, Sep 11, 2013 at 7:06 AM, Laing, Michael <michael.laing@nytimes.com
> > wrote:
>
>> If you have set up the table as described in my previous message, you
>> could run this python snippet to return the desired result:
>>
>> #!/usr/bin/env python
>> # -*- coding: utf-8 -*-
>> import logging
>> logging.basicConfig()
>>
>> from operator import itemgetter
>>
>> import cassandra
>> from cassandra.cluster import Cluster
>> from cassandra.query import SimpleStatement
>>
>> cql_cluster = Cluster()
>> cql_session = cql_cluster.connect()
>> cql_session.set_keyspace('latest')
>>
>> select_stmt = "select * from time_series where userid = 'XYZ'"
>> query = SimpleStatement(select_stmt)
>> rows = cql_session.execute(query)
>>
>> results = []
>> for row in rows:
>>     max_time = max(row.colname.keys())
>>     results.append((row.userid, row.pkid, max_time,
>> row.colname[max_time]))
>>
>> sorted_results = sorted(results, key=itemgetter(2), reverse=True)
>> for result in sorted_results: print result
>>
>> # prints:
>>
>> # (u'XYZ', u'1002', u'204', u'Col-Name-5')
>> # (u'XYZ', u'1000', u'203', u'Col-Name-4')
>> # (u'XYZ', u'1001', u'201', u'Col-Name-2')
>>
>>
>>
>> On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael <
>> michael.laing@nytimes.com> wrote:
>>
>>> You could try this. C* doesn't do it all for you, but it will
>>> efficiently get you the right data.
>>>
>>> -ml
>>>
>>> -- put this in <file> and run using 'cqlsh -f <file>
>>>
>>> DROP KEYSPACE latest;
>>>
>>> CREATE KEYSPACE latest WITH replication = {
>>>     'class': 'SimpleStrategy',
>>>     'replication_factor' : 1
>>> };
>>>
>>> USE latest;
>>>
>>> CREATE TABLE time_series (
>>>     userid text,
>>>     pkid text,
>>>     colname map<text, text>,
>>>     PRIMARY KEY (userid, pkid)
>>> );
>>>
>>> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
>>> userid = 'XYZ' AND pkid = '1000';
>>> UPDATE time_series SET colname = colname +
>>> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
>>> UPDATE time_series SET colname = colname +
>>> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
>>> UPDATE time_series SET colname = colname +
>>> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
>>> UPDATE time_series SET colname = colname +
>>> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';
>>>
>>> SELECT * FROM time_series WHERE userid = 'XYZ';
>>>
>>> -- returns:
>>> -- userid | pkid | colname
>>>
>>> ----------+------+-----------------------------------------------------------------
>>> --    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
>>> 'Col-Name-4'}
>>> --    XYZ | 1001 |                                           {'201':
>>> 'Col-Name-2'}
>>> --    XYZ | 1002 |                                           {'204':
>>> 'Col-Name-5'}
>>>
>>> -- use an app to pop off the latest key/value from the map for each row,
>>> then sort by key desc.
>>>
>>>
>>> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
>>> ravikumar.govindarajan@gmail.com> wrote:
>>>
>>>> I have been faced with a problem of grouping composites on the
>>>> second-part.
>>>>
>>>> Lets say my CF contains this
>>>>
>>>>
>>>> TimeSeriesCF
>>>>                        key:                            UserID
>>>>                        composite-col-name:    TimeUUID:PKID
>>>>
>>>> Some sample data
>>>>
>>>> UserID = XYZ
>>>>                                  Time:PKID
>>>>                Col-Name1 = 200:1000
>>>>                Col-Name2 = 201:1001
>>>>                Col-Name3 = 202:1000
>>>>                Col-Name4 = 203:1000
>>>>                Col-Name5 = 204:1002
>>>>
>>>> Whenever a time-series query is issued, it should return the following
>>>> in time-desc order.
>>>>
>>>> UserID = XYZ
>>>>               Col-Name5 = 204:1002
>>>>               Col-Name4 = 203:1000
>>>>               Col-Name2 = 201:1001
>>>>
>>>> Is something like this possible in Cassandra? Is there a different way
>>>> to design and achieve the same objective?
>>>>
>>>> --
>>>> Ravi
>>>>
>>>>
>>>
>>>
>>
>

Re: Composite Column Grouping

Posted by Ravikumar Govindarajan <ra...@gmail.com>.

Thanks Michael,

But I cannot sort the rows in memory, as the number of columns will be
quite huge.

>From the python script above:
   select_stmt = "select * from time_series where userid = 'XYZ'"

This would return me many hundreds of thousands of columns. I need to go in
time-series order using ranges [Pagination queries].


On Wed, Sep 11, 2013 at 7:06 AM, Laing, Michael
<mi...@nytimes.com>wrote:

> If you have set up the table as described in my previous message, you
> could run this python snippet to return the desired result:
>
> #!/usr/bin/env python
> # -*- coding: utf-8 -*-
> import logging
> logging.basicConfig()
>
> from operator import itemgetter
>
> import cassandra
> from cassandra.cluster import Cluster
> from cassandra.query import SimpleStatement
>
> cql_cluster = Cluster()
> cql_session = cql_cluster.connect()
> cql_session.set_keyspace('latest')
>
> select_stmt = "select * from time_series where userid = 'XYZ'"
> query = SimpleStatement(select_stmt)
> rows = cql_session.execute(query)
>
> results = []
> for row in rows:
>     max_time = max(row.colname.keys())
>     results.append((row.userid, row.pkid, max_time, row.colname[max_time]))
>
> sorted_results = sorted(results, key=itemgetter(2), reverse=True)
> for result in sorted_results: print result
>
> # prints:
>
> # (u'XYZ', u'1002', u'204', u'Col-Name-5')
> # (u'XYZ', u'1000', u'203', u'Col-Name-4')
> # (u'XYZ', u'1001', u'201', u'Col-Name-2')
>
>
>
> On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael <michael.laing@nytimes.com
> > wrote:
>
>> You could try this. C* doesn't do it all for you, but it will efficiently
>> get you the right data.
>>
>> -ml
>>
>> -- put this in <file> and run using 'cqlsh -f <file>
>>
>> DROP KEYSPACE latest;
>>
>> CREATE KEYSPACE latest WITH replication = {
>>     'class': 'SimpleStrategy',
>>     'replication_factor' : 1
>> };
>>
>> USE latest;
>>
>> CREATE TABLE time_series (
>>     userid text,
>>     pkid text,
>>     colname map<text, text>,
>>     PRIMARY KEY (userid, pkid)
>> );
>>
>> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
>> userid = 'XYZ' AND pkid = '1000';
>> UPDATE time_series SET colname = colname +
>> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
>> UPDATE time_series SET colname = colname +
>> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
>> UPDATE time_series SET colname = colname +
>> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
>> UPDATE time_series SET colname = colname +
>> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';
>>
>> SELECT * FROM time_series WHERE userid = 'XYZ';
>>
>> -- returns:
>> -- userid | pkid | colname
>>
>> ----------+------+-----------------------------------------------------------------
>> --    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
>> 'Col-Name-4'}
>> --    XYZ | 1001 |                                           {'201':
>> 'Col-Name-2'}
>> --    XYZ | 1002 |                                           {'204':
>> 'Col-Name-5'}
>>
>> -- use an app to pop off the latest key/value from the map for each row,
>> then sort by key desc.
>>
>>
>> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
>> ravikumar.govindarajan@gmail.com> wrote:
>>
>>> I have been faced with a problem of grouping composites on the
>>> second-part.
>>>
>>> Lets say my CF contains this
>>>
>>>
>>> TimeSeriesCF
>>>                        key:                            UserID
>>>                        composite-col-name:    TimeUUID:PKID
>>>
>>> Some sample data
>>>
>>> UserID = XYZ
>>>                                  Time:PKID
>>>                Col-Name1 = 200:1000
>>>                Col-Name2 = 201:1001
>>>                Col-Name3 = 202:1000
>>>                Col-Name4 = 203:1000
>>>                Col-Name5 = 204:1002
>>>
>>> Whenever a time-series query is issued, it should return the following
>>> in time-desc order.
>>>
>>> UserID = XYZ
>>>               Col-Name5 = 204:1002
>>>               Col-Name4 = 203:1000
>>>               Col-Name2 = 201:1001
>>>
>>> Is something like this possible in Cassandra? Is there a different way
>>> to design and achieve the same objective?
>>>
>>> --
>>> Ravi
>>>
>>>
>>
>>
>

Re: Composite Column Grouping

Posted by "Laing, Michael" <mi...@nytimes.com>.

If you have set up the table as described in my previous message, you could
run this python snippet to return the desired result:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
logging.basicConfig()

from operator import itemgetter

import cassandra
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

cql_cluster = Cluster()
cql_session = cql_cluster.connect()
cql_session.set_keyspace('latest')

select_stmt = "select * from time_series where userid = 'XYZ'"
query = SimpleStatement(select_stmt)
rows = cql_session.execute(query)

results = []
for row in rows:
    max_time = max(row.colname.keys())
    results.append((row.userid, row.pkid, max_time, row.colname[max_time]))

sorted_results = sorted(results, key=itemgetter(2), reverse=True)
for result in sorted_results: print result

# prints:

# (u'XYZ', u'1002', u'204', u'Col-Name-5')
# (u'XYZ', u'1000', u'203', u'Col-Name-4')
# (u'XYZ', u'1001', u'201', u'Col-Name-2')



On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael
<mi...@nytimes.com>wrote:

> You could try this. C* doesn't do it all for you, but it will efficiently
> get you the right data.
>
> -ml
>
> -- put this in <file> and run using 'cqlsh -f <file>
>
> DROP KEYSPACE latest;
>
> CREATE KEYSPACE latest WITH replication = {
>     'class': 'SimpleStrategy',
>     'replication_factor' : 1
> };
>
> USE latest;
>
> CREATE TABLE time_series (
>     userid text,
>     pkid text,
>     colname map<text, text>,
>     PRIMARY KEY (userid, pkid)
> );
>
> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
> userid = 'XYZ' AND pkid = '1000';
> UPDATE time_series SET colname = colname +
> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
> UPDATE time_series SET colname = colname +
> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
> UPDATE time_series SET colname = colname +
> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
> UPDATE time_series SET colname = colname +
> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';
>
> SELECT * FROM time_series WHERE userid = 'XYZ';
>
> -- returns:
> -- userid | pkid | colname
>
> ----------+------+-----------------------------------------------------------------
> --    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
> 'Col-Name-4'}
> --    XYZ | 1001 |                                           {'201':
> 'Col-Name-2'}
> --    XYZ | 1002 |                                           {'204':
> 'Col-Name-5'}
>
> -- use an app to pop off the latest key/value from the map for each row,
> then sort by key desc.
>
>
> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
> ravikumar.govindarajan@gmail.com> wrote:
>
>> I have been faced with a problem of grouping composites on the
>> second-part.
>>
>> Lets say my CF contains this
>>
>>
>> TimeSeriesCF
>>                        key:                            UserID
>>                        composite-col-name:    TimeUUID:PKID
>>
>> Some sample data
>>
>> UserID = XYZ
>>                                  Time:PKID
>>                Col-Name1 = 200:1000
>>                Col-Name2 = 201:1001
>>                Col-Name3 = 202:1000
>>                Col-Name4 = 203:1000
>>                Col-Name5 = 204:1002
>>
>> Whenever a time-series query is issued, it should return the following in
>> time-desc order.
>>
>> UserID = XYZ
>>               Col-Name5 = 204:1002
>>               Col-Name4 = 203:1000
>>               Col-Name2 = 201:1001
>>
>> Is something like this possible in Cassandra? Is there a different way to
>> design and achieve the same objective?
>>
>> --
>> Ravi
>>
>>
>
>

Re: Composite Column Grouping

Posted by "Laing, Michael" <mi...@nytimes.com>.

You could try this. C* doesn't do it all for you, but it will efficiently
get you the right data.

-ml

-- put this in <file> and run using 'cqlsh -f <file>

DROP KEYSPACE latest;

CREATE KEYSPACE latest WITH replication = {
    'class': 'SimpleStrategy',
    'replication_factor' : 1
};

USE latest;

CREATE TABLE time_series (
    userid text,
    pkid text,
    colname map<text, text>,
    PRIMARY KEY (userid, pkid)
);

UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
userid = 'XYZ' AND pkid = '1000';
UPDATE time_series SET colname = colname +
{'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
UPDATE time_series SET colname = colname +
{'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
UPDATE time_series SET colname = colname +
{'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
UPDATE time_series SET colname = colname +
{'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';

SELECT * FROM time_series WHERE userid = 'XYZ';

-- returns:
-- userid | pkid | colname
----------+------+-----------------------------------------------------------------
--    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
'Col-Name-4'}
--    XYZ | 1001 |                                           {'201':
'Col-Name-2'}
--    XYZ | 1002 |                                           {'204':
'Col-Name-5'}

-- use an app to pop off the latest key/value from the map for each row,
then sort by key desc.


On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
ravikumar.govindarajan@gmail.com> wrote:

> I have been faced with a problem of grouping composites on the second-part.
>
> Lets say my CF contains this
>
>
> TimeSeriesCF
>                        key:                            UserID
>                        composite-col-name:    TimeUUID:PKID
>
> Some sample data
>
> UserID = XYZ
>                                  Time:PKID
>                Col-Name1 = 200:1000
>                Col-Name2 = 201:1001
>                Col-Name3 = 202:1000
>                Col-Name4 = 203:1000
>                Col-Name5 = 204:1002
>
> Whenever a time-series query is issued, it should return the following in
> time-desc order.
>
> UserID = XYZ
>               Col-Name5 = 204:1002
>               Col-Name4 = 203:1000
>               Col-Name2 = 201:1001
>
> Is something like this possible in Cassandra? Is there a different way to
> design and achieve the same objective?
>
> --
> Ravi
>
>