You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@mahout.apache.org by Andrew Butkus <an...@butkus.co.uk> on 2013/12/06 21:33:44 UTC

K-Means clustering explanation

Hi,

I’ve essentially taken the dataset i’ve pre-classified with naive bayes, they are news article titles.


I would like to cluster these titles into categories such as …

‘murder’
‘sex’
‘fraud’
‘terrorism'


but am a little confused about the results i have received from my cluster dump (results attached),


I’m seeing a Top word list (which i specified to output), the list is somewhat irrelevant, but has some of the keywords i’m looking to categorise
and underneath i have a key/value list (which i think is listing vectors / word weights).


How do i turn what I have processed into something useful? for example, how would i input a title and return a category?

I’ve started to read through mahout in action book, but the book release I have seems to be out of date for mahout 0.7.

I would also welcome any resources which shows how to do what I ask programmatically through java.


Thanks

Andy


Top Terms:
                man                                     =>0.008000337010687452
                cdata                                   =>0.007997985964165618
                comments                                =>0.007731191320108965
                after                                   =>0.0069037602489741235
                school                                  => 0.00563160856893979
                bournemouth                             =>0.005011825189472135
                police                                  =>0.004733383314492858
                2013                                    =>0.0041557648663178405
                arrested                                =>0.004111181114247428
                home                                    =>0.003938528377541288
                manager                                 =>0.0039023473450242835
                poole                                   =>0.003582306689329533
                day                                     =>0.0034606181558174236
                dorset                                  =>0.003447262937763061
                time                                    =>0.0034374047718378543
                nurse                                   =>0.0033384290600343243
                back                                    =>0.003329673234220446
                videos                                  =>0.003327555941083277
                pictures                                =>0.0032607585027343176
                2                                       =>0.0032555880589821605
                business                                =>0.003155572579020411
                two                                     =>0.003042114366434562
                woman                                   =>0.0029723510044456745
                stats                                   => 0.00292934992316234
                avon                                    =>0.0029157652514312043
                charged                                 =>0.002882685347115821
                guilty                                  => 0.00283827426433915
                car                                     =>0.0028376236536839807
                1                                       =>0.0028258930566665338
                top                                     =>0.002809853497752466
                life                                    =>0.0027992448323098233
                off                                     =>0.0027682009102213863
                jobs                                    =>0.002732444673670308
                court                                   =>0.0027162454947557654
                crash                                   =>0.002699148979402604
                about                                   =>0.0026593731688505325
                death                                   =>0.0025994461863060635
                care                                    =>0.0025684477030354608
                case                                    =>0.0025439029865843036
….

1.0: /negative_article/0031a158fd399273b612cc333ddf27f3 = [detrimental:0.577, eating:0.419, fish:0.401, health:0.302, much:0.360, too:0.331]
        1.0: /negative_article/00399e05458ecbb40264b5625dc87183 = [airport:0.400, arrest:0.376, explosives:0.537, fbi:0.429, man:0.257, texas:0.400]
        1.0: /negative_article/003deb3e66242cd771b2d261cea58c4d = [accused:0.357, death:0.339, pleads:0.370, regina:0.602, toddler:0.508]
        1.0: /negative_article/003e021e55e57ebde5264ea80bcaa387 = [bank:0.291, buena:0.489, gunman:0.382, holds:0.368, hostage:0.420, least:0.381, park:0.271]
        1.0: /negative_article/0043897010b8a001bb54d5b9a00268b5 = [fall:0.351, internet:0.390, parkstone:0.425, people:0.326, scam:0.397, student:0.353, warns:0.395]
        1.0: /negative_article/0044d17673b0ef0f754aabf976b250df = [13:0.404, delhi:0.486, held:0.406, thefts:0.491, vehicle:0.441]
        1.0: /negative_article/004730c9b45401651e3ffce49a2335d1 = [bangor:0.326, bdn:0.329, business:0.250, daily:0.277, maine:0.316, nasty:0.424, sided:0.507, turns:0.332]
        1.0: /negative_article/00481cbb848cc1ff7638c3d905b0e753 = [alabama:0.304, case:0.218, could:0.224, death:0.216, forced:0.304, garrard:0.433, get:0.211, girl:0.251, grandma:0.366, joyce:0.312, penalty:0.2$
        1.0: /negative_article/0049aab643244f3eb5d7be4cf7212a5d = [25:0.369, cartel:0.488, drug:0.330, feared:0.468, gets:0.302, leader:0.343, years:0.298]
        1.0: /negative_article/004a8a65419ed59a3e2bf662b0e72a6f = [17k:0.572, admit:0.505, benefits:0.400, fraud:0.353, women:0.365]
        1.0: /negative_article/004f42f9964e0ae8544b37c1e451675a = [break:0.350, four:0.318, hip:0.390, inmates:0.442, prison:0.317, shower:0.468, through:0.331]
        1.0: /negative_article/0051048557ca1d3ef622e46881e1311d = [after:0.303, charged:0.381, crash:0.387, injured:0.460, man:0.303, officer:0.444, police:0.337]
        1.0: /negative_article/005200b633747a36dff05ff6555b081f = [20:0.419, dealer:0.514, drugs:0.434, gets:0.357, guns:0.495]
        1.0: /negative_article/0056080c48797ee9dd6695a458067b70 = [being:0.241, dreadful:0.378, evens:0.401, fulfilled:0.388, george:0.257, licensed:0.382, newspapers:0.347, prophecy:0.394]
        1.0: /negative_article/0057db01a4242cdeb274ae833c377d5e = [australian:0.472, evacuated:0.538, floods:0.532, hundreds:0.452]
        1.0: /negative_article/005916c054486fedf7727a265eb78f16 = [bananas:0.451, crate:0.478, frog:0.467, jumps:0.376, supermarket:0.339, tree:0.305]
        1.0: /negative_article/005c485c71df1b163c430cfbbf85deb0 = [arrested:0.276, bust:0.419, child:0.327, porn:0.386, ring:0.393, victorians:0.581]
        1.0: /negative_article/005ddcb0fb22f5dc21575bc3ade944b2 = [airport:0.455, axe:0.549, bournemouth:0.316, face:0.404, seven:0.478]
        1.0: /negative_article/005f305f1619838261497f731f985187 = [eastern:0.482, gets:0.332, kentucky:0.486, tsu:0.650]
        1.0: /negative_article/006197c31d29c4c5009a30f429f31ffd = [deported:0.418, mistakenly:0.486, mom:0.349, reunites:0.523, teen:0.300, texas:0.321]
        1.0: /negative_article/00630196962c6d8f5a475a6dfd8f0fe8 = [beat:0.299, dallas:0.346, disabled:0.331, men:0.275, police:0.218, rifle:0.431, searching:0.386, three:0.265, vet:0.391]
        1.0: /negative_article/0065526ca0b929aeaa6f9bd5e7563ffd = [addington:0.635, stabbing:0.429, stable:0.512, victim:0.389]
        1.0: /negative_article/0065a4b222bbc5aeed47b588aa8918dc = [home:0.179, kpax:0.385, missoula:0.578, montana:0.553, sports:0.292, weather:0.308]
        1.0: /negative_article/006b81cdc2684fc6bea4f6241e021669 = [90:0.539, arrested:0.341, montreal:0.613, protest:0.467]