You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/05/20 17:13:26 UTC

[1/2] git commit: DATAFU-49 Examples work again

Repository: incubator-datafu
Updated Branches:
  refs/heads/master 3cfcea78d -> 6d3acbb36


DATAFU-49 Examples work again

* Removed dependency on Guava (which it didn't actually depend on) and on Piggybank (date functions are now first class)
* Path to datafu jar correct for current repo layout
* I made the quantile examples demonstrate a comparison of the exact vs approx algorithms
* Added the script to generate data for quantile.
* Quantile examples demonstrate both ways of constructing a Quantile UDF (number of partitions vs list of breakpoints)

https://issues.apache.org/jira/browse/DATAFU-49

Signed-off-by: Matthew Hayes <ma...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/f75f9c4f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/f75f9c4f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/f75f9c4f

Branch: refs/heads/master
Commit: f75f9c4fdf9b94944b3d51cf6dab256e077899e7
Parents: 3cfcea7
Author: Philip (flip) Kromer <fl...@infochimps.org>
Authored: Mon May 19 09:12:34 2014 -0500
Committer: Matthew Hayes <ma...@gmail.com>
Committed: Tue May 20 07:27:18 2014 -0700

----------------------------------------------------------------------
 examples/quantile/generate_temperature_data.rb | 18 ++++++
 examples/quantile/quantile.pig                 | 20 ++++---
 examples/quantile/quartiles-approx.tsv         |  3 +
 examples/quantile/quartiles-diff.tsv           |  3 +
 examples/quantile/quartiles-exact.tsv          |  3 +
 examples/quantile/streaming_quantile.pig       | 61 +++++++++++++++++----
 examples/sessionize/sessionize.pig             | 11 ++--
 7 files changed, 91 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/generate_temperature_data.rb
----------------------------------------------------------------------
diff --git a/examples/quantile/generate_temperature_data.rb b/examples/quantile/generate_temperature_data.rb
new file mode 100644
index 0000000..e52295f
--- /dev/null
+++ b/examples/quantile/generate_temperature_data.rb
@@ -0,0 +1,18 @@
+require 'rubystats'
+
+# Generates 10,000 measurements for three imaginary temperature sensors.
+
+sensors = []
+sensors << {:id => 1, :mean => 60.0, :stdev => 5.0}
+sensors << {:id => 2, :mean => 50.0, :stdev => 10.0}
+sensors << {:id => 3, :mean => 40.0, :stdev => 3.0}
+
+File.open('temperature.txt','w') do |file|
+  sensors.each do |sensor|
+    id = sensor[:id]
+    dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev])
+    dist.rng(10000).each do |value|
+      file.write "#{id}\t#{value}\n"
+    end
+  end
+end
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quantile.pig
----------------------------------------------------------------------
diff --git a/examples/quantile/quantile.pig b/examples/quantile/quantile.pig
index e940665..039a65c 100644
--- a/examples/quantile/quantile.pig
+++ b/examples/quantile/quantile.pig
@@ -1,14 +1,16 @@
-REGISTER datafu-0.0.6.jar;
+REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
 
-define Quartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0');
- 
-temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
+define ExactQuartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0');
  
+temperature = LOAD 'temperature.tsv' AS (id:chararray, temp:double);
 temperature = GROUP temperature BY id;
  
-temperature_quartiles = FOREACH temperature {
-  sorted = ORDER temperature by temp; -- must be sorted
-  GENERATE group as id, Quartile(sorted.temp) as quartiles;
-}
+quartiles_slow = FOREACH temperature {
+  -- sort is necessary, because exact
+  sorted = ORDER temperature by temp;
+  GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ExactQuartile(sorted.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_slow;
  
-DUMP temperature_quartiles
\ No newline at end of file
+rmf                        /tmp/quartiles-exact;
+STORE quartiles_slow INTO '/tmp/quartiles-exact';

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-approx.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-approx.tsv b/examples/quantile/quartiles-approx.tsv
new file mode 100644
index 0000000..bd74bf4
--- /dev/null
+++ b/examples/quantile/quartiles-approx.tsv
@@ -0,0 +1,3 @@
+1	10001	(42.00608329852237,56.31536123458097,59.74079297806002,63.0500178316878,77.80994964056148)
+2	10001	(11.75817508617046,42.57950711182105,49.606042890169284,56.19442582359321,87.291045156487)
+3	10001	(27.469079687477848,37.834527048220664,39.793411881569575,41.8308582867632,52.378946172742346)

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-diff.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-diff.tsv b/examples/quantile/quartiles-diff.tsv
new file mode 100644
index 0000000..9130c69
--- /dev/null
+++ b/examples/quantile/quartiles-diff.tsv
@@ -0,0 +1,3 @@
+1	10001	(42.00608329852237,56.64770607307308,59.97095600661579,63.34401776031881,77.80994964056148)	(0.0,-0.0058668719623598126,-0.00383790827897393,-0.004641321138539824,0.0)
+2	10001	(11.75817508617046,43.262881134998416,50.06265923637557,56.78979251305065,87.291045156487)	(0.0,-0.015795850975457455,-0.009120896755610437,-0.010483691929683025,0.0)
+3	10001	(27.469079687477848,37.9945324789972,40.04256529584773,42.097939274344284,52.378946172742346)	(0.0,-0.004211275158208217,-0.006222214097356818,-0.00634427699276598,0.0)

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-exact.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-exact.tsv b/examples/quantile/quartiles-exact.tsv
new file mode 100644
index 0000000..61e3156
--- /dev/null
+++ b/examples/quantile/quartiles-exact.tsv
@@ -0,0 +1,3 @@
+1	10001	(42.00608329852237,56.64770607307308,59.97095600661579,63.34401776031881,77.80994964056148)
+2	10001	(11.75817508617046,43.262881134998416,50.06265923637557,56.78979251305065,87.291045156487)
+3	10001	(27.469079687477848,37.9945324789972,40.04256529584773,42.097939274344284,52.378946172742346)

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/streaming_quantile.pig
----------------------------------------------------------------------
diff --git a/examples/quantile/streaming_quantile.pig b/examples/quantile/streaming_quantile.pig
index a5718fc..b1f0750 100644
--- a/examples/quantile/streaming_quantile.pig
+++ b/examples/quantile/streaming_quantile.pig
@@ -1,14 +1,51 @@
-REGISTER datafu-0.0.6.jar;
+REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
 
-define Quartile datafu.pig.stats.StreamingQuantile('0.0','0.25','0.5','0.75','1.0');
- 
-temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
- 
+DEFINE ExactQuartile     datafu.pig.stats.Quantile(         '5');
+DEFINE ApproxQuartile    datafu.pig.stats.StreamingQuantile('5');
+-- Similar to (with different field names):
+-- DEFINE ExactQuartile  datafu.pig.stats.Quantile(          '0.0', '0.25', '0.50', '0.75', '1.0' );
+-- DEFINE ApproxQuartile datafu.pig.stats.StreamingQuantile( '0.0', '0.25', '0.50', '0.75', '1.0' );
+
+DEFINE FirstTupleFromBag datafu.pig.bags.FirstTupleFromBag;
+
+temperature = LOAD 'temperature.tsv' AS (id:chararray, temp:double);
 temperature = GROUP temperature BY id;
- 
-temperature_quartiles = FOREACH temperature {
-  -- sort not necessary
-  GENERATE group as id, Quartile(temperature.temp) as quartiles;
-}
- 
-DUMP temperature_quartiles
\ No newline at end of file
+
+quartiles_fast = FOREACH temperature {
+  -- sort not necessary, because streaming
+  GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ApproxQuartile(temperature.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_fast;
+
+quartiles_slow = FOREACH temperature {
+  -- sort is necessary, because exact
+  sorted = ORDER temperature by temp;
+  GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ExactQuartile(sorted.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_slow;
+
+--
+-- Group the two results together and compare.
+-- The differences are in the range of 0.5% to 1.5%
+--
+quartiles_diff = FOREACH (COGROUP quartiles_slow BY id, quartiles_fast BY id) {
+  count_fast = FirstTupleFromBag(quartiles_fast.n_recs, null);
+  count_slow = FirstTupleFromBag(quartiles_fast.n_recs, null);
+  qvals_fast = FirstTupleFromBag(quartiles_fast.qvals,  null).qvals;
+  qvals_slow = FirstTupleFromBag(quartiles_slow.qvals,  null).qvals;
+
+  GENERATE group AS id, count_fast.n_recs, qvals_slow AS qvals,
+    ( (qvals_fast.q0 - qvals_slow.q0) / qvals_slow.q0,
+      (qvals_fast.q1 - qvals_slow.q1) / qvals_slow.q1,
+      (qvals_fast.q2 - qvals_slow.q2) / qvals_slow.q2,
+      (qvals_fast.q3 - qvals_slow.q3) / qvals_slow.q3,
+      (qvals_fast.q4 - qvals_slow.q4) / qvals_slow.q4 ) AS diffs:tuple(dq0,dq1,dq2,dq3,dq4)
+    ;
+};
+DESCRIBE quartiles_diff;
+
+rmf                        /tmp/quartiles-approx;
+STORE quartiles_fast INTO '/tmp/quartiles-approx';
+
+rmf                        /tmp/quartiles-diff;
+STORE quartiles_diff INTO '/tmp/quartiles-diff';

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/sessionize/sessionize.pig
----------------------------------------------------------------------
diff --git a/examples/sessionize/sessionize.pig b/examples/sessionize/sessionize.pig
index 08a0699..6d03d9b 100644
--- a/examples/sessionize/sessionize.pig
+++ b/examples/sessionize/sessionize.pig
@@ -1,8 +1,5 @@
-REGISTER piggybank.jar;
-REGISTER datafu-0.0.6.jar;
-REGISTER guava-13.0.1.jar; -- needed by StreamingQuantile
+REGISTER           '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
  
-DEFINE UnixToISO   org.apache.pig.piggybank.evaluation.datetime.convert.UnixToISO();
 DEFINE Sessionize  datafu.pig.sessions.Sessionize('10m');
 DEFINE Median      datafu.pig.stats.Median();
 DEFINE Quantile    datafu.pig.stats.StreamingQuantile('0.75','0.90','0.95');
@@ -12,10 +9,10 @@ pv = LOAD 'clicks.csv' USING PigStorage(',') AS (memberId:int, time:long, url:ch
  
 pv = FOREACH pv
      -- Sessionize expects an ISO string
-     GENERATE UnixToISO(time) as isoTime,
+     GENERATE ToString(ToDate(time)) as isoTime,
               time,
               memberId;
- 
+
 pv_sessionized = FOREACH (GROUP pv BY memberId) {
   ordered = ORDER pv BY isoTime;
   GENERATE FLATTEN(Sessionize(ordered)) AS (isoTime, time, memberId, sessionId);
@@ -40,4 +37,4 @@ session_stats = FOREACH (GROUP session_times ALL) {
 };
  
 DUMP session_stats
---(15.737532575757575,31.29552045993877,(2.848041666666667),(14.648516666666666,31.88788333333333,86.69525))
\ No newline at end of file
+--(15.737532575757575,31.29552045993877,(2.848041666666667),(14.648516666666666,31.88788333333333,86.69525))


[2/2] git commit: Update README files for examples

Posted by mh...@apache.org.
Update README files for examples

I wasn't able to run the sessionize script against pig 0.12.1.  It appears there is a bug in ToString that isn't fixed until 0.13.0.  Sessionize accepts ms time as the input in addition to iso time anyways though, so I removed this code that converted to an iso string.

I also updated the quantile's data generation script to output a file with a tsv extension.


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/6d3acbb3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/6d3acbb3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/6d3acbb3

Branch: refs/heads/master
Commit: 6d3acbb36cd8ad650dd3cb402da5efa71d3c863a
Parents: f75f9c4
Author: Matthew Hayes <ma...@gmail.com>
Authored: Tue May 20 07:21:12 2014 -0700
Committer: Matthew Hayes <ma...@gmail.com>
Committed: Tue May 20 08:04:22 2014 -0700

----------------------------------------------------------------------
 examples/quantile/README.md                    | 9 ++++++++-
 examples/quantile/generate_temperature_data.rb | 4 ++--
 examples/sessionize/README.md                  | 6 +++++-
 examples/sessionize/sessionize.pig             | 8 +++-----
 4 files changed, 18 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/quantile/README.md
----------------------------------------------------------------------
diff --git a/examples/quantile/README.md b/examples/quantile/README.md
index be93595..75f01fa 100644
--- a/examples/quantile/README.md
+++ b/examples/quantile/README.md
@@ -1,3 +1,10 @@
 # Quantile Example
 
-This is an example of using the *Quantile* and *StreamingQuantile* UDFs to to compute quantiles for some sample temperature data.  See our [blog post](http://engineering.linkedin.com/open-source/introducing-datafu-open-source-collection-useful-apache-pig-udfs) for more details.
\ No newline at end of file
+This is an example of using the *Quantile* and *StreamingQuantile* UDFs to to compute quantiles for some sample temperature data.  It is based on our [blog post](http://datafu.incubator.apache.org/blog/2012/01/10/introducing-datafu.html), which can be read for more details.
+
+Assuming pig 0.12.1 has been downloaded to ~/pig-0.12.1 then the following commands can be used to execute the scripts:
+
+    ruby generate_temperature_data.rb
+
+    ~/pig-0.12.1/bin/pig -x local -f quantile.pig
+    ~/pig-0.12.1/bin/pig -x local -f streaming_quantile.pig

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/quantile/generate_temperature_data.rb
----------------------------------------------------------------------
diff --git a/examples/quantile/generate_temperature_data.rb b/examples/quantile/generate_temperature_data.rb
index e52295f..f9ebc11 100644
--- a/examples/quantile/generate_temperature_data.rb
+++ b/examples/quantile/generate_temperature_data.rb
@@ -7,7 +7,7 @@ sensors << {:id => 1, :mean => 60.0, :stdev => 5.0}
 sensors << {:id => 2, :mean => 50.0, :stdev => 10.0}
 sensors << {:id => 3, :mean => 40.0, :stdev => 3.0}
 
-File.open('temperature.txt','w') do |file|
+File.open('temperature.tsv','w') do |file|
   sensors.each do |sensor|
     id = sensor[:id]
     dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev])
@@ -15,4 +15,4 @@ File.open('temperature.txt','w') do |file|
       file.write "#{id}\t#{value}\n"
     end
   end
-end
\ No newline at end of file
+end

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/sessionize/README.md
----------------------------------------------------------------------
diff --git a/examples/sessionize/README.md b/examples/sessionize/README.md
index f10e123..05346aa 100644
--- a/examples/sessionize/README.md
+++ b/examples/sessionize/README.md
@@ -1,3 +1,7 @@
 # Sessionize Example
 
-This is an example of sessionizing a clickstream.  See our [blog post](http://data.linkedin.com/blog/2013/01/datafu-the-wd-40-of-big-data) for more details.
\ No newline at end of file
+This is an example of sessionizing a clickstream.  See our [blog post](http://datafu.incubator.apache.org/blog/2013/01/24/datafu-the-wd-40-of-big-data.html) for more details.
+
+Assuming pig 0.12.1 has been downloaded to ~/pig-0.12.1 then the following commands can be used to execute the scripts:
+
+    ~/pig-0.12.1/bin/pig -x local -f sessionize.pig

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/sessionize/sessionize.pig
----------------------------------------------------------------------
diff --git a/examples/sessionize/sessionize.pig b/examples/sessionize/sessionize.pig
index 6d03d9b..2de03cf 100644
--- a/examples/sessionize/sessionize.pig
+++ b/examples/sessionize/sessionize.pig
@@ -8,14 +8,12 @@ DEFINE VAR         datafu.pig.stats.VAR();
 pv = LOAD 'clicks.csv' USING PigStorage(',') AS (memberId:int, time:long, url:chararray);
  
 pv = FOREACH pv
-     -- Sessionize expects an ISO string
-     GENERATE ToString(ToDate(time)) as isoTime,
-              time,
+     GENERATE time,
               memberId;
 
 pv_sessionized = FOREACH (GROUP pv BY memberId) {
-  ordered = ORDER pv BY isoTime;
-  GENERATE FLATTEN(Sessionize(ordered)) AS (isoTime, time, memberId, sessionId);
+  ordered = ORDER pv BY time;
+  GENERATE FLATTEN(Sessionize(ordered)) AS (time, memberId, sessionId);
 };
  
 pv_sessionized = FOREACH pv_sessionized GENERATE sessionId, time;