You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/05/20 17:13:26 UTC
[1/2] git commit: DATAFU-49 Examples work again
Repository: incubator-datafu
Updated Branches:
refs/heads/master 3cfcea78d -> 6d3acbb36
DATAFU-49 Examples work again
* Removed dependency on Guava (which it didn't actually depend on) and on Piggybank (date functions are now first class)
* Path to datafu jar correct for current repo layout
* I made the quantile examples demonstrate a comparison of the exact vs approx algorithms
* Added the script to generate data for quantile.
* Quantile examples demonstrate both ways of constructing a Quantile UDF (number of partitions vs list of breakpoints)
https://issues.apache.org/jira/browse/DATAFU-49
Signed-off-by: Matthew Hayes <ma...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/f75f9c4f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/f75f9c4f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/f75f9c4f
Branch: refs/heads/master
Commit: f75f9c4fdf9b94944b3d51cf6dab256e077899e7
Parents: 3cfcea7
Author: Philip (flip) Kromer <fl...@infochimps.org>
Authored: Mon May 19 09:12:34 2014 -0500
Committer: Matthew Hayes <ma...@gmail.com>
Committed: Tue May 20 07:27:18 2014 -0700
----------------------------------------------------------------------
examples/quantile/generate_temperature_data.rb | 18 ++++++
examples/quantile/quantile.pig | 20 ++++---
examples/quantile/quartiles-approx.tsv | 3 +
examples/quantile/quartiles-diff.tsv | 3 +
examples/quantile/quartiles-exact.tsv | 3 +
examples/quantile/streaming_quantile.pig | 61 +++++++++++++++++----
examples/sessionize/sessionize.pig | 11 ++--
7 files changed, 91 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/generate_temperature_data.rb
----------------------------------------------------------------------
diff --git a/examples/quantile/generate_temperature_data.rb b/examples/quantile/generate_temperature_data.rb
new file mode 100644
index 0000000..e52295f
--- /dev/null
+++ b/examples/quantile/generate_temperature_data.rb
@@ -0,0 +1,18 @@
+require 'rubystats'
+
+# Generates 10,000 measurements for three imaginary temperature sensors.
+
+sensors = []
+sensors << {:id => 1, :mean => 60.0, :stdev => 5.0}
+sensors << {:id => 2, :mean => 50.0, :stdev => 10.0}
+sensors << {:id => 3, :mean => 40.0, :stdev => 3.0}
+
+File.open('temperature.txt','w') do |file|
+ sensors.each do |sensor|
+ id = sensor[:id]
+ dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev])
+ dist.rng(10000).each do |value|
+ file.write "#{id}\t#{value}\n"
+ end
+ end
+end
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quantile.pig
----------------------------------------------------------------------
diff --git a/examples/quantile/quantile.pig b/examples/quantile/quantile.pig
index e940665..039a65c 100644
--- a/examples/quantile/quantile.pig
+++ b/examples/quantile/quantile.pig
@@ -1,14 +1,16 @@
-REGISTER datafu-0.0.6.jar;
+REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
-define Quartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0');
-
-temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
+define ExactQuartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0');
+temperature = LOAD 'temperature.tsv' AS (id:chararray, temp:double);
temperature = GROUP temperature BY id;
-temperature_quartiles = FOREACH temperature {
- sorted = ORDER temperature by temp; -- must be sorted
- GENERATE group as id, Quartile(sorted.temp) as quartiles;
-}
+quartiles_slow = FOREACH temperature {
+ -- sort is necessary, because exact
+ sorted = ORDER temperature by temp;
+ GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ExactQuartile(sorted.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_slow;
-DUMP temperature_quartiles
\ No newline at end of file
+rmf /tmp/quartiles-exact;
+STORE quartiles_slow INTO '/tmp/quartiles-exact';
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-approx.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-approx.tsv b/examples/quantile/quartiles-approx.tsv
new file mode 100644
index 0000000..bd74bf4
--- /dev/null
+++ b/examples/quantile/quartiles-approx.tsv
@@ -0,0 +1,3 @@
+1 10001 (42.00608329852237,56.31536123458097,59.74079297806002,63.0500178316878,77.80994964056148)
+2 10001 (11.75817508617046,42.57950711182105,49.606042890169284,56.19442582359321,87.291045156487)
+3 10001 (27.469079687477848,37.834527048220664,39.793411881569575,41.8308582867632,52.378946172742346)
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-diff.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-diff.tsv b/examples/quantile/quartiles-diff.tsv
new file mode 100644
index 0000000..9130c69
--- /dev/null
+++ b/examples/quantile/quartiles-diff.tsv
@@ -0,0 +1,3 @@
+1 10001 (42.00608329852237,56.64770607307308,59.97095600661579,63.34401776031881,77.80994964056148) (0.0,-0.0058668719623598126,-0.00383790827897393,-0.004641321138539824,0.0)
+2 10001 (11.75817508617046,43.262881134998416,50.06265923637557,56.78979251305065,87.291045156487) (0.0,-0.015795850975457455,-0.009120896755610437,-0.010483691929683025,0.0)
+3 10001 (27.469079687477848,37.9945324789972,40.04256529584773,42.097939274344284,52.378946172742346) (0.0,-0.004211275158208217,-0.006222214097356818,-0.00634427699276598,0.0)
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/quartiles-exact.tsv
----------------------------------------------------------------------
diff --git a/examples/quantile/quartiles-exact.tsv b/examples/quantile/quartiles-exact.tsv
new file mode 100644
index 0000000..61e3156
--- /dev/null
+++ b/examples/quantile/quartiles-exact.tsv
@@ -0,0 +1,3 @@
+1 10001 (42.00608329852237,56.64770607307308,59.97095600661579,63.34401776031881,77.80994964056148)
+2 10001 (11.75817508617046,43.262881134998416,50.06265923637557,56.78979251305065,87.291045156487)
+3 10001 (27.469079687477848,37.9945324789972,40.04256529584773,42.097939274344284,52.378946172742346)
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/quantile/streaming_quantile.pig
----------------------------------------------------------------------
diff --git a/examples/quantile/streaming_quantile.pig b/examples/quantile/streaming_quantile.pig
index a5718fc..b1f0750 100644
--- a/examples/quantile/streaming_quantile.pig
+++ b/examples/quantile/streaming_quantile.pig
@@ -1,14 +1,51 @@
-REGISTER datafu-0.0.6.jar;
+REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
-define Quartile datafu.pig.stats.StreamingQuantile('0.0','0.25','0.5','0.75','1.0');
-
-temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
-
+DEFINE ExactQuartile datafu.pig.stats.Quantile( '5');
+DEFINE ApproxQuartile datafu.pig.stats.StreamingQuantile('5');
+-- Similar to (with different field names):
+-- DEFINE ExactQuartile datafu.pig.stats.Quantile( '0.0', '0.25', '0.50', '0.75', '1.0' );
+-- DEFINE ApproxQuartile datafu.pig.stats.StreamingQuantile( '0.0', '0.25', '0.50', '0.75', '1.0' );
+
+DEFINE FirstTupleFromBag datafu.pig.bags.FirstTupleFromBag;
+
+temperature = LOAD 'temperature.tsv' AS (id:chararray, temp:double);
temperature = GROUP temperature BY id;
-
-temperature_quartiles = FOREACH temperature {
- -- sort not necessary
- GENERATE group as id, Quartile(temperature.temp) as quartiles;
-}
-
-DUMP temperature_quartiles
\ No newline at end of file
+
+quartiles_fast = FOREACH temperature {
+ -- sort not necessary, because streaming
+ GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ApproxQuartile(temperature.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_fast;
+
+quartiles_slow = FOREACH temperature {
+ -- sort is necessary, because exact
+ sorted = ORDER temperature by temp;
+ GENERATE group as id, COUNT_STAR(temperature) AS n_recs, ExactQuartile(sorted.temp) AS qvals:tuple(q0,q1,q2,q3,q4);
+};
+DESCRIBE quartiles_slow;
+
+--
+-- Group the two results together and compare.
+-- The differences are in the range of 0.5% to 1.5%
+--
+quartiles_diff = FOREACH (COGROUP quartiles_slow BY id, quartiles_fast BY id) {
+ count_fast = FirstTupleFromBag(quartiles_fast.n_recs, null);
+ count_slow = FirstTupleFromBag(quartiles_fast.n_recs, null);
+ qvals_fast = FirstTupleFromBag(quartiles_fast.qvals, null).qvals;
+ qvals_slow = FirstTupleFromBag(quartiles_slow.qvals, null).qvals;
+
+ GENERATE group AS id, count_fast.n_recs, qvals_slow AS qvals,
+ ( (qvals_fast.q0 - qvals_slow.q0) / qvals_slow.q0,
+ (qvals_fast.q1 - qvals_slow.q1) / qvals_slow.q1,
+ (qvals_fast.q2 - qvals_slow.q2) / qvals_slow.q2,
+ (qvals_fast.q3 - qvals_slow.q3) / qvals_slow.q3,
+ (qvals_fast.q4 - qvals_slow.q4) / qvals_slow.q4 ) AS diffs:tuple(dq0,dq1,dq2,dq3,dq4)
+ ;
+};
+DESCRIBE quartiles_diff;
+
+rmf /tmp/quartiles-approx;
+STORE quartiles_fast INTO '/tmp/quartiles-approx';
+
+rmf /tmp/quartiles-diff;
+STORE quartiles_diff INTO '/tmp/quartiles-diff';
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/f75f9c4f/examples/sessionize/sessionize.pig
----------------------------------------------------------------------
diff --git a/examples/sessionize/sessionize.pig b/examples/sessionize/sessionize.pig
index 08a0699..6d03d9b 100644
--- a/examples/sessionize/sessionize.pig
+++ b/examples/sessionize/sessionize.pig
@@ -1,8 +1,5 @@
-REGISTER piggybank.jar;
-REGISTER datafu-0.0.6.jar;
-REGISTER guava-13.0.1.jar; -- needed by StreamingQuantile
+REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
-DEFINE UnixToISO org.apache.pig.piggybank.evaluation.datetime.convert.UnixToISO();
DEFINE Sessionize datafu.pig.sessions.Sessionize('10m');
DEFINE Median datafu.pig.stats.Median();
DEFINE Quantile datafu.pig.stats.StreamingQuantile('0.75','0.90','0.95');
@@ -12,10 +9,10 @@ pv = LOAD 'clicks.csv' USING PigStorage(',') AS (memberId:int, time:long, url:ch
pv = FOREACH pv
-- Sessionize expects an ISO string
- GENERATE UnixToISO(time) as isoTime,
+ GENERATE ToString(ToDate(time)) as isoTime,
time,
memberId;
-
+
pv_sessionized = FOREACH (GROUP pv BY memberId) {
ordered = ORDER pv BY isoTime;
GENERATE FLATTEN(Sessionize(ordered)) AS (isoTime, time, memberId, sessionId);
@@ -40,4 +37,4 @@ session_stats = FOREACH (GROUP session_times ALL) {
};
DUMP session_stats
---(15.737532575757575,31.29552045993877,(2.848041666666667),(14.648516666666666,31.88788333333333,86.69525))
\ No newline at end of file
+--(15.737532575757575,31.29552045993877,(2.848041666666667),(14.648516666666666,31.88788333333333,86.69525))
[2/2] git commit: Update README files for examples
Posted by mh...@apache.org.
Update README files for examples
I wasn't able to run the sessionize script against pig 0.12.1. It appears there is a bug in ToString that isn't fixed until 0.13.0. Sessionize accepts ms time as the input in addition to iso time anyways though, so I removed this code that converted to an iso string.
I also updated the quantile's data generation script to output a file with a tsv extension.
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/6d3acbb3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/6d3acbb3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/6d3acbb3
Branch: refs/heads/master
Commit: 6d3acbb36cd8ad650dd3cb402da5efa71d3c863a
Parents: f75f9c4
Author: Matthew Hayes <ma...@gmail.com>
Authored: Tue May 20 07:21:12 2014 -0700
Committer: Matthew Hayes <ma...@gmail.com>
Committed: Tue May 20 08:04:22 2014 -0700
----------------------------------------------------------------------
examples/quantile/README.md | 9 ++++++++-
examples/quantile/generate_temperature_data.rb | 4 ++--
examples/sessionize/README.md | 6 +++++-
examples/sessionize/sessionize.pig | 8 +++-----
4 files changed, 18 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/quantile/README.md
----------------------------------------------------------------------
diff --git a/examples/quantile/README.md b/examples/quantile/README.md
index be93595..75f01fa 100644
--- a/examples/quantile/README.md
+++ b/examples/quantile/README.md
@@ -1,3 +1,10 @@
# Quantile Example
-This is an example of using the *Quantile* and *StreamingQuantile* UDFs to to compute quantiles for some sample temperature data. See our [blog post](http://engineering.linkedin.com/open-source/introducing-datafu-open-source-collection-useful-apache-pig-udfs) for more details.
\ No newline at end of file
+This is an example of using the *Quantile* and *StreamingQuantile* UDFs to to compute quantiles for some sample temperature data. It is based on our [blog post](http://datafu.incubator.apache.org/blog/2012/01/10/introducing-datafu.html), which can be read for more details.
+
+Assuming pig 0.12.1 has been downloaded to ~/pig-0.12.1 then the following commands can be used to execute the scripts:
+
+ ruby generate_temperature_data.rb
+
+ ~/pig-0.12.1/bin/pig -x local -f quantile.pig
+ ~/pig-0.12.1/bin/pig -x local -f streaming_quantile.pig
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/quantile/generate_temperature_data.rb
----------------------------------------------------------------------
diff --git a/examples/quantile/generate_temperature_data.rb b/examples/quantile/generate_temperature_data.rb
index e52295f..f9ebc11 100644
--- a/examples/quantile/generate_temperature_data.rb
+++ b/examples/quantile/generate_temperature_data.rb
@@ -7,7 +7,7 @@ sensors << {:id => 1, :mean => 60.0, :stdev => 5.0}
sensors << {:id => 2, :mean => 50.0, :stdev => 10.0}
sensors << {:id => 3, :mean => 40.0, :stdev => 3.0}
-File.open('temperature.txt','w') do |file|
+File.open('temperature.tsv','w') do |file|
sensors.each do |sensor|
id = sensor[:id]
dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev])
@@ -15,4 +15,4 @@ File.open('temperature.txt','w') do |file|
file.write "#{id}\t#{value}\n"
end
end
-end
\ No newline at end of file
+end
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/sessionize/README.md
----------------------------------------------------------------------
diff --git a/examples/sessionize/README.md b/examples/sessionize/README.md
index f10e123..05346aa 100644
--- a/examples/sessionize/README.md
+++ b/examples/sessionize/README.md
@@ -1,3 +1,7 @@
# Sessionize Example
-This is an example of sessionizing a clickstream. See our [blog post](http://data.linkedin.com/blog/2013/01/datafu-the-wd-40-of-big-data) for more details.
\ No newline at end of file
+This is an example of sessionizing a clickstream. See our [blog post](http://datafu.incubator.apache.org/blog/2013/01/24/datafu-the-wd-40-of-big-data.html) for more details.
+
+Assuming pig 0.12.1 has been downloaded to ~/pig-0.12.1 then the following commands can be used to execute the scripts:
+
+ ~/pig-0.12.1/bin/pig -x local -f sessionize.pig
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/6d3acbb3/examples/sessionize/sessionize.pig
----------------------------------------------------------------------
diff --git a/examples/sessionize/sessionize.pig b/examples/sessionize/sessionize.pig
index 6d03d9b..2de03cf 100644
--- a/examples/sessionize/sessionize.pig
+++ b/examples/sessionize/sessionize.pig
@@ -8,14 +8,12 @@ DEFINE VAR datafu.pig.stats.VAR();
pv = LOAD 'clicks.csv' USING PigStorage(',') AS (memberId:int, time:long, url:chararray);
pv = FOREACH pv
- -- Sessionize expects an ISO string
- GENERATE ToString(ToDate(time)) as isoTime,
- time,
+ GENERATE time,
memberId;
pv_sessionized = FOREACH (GROUP pv BY memberId) {
- ordered = ORDER pv BY isoTime;
- GENERATE FLATTEN(Sessionize(ordered)) AS (isoTime, time, memberId, sessionId);
+ ordered = ORDER pv BY time;
+ GENERATE FLATTEN(Sessionize(ordered)) AS (time, memberId, sessionId);
};
pv_sessionized = FOREACH pv_sessionized GENERATE sessionId, time;