You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vxquery.apache.org by pr...@apache.org on 2014/03/14 19:36:21 UTC
[1/5] git commit: Added some support queries to help understand the
join condition.
Repository: incubator-vxquery
Updated Branches:
refs/heads/prestonc/hash_join f10d678fd -> 4f061948d
Added some support queries to help understand the join condition.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/7cff5b8f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/7cff5b8f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/7cff5b8f
Branch: refs/heads/prestonc/hash_join
Commit: 7cff5b8fc6b59314b64a591ab347d2a4d3825ce0
Parents: f10d678
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 14:30:13 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 14:30:13 2014 -0700
----------------------------------------------------------------------
.../resources/noaa-ghcn-daily/queries/q04_sensor.xq | 10 ++++++++++
.../resources/noaa-ghcn-daily/queries/q04_station.xq | 8 ++++++++
.../resources/noaa-ghcn-daily/queries/q05_sensor.xq | 11 +++++++++++
.../resources/noaa-ghcn-daily/queries/q05_station.xq | 8 ++++++++
.../src/main/resources/noaa-ghcn-daily/queries/q06.xq | 14 ++++++++++++++
.../resources/noaa-ghcn-daily/queries/q06_sensor.xq | 9 +++++++++
.../resources/noaa-ghcn-daily/queries/sensor_count.xq | 7 +++++++
.../noaa-ghcn-daily/queries/station_count.xq | 7 +++++++
8 files changed, 74 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
new file mode 100644
index 0000000..7a94332
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
@@ -0,0 +1,10 @@
+(: XQuery Join Query :)
+(: Count all the weather sensor readings on 1976-07-04. :)
+count(
+ let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+ for $r in collection($sensor_collection)/dataCollection/data
+
+ let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+ where $date eq xs:date("1976-07-04")
+ return $r
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
new file mode 100644
index 0000000..c9effde
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
@@ -0,0 +1,8 @@
+(: XQuery Join Query :)
+(: Count all the weather stations for King county. :)
+count(
+ let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+ for $s in collection($station_collection)/stationCollection/station
+ where (some $x in $s/locationLabels satisfies ($x/type eq "CNTY" and fn:contains(fn:upper-case(fn:data($x/displayName)), "KING")))
+ return $s
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
new file mode 100644
index 0000000..5f96d1b
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
@@ -0,0 +1,11 @@
+(: XQuery Join Aggregate Query :)
+(: Count all sensor readings for TMIN in 2001. :)
+count(
+ let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+ for $r in collection($sensor_collection)/dataCollection/data
+
+ let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+ where $r/dataType eq "TMIN"
+ and fn:year-from-date($date) eq 2001
+ return $r/value
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
new file mode 100644
index 0000000..c01f386
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
@@ -0,0 +1,8 @@
+(: XQuery Join Aggregate Query :)
+(: Count all stations in the state of Oregon. :)
+count(
+ let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+ for $s in collection($station_collection)/stationCollection/station
+ where (some $x in $s/locationLabels satisfies ($x/type eq "ST" and fn:upper-case(fn:data($x/displayName)) eq "OREGON"))
+ return $s
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
new file mode 100644
index 0000000..02dc9ab
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
@@ -0,0 +1,14 @@
+(: XQuery Join Aggregate Query :)
+(: Self join with all stations. :)
+let $sensor_collection1 := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r1 in collection($sensor_collection1)/dataCollection/data
+
+let $sensor_collection2 := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r2 in collection($sensor_collection2)/dataCollection/data
+
+let $date1 := xs:date(fn:substring(xs:string(fn:data($r1/date)), 0, 11))
+let $date2 := xs:date(fn:substring(xs:string(fn:data($r2/date)), 0, 11))
+where $r1/station eq $r2/station
+ and fn:year-from-date($date1) gt 2000
+ and fn:year-from-date($date2) gt 2000
+return ($r1/value, $r2/value)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
new file mode 100644
index 0000000..7767b7c
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
@@ -0,0 +1,9 @@
+(: XQuery Join Aggregate Query :)
+(: Count all sensor readings after 2000. :)
+count(
+ let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+ for $r in collection($sensor_collection)/dataCollection/data
+ let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+ where fn:year-from-date($date) gt 2000
+ return $r
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
new file mode 100644
index 0000000..b671d31
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
@@ -0,0 +1,7 @@
+(: XQuery Join Query :)
+(: Count all the weather sensor readings available. :)
+count(
+ let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+ for $r in collection($sensor_collection)/dataCollection/data
+ return $r
+)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
new file mode 100644
index 0000000..4f8a666
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
@@ -0,0 +1,7 @@
+(: XQuery Join Query :)
+(: Count all the weather stations available. :)
+count(
+ let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+ for $s in collection($station_collection)/stationCollection/station
+ return $s
+)
\ No newline at end of file
[3/5] git commit: Switched ssh commands to use hostname instead of ip
address.
Posted by pr...@apache.org.
Switched ssh commands to use hostname instead of ip address.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/2a9629f3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/2a9629f3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/2a9629f3
Branch: refs/heads/prestonc/hash_join
Commit: 2a9629f38075b256db313aa92df4331971873cb9
Parents: 0b653ea
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:07:34 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:07:34 2014 -0700
----------------------------------------------------------------------
.../main/resources/scripts/cluster_actions.py | 26 ++++++++++----------
1 file changed, 13 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/2a9629f3/vxquery-server/src/main/resources/scripts/cluster_actions.py
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/cluster_actions.py b/vxquery-server/src/main/resources/scripts/cluster_actions.py
index 066aa4f..cef5db1 100644
--- a/vxquery-server/src/main/resources/scripts/cluster_actions.py
+++ b/vxquery-server/src/main/resources/scripts/cluster_actions.py
@@ -87,52 +87,52 @@ class ClusterActions:
# Push the information out to each server.
print " - Add new file."
remove_tar_command = "rm " + tar_file + ""
- self.run_remote_command(machine.get_username(), machine.get_ip(), remove_tar_command)
- copy_command = "scp " + tar_file + " " + machine.get_username() + "@" + machine.get_ip() + ":"
+ self.run_remote_command(machine.get_username(), machine.get_id(), remove_tar_command)
+ copy_command = "scp " + tar_file + " " + machine.get_username() + "@" + machine.get_id() + ":"
subprocess.call(copy_command, shell=True)
print " - Expand new file."
base_folder = tar_file.split('.')[0]
remove_folder_command = "rm -rf " + base_folder + ""
- self.run_remote_command(machine.get_username(), machine.get_ip(), remove_folder_command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), remove_folder_command)
unpack_command = "tar -zxf " + tar_file + ""
- self.run_remote_command(machine.get_username(), machine.get_ip(), unpack_command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), unpack_command)
# Make the bin files executable.
chmod_command = "chmod u+x " + base_folder + "/target/appassembler/bin/vxq*"
- self.run_remote_command(machine.get_username(), machine.get_ip(), chmod_command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), chmod_command)
chmod_command = "chmod u+x " + base_folder + "/target/appassembler/bin/*.sh"
- self.run_remote_command(machine.get_username(), machine.get_ip(), chmod_command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), chmod_command)
print " - Server clean up."
- self.run_remote_command(machine.get_username(), machine.get_ip(), remove_tar_command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), remove_tar_command)
def start_cc(self, machine):
print "Start Cluster Controller."
print " " + machine.get_id() + " " + machine.get_ip()
command = "./vxquery-server/target/appassembler/bin/startcc.sh " + machine.get_ip()
- self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), command)
def start_nc(self, machine, cc):
print "Start Node Controller."
print " " + machine.get_id() + " " + machine.get_ip()
command = "./vxquery-server/target/appassembler/bin/startnc.sh " + cc.get_ip() + " " + machine.get_ip() + " " + machine.get_id()
- self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), command)
def stop_cc(self, machine):
print "Stop Cluster Controller."
print " " + machine.get_id() + " " + machine.get_ip()
command = "./vxquery-server/target/appassembler/bin/stopcc.sh " + machine.get_username()
- self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), command)
def stop_nc(self, machine):
print "Stop Node Controller."
print " " + machine.get_id() + " " + machine.get_ip()
command = "./vxquery-server/target/appassembler/bin/stopnc.sh " + machine.get_username()
- self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+ self.run_remote_command(machine.get_username(), machine.get_id(), command)
- def run_remote_command(self, username, ip_address, command):
- remote_command = "ssh -x " + username + "@" + ip_address + " '" + command + "' "
+ def run_remote_command(self, username, host, command):
+ remote_command = "ssh -x " + username + "@" + host + " '" + command + "' "
# print remote_command
os.system(remote_command)
[4/5] git commit: Benchmark scripts simplification and update.
Posted by pr...@apache.org.
Benchmark scripts simplification and update.
* Removed compression.
* Added a print out of the partition scheme.
* Added new queries to the list.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/e9805a66
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/e9805a66
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/e9805a66
Branch: refs/heads/prestonc/hash_join
Commit: e9805a66a44b980ae20c560bc8972c282bd80919
Parents: 2a9629f
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:15:10 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:15:10 2014 -0700
----------------------------------------------------------------------
.../scripts/weather_benchmark.py | 105 ++++++++++++++-----
.../noaa-ghcn-daily/scripts/weather_cli.py | 21 ++--
.../scripts/weather_convert_to_xml.py | 11 +-
.../scripts/weather_data_files.py | 17 ++-
4 files changed, 108 insertions(+), 46 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
index 66f85d6..0500de3 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
@@ -34,7 +34,8 @@ class WeatherBenchmark:
QUERY_REPLACEMENT_KEY = "/tmp/1.0_partition_ghcnd_all_xml/"
QUERY_MASTER_FOLDER = "../queries/"
- QUERY_FILE_LIST = ["q00.xq", "q01.xq", "q02.xq", "q03.xq", "q04.xq", "q05.xq"]
+ QUERY_FILE_LIST = ["q00.xq", "q01.xq", "q02.xq", "q03.xq", "q04.xq", "q05.xq", "q06.xq"]
+ QUERY_UTILITY_LIST = ["sensor_count.xq", "station_count.xq", "q04_sensor.xq", "q04_station.xq", "q05_sensor.xq", "q05_station.xq", "q06_sensor.xq"]
BENCHMARK_LOCAL_TESTS = ["local_speed_up", "local_batch_scale_out"]
BENCHMARK_CLUSTER_TESTS = ["speed_up", "batch_scale_out"]
QUERY_COLLECTIONS = ["sensors", "stations"]
@@ -47,40 +48,73 @@ class WeatherBenchmark:
self.dataset = dataset
self.nodes = nodes
- def build_data_links(self, xml_save_path):
+ def print_partition_scheme(self, xml_save_path):
if (len(self.base_paths) == 0):
return
for test in self.dataset.get_tests():
if test in self.BENCHMARK_LOCAL_TESTS:
- self.build_local_data_links(test, xml_save_path)
+ self.print_local_partition_schemes(test, xml_save_path)
elif test in self.BENCHMARK_CLUSTER_TESTS:
- self.build_cluster_data_links(test, xml_save_path)
+ self.print_cluster_partition_schemes(test, xml_save_path)
else:
print "Unknown test."
exit()
- def build_local_data_links(self, test, xml_save_path):
- virtual_partitions = get_local_virtual_partitions(self.partitions)
- data_paths = get_partition_paths(virtual_partitions, self.base_paths)
+ def print_local_partition_schemes(self, test, xml_save_path):
for i in self.partitions:
- link_base_paths = get_partition_paths(i, self.base_paths, "data_links/" + test)
- # Match link paths to real data paths.
- offset = 0
- group_size = len(data_paths) / len(link_base_paths)
- for link_index, link_path in enumerate(link_base_paths):
- if os.path.isdir(link_path):
- shutil.rmtree(link_path)
- for data_index, data_path in enumerate(data_paths):
- if offset <= data_index and data_index < offset + group_size:
- self.add_collection_links_for(data_path, link_path, data_index)
- offset += group_size
+ scheme = self.get_local_partition_scheme(test, xml_save_path, i)
+ virtual_partitions = get_local_virtual_partitions(self.partitions)
+ self.print_partition_schemes(virtual_partitions, scheme, test, i)
+
+ def print_cluster_partition_schemes(self, test, xml_save_path):
+ scheme = self.get_cluster_partition_scheme(test, xml_save_path)
+ virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions)
+ self.print_partition_schemes(virtual_partitions, scheme, test, 0)
+
+ def print_partition_schemes(self, virtual_partitions, scheme, test, partitions):
+ print
+ print "---------------- Partition Scheme --------------------"
+ print " Test: " + test
+ print " Virtual Partitions: " + str(virtual_partitions)
+ print " Disks: " + str(len(self.base_paths))
+ print " Partitions: " + str(partitions)
+
+ if len(scheme) > 0:
+ folder_length = len(scheme[0][3]) + 5
+ row_format = "{:>5} {:>5} {:<" + str(folder_length) + "} {:<" + str(folder_length) + "}"
+ HEADER = ("Index", "Link", "Data Path", "Link Path")
+ print row_format.format(*HEADER)
+ for row in scheme:
+ print row_format.format(*row)
+ print
+ else:
+ print " Scheme is EMPTY."
+
+ def get_local_partition_scheme(self, test, xml_save_path, partition):
+ scheme = []
+ virtual_partitions = get_local_virtual_partitions(self.partitions)
+ data_schems = get_partition_scheme(virtual_partitions, self.base_paths)
+
+ link_base_schemes = get_partition_scheme(partition, self.base_paths, "data_links/" + test)
+ # Match link paths to real data paths.
+ offset = 0
+ group_size = len(data_schemes) / len(link_base_schemes)
+ for link_disk, link_virtual, link_index, link_path in enumerate(link_base_schemes):
+ for data_disk, data_virtual, data_index, data_path in enumerate(data_schemes):
+ if test == "local_speed_up" and offset <= data_index and data_index < offset + group_size:
+ scheme.append([data_index, link_index, data_path, link_path])
+ elif test == "local_batch_scale_out" and data_index == link_index:
+ scheme.append([data_index, link_index, data_path, link_path])
+ offset += group_size
+ return scheme
- def build_cluster_data_links(self, test, xml_save_path):
+ def get_cluster_partition_scheme(self, test, xml_save_path):
node_index = self.get_current_node_index()
if node_index == -1:
print "Unknown host."
return
+ scheme = []
virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions)
data_paths = get_partition_paths(virtual_partitions, self.base_paths)
link_base_paths = get_cluster_link_paths(len(self.nodes), self.base_paths, "data_links/" + test)
@@ -88,8 +122,6 @@ class WeatherBenchmark:
# Match link paths to real data paths.
link_base_paths.sort()
for link_index, link_path in enumerate(link_base_paths):
- if os.path.isdir(link_path):
- shutil.rmtree(link_path)
# Prep
link_offset = link_index % len(self.nodes)
disk_offset = link_index // len(self.nodes)
@@ -111,8 +143,33 @@ class WeatherBenchmark:
data_paths.sort()
for data_index, data_path in enumerate(data_paths):
if has_data and node_offset <= data_index and data_index < node_offset + group_size:
- self.add_collection_links_for(data_path, link_path, data_index)
- self.add_collection_links_for("", link_path, -1)
+ scheme.append([data_index, link_index, data_path, link_path])
+ scheme.append([-1, link_index, "", link_path])
+ return scheme
+
+ def build_data_links(self, xml_save_path):
+ if (len(self.base_paths) == 0):
+ return
+ for test in self.dataset.get_tests():
+ if test in self.BENCHMARK_LOCAL_TESTS:
+ for i in self.partitions:
+ scheme = self.get_local_partition_scheme(test, xml_save_path, i)
+ self.build_data_links_scheme(scheme)
+ elif test in self.BENCHMARK_CLUSTER_TESTS:
+ scheme = self.get_cluster_partition_scheme(test, xml_save_path)
+ self.build_data_links_scheme(scheme)
+ else:
+ print "Unknown test."
+ exit()
+
+ def build_data_links_scheme(self, scheme):
+ """Build all the data links based on the scheme information."""
+ link_path_cleared = []
+ for (data_index, partition, data_path, link_path) in scheme:
+ if link_path not in link_path_cleared and os.path.isdir(link_path):
+ shutil.rmtree(link_path)
+ link_path_cleared.append(link_path)
+ self.add_collection_links_for(data_path, link_path, data_index)
def get_current_node_index(self):
found = False
@@ -172,7 +229,7 @@ class WeatherBenchmark:
def copy_and_replace_query(self, query_path, replacement_list):
'''Copy the query files over to the query_path and replace the path
for the where the collection data is located.'''
- for query_file in self.QUERY_FILE_LIST:
+ for query_file in self.QUERY_FILE_LIST + self.QUERY_UTILITY_LIST:
shutil.copyfile(self.QUERY_MASTER_FOLDER + query_file, query_path + query_file)
# Make a search replace for each collection.
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
index a1a1aa2..52945e5 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
@@ -24,7 +24,6 @@ from weather_config import *
from weather_benchmark import *
DEBUG_OUTPUT = False
-COMPRESSED = False
#
# Weather conversion for GHCN-DAILY files to xml.
@@ -42,7 +41,7 @@ def main(argv):
xml_config_path = ""
try:
- opts, args = getopt.getopt(argv, "acf:hl:m:ruvw:x:", ["file=", "locality=", "max_station_files=", "web_service=", "xml_config="])
+ opts, args = getopt.getopt(argv, "af:hl:m:ruvw:x:", ["file=", "locality=", "max_station_files=", "web_service=", "xml_config="])
except getopt.GetoptError:
print 'The file options for weather_cli.py were not correctly specified.'
print 'To see a full list of options try:'
@@ -52,10 +51,9 @@ def main(argv):
if opt == '-h':
print 'Converting weather daily files to xml options:'
print ' -a Append the results to the progress file.'
- print ' -c Compress the produced XML file with .gz.'
print ' -f (str) The file name of a specific station to process.'
print ' * Helpful when testing a single stations XML file output.'
- print ' -l (str) Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, statistics).'
+ print ' -l (str) Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, partition_scheme, statistics).'
print ' -m (int) Limits the number of files created for each station.'
print ' * Helpful when testing to make sure all elements are supported for each station.'
print ' Alternate form: --max_station_files=(int)'
@@ -67,9 +65,6 @@ def main(argv):
sys.exit()
elif opt in ('-a', "--append"):
append = True
- elif opt == '-c':
- global COMPRESSED
- COMPRESSED = True
elif opt in ('-f', "--file"):
# check if file exists.
if os.path.exists(arg):
@@ -78,7 +73,7 @@ def main(argv):
print 'Error: Argument must be a file name for --file (-f).'
sys.exit()
elif opt in ('-l', "--locality"):
- if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "test_links", "queries", "statistics"):
+ if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "partition_scheme", "test_links", "queries", "statistics"):
section = arg
else:
print 'Error: Argument must be a string for --locality (-l) and a valid locality.'
@@ -144,7 +139,7 @@ def main(argv):
os.makedirs(xml_data_save_path)
# Set up the XML build objects.
- convert = WeatherWebServiceMonthlyXMLFile(download_path, xml_data_save_path, COMPRESSED, DEBUG_OUTPUT)
+ convert = WeatherWebServiceMonthlyXMLFile(download_path, xml_data_save_path, DEBUG_OUTPUT)
progress_file = xml_data_save_path + "_data_progress.csv"
data = WeatherDataFiles(ghcnd_data_dly_path, progress_file)
if section in ("all", "progress_file"):
@@ -207,15 +202,19 @@ def main(argv):
base_paths.append(paths + dataset_folder + "/")
benchmark = WeatherBenchmark(base_paths, dataset.get_partitions(), dataset, config.get_node_machine_list())
- if section in ("all", "partition"):
+ if section in ("all", "partition", "partition_scheme"):
slices = benchmark.get_number_of_slices()
print 'Processing the partition section (' + dataset.get_name() + ':d' + str(len(base_paths)) + ':s' + str(slices) + ').'
data.reset()
- data.copy_to_n_partitions(xml_data_save_path, slices, base_paths, reset)
+ if section == "partition_scheme":
+ benchmark.print_partition_scheme(xml_data_save_path)
+ else:
+ data.copy_to_n_partitions(xml_data_save_path, slices, base_paths, reset)
if section in ("all", "test_links"):
# TODO determine current node
print 'Processing the test links section (' + dataset.get_name() + ').'
+ benchmark.print_partition_scheme(xml_data_save_path)
benchmark.build_data_links(xml_data_save_path)
if section in ("all", "queries"):
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
index a7adce5..c115efa 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
@@ -17,7 +17,6 @@
import textwrap
from datetime import date
import os
-import gzip
from collections import OrderedDict
# Custom modules.
@@ -104,9 +103,8 @@ class WeatherConvertToXML:
token = ""
- def __init__(self, base_path, save_path, compressed, debug_output):
+ def __init__(self, base_path, save_path, debug_output):
self.save_path = save_path
- self.compressed = compressed
self.debug_output = debug_output
# Extra support files.
@@ -134,14 +132,9 @@ class WeatherConvertToXML:
print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
def save_file(self, filename, contents):
- if self.compressed:
- filename = filename + '.gz'
- file = gzip.open(filename, 'wb')
- else:
- file = open(filename, 'w')
+ file = open(filename, 'w')
file.write(contents)
file.close()
-
return filename
def get_folder_size(self, folder_name):
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
index a9b4ecc..a68cba1 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
@@ -318,13 +318,26 @@ class WeatherDataFiles:
return columns[self.INDEX_DATA_FILE_NAME]
+# Index values of each field details.
+PARTITION_INDEX_DISK = 0
+PARTITION_INDEX_VIRTUAL = 1
+PARTITION_INDEX = 2
+PARTITION_INDEX_PATH = 3
+PARTITION_HEADER = ("Disk", "Virtual", "Index", "Path")
+
def get_partition_paths(partitions, base_paths, key="partitions"):
partition_paths = []
+ for scheme in get_partition_scheme(partitions, base_paths, key):
+ partition_paths.append(scheme[PARTITION_INDEX_PATH])
+ return partition_paths
+
+def get_partition_scheme(partitions, base_paths, key="partitions"):
+ partition_scheme = []
for i in range(0, partitions):
for j in range(0, len(base_paths)):
new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, partitions, i) + "/"
- partition_paths.append(new_partition_path)
- return partition_paths
+ partition_scheme.append((j, partitions, i, new_partition_path))
+ return partition_scheme
def get_partition_folder(disks, partitions, index):
return "d" + str(disks) + "_p" + str(partitions) + "_i" + str(index)
[2/5] git commit: Added logging to the benchmark script and a new
join query.
Posted by pr...@apache.org.
Added logging to the benchmark script and a new join query.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/0b653ea1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/0b653ea1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/0b653ea1
Branch: refs/heads/prestonc/hash_join
Commit: 0b653ea1b8e1b7361acf235302532f33ab26b420
Parents: 7cff5b8
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:06:48 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:06:48 2014 -0700
----------------------------------------------------------------------
.../main/resources/noaa-ghcn-daily/queries/q07.xq | 15 +++++++++++++++
.../noaa-ghcn-daily/scripts/run_benchmark.sh | 7 +++++--
.../rewriter/rules/InlineNestedVariablesRule.java | 4 ++--
3 files changed, 22 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
new file mode 100644
index 0000000..412285c
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
@@ -0,0 +1,15 @@
+(: XQuery Join Aggregate Query :)
+(: Find the lowest recorded temperature (TMIN) for each station 2001. :)
+let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+for $s in collection($station_collection)/stationCollection/stations
+
+return fn:min(
+ let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+ for $r in collection($sensor_collection)/dataCollection/data
+
+ let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+ where $s/id eq $r/station
+ and $r/dataType eq "TMIN"
+ and fn:year-from-date($date) eq 2001
+ return $r/value
+) div 10
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
index 8417660..bbfe35d 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
@@ -32,12 +32,15 @@ then
exit
fi
-for j in $(find ${1} -name '*.xq')
+for j in $(find ${1} -name '*q??.xq')
do
if [ -z "${3}" ] || [[ "${j}" =~ "${3}" ]]
then
echo "Running query: ${j}"
- time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -frame-size 1000000 -repeatexec 10 > ${j}.log 2>&1
+ log_file="$(basename ${j}).$(date +%Y%m%d).log"
+ log_base_path=$(dirname ${j/queries/query_logs})
+ mkdir -p ${log_base_path}
+ time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -frame-size 1000000 -repeatexec 10 > ${log_base_path}/${log_file} 2>&1
fi;
done
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
----------------------------------------------------------------------
diff --git a/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java b/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
index 8cd86f8..c0c082f 100644
--- a/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
+++ b/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
@@ -62,7 +62,7 @@ public class InlineNestedVariablesRule extends InlineVariablesRule {
}
boolean modified = false;
- // Descend into nested plans removing projects on the way.
+ // Descend into nested plans inlining along the way.
if (op.hasNestedPlans()) {
AbstractOperatorWithNestedPlans nestedOp = (AbstractOperatorWithNestedPlans) op;
for (ILogicalPlan nestedPlan : nestedOp.getNestedPlans()) {
@@ -74,7 +74,7 @@ public class InlineNestedVariablesRule extends InlineVariablesRule {
}
}
- // Descend into children removing projects on the way.
+ // Descend into children inlining along on the way.
for (Mutable<ILogicalOperator> inputOpRef : op.getInputs()) {
if (inlineVariables(inputOpRef, context)) {
modified = true;
[5/5] git commit: Added join for all stations.
Posted by pr...@apache.org.
Added join for all stations.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/4f061948
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/4f061948
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/4f061948
Branch: refs/heads/prestonc/hash_join
Commit: 4f061948d4994d36772c5f4202b99d61f3676bb8
Parents: e9805a6
Author: Preston Carman <pr...@apache.org>
Authored: Fri Mar 14 11:36:09 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Fri Mar 14 11:36:09 2014 -0700
----------------------------------------------------------------------
.../resources/noaa-ghcn-daily/queries/q07.xq | 22 +++++++++-----------
1 file changed, 10 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/4f061948/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
index 412285c..c4e186d 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
@@ -1,15 +1,13 @@
(: XQuery Join Aggregate Query :)
-(: Find the lowest recorded temperature (TMIN) for each station 2001. :)
+(: Find the lowest recorded temperature (TMIN) for each station on 2000-01-01.:)
let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
-for $s in collection($station_collection)/stationCollection/stations
+for $s in collection($station_collection)/stationCollection/station
-return fn:min(
- let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
- for $r in collection($sensor_collection)/dataCollection/data
-
- let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
- where $s/id eq $r/station
- and $r/dataType eq "TMIN"
- and fn:year-from-date($date) eq 2001
- return $r/value
-) div 10
\ No newline at end of file
+let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r in collection($sensor_collection)/dataCollection/data
+
+let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+where $s/id eq $r/station
+ and $r/dataType eq "TMAX"
+ and $date eq xs:date("2000-01-01")
+return ($s/displayName, $r/value)
\ No newline at end of file