You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vxquery.apache.org by pr...@apache.org on 2014/03/14 19:36:21 UTC

[1/5] git commit: Added some support queries to help understand the join condition.

Repository: incubator-vxquery
Updated Branches:
  refs/heads/prestonc/hash_join f10d678fd -> 4f061948d


Added some support queries to help understand the join condition.


Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/7cff5b8f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/7cff5b8f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/7cff5b8f

Branch: refs/heads/prestonc/hash_join
Commit: 7cff5b8fc6b59314b64a591ab347d2a4d3825ce0
Parents: f10d678
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 14:30:13 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 14:30:13 2014 -0700

----------------------------------------------------------------------
 .../resources/noaa-ghcn-daily/queries/q04_sensor.xq   | 10 ++++++++++
 .../resources/noaa-ghcn-daily/queries/q04_station.xq  |  8 ++++++++
 .../resources/noaa-ghcn-daily/queries/q05_sensor.xq   | 11 +++++++++++
 .../resources/noaa-ghcn-daily/queries/q05_station.xq  |  8 ++++++++
 .../src/main/resources/noaa-ghcn-daily/queries/q06.xq | 14 ++++++++++++++
 .../resources/noaa-ghcn-daily/queries/q06_sensor.xq   |  9 +++++++++
 .../resources/noaa-ghcn-daily/queries/sensor_count.xq |  7 +++++++
 .../noaa-ghcn-daily/queries/station_count.xq          |  7 +++++++
 8 files changed, 74 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
new file mode 100644
index 0000000..7a94332
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_sensor.xq
@@ -0,0 +1,10 @@
+(: XQuery Join Query :)
+(: Count all the weather sensor readings on 1976-07-04.                       :)
+count(
+    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+    for $r in collection($sensor_collection)/dataCollection/data
+        
+    let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+    where $date eq xs:date("1976-07-04")
+    return $r
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
new file mode 100644
index 0000000..c9effde
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_station.xq
@@ -0,0 +1,8 @@
+(: XQuery Join Query :)
+(: Count all the weather stations for King county.                            :)
+count(
+    let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+    for $s in collection($station_collection)/stationCollection/station
+    where (some $x in $s/locationLabels satisfies ($x/type eq "CNTY" and fn:contains(fn:upper-case(fn:data($x/displayName)), "KING")))
+    return $s
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
new file mode 100644
index 0000000..5f96d1b
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_sensor.xq
@@ -0,0 +1,11 @@
+(: XQuery Join Aggregate Query :)
+(: Count all sensor readings for TMIN in 2001.                                :)
+count(
+    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+    for $r in collection($sensor_collection)/dataCollection/data
+    
+    let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+    where $r/dataType eq "TMIN" 
+        and fn:year-from-date($date) eq 2001
+    return $r/value
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
new file mode 100644
index 0000000..c01f386
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_station.xq
@@ -0,0 +1,8 @@
+(: XQuery Join Aggregate Query :)
+(: Count all stations in the state of Oregon.                                 :)
+count(
+    let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+    for $s in collection($station_collection)/stationCollection/station
+    where (some $x in $s/locationLabels satisfies ($x/type eq "ST" and fn:upper-case(fn:data($x/displayName)) eq "OREGON"))
+    return $s
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
new file mode 100644
index 0000000..02dc9ab
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq
@@ -0,0 +1,14 @@
+(: XQuery Join Aggregate Query :)
+(: Self join with all stations.                                               :)
+let $sensor_collection1 := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r1 in collection($sensor_collection1)/dataCollection/data
+
+let $sensor_collection2 := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r2 in collection($sensor_collection2)/dataCollection/data
+
+let $date1 := xs:date(fn:substring(xs:string(fn:data($r1/date)), 0, 11))
+let $date2 := xs:date(fn:substring(xs:string(fn:data($r2/date)), 0, 11))
+where $r1/station eq $r2/station
+    and fn:year-from-date($date1) gt 2000
+    and fn:year-from-date($date2) gt 2000
+return ($r1/value, $r2/value) 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
new file mode 100644
index 0000000..7767b7c
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_sensor.xq
@@ -0,0 +1,9 @@
+(: XQuery Join Aggregate Query :)
+(: Count all sensor readings after 2000.                                      :)
+count(
+    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+    for $r in collection($sensor_collection)/dataCollection/data
+    let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+    where fn:year-from-date($date) gt 2000
+    return $r
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
new file mode 100644
index 0000000..b671d31
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/sensor_count.xq
@@ -0,0 +1,7 @@
+(: XQuery Join Query :)
+(: Count all the weather sensor readings available.                           :)
+count(
+    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+    for $r in collection($sensor_collection)/dataCollection/data
+    return $r
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/7cff5b8f/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
new file mode 100644
index 0000000..4f8a666
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/station_count.xq
@@ -0,0 +1,7 @@
+(: XQuery Join Query :)
+(: Count all the weather stations available.                                  :)
+count(
+    let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+    for $s in collection($station_collection)/stationCollection/station
+    return $s
+)
\ No newline at end of file


[3/5] git commit: Switched ssh commands to use hostname instead of ip address.

Posted by pr...@apache.org.
Switched ssh commands to use hostname instead of ip address.


Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/2a9629f3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/2a9629f3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/2a9629f3

Branch: refs/heads/prestonc/hash_join
Commit: 2a9629f38075b256db313aa92df4331971873cb9
Parents: 0b653ea
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:07:34 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:07:34 2014 -0700

----------------------------------------------------------------------
 .../main/resources/scripts/cluster_actions.py   | 26 ++++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/2a9629f3/vxquery-server/src/main/resources/scripts/cluster_actions.py
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/cluster_actions.py b/vxquery-server/src/main/resources/scripts/cluster_actions.py
index 066aa4f..cef5db1 100644
--- a/vxquery-server/src/main/resources/scripts/cluster_actions.py
+++ b/vxquery-server/src/main/resources/scripts/cluster_actions.py
@@ -87,52 +87,52 @@ class ClusterActions:
         # Push the information out to each server.    
         print "  - Add new file."
         remove_tar_command = "rm " + tar_file + ""
-        self.run_remote_command(machine.get_username(), machine.get_ip(), remove_tar_command)
-        copy_command = "scp " + tar_file + " " + machine.get_username() + "@" + machine.get_ip() + ":"
+        self.run_remote_command(machine.get_username(), machine.get_id(), remove_tar_command)
+        copy_command = "scp " + tar_file + " " + machine.get_username() + "@" + machine.get_id() + ":"
         subprocess.call(copy_command, shell=True)
         
         print "  - Expand new file."
         base_folder = tar_file.split('.')[0]
         remove_folder_command = "rm -rf " + base_folder + ""
-        self.run_remote_command(machine.get_username(), machine.get_ip(), remove_folder_command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), remove_folder_command)
         unpack_command = "tar -zxf " + tar_file + ""
-        self.run_remote_command(machine.get_username(), machine.get_ip(), unpack_command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), unpack_command)
         # Make the bin files executable.
         chmod_command = "chmod u+x " + base_folder + "/target/appassembler/bin/vxq*"
-        self.run_remote_command(machine.get_username(), machine.get_ip(), chmod_command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), chmod_command)
         chmod_command = "chmod u+x " + base_folder + "/target/appassembler/bin/*.sh"
-        self.run_remote_command(machine.get_username(), machine.get_ip(), chmod_command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), chmod_command)
         
         print "  - Server clean up."
-        self.run_remote_command(machine.get_username(), machine.get_ip(), remove_tar_command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), remove_tar_command)
         
     
     def start_cc(self, machine):
         print "Start Cluster Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()
         command = "./vxquery-server/target/appassembler/bin/startcc.sh " + machine.get_ip()
-        self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), command)
     
     def start_nc(self, machine, cc):
         print "Start Node Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()
         command = "./vxquery-server/target/appassembler/bin/startnc.sh " + cc.get_ip() + " " + machine.get_ip() + " " + machine.get_id()
-        self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), command)
 
     def stop_cc(self, machine):
         print "Stop Cluster Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()
         command = "./vxquery-server/target/appassembler/bin/stopcc.sh " + machine.get_username()
-        self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), command)
     
     def stop_nc(self, machine):
         print "Stop Node Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()
         command = "./vxquery-server/target/appassembler/bin/stopnc.sh " + machine.get_username()
-        self.run_remote_command(machine.get_username(), machine.get_ip(), command)
+        self.run_remote_command(machine.get_username(), machine.get_id(), command)
         
-    def run_remote_command(self, username, ip_address, command):
-        remote_command = "ssh -x " + username + "@" + ip_address + " '" + command + "' "
+    def run_remote_command(self, username, host, command):
+        remote_command = "ssh -x " + username + "@" + host + " '" + command + "' "
 #         print remote_command
         os.system(remote_command)
 


[4/5] git commit: Benchmark scripts simplification and update.

Posted by pr...@apache.org.
Benchmark scripts simplification and update.

* Removed compression.
* Added a print out of the partition scheme.
* Added new queries to the list.


Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/e9805a66
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/e9805a66
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/e9805a66

Branch: refs/heads/prestonc/hash_join
Commit: e9805a66a44b980ae20c560bc8972c282bd80919
Parents: 2a9629f
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:15:10 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:15:10 2014 -0700

----------------------------------------------------------------------
 .../scripts/weather_benchmark.py                | 105 ++++++++++++++-----
 .../noaa-ghcn-daily/scripts/weather_cli.py      |  21 ++--
 .../scripts/weather_convert_to_xml.py           |  11 +-
 .../scripts/weather_data_files.py               |  17 ++-
 4 files changed, 108 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
index 66f85d6..0500de3 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py
@@ -34,7 +34,8 @@ class WeatherBenchmark:
 
     QUERY_REPLACEMENT_KEY = "/tmp/1.0_partition_ghcnd_all_xml/"
     QUERY_MASTER_FOLDER = "../queries/"
-    QUERY_FILE_LIST = ["q00.xq", "q01.xq", "q02.xq", "q03.xq", "q04.xq", "q05.xq"] 
+    QUERY_FILE_LIST = ["q00.xq", "q01.xq", "q02.xq", "q03.xq", "q04.xq", "q05.xq", "q06.xq"] 
+    QUERY_UTILITY_LIST = ["sensor_count.xq", "station_count.xq", "q04_sensor.xq", "q04_station.xq", "q05_sensor.xq", "q05_station.xq", "q06_sensor.xq"] 
     BENCHMARK_LOCAL_TESTS = ["local_speed_up", "local_batch_scale_out"] 
     BENCHMARK_CLUSTER_TESTS = ["speed_up", "batch_scale_out"] 
     QUERY_COLLECTIONS = ["sensors", "stations"]
@@ -47,40 +48,73 @@ class WeatherBenchmark:
         self.dataset = dataset
         self.nodes = nodes
         
-    def build_data_links(self, xml_save_path):
+    def print_partition_scheme(self, xml_save_path):
         if (len(self.base_paths) == 0):
             return
         for test in self.dataset.get_tests():
             if test in self.BENCHMARK_LOCAL_TESTS:
-                self.build_local_data_links(test, xml_save_path)
+                self.print_local_partition_schemes(test, xml_save_path)
             elif test in self.BENCHMARK_CLUSTER_TESTS:
-                self.build_cluster_data_links(test, xml_save_path)
+                self.print_cluster_partition_schemes(test, xml_save_path)
             else:
                 print "Unknown test."
                 exit()
             
-    def build_local_data_links(self, test, xml_save_path):
-        virtual_partitions = get_local_virtual_partitions(self.partitions)
-        data_paths = get_partition_paths(virtual_partitions, self.base_paths)
+    def print_local_partition_schemes(self, test, xml_save_path):
         for i in self.partitions:
-            link_base_paths = get_partition_paths(i, self.base_paths, "data_links/" + test)
-            # Match link paths to real data paths.
-            offset = 0
-            group_size = len(data_paths) / len(link_base_paths)
-            for link_index, link_path in enumerate(link_base_paths):
-                if os.path.isdir(link_path):
-                    shutil.rmtree(link_path)
-                for data_index, data_path in enumerate(data_paths):
-                    if offset <= data_index and data_index < offset + group_size:
-                        self.add_collection_links_for(data_path, link_path, data_index)
-                offset += group_size
+            scheme = self.get_local_partition_scheme(test, xml_save_path, i)
+            virtual_partitions = get_local_virtual_partitions(self.partitions)
+            self.print_partition_schemes(virtual_partitions, scheme, test, i)
+        
+    def print_cluster_partition_schemes(self, test, xml_save_path):
+        scheme = self.get_cluster_partition_scheme(test, xml_save_path)
+        virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions)
+        self.print_partition_schemes(virtual_partitions, scheme, test, 0)
+        
+    def print_partition_schemes(self, virtual_partitions, scheme, test, partitions):
+        print
+        print "---------------- Partition Scheme --------------------"
+        print "    Test: " + test
+        print "    Virtual Partitions: " + str(virtual_partitions)
+        print "    Disks: " + str(len(self.base_paths))
+        print "    Partitions: " + str(partitions)
+        
+        if len(scheme) > 0:
+            folder_length = len(scheme[0][3]) + 5
+            row_format = "{:>5} {:>5} {:<" + str(folder_length) + "} {:<" + str(folder_length) + "}"
+            HEADER = ("Index", "Link", "Data Path", "Link Path")
+            print row_format.format(*HEADER)
+            for row in scheme:
+                print row_format.format(*row)
+            print
+        else:
+            print "    Scheme is EMPTY."
+
+    def get_local_partition_scheme(self, test, xml_save_path, partition):
+        scheme = []
+        virtual_partitions = get_local_virtual_partitions(self.partitions)
+        data_schems = get_partition_scheme(virtual_partitions, self.base_paths)
+        
+        link_base_schemes = get_partition_scheme(partition, self.base_paths, "data_links/" + test)
+        # Match link paths to real data paths.
+        offset = 0
+        group_size = len(data_schemes) / len(link_base_schemes)
+        for link_disk, link_virtual, link_index, link_path in enumerate(link_base_schemes):
+            for data_disk, data_virtual, data_index, data_path in enumerate(data_schemes):
+                if test == "local_speed_up" and offset <= data_index and data_index < offset + group_size:
+                    scheme.append([data_index, link_index, data_path, link_path])
+                elif test == "local_batch_scale_out" and data_index == link_index:
+                    scheme.append([data_index, link_index, data_path, link_path])
+            offset += group_size
+        return scheme
     
-    def build_cluster_data_links(self, test, xml_save_path):
+    def get_cluster_partition_scheme(self, test, xml_save_path):
         node_index = self.get_current_node_index()
         if node_index == -1:
             print "Unknown host."
             return 
         
+        scheme = []
         virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions)
         data_paths = get_partition_paths(virtual_partitions, self.base_paths)
         link_base_paths = get_cluster_link_paths(len(self.nodes), self.base_paths, "data_links/" + test)
@@ -88,8 +122,6 @@ class WeatherBenchmark:
         # Match link paths to real data paths.
         link_base_paths.sort()
         for link_index, link_path in enumerate(link_base_paths):
-            if os.path.isdir(link_path):
-                shutil.rmtree(link_path)
             # Prep
             link_offset = link_index % len(self.nodes)
             disk_offset = link_index // len(self.nodes)
@@ -111,8 +143,33 @@ class WeatherBenchmark:
             data_paths.sort()
             for data_index, data_path in enumerate(data_paths):
                 if has_data and node_offset <= data_index and data_index < node_offset + group_size:
-                    self.add_collection_links_for(data_path, link_path, data_index)
-            self.add_collection_links_for("", link_path, -1)
+                    scheme.append([data_index, link_index, data_path, link_path])
+            scheme.append([-1, link_index, "", link_path])
+        return scheme
+    
+    def build_data_links(self, xml_save_path):
+        if (len(self.base_paths) == 0):
+            return
+        for test in self.dataset.get_tests():
+            if test in self.BENCHMARK_LOCAL_TESTS:
+                for i in self.partitions:
+                    scheme = self.get_local_partition_scheme(test, xml_save_path, i)
+                    self.build_data_links_scheme(scheme)
+            elif test in self.BENCHMARK_CLUSTER_TESTS:
+                scheme = self.get_cluster_partition_scheme(test, xml_save_path)
+                self.build_data_links_scheme(scheme)
+            else:
+                print "Unknown test."
+                exit()
+    
+    def build_data_links_scheme(self, scheme):
+        """Build all the data links based on the scheme information."""
+        link_path_cleared = []
+        for (data_index, partition, data_path, link_path) in scheme:
+            if link_path not in link_path_cleared and os.path.isdir(link_path):
+                shutil.rmtree(link_path)
+                link_path_cleared.append(link_path)
+            self.add_collection_links_for(data_path, link_path, data_index)
     
     def get_current_node_index(self):
         found = False
@@ -172,7 +229,7 @@ class WeatherBenchmark:
     def copy_and_replace_query(self, query_path, replacement_list):
         '''Copy the query files over to the query_path and replace the path
         for the where the collection data is located.'''
-        for query_file in self.QUERY_FILE_LIST:
+        for query_file in self.QUERY_FILE_LIST + self.QUERY_UTILITY_LIST:
             shutil.copyfile(self.QUERY_MASTER_FOLDER + query_file, query_path + query_file)
         
             # Make a search replace for each collection.

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
index a1a1aa2..52945e5 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
@@ -24,7 +24,6 @@ from weather_config import *
 from weather_benchmark import *
 
 DEBUG_OUTPUT = False
-COMPRESSED = False
 
 #
 # Weather conversion for GHCN-DAILY files to xml.
@@ -42,7 +41,7 @@ def main(argv):
     xml_config_path = ""
     
     try:
-        opts, args = getopt.getopt(argv, "acf:hl:m:ruvw:x:", ["file=", "locality=", "max_station_files=", "web_service=", "xml_config="])
+        opts, args = getopt.getopt(argv, "af:hl:m:ruvw:x:", ["file=", "locality=", "max_station_files=", "web_service=", "xml_config="])
     except getopt.GetoptError:
         print 'The file options for weather_cli.py were not correctly specified.'
         print 'To see a full list of options try:'
@@ -52,10 +51,9 @@ def main(argv):
         if opt == '-h':
             print 'Converting weather daily files to xml options:'
             print '    -a        Append the results to the progress file.'
-            print '    -c        Compress the produced XML file with .gz.'
             print '    -f (str)  The file name of a specific station to process.'
             print '              * Helpful when testing a single stations XML file output.'
-            print '    -l (str)  Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, statistics).'
+            print '    -l (str)  Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, partition_scheme, statistics).'
             print '    -m (int)  Limits the number of files created for each station.'
             print '              * Helpful when testing to make sure all elements are supported for each station.'
             print '              Alternate form: --max_station_files=(int)'
@@ -67,9 +65,6 @@ def main(argv):
             sys.exit()
         elif opt in ('-a', "--append"):
             append = True
-        elif opt == '-c':
-            global COMPRESSED
-            COMPRESSED = True
         elif opt in ('-f', "--file"):
             # check if file exists.
             if os.path.exists(arg):
@@ -78,7 +73,7 @@ def main(argv):
                 print 'Error: Argument must be a file name for --file (-f).'
                 sys.exit()
         elif opt in ('-l', "--locality"):
-            if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "test_links", "queries", "statistics"):
+            if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "partition_scheme", "test_links", "queries", "statistics"):
                 section = arg
             else:
                 print 'Error: Argument must be a string for --locality (-l) and a valid locality.'
@@ -144,7 +139,7 @@ def main(argv):
         os.makedirs(xml_data_save_path)
 
     # Set up the XML build objects.
-    convert = WeatherWebServiceMonthlyXMLFile(download_path, xml_data_save_path, COMPRESSED, DEBUG_OUTPUT)
+    convert = WeatherWebServiceMonthlyXMLFile(download_path, xml_data_save_path, DEBUG_OUTPUT)
     progress_file = xml_data_save_path + "_data_progress.csv"
     data = WeatherDataFiles(ghcnd_data_dly_path, progress_file)
     if section in ("all", "progress_file"):
@@ -207,15 +202,19 @@ def main(argv):
             base_paths.append(paths + dataset_folder + "/")
         benchmark = WeatherBenchmark(base_paths, dataset.get_partitions(), dataset, config.get_node_machine_list())
         
-        if section in ("all", "partition"):
+        if section in ("all", "partition", "partition_scheme"):
             slices = benchmark.get_number_of_slices()
             print 'Processing the partition section (' + dataset.get_name() + ':d' + str(len(base_paths)) + ':s' + str(slices) + ').'
             data.reset()
-            data.copy_to_n_partitions(xml_data_save_path, slices, base_paths, reset)
+            if section == "partition_scheme":
+                benchmark.print_partition_scheme(xml_data_save_path)
+            else:
+                data.copy_to_n_partitions(xml_data_save_path, slices, base_paths, reset)
     
         if section in ("all", "test_links"):
             # TODO determine current node 
             print 'Processing the test links section (' + dataset.get_name() + ').'
+            benchmark.print_partition_scheme(xml_data_save_path)
             benchmark.build_data_links(xml_data_save_path)
 
         if section in ("all", "queries"):

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
index a7adce5..c115efa 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
@@ -17,7 +17,6 @@
 import textwrap
 from datetime import date
 import os
-import gzip
 from collections import OrderedDict
 
 # Custom modules.
@@ -104,9 +103,8 @@ class WeatherConvertToXML:
     
     token = ""
     
-    def __init__(self, base_path, save_path, compressed, debug_output):
+    def __init__(self, base_path, save_path, debug_output):
         self.save_path = save_path
-        self.compressed = compressed
         self.debug_output = debug_output
 
         # Extra support files.
@@ -134,14 +132,9 @@ class WeatherConvertToXML:
             print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
     
     def save_file(self, filename, contents):
-        if self.compressed:
-            filename = filename + '.gz'
-            file = gzip.open(filename, 'wb')
-        else:
-            file = open(filename, 'w')
+        file = open(filename, 'w')
         file.write(contents)
         file.close()
-        
         return filename
     
     def get_folder_size(self, folder_name):

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/e9805a66/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
index a9b4ecc..a68cba1 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
@@ -318,13 +318,26 @@ class WeatherDataFiles:
         return columns[self.INDEX_DATA_FILE_NAME]
     
     
+# Index values of each field details.
+PARTITION_INDEX_DISK = 0
+PARTITION_INDEX_VIRTUAL = 1
+PARTITION_INDEX = 2
+PARTITION_INDEX_PATH = 3
+PARTITION_HEADER = ("Disk", "Virtual", "Index", "Path")
+            
 def get_partition_paths(partitions, base_paths, key="partitions"):        
     partition_paths = []
+    for scheme in get_partition_scheme(partitions, base_paths, key):
+        partition_paths.append(scheme[PARTITION_INDEX_PATH])
+    return partition_paths
+
+def get_partition_scheme(partitions, base_paths, key="partitions"):        
+    partition_scheme = []
     for i in range(0, partitions):
         for j in range(0, len(base_paths)):
             new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, partitions, i) + "/"
-            partition_paths.append(new_partition_path)
-    return partition_paths
+            partition_scheme.append((j, partitions, i, new_partition_path))
+    return partition_scheme
 
 def get_partition_folder(disks, partitions, index):        
     return "d" + str(disks) + "_p" + str(partitions) + "_i" + str(index)


[2/5] git commit: Added logging to the benchmark script and a new join query.

Posted by pr...@apache.org.
Added logging to the benchmark script and a new join query.


Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/0b653ea1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/0b653ea1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/0b653ea1

Branch: refs/heads/prestonc/hash_join
Commit: 0b653ea1b8e1b7361acf235302532f33ab26b420
Parents: 7cff5b8
Author: Preston Carman <pr...@apache.org>
Authored: Thu Mar 13 23:06:48 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Thu Mar 13 23:06:48 2014 -0700

----------------------------------------------------------------------
 .../main/resources/noaa-ghcn-daily/queries/q07.xq    | 15 +++++++++++++++
 .../noaa-ghcn-daily/scripts/run_benchmark.sh         |  7 +++++--
 .../rewriter/rules/InlineNestedVariablesRule.java    |  4 ++--
 3 files changed, 22 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
new file mode 100644
index 0000000..412285c
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
@@ -0,0 +1,15 @@
+(: XQuery Join Aggregate Query :)
+(: Find the lowest recorded temperature (TMIN) for each station 2001.         :)
+let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
+for $s in collection($station_collection)/stationCollection/stations
+
+return fn:min(
+    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+    for $r in collection($sensor_collection)/dataCollection/data
+    
+    let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+    where $s/id eq $r/station
+        and $r/dataType eq "TMIN" 
+        and fn:year-from-date($date) eq 2001
+    return $r/value
+) div 10
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
index 8417660..bbfe35d 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
@@ -32,12 +32,15 @@ then
     exit
 fi
 
-for j in $(find ${1} -name '*.xq')
+for j in $(find ${1} -name '*q??.xq')
 do
 	if [ -z "${3}" ] || [[ "${j}" =~ "${3}" ]] 
 	then
 		echo "Running query: ${j}"
-		time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -frame-size 1000000 -repeatexec 10 > ${j}.log 2>&1
+		log_file="$(basename ${j}).$(date +%Y%m%d).log"
+		log_base_path=$(dirname ${j/queries/query_logs})
+		mkdir -p ${log_base_path}
+		time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -frame-size 1000000 -repeatexec 10 > ${log_base_path}/${log_file} 2>&1
 	fi;
 done
 

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/0b653ea1/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
----------------------------------------------------------------------
diff --git a/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java b/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
index 8cd86f8..c0c082f 100644
--- a/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
+++ b/vxquery-core/src/main/java/org/apache/vxquery/compiler/rewriter/rules/InlineNestedVariablesRule.java
@@ -62,7 +62,7 @@ public class InlineNestedVariablesRule extends InlineVariablesRule {
         }
 
         boolean modified = false;
-        // Descend into nested plans removing projects on the way.
+        // Descend into nested plans inlining along the way.
         if (op.hasNestedPlans()) {
             AbstractOperatorWithNestedPlans nestedOp = (AbstractOperatorWithNestedPlans) op;
             for (ILogicalPlan nestedPlan : nestedOp.getNestedPlans()) {
@@ -74,7 +74,7 @@ public class InlineNestedVariablesRule extends InlineVariablesRule {
             }
         }
 
-        // Descend into children removing projects on the way.
+        // Descend into children inlining along on the way.
         for (Mutable<ILogicalOperator> inputOpRef : op.getInputs()) {
             if (inlineVariables(inputOpRef, context)) {
                 modified = true;


[5/5] git commit: Added join for all stations.

Posted by pr...@apache.org.
Added join for all stations.


Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/4f061948
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/4f061948
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/4f061948

Branch: refs/heads/prestonc/hash_join
Commit: 4f061948d4994d36772c5f4202b99d61f3676bb8
Parents: e9805a6
Author: Preston Carman <pr...@apache.org>
Authored: Fri Mar 14 11:36:09 2014 -0700
Committer: Preston Carman <pr...@apache.org>
Committed: Fri Mar 14 11:36:09 2014 -0700

----------------------------------------------------------------------
 .../resources/noaa-ghcn-daily/queries/q07.xq    | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/4f061948/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
index 412285c..c4e186d 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq
@@ -1,15 +1,13 @@
 (: XQuery Join Aggregate Query :)
-(: Find the lowest recorded temperature (TMIN) for each station 2001.         :)
+(: Find the lowest recorded temperature (TMIN) for each station on 2000-01-01.:)
 let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations"
-for $s in collection($station_collection)/stationCollection/stations
+for $s in collection($station_collection)/stationCollection/station
 
-return fn:min(
-    let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
-    for $r in collection($sensor_collection)/dataCollection/data
-    
-    let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
-    where $s/id eq $r/station
-        and $r/dataType eq "TMIN" 
-        and fn:year-from-date($date) eq 2001
-    return $r/value
-) div 10
\ No newline at end of file
+let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors"
+for $r in collection($sensor_collection)/dataCollection/data
+
+let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11))
+where $s/id eq $r/station
+    and $r/dataType eq "TMAX" 
+    and $date eq xs:date("2000-01-01")
+return ($s/displayName, $r/value)
\ No newline at end of file