You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vxquery.apache.org by pr...@apache.org on 2014/10/21 21:36:05 UTC

[04/14] copy of all changes in exrt benchmark queries that is copyright free.

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
new file mode 100644
index 0000000..5db090a
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
@@ -0,0 +1,554 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import textwrap
+from datetime import date
+import os
+from collections import OrderedDict
+
+# Custom modules.
+from weather_config_ghcnd import *
+from weather_config_mshr import *
+from weather_download_files import *
+
+class WeatherConvertToXML:
+    
+    STATES = OrderedDict({
+        'AK': 'Alaska',
+        'AL': 'Alabama',
+        'AR': 'Arkansas',
+        'AS': 'American Samoa',
+        'AZ': 'Arizona',
+        'CA': 'California',
+        'CO': 'Colorado',
+        'CT': 'Connecticut',
+        'DC': 'District of Columbia',
+        'DE': 'Delaware',
+        'FL': 'Florida',
+        'GA': 'Georgia',
+        'GU': 'Guam',
+        'HI': 'Hawaii',
+        'IA': 'Iowa',
+        'ID': 'Idaho',
+        'IL': 'Illinois',
+        'IN': 'Indiana',
+        'KS': 'Kansas',
+        'KY': 'Kentucky',
+        'LA': 'Louisiana',
+        'MA': 'Massachusetts',
+        'MD': 'Maryland',
+        'ME': 'Maine',
+        'MI': 'Michigan',
+        'MN': 'Minnesota',
+        'MO': 'Missouri',
+        'MP': 'Northern Mariana Islands',
+        'MS': 'Mississippi',
+        'MT': 'Montana',
+        'NA': 'National',
+        'NC': 'North Carolina',
+        'ND': 'North Dakota',
+        'NE': 'Nebraska',
+        'NH': 'New Hampshire',
+        'NJ': 'New Jersey',
+        'NM': 'New Mexico',
+        'NV': 'Nevada',
+        'NY': 'New York',
+        'OH': 'Ohio',
+        'OK': 'Oklahoma',
+        'OR': 'Oregon',
+        'PA': 'Pennsylvania',
+        'PR': 'Puerto Rico',
+        'RI': 'Rhode Island',
+        'SC': 'South Carolina',
+        'SD': 'South Dakota',
+        'TN': 'Tennessee',
+        'TX': 'Texas',
+        'UT': 'Utah',
+        'VA': 'Virginia',
+        'VI': 'Virgin Islands',
+        'VT': 'Vermont',
+        'WA': 'Washington',
+        'WI': 'Wisconsin',
+        'WV': 'West Virginia',
+        'WY': 'Wyoming'
+    })
+    
+    MONTHS = [
+        "January",
+        "February",
+        "March",
+        "April",
+        "May",
+        "June",
+        "July",
+        "August",
+        "September",
+        "October",
+        "November",
+        "December"
+    ]
+    
+    token = ""
+    
+    def __init__(self, base_path, save_path, debug_output):
+        self.save_path = save_path
+        self.debug_output = debug_output
+
+        # Extra support files.
+        self.ghcnd_countries = base_path + '/ghcnd-countries.txt'
+        self.ghcnd_inventory = base_path + '/ghcnd-inventory.txt'
+        self.ghcnd_states = base_path + '/ghcnd-states.txt'
+        self.ghcnd_stations = base_path + '/ghcnd-stations.txt'
+        
+        # MSHR support files.
+        self.mshr_stations = base_path + '/mshr_enhanced_201402.txt'
+        
+    def set_token(self, token):
+        self.token = token
+        
+    def get_field_from_definition(self, row, field_definition):
+        return row[(field_definition[FIELD_INDEX_START] - 1):field_definition[FIELD_INDEX_END]]
+    
+    def get_field(self, fields_array, row, index):
+        return row[(fields_array[index][FIELD_INDEX_START] - 1):fields_array[index][FIELD_INDEX_END]]
+    
+    def get_dly_field(self, row, index):
+        return self.get_field(DLY_FIELDS, row, index)
+    
+    def print_row_files(self, row):
+        for field in DLY_FIELDS:
+            print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
+    
+    def save_file(self, filename, contents):
+        file = open(filename, 'w')
+        file.write(contents)
+        file.close()
+        return filename
+    
+    def get_folder_size(self, folder_name):
+        total_size = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                fp = os.path.join(dirpath, f)
+                total_size += os.path.getsize(fp)
+        return total_size
+
+    def process_one_month_sensor_set(self, records, page):
+        # Default
+        return 0
+    
+    def process_station_data(self, row):
+        # Default
+        return 0
+    
+    def get_base_folder(self, station_id, data_type="sensors"):
+        return build_base_save_folder(self.save_path, station_id, data_type) 
+    
+    def process_inventory_file(self):
+        print "Processing inventory file"
+        file_stream = open(self.ghcnd_inventory, 'r')
+        
+        csv_header = ['ID', 'SENSORS', 'SENSORS_COUNT', 'MAX_YEARS', 'TOTAL_YEARS_FOR_ALL_SENSORS']
+        row = file_stream.readline()
+        csv_inventory = {}
+        for row in file_stream:
+            id = self.get_field_from_definition(row, INVENTORY_FIELDS['ID'])
+            sensor_id = self.get_field_from_definition(row, INVENTORY_FIELDS['ELEMENT'])
+            start = int(self.get_field_from_definition(row, INVENTORY_FIELDS['FIRSTYEAR']))
+            end = int(self.get_field_from_definition(row, INVENTORY_FIELDS['LASTYEAR']))
+            if id in csv_inventory:
+                new_count = str(int(csv_inventory[id][2]) + 1)
+                new_max = str(max(int(csv_inventory[id][3]), (end - start)))
+                new_total = str(int(csv_inventory[id][3]) + end - start)
+                csv_inventory[id] = [id, (csv_inventory[id][1] + "," + sensor_id), new_count, new_max, new_total]
+            else:
+                csv_inventory[id] = [id, sensor_id, str(1), str(end - start), str(end - start)]
+                
+        path = self.save_path + "/inventory.csv"
+        self.save_csv_file(path, csv_inventory, csv_header)
+    
+    def save_csv_file(self, path, csv_inventory, header):
+        csv_content = "|".join(header) + "\n"
+        for row_id in csv_inventory:
+            csv_content += "|".join(csv_inventory[row_id]) + "\n"
+        self.save_file(path, csv_content)
+        
+
+    def process_station_file(self, file_name):
+        print "Processing station file: " + file_name
+        file_stream = open(file_name, 'r')
+        
+        row = file_stream.readline()
+        return self.process_station_data(row)
+
+    def process_sensor_file(self, file_name, max_files, sensor_max=99):
+        print "Processing sensor file: " + file_name
+        file_stream = open(file_name, 'r')
+    
+        month_last = 0
+        year_last = 0
+        records = []
+        page = 0
+        sensor_count = 0
+    
+        file_count = 0
+        for row in file_stream:
+            month = self.get_dly_field(row, DLY_FIELD_MONTH)
+            year = self.get_dly_field(row, DLY_FIELD_YEAR)
+            
+            if (month_last != 0 and year_last != 0) and (sensor_count >= sensor_max or month != month_last or year != year_last):
+                # process set
+                file_count += self.process_one_month_sensor_set(records, page)
+                records = []
+                if sensor_count >= sensor_max and month == month_last and year == year_last:
+                    # start a new page.
+                    page += 1
+                else:
+                    # start over.
+                    page = 0
+                sensor_count = 0
+            
+            records.append(row)
+            sensor_count += 1
+            if max_files != 0 and file_count >= max_files:
+                # Stop creating more files after the max is reached.
+                break
+
+            month_last = month
+            year_last = year
+        
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+        data_size = self.get_folder_size(self.get_base_folder(station_id) + "/" + station_id)
+        print "Created " + str(file_count) + " XML files for a data size of " + str(data_size) + "."
+        
+        return (file_count, data_size)
+    
+    def convert_c2f(self, c):
+        return (9 / 5 * c) + 32
+    
+    def default_xml_web_service_start(self):
+        field_xml = ""
+        field_xml += "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
+        return field_xml
+    
+    def default_xml_data_start(self, total_records):
+        field_xml = ""
+        field_xml += "<dataCollection pageCount=\"1\" totalCount=\"" + str(total_records) + "\">\n"
+        return field_xml
+    
+    def default_xml_station_start(self):
+        field_xml = ""
+        field_xml = "<stationCollection pageSize=\"100\" pageCount=\"1\" totalCount=\"1\">\n"
+        return field_xml
+    
+    def default_xml_field_date(self, report_date, indent=2):
+        field_xml = ""
+        field_xml += self.get_indent_space(indent) + "<date>" + str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + str(report_date.day).zfill(2) + "T00:00:00.000</date>\n"
+        return field_xml
+    
+    def default_xml_mshr_station_additional(self, station_id):
+        """The web service station data is generate from the MSHR data supplemented with GHCN-Daily."""
+        station_mshr_row = ""
+        stations_mshr_file = open(self.mshr_stations, 'r')
+        for line in stations_mshr_file:
+            if station_id == self.get_field_from_definition(line, MSHR_FIELDS['GHCND_ID']).strip():
+                station_mshr_row = line
+                break
+        
+        if station_mshr_row == "":
+            return ""
+
+        additional_xml = ""
+
+        county = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['COUNTY']).strip()
+        if county != "":
+            additional_xml += self.default_xml_location_labels("CNTY", "FIPS:-9999", county)
+            
+        country_code = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip()
+        country_name = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip()
+        if country_code != "" and country_name != "":
+            additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:" + country_code, country_name)
+        
+        return additional_xml
+
+    def default_xml_location_labels(self, type, id, display_name):
+        label_xml = ""
+        label_xml += self.default_xml_start_tag("locationLabels", 2)
+        label_xml += self.default_xml_element("type", type, 3)
+        label_xml += self.default_xml_element("id", id, 3)
+        label_xml += self.default_xml_element("displayName", display_name, 3)
+        label_xml += self.default_xml_end_tag("locationLabels", 2)
+        return label_xml
+        
+
+    def default_xml_web_service_station(self, station_id):
+        """The web service station data is generate from available historical sources."""
+        station_ghcnd_row = ""
+        stations_ghcnd_file = open(self.ghcnd_stations, 'r')
+        for line in stations_ghcnd_file:
+            if station_id == self.get_field_from_definition(line, STATIONS_FIELDS['ID']):
+                station_ghcnd_row = line
+                break
+    
+        xml_station = ""
+        xml_station += self.default_xml_start_tag("station", 1)
+        
+        xml_station += self.default_xml_element("id", "GHCND:" + station_id, 2)
+        xml_station += self.default_xml_element("displayName", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['NAME']).strip(), 2)
+        xml_station += self.default_xml_element("latitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LATITUDE']).strip(), 2)
+        xml_station += self.default_xml_element("longitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LONGITUDE']).strip(), 2)
+        
+        elevation = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['ELEVATION']).strip()
+        if elevation != "-999.9":
+            xml_station += self.default_xml_element("elevation", elevation, 2)
+        
+        state_code = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['STATE']).strip()
+        if state_code != "" and state_code in self.STATES:
+            xml_station += self.default_xml_location_labels("ST", "FIPS:" + str(self.STATES.keys().index(state_code)), self.STATES[state_code])
+        
+        # Add the MSHR data to the station generated information.
+        xml_station += self.default_xml_mshr_station_additional(station_id)
+            
+        xml_station += self.default_xml_end_tag("station", 1)
+        return xml_station
+        
+    def default_xml_day_reading_as_field(self, row, day):
+        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
+        value = self.get_dly_field(row, day_index);
+        if value == "-9999":
+            return ""
+    
+        field_xml = ""
+        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
+        if field_id in ("MDTN", "MDTX", "MNPN", "MXPN", "TMAX", "TMIN", "TOBS",):
+            # Add both the celcius and fahrenheit temperatures.
+            celcius = float(value) / 10
+            field_xml += "            <" + field_id + "_c>" + str(celcius) + "</" + field_id + "_c>\n"
+            fahrenheit = self.convert_c2f(celcius)
+            field_xml += "            <" + field_id + "_f>" + str(fahrenheit) + "</" + field_id + "_f>\n"
+        elif field_id in ("AWND", "EVAP", "PRCP", "THIC", "WESD", "WESF", "WSF1", "WSF2", "WSF5", "WSFG", "WSFI", "WSFM",):
+            # Field values that are in tenths.
+            converted_value = float(value) / 10
+            field_xml += "            <" + field_id + ">" + str(converted_value) + "</" + field_id + ">\n"
+        elif field_id in ("ACMC", "ACMH", "ACSC", "ACSH", "PSUN",):
+            # Fields is a percentage.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        elif field_id in ("FMTM", "PGTM",):
+            # Fields is a time value HHMM.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        elif field_id in ("DAEV", "DAPR", "DASF", "DATN", "DATX", "DAWM", "DWPR", "FRGB", "FRGT", "FRTH", "GAHT", "MDSF", "MDWM", "MDEV", "MDPR", "SNOW", "SNWD", "TSUN", "WDF1", "WDF2", "WDF5", "WDFG", "WDFI", "WDFM", "WDMV",):
+            # Fields with no alternation needed.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        else:
+            field_xml += "            <unknown>" + field_id + "</unknown>\n"
+            
+        # print field_xml
+        return field_xml
+    
+    def default_xml_day_reading(self, row, day, indent=2):
+        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
+        value = self.get_dly_field(row, day_index);
+        mflag = self.get_dly_field(row, day_index + 1);
+        qflag = self.get_dly_field(row, day_index + 2);
+        sflag = self.get_dly_field(row, day_index + 3);
+
+        if value == "-9999":
+            return ""
+
+        indent_space = self.get_indent_space(indent)
+        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
+        station_id = "GHCND:" + self.get_dly_field(row, DLY_FIELD_ID)
+    
+        field_xml = ""
+        field_xml += indent_space + "<dataType>" + field_id + "</dataType>\n"
+        field_xml += indent_space + "<station>" + station_id + "</station>\n"
+        field_xml += indent_space + "<value>" + value.strip() + "</value>\n"
+        field_xml += indent_space + "<attributes>\n"
+        field_xml += indent_space + indent_space + "<attribute>" + mflag.strip() + "</attribute>\n"
+        field_xml += indent_space + indent_space + "<attribute>" + qflag.strip() + "</attribute>\n"
+        field_xml += indent_space + indent_space + "<attribute>" + sflag.strip() + "</attribute>\n"
+        field_xml += indent_space + indent_space + "<attribute></attribute>\n"
+        field_xml += indent_space + "</attributes>\n"
+
+        # print field_xml
+        return field_xml
+    
+    def default_xml_end(self):
+        return textwrap.dedent("""\
+            </ghcnd_observation>""")
+
+    def default_xml_data_end(self):
+        return self.default_xml_end_tag("dataCollection", 0)
+
+    def default_xml_station_end(self):
+        return self.default_xml_end_tag("stationCollection", 0)
+
+    def default_xml_element(self, tag, data, indent=1):
+        return self.get_indent_space(indent) + "<" + tag + ">" + data + "</" + tag + ">\n"
+
+    def default_xml_start_tag(self, tag, indent=1):
+        return self.get_indent_space(indent) + "<" + tag + ">\n"
+
+    def default_xml_end_tag(self, tag, indent=1):
+        return self.get_indent_space(indent) + "</" + tag + ">\n"
+
+    def get_indent_space(self, indent):
+        return (" " * (4 * indent))
+    
+
+class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML):
+    """The web service class details how to create files similar to the NOAA web service."""
+    skip_downloading = False
+    # Station data
+    def process_station_data(self, row):
+        """Adds a single station record file either from downloading the data or generating a similar record."""
+        station_id = self.get_dly_field(row, DLY_FIELD_ID)
+        download = 0
+        if self.token is not "" and not self.skip_downloading:
+            download = self.download_station_data(station_id, self.token, True)
+            if download == 0:
+                self.skip_downloading = True
+        
+        # If not downloaded, generate.
+        if download != 0:
+            return download
+        else:
+            # Information for each daily file.
+            station_xml_file = self.default_xml_web_service_start()
+            station_xml_file += self.default_xml_station_start()
+            station_xml_file += self.default_xml_web_service_station(station_id)
+            station_xml_file += self.default_xml_station_end()
+            
+            # Remove white space.
+            station_xml_file = station_xml_file.replace("\n", "");
+            station_xml_file = station_xml_file.replace(self.get_indent_space(1), "");
+
+            # Make sure the station folder is available.
+            ghcnd_xml_station_path = self.get_base_folder(station_id, "stations")
+            if not os.path.isdir(ghcnd_xml_station_path):
+                os.makedirs(ghcnd_xml_station_path)
+                    
+            # Save XML string to disk.
+            save_file_name = ghcnd_xml_station_path + station_id + ".xml"
+            save_file_name = self.save_file(save_file_name, station_xml_file)
+    
+            if save_file_name is not "":
+                if self.debug_output:
+                    print "Wrote file: " + save_file_name
+                return 1
+            else:
+                return 0
+
+    # Station data
+    def download_station_data(self, station_id, token, reset=False):
+        """Downloads the station data from the web service."""
+        import time
+        time.sleep(2)
+        # Make sure the station folder is available.
+        ghcnd_xml_station_path = self.get_base_folder(station_id, "stations")
+        if not os.path.isdir(ghcnd_xml_station_path):
+            os.makedirs(ghcnd_xml_station_path)
+                
+        # Build download URL.
+        url = "http://www.ncdc.noaa.gov/cdo-services/services/datasets/GHCND/stations/GHCND:" + station_id + ".xml?token=" + token
+        url_file = urllib.urlopen(url)
+        station_xml_file = ""
+        while (True):
+            line = url_file.readline()
+            if not line:
+                break
+            station_xml_file += line
+        
+        if station_xml_file.find("<cdoError>") != -1:
+            if self.debug_output:
+                print "Error in station download"
+            return 0
+        
+        # Save XML string to disk.
+        save_file_name = ghcnd_xml_station_path + station_id + ".xml"
+        save_file_name = self.save_file(save_file_name, station_xml_file)
+    
+        if save_file_name is not "":
+            if self.debug_output:
+                print "Wrote file: " + save_file_name
+            return 2
+        else:
+            return 0
+
+    # Sensor data
+    def process_one_month_sensor_set(self, records, page):
+        """Generates records for a station using the web service xml layout."""
+        found_data = False        
+        year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR))
+        month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH))
+    
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+
+        # Information for each daily file.
+        count = 0
+        daily_xml_file = ""
+        
+        for day in range(1, 32):
+            try:
+                # TODO find out what is a valid python date range? 1889?
+                # Attempt to see if this is valid date.
+                report_date = date(year, month, day)
+
+                for record in records:
+                    record_xml_snip = self.default_xml_day_reading(record, report_date.day)
+                    if record_xml_snip is not "":
+                        daily_xml_file += self.default_xml_start_tag("data")
+                        daily_xml_file += self.default_xml_field_date(report_date)
+                        daily_xml_file += record_xml_snip
+                        daily_xml_file += self.default_xml_end_tag("data")
+                        found_data = True
+                        count += 1
+
+            except ValueError:
+                pass
+
+        daily_xml_file = self.default_xml_web_service_start() + self.default_xml_data_start(count) + daily_xml_file + self.default_xml_data_end()
+        daily_xml_file = daily_xml_file.replace("\n", "");
+        daily_xml_file = daily_xml_file.replace(self.get_indent_space(1), "");
+
+        if not found_data:
+            return 0
+
+        # Make sure the station folder is available.
+        ghcnd_xml_station_path = self.get_base_folder(station_id) + "/" + station_id + "/" + str(report_date.year) + "/"
+        if not os.path.isdir(ghcnd_xml_station_path):
+            os.makedirs(ghcnd_xml_station_path)
+                
+        # Save XML string to disk.
+        save_file_name = ghcnd_xml_station_path + build_sensor_save_filename(station_id, report_date, page)
+        save_file_name = self.save_file(save_file_name, daily_xml_file)
+
+        if save_file_name is not "":
+            if self.debug_output:
+                print "Wrote file: " + save_file_name
+            return 1
+        else:
+            return 0
+
+def build_base_save_folder(save_path, station_id, data_type="sensors"):
+    # Default
+    station_prefix = station_id[:3]
+    return save_path + data_type + "/" + station_prefix + "/"
+
+def build_sensor_save_filename(station_id, report_date, page):
+    # Default
+    return station_id + "_" + str(report_date.year).zfill(4) + str(report_date.month).zfill(2) + "_" + str(page) + ".xml"
+

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
new file mode 100644
index 0000000..c8b0fa5
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os.path
+import linecache
+import distutils.core
+
+from weather_convert_to_xml import *
+from collections import OrderedDict
+
+# Weather data files created to manage the conversion process.
+# Allows partition and picking up where you left off.
+class WeatherDataFiles:
+
+    LARGE_FILE_ROOT_TAG = "root"
+
+    INDEX_DATA_FILE_NAME = 0
+    INDEX_DATA_SENSORS_STATUS = 1
+    INDEX_DATA_STATION_STATUS = 2
+    INDEX_DATA_FILE_COUNT = 3
+    INDEX_DATA_FOLDER_DATA = 4
+
+    DATA_FILE_START_INDEX = 0
+    DATA_FILE_EXTENSION = ".dly"
+    DATA_FILE_MISSING = "missing"
+    DATA_FILE_INITIAL = "initialized"
+    DATA_FILE_DOWNLOADED = "downloaded"
+    DATA_FILE_GENERATED = "generated"
+    SEPERATOR = ","
+    
+    type = "sensor"
+    data_reset = False
+    
+    def __init__(self, base_path, progress_file_name="/tmp/_weather_data.csv"):
+        self.base_path = base_path
+
+        self.progress_file_name = progress_file_name
+        
+        self.current = self.DATA_FILE_START_INDEX
+        self.progress_data = []
+
+    def get_file_list_iterator(self):
+        """Return the list of files one at a time."""
+        return glob.iglob(self.base_path + "/*" + self.DATA_FILE_EXTENSION)
+
+    # Save Functions
+    def build_progress_file(self, options, convert):
+        if not os.path.isfile(self.progress_file_name) or 'reset' in options:
+            # Build a new file.
+            file = open(self.progress_file_name, 'w')
+            contents = self.get_default_progress_file_csv()
+            file.write(contents)
+            file.close()
+        elif 'append' in options or 'recalculate' in options:
+            self.open_progress_data()
+            row_count = len(self.progress_data)
+            for row in range(0, row_count):
+                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+                file_name = row_contents[self.INDEX_DATA_FILE_NAME]
+                if self.get_file_row(file_name) < 0 and 'append' in options: 
+                    self.progress_data.append(self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL))
+                elif 'recalculate' in options:
+                    # The folder is hard coded
+                    station_id = os.path.basename(file_name).split('.')[0]
+                    folder_name = convert.get_base_folder(station_id)
+                    if os.path.exists(folder_name):
+                        row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+                        sensor_status = row_contents[self.INDEX_DATA_SENSORS_STATUS]
+                        station_status = row_contents[self.INDEX_DATA_STATION_STATUS]
+                        file_count = self.get_file_count(folder_name)
+                        data_size = self.get_folder_size(folder_name)
+                        self.progress_data[row] = self.get_progress_csv_row(file_name, sensor_status, station_status, file_count, data_size)
+                    else:
+                        self.progress_data[row] = self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL)
+            # Save file
+            self.close_progress_data(True)
+        self.reset()
+        
+    def copy_to_n_partitions(self, save_path, partitions, base_paths, reset):
+        """Once the initial data has been generated, the data can be copied into a set number of partitions. """
+        if (len(base_paths) == 0):
+            return
+        
+        # Initialize the partition paths.
+        partition_paths = get_disk_partition_paths(0, partitions, base_paths)
+        for path in partition_paths:
+            # Make sure the xml folder is available.
+            prepare_path(path, reset)
+
+        import fnmatch
+        import os
+        
+        # copy stations and sensors into each partition
+        current_sensor_partition = 0
+        current_station_partition = 0
+        self.open_progress_data()
+        row_count = len(self.progress_data)
+        for row in range(0, row_count):
+            row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
+            station_id = os.path.basename(file_name).split('.')[0]
+               
+            # Copy sensor files
+            type = "sensors"
+            file_path = build_base_save_folder(save_path, station_id, type) + station_id
+            for root, dirnames, filenames in os.walk(file_path):
+                for filename in fnmatch.filter(filenames, '*.xml'):
+                    xml_path = os.path.join(root, filename)
+                    new_file_base = build_base_save_folder(partition_paths[current_sensor_partition], station_id, type) + station_id
+                    if not os.path.isdir(new_file_base):
+                        os.makedirs(new_file_base)
+                    shutil.copyfile(xml_path, new_file_base + "/" + filename)
+                    current_sensor_partition += 1
+                    if current_sensor_partition >= len(partition_paths):
+                        current_sensor_partition = 0
+            
+            # Copy station files
+            type = "stations"
+            file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml"
+            new_file_base = build_base_save_folder(partition_paths[current_station_partition], station_id, type)
+            new_file_path = new_file_base + station_id + ".xml"
+            if os.path.isfile(file_path):
+                if not os.path.isdir(new_file_base):
+                    os.makedirs(new_file_base)
+                shutil.copyfile(file_path, new_file_path)
+            current_station_partition += 1
+            if current_station_partition >= len(partition_paths):
+                current_station_partition = 0
+
+    def build_to_n_partition_files(self, save_path, partitions, base_paths, reset):
+        """Once the initial data has been generated, the data can be divided into partitions 
+        and stored in single files.
+        """
+        if (len(base_paths) == 0):
+            return
+        
+        XML_START = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>"
+        
+        partition_paths = get_disk_partition_paths(0, partitions, base_paths)
+
+        import fnmatch
+        import os
+        
+        for path in partition_paths:
+            prepare_path(path, reset)
+
+        # Initialize the partition paths.
+        types = ["sensors", "stations"]
+        for type in types:
+            partition_files = []
+            for path in partition_paths:
+                # Make sure the xml folder is available.
+                prepare_path(path + type + "/", False)
+                partition_files.append(open(path + type + "/partition.xml", 'w'))
+                partition_files[-1].write(XML_START + "<" + self.LARGE_FILE_ROOT_TAG + ">\n")
+
+            # copy into each partition
+            current_partition = 0
+            self.open_progress_data()
+            row_count = len(self.progress_data)
+            for row in range(0, row_count):
+                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+                file_name = row_contents[self.INDEX_DATA_FILE_NAME]
+                station_id = os.path.basename(file_name).split('.')[0]
+                
+                # Copy files
+                if type == "sensors":
+                    file_path = build_base_save_folder(save_path, station_id, type) + station_id
+                    for root, dirnames, filenames in os.walk(file_path):
+                        for filename in fnmatch.filter(filenames, '*.xml'):
+                            xml_path = os.path.join(root, filename)
+                            xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n"
+                            partition_files[current_partition].write(xml_data)
+                            current_partition += 1
+                            if current_partition >= len(partition_files):
+                                current_partition = 0
+                elif type == "stations":
+                    file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml"
+                    xml_path = os.path.join(root, file_path)
+                    xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n"
+                    partition_files[current_partition].write(xml_data)
+                    current_partition += 1
+                    if current_partition >= len(partition_paths):
+                        current_partition = 0
+
+            for row in range(0, len(partition_paths)):
+                partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + ">\n")
+                partition_files[row].close()
+
+    def get_file_row(self, file_name):
+        for i in range(0, len(self.progress_data)):
+            if self.progress_data[i].startswith(file_name):
+                return i
+        return -1
+        
+    def get_default_progress_file_csv(self):
+        contents = ""
+        for path in self.get_file_list_iterator():
+            file_name = os.path.basename(path)
+            contents += self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL)
+        return contents
+    
+    def print_progress_file_stats(self, convert):
+        sensor_count_missing = 0
+        sensor_count = 0
+        file_count = 0
+        data_size = 0
+        
+        sensor_count_actual = 0
+        file_count_actual = 0
+        data_size_actual = 0
+        
+        station_count_missing = 0
+        station_count_generated = 0
+        station_count_downloaded = 0
+        
+        self.open_progress_data()
+        row_count = len(self.progress_data)
+        for row in range(0, row_count):
+            row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+            if int(row_contents[self.INDEX_DATA_FILE_COUNT]) != -1 and  int(row_contents[self.INDEX_DATA_FOLDER_DATA]) != -1:
+                sensor_count += 1
+                file_count += int(row_contents[self.INDEX_DATA_FILE_COUNT])
+                data_size += int(row_contents[self.INDEX_DATA_FOLDER_DATA])
+            else:
+                sensor_count_missing += 1
+                
+            if row_contents[self.INDEX_DATA_STATION_STATUS] == "generated":
+                station_count_generated += 1
+            if row_contents[self.INDEX_DATA_STATION_STATUS] == "downloaded":
+                station_count_downloaded += 1
+            else:
+                station_count_missing += 1
+
+            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
+            station_id = os.path.basename(file_name).split('.')[0]
+            folder_name = convert.get_base_folder(station_id)
+            if os.path.exists(folder_name):
+                sensor_count_actual += 1
+                file_count_actual += self.get_file_count(folder_name)
+                data_size_actual += self.get_folder_size(folder_name)
+
+
+        print "Progress File:\t" + self.progress_file_name + "\n"
+        
+        print "CSV DETAILS OF PROCESSED SENSORS"
+        print "Number of stations:\t" + "{:,}".format(sensor_count)
+        print "Number of files:\t" + "{:,}".format(file_count)
+        print "Data size:\t\t" + "{:,}".format(data_size) + " Bytes\n"
+
+        print "CSV DETAILS OF unPROCESSED SENSORS"
+        print "Number of stations:\t" + "{:,}".format(sensor_count_missing) + "\n"
+
+        print "CSV DETAILS OF PROCESSED STATIONS"
+        print "Generated:\t\t" + "{:,}".format(station_count_generated)
+        print "Downloaded:\t\t" + "{:,}".format(station_count_downloaded)
+        print "Missing:\t\t" + "{:,}".format(station_count_missing) + "\n"
+
+        print "FOLDER DETAILS"
+        print "Number of stations:\t" + "{:,}".format(sensor_count_actual)
+        print "Number of files:\t" + "{:,}".format(file_count_actual)
+        print "Data size:\t\t" + "{:,}".format(data_size_actual) + " Bytes\n"
+
+    
+    def get_progress_csv_row(self, file_name, sensors_status, station_status, file_count=-1, data_size=-1):
+        return file_name + self.SEPERATOR + sensors_status + self.SEPERATOR + station_status + self.SEPERATOR + str(file_count) + self.SEPERATOR + str(data_size) + "\n"
+    
+    def update_file_sensor_status(self, file_name, sensors_status, file_count=-1, data_size=-1):
+        for row in range(0, len(self.progress_data)):
+            if self.progress_data[row].startswith(file_name):
+                station_status = self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_STATION_STATUS]
+                self.progress_data[row] = self.get_progress_csv_row(file_name, sensors_status, station_status, file_count, data_size)
+                break
+
+        # Save the file            
+        self.close_progress_data(True)
+
+    def update_file_station_status(self, file_name, station_status):
+        for row in range(0, len(self.progress_data)):
+            if self.progress_data[row].startswith(file_name):
+                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+                sensors_status = row_contents[self.INDEX_DATA_SENSORS_STATUS]
+                file_count = int(row_contents[self.INDEX_DATA_FILE_COUNT])
+                data_size = int(row_contents[self.INDEX_DATA_FOLDER_DATA])
+                self.progress_data[row] = self.get_progress_csv_row(file_name, sensors_status, station_status, file_count, data_size)
+                break
+
+        # Save the file            
+        self.close_progress_data(True)
+
+    def get_file_count(self, folder_name):
+        count = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                count += 1
+        return count
+
+    def get_folder_size(self, folder_name):
+        total_size = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                fp = os.path.join(dirpath, f)
+                total_size += os.path.getsize(fp)
+        return total_size
+
+    def get_station_status(self, return_value):
+        if return_value == 2:
+            return self.DATA_FILE_DOWNLOADED
+        elif return_value == 1:
+            return self.DATA_FILE_GENERATED
+        return self.DATA_FILE_MISSING
+        
+    
+    def open_progress_data(self):
+        with open(self.progress_file_name, 'r') as file:
+            self.progress_data = file.readlines()
+
+    def close_progress_data(self, force=False):
+        if len(self.progress_data) > 0 or force:
+            with open(self.progress_file_name, 'w') as file:
+                file.writelines(self.progress_data)
+
+    
+    def reset(self):
+        self.close_progress_data()
+
+        self.current = self.DATA_FILE_START_INDEX
+        self.open_progress_data()
+
+    def set_type(self, type):
+        self.type = type
+
+    def set_data_reset(self, data_reset):
+        self.data_reset = data_reset
+
+
+    # Iterator Functions
+    def __iter__(self):
+        return self
+
+    def next(self):
+        columns = []
+        while True:
+            # find a row that has not been created.
+            if self.current >= len(self.progress_data):
+                raise StopIteration
+            row = self.progress_data[self.current]
+            self.current += 1
+            columns = row.rsplit(self.SEPERATOR)
+            if self.type == "sensor" and (columns[self.INDEX_DATA_SENSORS_STATUS].strip() != self.DATA_FILE_GENERATED or self.data_reset):
+                break
+            elif self.type == "station" and (columns[self.INDEX_DATA_STATION_STATUS].strip() != self.DATA_FILE_DOWNLOADED or self.data_reset):
+                break
+        return columns[self.INDEX_DATA_FILE_NAME]
+    
+    
+# Index values of each field details.
+PARTITION_INDEX_NODE = 0
+PARTITION_INDEX_DISK = 1
+PARTITION_INDEX_VIRTUAL = 2
+PARTITION_INDEX = 3
+PARTITION_INDEX_PATH = 4
+PARTITION_HEADER = ("Node", "Disk", "Virtual", "Index", "Path")
+
+def get_disk_partition_paths(node_id, partitions, base_paths, key="partitions"):
+    partition_paths = []
+    for scheme in get_disk_partition_scheme(node_id, partitions, base_paths, key):
+        partition_paths.append(scheme[PARTITION_INDEX_PATH])
+    return partition_paths
+
+def get_disk_partition_scheme(node_id, virtual_disk_partitions, base_paths, key="partitions"):
+    partition_scheme = []
+    for i in range(0, virtual_disk_partitions):
+        for j in range(0, len(base_paths)):
+            new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, virtual_disk_partitions, i) + "/"
+            partition_scheme.append((node_id, j, virtual_disk_partitions, i, new_partition_path))
+    return partition_scheme
+
+def get_partition_folder(disks, partitions, index):        
+    return "d" + str(disks) + "_p" + str(partitions) + "_i" + str(index)
+
+def prepare_path(path, reset):
+    """Ensures the directory is available. If reset, then its a brand new directory."""
+    if os.path.isdir(path) and reset:
+        shutil.rmtree(path)
+                
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+def file_get_contents(filename):
+    with open(filename) as f:
+        return f.read()

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
new file mode 100644
index 0000000..fb59b50
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os.path
+import shutil
+import tarfile
+import urllib
+import zipfile
+
+# Custom modules.
+from weather_config_ghcnd import *
+from weather_config_mshr import *
+
+class WeatherDownloadFiles:
+
+    def __init__(self, save_path):
+        self.save_path = save_path
+        
+        if not os.path.isdir(save_path):
+            os.makedirs(save_path)
+
+
+    def download_ghcnd_files(self, reset=False):
+        """Download the complete list."""
+        for file_name in FILE_NAMES:
+            url = BASE_DOWNLOAD_URL + file_name
+            self.download_file(url, reset)
+
+    def download_mshr_files(self, reset=False):
+        for url in MSHR_URLS:
+            self.download_file(url, reset)
+
+    def download_file(self, url, reset=False):
+        """Download the file, unless it exists."""
+        file_name = self.save_path + "/" + url.split('/')[-1]
+
+        if not os.path.isfile(file_name) or reset:
+            print "Downloading: " + url
+            urllib.urlretrieve(url, file_name, report_download_status)
+            print
+
+    def unzip_ghcnd_package(self, package, reset=False):
+        """Unzip the package file, unless it exists."""
+        file_name = self.save_path + "/" + package + ".tar.gz"
+        unzipped_path = self.save_path + "/" + package
+        
+        if os.path.isdir(unzipped_path) and reset:
+            shutil.rmtree(unzipped_path)
+            
+        if not os.path.isdir(unzipped_path):
+            print "Unzipping: " + file_name
+            tar_file = tarfile.open(file_name, 'r:gz')
+            tar_file.extractall(unzipped_path)
+ 
+    def unzip_mshr_files(self, reset=False):
+        """Unzip the package file, unless it exists."""
+        for url in MSHR_URLS:
+            if url.endswith('.zip'):
+                file_name = self.save_path + "/" + url.split('/')[-1]
+                print "Unzipping: " + file_name
+                with zipfile.ZipFile(file_name, 'r') as myzip:
+                    myzip.extractall(self.save_path)
+ 
+def report_download_status(count, block, size):
+    """Report download status."""
+    line_size = 50
+    erase = "\b" * line_size
+    sys.stdout.write(erase)
+    report = get_report_line((float(count) * block / size), line_size)
+    sys.stdout.write(report)
+
+def get_report_line(percentage, line_size):
+    """Creates a string to be used in reporting the percentage done."""
+    report = ""
+    for i in range(0, line_size):
+        if (float(i) / line_size < percentage):
+            report += "="
+        else:
+            report += "-"
+    return report
+            
+def download_file_save_as(url, new_file_name, reset=False):
+    """Download the file, unless it exists."""
+    if not os.path.isfile(new_file_name) or reset:
+        print "Downloading: " + url
+        urllib.urlretrieve(url, new_file_name, report_download_status)
+        print
+

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/README.md
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/README.md b/vxquery-benchmark/src/main/resources/util/README.md
new file mode 100644
index 0000000..8e2a204
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/README.md
@@ -0,0 +1,28 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+Utilities for Benchmark Operations
+=====================
+
+# Introduction
+
+Helpful scripts or configuration document to work with the benchmarks.
+
+## Saxon Collection
+
+To test the data with other XQuery processors, the saxon script helps with 
+creating a collection.xml file.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py b/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
new file mode 100644
index 0000000..02f39ee
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import getopt, glob, os, sys
+ 
+def main(argv):
+    xml_folder = ""
+     
+    # Get the base folder
+    try:
+        opts, args = getopt.getopt(argv, "f:h", ["folder="])
+    except getopt.GetoptError:
+        print 'The file options for build_saxon_collection_xml.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python build_saxon_collection_xml.py -h'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Options:'
+            print '    -f        The base folder to create collection XML file.'
+            sys.exit()
+        elif opt in ('-f', "--folder"):
+            # check if file exists.
+            if os.path.exists(arg):
+                xml_folder = arg
+            else:
+                print 'Error: Argument must be a folder name for --folder (-f).'
+                sys.exit()
+  
+    # Required fields to run the script.
+    if xml_folder == "" or not os.path.exists(xml_folder):
+        print 'Error: The folder path option must be supplied:  --folder (-f).'
+        sys.exit()
+      
+    # find all XML files in folder
+    collection_xml = "<collection>"
+    for i in range(1, 5):
+        # Search the ith directory level.
+        search_pattern = xml_folder + ('/*' * i) + '.xml'
+        for file_path in glob.iglob(search_pattern):
+            collection_xml += '<doc href="' + str.replace(file_path, xml_folder, '') + '"/>'
+    collection_xml += "</collection>"
+          
+    # create collection XML
+    file = open('collection.xml', 'w')
+    file.write(collection_xml)
+    file.close()
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/diff_xml_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/diff_xml_files.py b/vxquery-benchmark/src/main/resources/util/diff_xml_files.py
new file mode 100644
index 0000000..8ad2e30
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/diff_xml_files.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import getopt, glob, os, sys
+ 
+def main(argv):
+    f1 = ""
+    f2 = ""
+     
+    # Get the base folder
+    try:
+        opts, args = getopt.getopt(argv, "h", ["f1=", "f2="])
+    except getopt.GetoptError:
+        print 'The file options for build_saxon_collection_xml.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python build_saxon_collection_xml.py -h'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Options:'
+            print '    -f        The base folder to create collection XML file.'
+            sys.exit()
+        elif opt in ('--f1'):
+            # check if file exists.
+            if os.path.exists(arg):
+                f1 = arg
+            else:
+                print 'Error: Argument must be a file name for --f1.'
+                sys.exit()
+        elif opt in ('--f2'):
+            # check if file exists.
+            if os.path.exists(arg):
+                f2 = arg
+            else:
+                print 'Error: Argument must be a file name for --f2.'
+                sys.exit()
+
+    # Required fields to run the script.
+    if f1 == "" or not os.path.exists(f1):
+        print 'Error: The file path option must be supplied:  --f1.'
+        sys.exit()
+    if f2 == "" or not os.path.exists(f2):
+        print 'Error: The file path option must be supplied:  --f2.'
+        sys.exit()
+      
+    missing_in_f1 = []
+    missing_in_f2 = []
+    found_in_both = []
+    
+    with open(f1) as f:
+        content_f1 = f.readlines()
+    set_f1 = set(content_f1)
+    
+    
+    with open(f2) as f:
+        content_f2 = f.readlines()
+    set_f2 = set(content_f2)
+    
+    missing_in_f1 = set_f2.difference(set_f1)
+    missing_in_f2 = set_f1.difference(set_f2)
+    found_in_both = set_f1.intersection(set_f2)
+    
+    print ""
+    print "Missing files in " + f1
+    for f1_name in missing_in_f1:
+        print " + " + f1_name.strip()
+
+    print ""
+    print "Missing files in " + f2
+    for f2_name in missing_in_f2:
+        print " + " + f2_name.strip()
+    
+    offset = 40
+    print ""
+    print "XML Summary"
+    print (" - Found in both:").ljust(offset) + str(len(found_in_both))
+    print (" - " + f1 + " diff set vs list:").ljust(offset) + str(len(content_f1) - len(set_f1))
+    print (" - " + f2 + " diff set vs list:").ljust(offset) + str(len(content_f2) - len(set_f2))
+    print (" - " + f1 + " missing:").ljust(offset) + str(len(missing_in_f1))
+    print (" - " + f2 + " missing:").ljust(offset) + str(len(missing_in_f2))
+    
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py b/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
new file mode 100644
index 0000000..1cd7939
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fnmatch
+import getopt
+import glob
+import os
+import sys
+import csv
+
+SEARCH_STRING = 'Average execution time:'
+
+def find_files(directory, pattern):
+    for root, dirs, files in os.walk(directory):
+        for basename in files:
+            if fnmatch.fnmatch(basename, pattern):
+                yield (root, basename)
+    
+    
+def main(argv):
+    ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; > list_xml.csv
+    '''
+    log_folder = ""
+    save_file = ""
+    data_type = ""
+    
+    # Get the base folder
+    try:
+        opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", "data_type="])
+    except getopt.GetoptError:
+        print 'The file options for list_xml_files.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python list_xml_files.py -h'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Options:'
+            print '    -f        The base folder to build XML file list.'
+            print '    -s        The save file.'
+            sys.exit()
+        elif opt in ('-f', "--folder"):
+            # check if file exists.
+            if os.path.exists(arg):
+                log_folder = arg
+            else:
+                print 'Error: Argument must be a folder name for --folder (-f).'
+                sys.exit()
+        elif opt in ('-s', "--save_file"):
+            save_file = arg
+        elif opt in ('-t', "--data_type"):
+            data_type = arg
+  
+    # Required fields to run the script.
+    if log_folder == "" or not os.path.exists(log_folder):
+        print 'Error: The folder path option must be supplied:  --folder (-f).'
+        sys.exit()
+    if save_file == "":
+        print 'Error: The folder path option must be supplied:  --save_file (-s).'
+        sys.exit()
+      
+    list_xml_csv = ''
+    with open(save_file, 'w') as outfile:
+        csvfile = csv.writer(outfile)
+        for path, filename in find_files(log_folder, '*.log'):
+            # Only write out a specific type of data xml documents found in a specific path.
+            with open(path + "/" + filename) as infile:
+                folders = path.replace(log_folder, "")
+                for line in infile:
+                    # Skip the root tags.
+                    if line.startswith(SEARCH_STRING):
+                        time_split = line.split(" ")
+                        name_split = filename.split(".")
+                        folder_split = folders.split("/")
+
+                        # Build data row
+                        row = folder_split
+                        row.append(name_split[0])
+                        row.append(time_split[3])
+                        row.append(name_split[2])
+                        csvfile.writerow(row)
+        
+          
+if __name__ == "__main__":
+    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/list_xml_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/list_xml_files.py b/vxquery-benchmark/src/main/resources/util/list_xml_files.py
new file mode 100644
index 0000000..750a95e
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/list_xml_files.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fnmatch
+import getopt
+import glob
+import os
+import sys
+
+def find_files(directory, pattern):
+    for root, dirs, files in os.walk(directory):
+        for basename in files:
+            if fnmatch.fnmatch(basename, pattern):
+                yield (root, basename)
+    
+    
+def main(argv):
+    ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; > list_xml.csv
+    '''
+    xml_folder = ""
+     
+    # Get the base folder
+    try:
+        opts, args = getopt.getopt(argv, "f:h", ["folder="])
+    except getopt.GetoptError:
+        print 'The file options for list_xml_files.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python list_xml_files.py -h'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Options:'
+            print '    -f        The base folder to build XML file list.'
+            sys.exit()
+        elif opt in ('-f', "--folder"):
+            # check if file exists.
+            if os.path.exists(arg):
+                xml_folder = arg
+            else:
+                print 'Error: Argument must be a folder name for --folder (-f).'
+                sys.exit()
+  
+    # Required fields to run the script.
+    if xml_folder == "" or not os.path.exists(xml_folder):
+        print 'Error: The folder path option must be supplied:  --folder (-f).'
+        sys.exit()
+      
+    list_xml_csv = ''
+    for path, filename in find_files(xml_folder, '*.xml'):
+        list_xml_csv += filename + "\n"
+        #list_xml_csv += filename + "," + path + "\n"
+          
+    # create collection XML
+    file = open('list_xml.csv', 'w')
+    file.write(list_xml_csv)
+    file.close()
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/merge_xml_files.py b/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
new file mode 100644
index 0000000..2df026b
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fnmatch
+import getopt
+import glob
+import os
+import sys
+
+XML_PREFIX = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root>' + "\n"
+XML_SUFFIX = '</root>' + "\n"
+
+def find_files(directory, pattern):
+    for root, dirs, files in os.walk(directory, followlinks=True):
+        for basename in files:
+            if fnmatch.fnmatch(basename, pattern):
+                yield (root, basename)
+    
+    
+def main(argv):
+    ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; > list_xml.csv
+    '''
+    xml_folder = ""
+    save_file = ""
+    data_type = ""
+     
+    # Get the base folder
+    try:
+        opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", "data_type="])
+    except getopt.GetoptError:
+        print 'The file options for list_xml_files.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python merge_xml_files.py -f /path/to/folder -s new.xml -t sensors'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Options:'
+            print '    -f        The base folder to build XML file list.'
+            print '    -s        The save file.'
+            sys.exit()
+        elif opt in ('-f', "--folder"):
+            # check if file exists.
+            if os.path.exists(arg):
+                xml_folder = arg
+            else:
+                print 'Error: Argument must be a folder name for --folder (-f).'
+                sys.exit()
+        elif opt in ('-s', "--save_file"):
+            save_file = arg
+        elif opt in ('-t', "--data_type"):
+            data_type = arg
+  
+    # Required fields to run the script.
+    if xml_folder == "" or not os.path.exists(xml_folder):
+        print 'Error: The folder path option must be supplied:  --folder (-f).'
+        sys.exit()
+    if save_file == "":
+        print 'Error: The folder path option must be supplied:  --save_file (-s).'
+        sys.exit()
+      
+    list_xml_csv = ''
+    with open(save_file, 'w') as outfile:
+        outfile.write(XML_PREFIX)
+        for path, filename in find_files(xml_folder, '*.xml'):
+            # Only write out a specific type of data xml documents found in a specific path.
+            if data_type in path:
+                with open(path + "/" + filename) as infile:
+                    for line in infile:
+                        # Skip the root tags.
+                        if line != XML_PREFIX and line != XML_SUFFIX:
+                            outfile.write(line)
+        outfile.write(XML_SUFFIX)
+          
+if __name__ == "__main__":
+    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq b/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
new file mode 100644
index 0000000..d0621eb
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
@@ -0,0 +1,27 @@
+(: Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+   
+     http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License. :)
+
+(: XQuery Function List :)
+(: VXQuery function list in csv with arguments and return types :)
+let $list := "../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml"
+let $r :=
+    for $f in fn:doc($list)/functions/function
+        let $pl := 
+            for $p in $f/param
+            return $p/@type
+        return fn:string-join(($f/@name, fn:string-join($pl, ' '), $f/return/@type), ',')
+return fn:string-join($r , '|')
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq b/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
new file mode 100644
index 0000000..f485807
--- /dev/null
+++ b/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
@@ -0,0 +1,27 @@
+(: Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+   
+     http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License. :)
+
+(: XQuery Function List :)
+(: VXQuery function list in csv with arguments and return types :)
+let $list := "../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-operators.xml"
+let $r :=
+    for $f in fn:doc($list)/operators/operator
+        let $pl := 
+            for $p in $f/param
+            return $p/@type
+        return fn:string-join(($f/@name, fn:string-join($pl, ' '), $f/return/@type), ',')
+return fn:string-join($r , '|')
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-cli/src/main/java/org/apache/vxquery/cli/VXQuery.java
----------------------------------------------------------------------
diff --git a/vxquery-cli/src/main/java/org/apache/vxquery/cli/VXQuery.java b/vxquery-cli/src/main/java/org/apache/vxquery/cli/VXQuery.java
index 8451bd5..c0ca612 100644
--- a/vxquery-cli/src/main/java/org/apache/vxquery/cli/VXQuery.java
+++ b/vxquery-cli/src/main/java/org/apache/vxquery/cli/VXQuery.java
@@ -375,6 +375,7 @@ public class VXQuery {
         ccConfig.clientNetPort = 39000;
         ccConfig.clusterNetIpAddress = "127.0.0.1";
         ccConfig.clusterNetPort = 39001;
+        ccConfig.httpPort = 39002;
         ccConfig.profileDumpPeriod = 10000;
         cc = new ClusterControllerService(ccConfig);
         cc.start();

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/pom.xml
----------------------------------------------------------------------
diff --git a/vxquery-server/pom.xml b/vxquery-server/pom.xml
index ef8f348..6c99712 100644
--- a/vxquery-server/pom.xml
+++ b/vxquery-server/pom.xml
@@ -47,6 +47,10 @@
             <configuration>
               <programs>
                 <program>
+                  <mainClass>org.apache.vxquery.cli.VXQueryClusterShutdown</mainClass>
+                  <name>vxqueryshutdown</name>
+                </program>
+                <program>
                   <mainClass>edu.uci.ics.hyracks.control.cc.CCDriver</mainClass>
                   <name>vxquerycc</name>
                 </program>

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/cluster_actions.py
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/cluster_actions.py b/vxquery-server/src/main/resources/scripts/cluster_actions.py
index a7cda17..deeee33 100644
--- a/vxquery-server/src/main/resources/scripts/cluster_actions.py
+++ b/vxquery-server/src/main/resources/scripts/cluster_actions.py
@@ -62,6 +62,10 @@ class ClusterActions:
         time.sleep(5)
         self.start_all_ncs()
     
+    def stop_cluster(self):
+        machine = self.ci.get_master_node_machine()
+        self.stop_cc_and_all_ncs(machine)
+    
     def stop(self):
         self.stop_all_ncs()
         time.sleep(2)
@@ -109,16 +113,22 @@ class ClusterActions:
     
     def start_cc(self, machine):
         print "Start Cluster Controller."
-        print "  " + machine.get_id() + " " + machine.get_ip() + ":" + machine.get_port()
-        command = "./vxquery-server/target/appassembler/bin/startcc.sh " + machine.get_ip() + " \"" + machine.get_port() + "\" \"" + machine.get_java_opts() + "\""
+        print "  " + machine.get_id() + " " + machine.get_client_ip() + ":" + machine.get_client_port()
+        command = "./vxquery-server/target/appassembler/bin/startcc.sh " + machine.get_client_ip() + " \"" + machine.get_client_port() + "\" \"" + machine.get_java_opts() + "\""
         self.run_remote_command(machine.get_username(), machine.get_id(), command)
     
     def start_nc(self, machine, cc):
         print "Start Node Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()
-        command = "./vxquery-server/target/appassembler/bin/startnc.sh " + machine.get_id() + " " + machine.get_ip() + " " + cc.get_ip() + " \"" + cc.get_port() + "\" \"" + machine.get_java_opts() + "\""
+        command = "./vxquery-server/target/appassembler/bin/startnc.sh " + machine.get_id() + " " + machine.get_ip() + " " + cc.get_client_ip() + " \"" + cc.get_client_port() + "\" \"" + machine.get_java_opts() + "\""
         self.run_remote_command(machine.get_username(), machine.get_id(), command)
 
+    def stop_cc_and_all_ncs(self, machine):
+        print "Stop Cluster and Node Controllers."
+        print "  " + machine.get_id() + " " + machine.get_client_ip() + ":" + machine.get_client_port()
+        command = "./vxquery-server/target/appassembler/bin/stopcluster.sh " + machine.get_client_ip() + " \"" + machine.get_client_port() + "\" \"" + machine.get_java_opts() + "\""
+        self.run_remote_command(machine.get_username(), machine.get_id(), command)
+    
     def stop_cc(self, machine):
         print "Stop Cluster Controller."
         print "  " + machine.get_id() + " " + machine.get_ip()

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/cluster_cli.py
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/cluster_cli.py b/vxquery-server/src/main/resources/scripts/cluster_cli.py
index 089ad08..bd5efa6 100644
--- a/vxquery-server/src/main/resources/scripts/cluster_cli.py
+++ b/vxquery-server/src/main/resources/scripts/cluster_cli.py
@@ -39,10 +39,10 @@ def main(argv):
             sys.exit()
         elif opt in ('-a', "--action"):
             # check if file exists.
-            if arg in ('deploy', 'start', 'stop'):
+            if arg in ('deploy', 'start', 'stop', 'kill'):
                 action = arg
             else:
-                print 'Error: Argument must be a string ("deploy", "start", or "stop") for --action (-a).'
+                print 'Error: Argument must be a string ("deploy", "start", "stop", or "kill") for --action (-a).'
                 sys.exit()
         elif opt in ('-c', "--cluster"):
             # check if file exists.
@@ -72,6 +72,8 @@ def main(argv):
     if action == 'start':
         cluster.start()
     elif action == 'stop':
+        cluster.stop_cluster()
+    elif action == 'kill':
         cluster.stop()
     elif action == 'deploy':
         if deploy_path != "":

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/cluster_information.py
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/cluster_information.py b/vxquery-server/src/main/resources/scripts/cluster_information.py
index 677204b..94b231d 100644
--- a/vxquery-server/src/main/resources/scripts/cluster_information.py
+++ b/vxquery-server/src/main/resources/scripts/cluster_information.py
@@ -32,12 +32,13 @@ class ClusterInformation:
         master_node = self.config.getElementsByTagName("master_node")[0]
         id = NodeXmlReader.get_cluster_id(master_node)
         ip = NodeXmlReader.get_cluster_ip(master_node)
-        port = NodeXmlReader.get_cluster_port(master_node)
+        client_ip = NodeXmlReader.get_client_ip(master_node)
+        client_port = NodeXmlReader.get_client_port(master_node)
         java_opts = NodeXmlReader.get_java_opts(master_node)
         if java_opts is "":
             java_opts = self.get_java_opts()
         username = self.get_username()
-        return Machine(id, ip, username, port, java_opts)
+        return Machine(id, ip, username, client_ip, client_port, java_opts)
 
     def get_node_machine_list(self):
         nodes = []
@@ -48,7 +49,7 @@ class ClusterInformation:
             java_opts = NodeXmlReader.get_java_opts(node)
             if java_opts is "":
                 java_opts = self.get_java_opts()
-            nodes.append(Machine(id, ip, username, "", java_opts))
+            nodes.append(Machine(id, ip, username, "", "", java_opts))
         return nodes
 
 class NodeXmlReader(object):
@@ -64,8 +65,12 @@ class NodeXmlReader(object):
         return get_tag_text(node, "cluster_ip")
 
     @staticmethod
-    def get_cluster_port(node):
-        return get_tag_text(node, "cluster_port")
+    def get_client_ip(node):
+        return get_tag_text(node, "client_ip")
+
+    @staticmethod
+    def get_client_port(node):
+        return get_tag_text(node, "client_port")
 
     @staticmethod
     def get_java_opts(node):
@@ -90,11 +95,12 @@ class Machine:
     log_path = ""
     port = ""
     
-    def __init__(self, id, ip, username, port="", java_opts=""):
+    def __init__(self, id, ip, username, client_ip="", client_port="", java_opts=""):
         self.id = id
         self.ip = ip
         self.username = username
-        self.port = port
+        self.client_ip = client_ip
+        self.client_port = client_port
         self.java_opts = java_opts
     
     def get_id(self):
@@ -106,8 +112,11 @@ class Machine:
     def get_java_opts(self):
         return self.java_opts
     
-    def get_port(self):
-        return self.port
+    def get_client_ip(self):
+        return self.client_ip
+    
+    def get_client_port(self):
+        return self.client_port
     
     def get_username(self):
         return self.username

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/startcc.sh
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/startcc.sh b/vxquery-server/src/main/resources/scripts/startcc.sh
index 002055c..dd1e5be 100755
--- a/vxquery-server/src/main/resources/scripts/startcc.sh
+++ b/vxquery-server/src/main/resources/scripts/startcc.sh
@@ -23,7 +23,7 @@ CCHOST=$1
 CCPORT=$2
 J_OPTS=$3
 
-#Export JAVA_HOME
+# Export JAVA_HOME
 export JAVA_HOME=${JAVA_HOME}
 
 # java opts added parameters
@@ -43,8 +43,8 @@ mkdir -p ${CCLOGS_DIR}
 CC_OPTIONS=" -client-net-ip-address ${CCHOST} -cluster-net-ip-address ${CCHOST} "
 if [ ! -z "${CCPORT}" ]
 then
-    CC_OPTIONS=" ${CC_OPTIONS} -cluster-net-port ${CCPORT} "
+    CC_OPTIONS=" ${CC_OPTIONS} -client-net-port ${CCPORT} "
 fi
 
-#Launch hyracks cc script without toplogy
+# Launch hyracks cc script without toplogy
 ${VXQUERY_HOME}/vxquery-server/target/appassembler/bin/vxquerycc ${CC_OPTIONS} &> ${CCLOGS_DIR}/cc_$(date +%Y%m%d%H%M).log &

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/startnc.sh
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/startnc.sh b/vxquery-server/src/main/resources/scripts/startnc.sh
index c2bda3c..260512e 100755
--- a/vxquery-server/src/main/resources/scripts/startnc.sh
+++ b/vxquery-server/src/main/resources/scripts/startnc.sh
@@ -25,7 +25,7 @@ CCHOST=$3
 CCPORT=$4
 J_OPTS=$5
 
-#Set JAVA_HOME
+# Set JAVA_HOME
 export JAVA_HOME=$JAVA_HOME
 
 # java opts added parameters
@@ -49,5 +49,5 @@ then
 fi
 
 
-#Launch hyracks nc
+# Launch hyracks nc
 ${VXQUERY_HOME}/vxquery-server/target/appassembler/bin/vxquerync ${NC_OPTIONS} &> ${NCLOGS_DIR}/nc_$(date +%Y%m%d%H%M).log &

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/stopcc.sh
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/stopcc.sh b/vxquery-server/src/main/resources/scripts/stopcc.sh
index 3290ec6..f2b6883 100755
--- a/vxquery-server/src/main/resources/scripts/stopcc.sh
+++ b/vxquery-server/src/main/resources/scripts/stopcc.sh
@@ -21,8 +21,7 @@ hostname
 
 USER=$1
 
-#Kill process
-#Kill process
+# Kill process
 PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=vxquerycc'|awk '{print $2}'`
 
 if [ "$PID" == "" ]; then

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/stopcluster.sh
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/stopcluster.sh b/vxquery-server/src/main/resources/scripts/stopcluster.sh
new file mode 100755
index 0000000..238da7f
--- /dev/null
+++ b/vxquery-server/src/main/resources/scripts/stopcluster.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#      http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+CCHOST=$1
+CCPORT=$2
+J_OPTS=$3
+
+# Export JAVA_HOME
+export JAVA_HOME=${JAVA_HOME}
+
+# java opts added parameters
+if [ ! -z "${J_OPTS}" ]
+then
+    JAVA_OPTS="${JAVA_OPTS} ${J_OPTS}"
+    export JAVA_OPTS
+fi
+
+VXQUERY_HOME=`pwd`
+CCLOGS_DIR=${VXQUERY_HOME}/logs
+
+# logs dir
+mkdir -p ${CCLOGS_DIR}
+
+# Set up the options for the cc.
+CC_OPTIONS=" -client-net-ip-address ${CCHOST} "
+if [ ! -z "${CCPORT}" ]
+then
+    CC_OPTIONS=" ${CC_OPTIONS} -client-net-port ${CCPORT} "
+fi
+
+# Launch hyracks cc script without toplogy
+echo "${VXQUERY_HOME}/vxquery-server/target/appassembler/bin/vxqueryshutdown ${CC_OPTIONS} &> ${CCLOGS_DIR}/shutdown_$(date +%Y%m%d%H%M).log &"
+${VXQUERY_HOME}/vxquery-server/target/appassembler/bin/vxqueryshutdown ${CC_OPTIONS} &> ${CCLOGS_DIR}/shutdown_$(date +%Y%m%d%H%M).log &

http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-server/src/main/resources/scripts/stopnc.sh
----------------------------------------------------------------------
diff --git a/vxquery-server/src/main/resources/scripts/stopnc.sh b/vxquery-server/src/main/resources/scripts/stopnc.sh
index 56ffc66..8f29de5 100755
--- a/vxquery-server/src/main/resources/scripts/stopnc.sh
+++ b/vxquery-server/src/main/resources/scripts/stopnc.sh
@@ -21,7 +21,7 @@ hostname
 
 USER=$1
 
-#Kill process
+# Kill process
 PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=vxquerync'|awk '{print $2}'`
 
 if [ "$PID" == "" ]; then