You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vxquery.apache.org by pr...@apache.org on 2013/09/30 21:25:16 UTC
svn commit: r1527729 - in /incubator/vxquery/trunk/vxquery/vxquery-benchmark: ./ src/ src/main/ src/main/resources/ src/main/resources/noaa-ghcn-daily/ src/main/resources/noaa-ghcn-daily/queries/ src/main/resources/noaa-ghcn-daily/scripts/

Author: prestonc
Date: Mon Sep 30 19:25:15 2013
New Revision: 1527729

URL: http://svn.apache.org/r1527729
Log:
Adding a benchmark directory to hold future XQuery benchmarks. The code for developing the first benchmark on weather data from NOAA is listed under resources. This is the first update to begin the finalization of this benchmark and providing the code to the public.

Added:
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py   (with props)
    incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py   (with props)

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml Mon Sep 30 19:25:15 2013
@@ -0,0 +1,88 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.apache.vxquery</groupId>
+  <artifactId>apache-vxquery-benchmark</artifactId>
+  <packaging>jar</packaging>
+  <version>0.1-incubating-SNAPSHOT</version>
+  <name>VXQuery Benchmark</name>
+  <description>Apache VXQuery Benchmark</description>
+
+  <parent>
+    <groupId>org.apache.vxquery</groupId>
+    <artifactId>apache-vxquery-parent</artifactId>
+    <version>0.1-incubating-SNAPSHOT</version>
+    <relativePath>../vxquery-parent</relativePath>
+  </parent>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>appassembler-maven-plugin</artifactId>
+        <version>1.1.1</version>
+        <executions>
+          <execution>
+            <configuration>
+              <programs>
+                <program>
+                  <mainClass>org.apache.vxquery.xtest.Main</mainClass>
+                  <name>xtest</name>
+                </program>
+              </programs>
+              <repositoryLayout>flat</repositoryLayout>
+              <repositoryName>lib</repositoryName>
+            </configuration>
+            <phase>package</phase>
+            <goals>
+              <goal>assemble</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-site-plugin</artifactId>
+        <version>3.0</version>
+      </plugin>
+    </plugins>
+  </build>
+  
+  <reporting>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-project-info-reports-plugin</artifactId>
+        <version>2.4</version>
+        <reportSets>
+          <reportSet>
+            <reports>
+              <report>index</report>
+              <report>dependencies</report>
+              <report>plugins</report>
+            </reports>
+          </reportSet>
+        </reportSets>
+        <configuration>
+            <linkOnly>true</linkOnly>
+        </configuration>            
+      </plugin>
+    </plugins>
+  </reporting>
+</project>

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq Mon Sep 30 19:25:15 2013
@@ -0,0 +1,5 @@
+let $collection := "/path/to/data"
+return
+	for $r in collection($collection)/ghcnd_observation
+	where $r/station_id = "AG000060390"
+	return $r/name
\ No newline at end of file

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q00.xq
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md Mon Sep 30 19:25:15 2013
@@ -0,0 +1,23 @@
+Weather Data Conversion To XML
+=====================
+
+# Introduction
+
+The NOAA has hosted DAILY GLOBAL HISTORICAL CLIMATOLOGY NETWORK (GHCN-DAILY) 
+.dat files. Weather.gov has an RSS/XML feed that gives current weather sensor 
+readings. Using the RSS feed as a template, the GHCN-DAILY historical 
+information is used to generate past RSS feed XML documents. The process allows 
+testing on a large set of information with out having to continually monitor 
+the weather.gov site for all the weather details for years.
+
+# Detailed Description
+
+Detailed GHDN-DAILY information: 
+<http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt>
+
+The process takes a save folder for the data. The folder contains a several 
+folders:
+
+ - downloads (All files taken from the NOAA HTTP site)
+ - 1\_node\_{package}\_xml{\_compression} (The generated xml files for a given 
+     package)
\ No newline at end of file

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py Mon Sep 30 19:25:15 2013
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys, getopt
+
+# Custom modules.
+from weather_data_files import *
+from weather_download_files import *
+from weather_convert_to_xml import *
+
+DEBUG_OUTPUT = False
+COMPRESSED = False
+
+#
+# Weather conversion for GHCN-DAILY files to xml.
+#
+# http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
+#
+def main(argv):
+    append = True
+    download_reset = False
+    max_records = 0
+    no_data_processing = False
+    nodes = 0
+    package = "ghcnd_gsn"
+    partition_stations = False
+    partitions = 0
+    print_stats = False
+    process_file_name = ""
+    reset = False
+    save_path = "/tmp"
+    update = False
+    
+    try:
+        opts, args = getopt.getopt(argv, "a:cdf:lhm:no:p:s:tru", ["max_station_files=", "file=", "save_directory=", "package=", "partitions=", "nodes="])
+    except getopt.GetoptError:
+        print 'The file options for weather_cli.py were not correctly specified.'
+        print 'To see a full list of options try:'
+        print '  $ python weather_cli.py -h'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print 'Converting weather daily files to xml options:'
+            print '    -a (int)  The number of partitions for creating split up data progress csv files. Used with -o'
+            print '    -c        Compress the produced XML file with .gz.'
+            print '    -d        Extra debug information.'
+            print '    -f (str)  The file name of a specific station to process.'
+            print '              * Helpful when testing a single stations XML file output.'
+            print '    -l        Reset download.'
+            print '    -m (int)  Limits the number of files created for each station.'
+            print '              * Helpful when testing to make sure all elements are supported for each station.'
+            print '              Alternate form: --max_station_files=(int)'
+            print '    -n        Do not process data files. (Used to only update the data processing file.)'
+            print '    -o (int)  The number of nodes for creating split up data progress csv files. Used with -a'
+            print '    -p (str)  The package used to generate files. (all, gsn, hcn)'
+            print '    -r        Build a new data progress file. (reset)'
+            print '    -s (str)  The directory for saving the downloaded and created XML files.'
+            print '    -t        Print the statistics of the data progress file.'
+            print '    -u        Recalculate the file count and data size for each data source file.'
+            sys.exit()
+        elif opt in ('-a', "--partitions"):
+            no_data_processing = True
+            partition_stations = True
+            append = False
+            partitions = int(arg)
+        elif opt == '-c':
+            global COMPRESSED
+            COMPRESSED = True
+        elif opt == '-d':
+            global DEBUG_OUTPUT
+            DEBUG_OUTPUT = True
+        elif opt in ('-f', "--file"):
+            # check if file exists.
+            if os.path.exists(arg):
+                process_file_name = arg
+            else:
+                print 'Error: Argument must be a file name for --file (-f).'
+                sys.exit()
+        elif opt == '-l':
+            download_reset = True
+        elif opt in ('-m', "--max_station_files"):
+            if arg.isdigit():
+                max_records = int(arg)
+            else:
+                print 'Error: Argument must be an integer for --max_station_files (-m).'
+                sys.exit()
+        elif opt == '-n':
+            no_data_processing = True
+        elif opt in ('-o', "--nodes"):
+            no_data_processing = True
+            partition_stations = True
+            append = False
+            nodes = int(arg)
+        elif opt in ('-p', "--package"):
+            if arg in ("all", "gsn", "hcn"):
+                package = "ghcnd_" + arg
+            else:
+                print 'Error: Argument must be an string for one of the known weather packages: "all", "gsn", "hcn"'
+                sys.exit()
+        elif opt == '-r':
+            reset = True
+        elif opt in ('-s', "--save_directory"):
+            # check if file exists.
+            if os.path.exists(arg):
+                save_path = arg
+            else:
+                print 'Error: Argument must be a directory for --save_directory (-s).'
+                sys.exit()
+        elif opt == '-t':
+            no_data_processing = True
+            print_stats = True
+        elif opt == '-u':
+            update = True
+
+    # Required fields to run the script.
+    if save_path == "" or not os.path.exists(save_path):
+        print 'Error: The save directory option must be supplied: --save_directory (-s).'
+        sys.exit()
+
+    # Set up downloads folder.
+    download_path = save_path + "/downloads"
+    download = WeatherDownloadFiles(download_path)
+    download.download_all_files(download_reset)
+
+    # Unzip the required file.
+    download.unzip_package(package, download_reset)
+
+
+    # Create some basic paths for save files and references.
+    ghcnd_data_dly_path = download_path + '/' + package + '/' + package
+    ghcnd_xml_path = save_path + "/1_node_" + package + '_xml/'
+    ghcnd_xml_gz_path = save_path + "/1_node_" + package + '_xml_gz/'
+    if COMPRESSED:
+        xml_data_save_path = ghcnd_xml_gz_path
+    else:
+        xml_data_save_path = ghcnd_xml_path
+
+    # Make sure the xml folder is available.
+    if not os.path.isdir(xml_data_save_path):
+        os.makedirs(xml_data_save_path)
+
+
+    # Set up the XML build objects.
+    convert = WeatherDailyXMLFile(download_path, xml_data_save_path, COMPRESSED, DEBUG_OUTPUT)
+    progress_file = xml_data_save_path + "_data_progress.csv"
+    data = WeatherDataFiles(ghcnd_data_dly_path, progress_file)
+    options = list()
+    if append:
+        options.append('append')
+    if update:
+        options.append('recalculate')
+    if reset:
+        options.append('reset')
+    data.build_progress_file(options, convert)
+    
+    if not no_data_processing:
+        if process_file_name is not "":
+            # process a single file
+            if os.path.exists(process_file_name):
+                (file_count, data_size) = convert.process_file(process_file_name, max_records)
+                data.update_file_status(process_file_name, WeatherDataFiles.DATA_FILE_CREATED, file_count, data_size)
+            else:
+                data.update_file_status(process_file_name, WeatherDataFiles.DATA_FILE_MISSING)
+        else:
+            # process directory
+            data.reset()
+            for file_name in data:
+                file_path = ghcnd_data_dly_path + '/' + file_name
+                if os.path.exists(file_path):
+                    (file_count, data_size) = convert.process_file(file_path, max_records)
+                    data.update_file_status(file_name, WeatherDataFiles.DATA_FILE_CREATED, file_count, data_size)
+                else:
+                    data.update_file_status(file_name, WeatherDataFiles.DATA_FILE_MISSING)
+                
+    elif print_stats:
+        data.print_progress_file_stats(convert)
+    elif partition_stations and nodes > 0 and partitions > 0:
+        data.reset()
+        data.build_partition_structure(nodes, partitions)
+
+                
+if __name__ == "__main__":
+    main(sys.argv[1:])

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py Mon Sep 30 19:25:15 2013
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import textwrap
+from datetime import date
+import os
+import gzip
+
+# Custom modules.
+from weather_dly_config import *
+
+class WeatherConvertToXML:
+    
+    MONTHS = [
+        "January",
+        "February",
+        "March",
+        "April",
+        "May",
+        "June",
+        "July",
+        "August",
+        "September",
+        "October",
+        "November",
+        "December"
+    ]
+    
+    def __init__(self, base_path, save_path, compressed, debug_output):
+        self.save_path = save_path
+        self.compressed = compressed
+        self.debug_output = debug_output
+
+        # Extra support files.
+        self.ghcnd_countries = base_path + '/ghcnd-countries.txt'
+        self.ghcnd_states = base_path + '/ghcnd-states.txt'
+        self.ghcnd_stations = base_path + '/ghcnd-stations.txt'
+        
+        
+    def get_field_from_definition(self, row, field_definition):
+        return row[(field_definition[FIELD_INDEX_START] - 1):field_definition[FIELD_INDEX_END]]
+    
+    def get_field(self, fields_array, row, index):
+        return row[(fields_array[index][FIELD_INDEX_START] - 1):fields_array[index][FIELD_INDEX_END]]
+    
+    def get_dly_field(self, row, index):
+        return self.get_field(DLY_FIELDS, row, index)
+    
+    def print_row_files(self, row):
+        for field in DLY_FIELDS:
+            print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
+    
+    def save_file(self, filename, contents):
+        if self.compressed:
+            filename = filename + '.gz'
+            file = gzip.open(filename, 'wb')
+        else:
+            file = open(filename, 'w')
+        file.write(contents)
+        file.close()
+        
+        return filename
+    
+    def get_folder_size(self, folder_name):
+        total_size = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                fp = os.path.join(dirpath, f)
+                total_size += os.path.getsize(fp)
+        return total_size
+
+    def process_one_month_sensor_set(self, records):
+        # Default
+        return 0
+    
+    def get_base_folder(self, station_id):
+        # Default
+        station_prefix = station_id[:3]
+        return self.save_path + station_prefix + "/" + station_id + "/" 
+    
+    def process_file(self, file_name, max_files):
+        print "Processing file: " + file_name
+        file_stream = open(file_name, 'r')
+    
+        month_last = 0
+        year_last = 0
+        records = []
+    
+        file_count = 0
+        for row in file_stream:
+            month = self.get_dly_field(row, DLY_FIELD_MONTH)
+            year = self.get_dly_field(row, DLY_FIELD_YEAR)
+            
+            if (month_last == 0 and year_last == 0) or (month == month_last and year == year_last):
+                records.append(row)
+            else:
+                # process set and start over.
+                file_count += self.process_one_month_sensor_set(records)
+                records = []
+                records.append(row)
+                   
+                if max_files != 0 and file_count >= max_files:
+                    # Stop creating more files after the max is reached.
+                    break
+        
+            month_last = month
+            year_last = year
+        
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+        data_size = self.get_folder_size(self.get_base_folder(station_id))
+        print "Created " + str(file_count) + " XML files for a data size of " + str(data_size) + "."
+        
+        return (file_count, data_size)
+    
+    def convert_c2f(self, c):
+        return (9 / 5 * c) + 32
+    
+    def default_xml_start(self):
+        return textwrap.dedent("""\
+            <?xml version="1.0" encoding="ISO-8859-1"?>
+            <ghcnd_observation version="1.0"
+                 xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+                 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+                <credit>NOAA's National Climatic Data Center (NCDC)</credit>
+                <credit_URL>http://www.ncdc.noaa.gov/</credit_URL>
+            """)
+    
+    def default_xml_field_date(self, report_date):
+        field_xml = ""
+        field_xml += "    <observation_date>" + self.MONTHS[report_date.month - 1] + " " + str(report_date.day) + ", " + str(report_date.year) + "</observation_date>\n"
+        field_xml += "    <observation_date_iso8601>" + report_date.isoformat() + "</observation_date_iso8601>\n"
+        return field_xml
+    
+    def default_xml_field_date_year(self, year):
+        field_xml = ""
+        field_xml += "    <observation_year>" + str(year) + "</observation_year>\n"
+        return field_xml
+
+    def default_xml_field_date_month(self, month):
+        field_xml = ""
+        field_xml += "    <observation_month>" + str(month) + "</observation_month>\n"
+        return field_xml
+
+    def default_xml_field_date_day(self, day):
+        field_xml = ""
+        field_xml += "            <observation_day>" + str(day) + "</observation_day>\n"
+        return field_xml
+    
+    
+    def default_xml_field_station(self, station_id):
+        station_row = ""
+        stations_file = open(self.ghcnd_stations, 'r')
+        
+        for line in stations_file:
+            if station_id == self.get_field_from_definition(line, STATIONS_FIELDS['ID']):
+                station_row = line
+                break
+        
+        field_xml = ""
+        field_xml += "    <station_id>" + station_id + "</station_id>\n"
+        field_xml += "    <location>\n"
+        field_xml += "        <latitude>" + self.get_field_from_definition(station_row, STATIONS_FIELDS['LATITUDE']).strip() + "</latitude>\n"
+        field_xml += "        <longitude>" + self.get_field_from_definition(station_row, STATIONS_FIELDS['LONGITUDE']).strip() + "</longitude>\n"
+        
+        elevation = self.get_field_from_definition(station_row, STATIONS_FIELDS['ELEVATION']).strip()
+        if elevation != "-999.9":
+            field_xml += "        <elevation>" + elevation + "</elevation>\n"
+        
+        field_xml += "    </location>\n"
+        field_xml += "    <name>" + self.get_field_from_definition(station_row, STATIONS_FIELDS['NAME']).strip() + "</name>\n"
+    
+        state = self.get_field_from_definition(station_row, STATIONS_FIELDS['STATE'])
+        if state.strip():
+            field_xml += "    <state>" + state + "</state>\n"
+    
+        gsn = self.get_field_from_definition(station_row, STATIONS_FIELDS['GSNFLAG'])
+        if gsn.strip():
+            field_xml += "    <gsn />\n"
+    
+        hcn = self.get_field_from_definition(station_row, STATIONS_FIELDS['HCNFLAG'])
+        if hcn.strip():
+            field_xml += "    <hcn />\n"
+    
+        wmoid = self.get_field_from_definition(station_row, STATIONS_FIELDS['WMOID'])
+        if wmoid.strip():
+            field_xml += "    <wmoid id=\"" + wmoid + "\" />\n"
+    
+        return field_xml
+    
+    def default_xml_start_sensors(self):
+        return "    <sensors>\n"
+
+    def default_xml_start_readings(self):
+        return "    <readings>\n"
+
+    def default_xml_start_reading(self):
+        return "        <reading>\n"
+    
+    def default_xml_field_day(self, row, day):
+        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
+        value = self.get_dly_field(row, day_index);
+        if value == "-9999":
+            return ""
+    
+        field_xml = ""
+        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
+        if field_id in ("MDTN", "MDTX", "MNPN", "MXPN", "TMAX", "TMIN", "TOBS",):
+            # Add both the celcius and fahrenheit temperatures.
+            celcius = float(value) / 10
+            field_xml += "            <" + field_id + "_c>" + str(celcius) + "</" + field_id + "_c>\n"
+            fahrenheit = self.convert_c2f(celcius)
+            field_xml += "            <" + field_id + "_f>" + str(fahrenheit) + "</" + field_id + "_f>\n"
+        elif field_id in ("AWND", "EVAP", "PRCP", "THIC", "WESD", "WESF", "WSF1", "WSF2", "WSF5", "WSFG", "WSFI", "WSFM",):
+            # Field values that are in tenths.
+            converted_value = float(value) / 10
+            field_xml += "            <" + field_id + ">" + str(converted_value) + "</" + field_id + ">\n"
+        elif field_id in ("ACMC", "ACMH", "ACSC", "ACSH", "PSUN",):
+            # Fields is a percentage.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        elif field_id in ("FMTM", "PGTM",):
+            # Fields is a time value HHMM.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        elif field_id in ("DAEV", "DAPR", "DASF", "DATN", "DATX", "DAWM", "DWPR", "FRGB", "FRGT", "FRTH", "GAHT", "MDSF", "MDWM", "MDEV", "MDPR", "SNOW", "SNWD", "TSUN", "WDF1", "WDF2", "WDF5", "WDFG", "WDFI", "WDFM", "WDMV",):
+            # Fields with no alternation needed.
+            field_xml += "            <" + field_id + ">" + value.strip() + "</" + field_id + ">\n"
+        else:
+            field_xml += "            <unknown>" + field_id + "</unknown>\n"
+            
+        # print field_xml
+        return field_xml
+    
+    def default_xml_end_reading(self):
+        return "        </reading>\n"
+
+    def default_xml_end_readings(self):
+        return "    </readings>\n"
+    
+    def default_xml_end_sensors(self):
+        return "    </sensors>\n"
+    
+    def default_xml_end(self):
+        return textwrap.dedent("""\
+            </ghcnd_observation>""")
+
+class WeatherDailyXMLFile(WeatherConvertToXML):
+    def process_one_month_sensor_set(self, records):
+        year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR))
+        month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH))
+    
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+        
+        count = 0
+        for day in range(1, 32):
+            try:
+                # TODO find out what is a valid python date range? 1889?
+                # Attempt to see if this is valid date.
+                report_date = date(year, month, day)
+                save_file_name = self.process_one_day(records, report_date)
+                if save_file_name is not "":
+                    count = count + 1
+                    if self.debug_output:
+                        print "Wrote file: " + save_file_name
+            except ValueError:
+                if self.debug_output:
+                    print "Error: Not a valid date (" + str(month) + "/" + str(day) + "/" + str(year) + ") for " + station_id + "."
+                pass
+        return count
+
+    def process_one_day(self, records, report_date):
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+        found_data = False
+                
+        # Information for each daily file.
+        daily_xml_file = self.default_xml_start()
+        daily_xml_file += self.default_xml_field_station(station_id)
+        daily_xml_file += self.default_xml_field_date(report_date)
+        daily_xml_file += self.default_xml_start_sensors()
+        for record in records:
+            record_xml_snip = self.default_xml_field_day(record, report_date.day)
+            if record_xml_snip is not "":
+                found_data = True
+            daily_xml_file += record_xml_snip
+        daily_xml_file += self.default_xml_end_sensors()
+        daily_xml_file += self.default_xml_end()
+        
+        if not found_data:
+            return ""
+        
+        # Make sure the station folder is available.
+        ghcnd_xml_station_path = self.get_base_folder(station_id) + str(report_date.year) + "/"
+        if not os.path.isdir(ghcnd_xml_station_path):
+            os.makedirs(ghcnd_xml_station_path)
+        
+        # Save XML string to disk.
+        save_file_name = ghcnd_xml_station_path + station_id + "_" + str(report_date.year).zfill(4) + str(report_date.month).zfill(2) + str(report_date.day).zfill(2) + ".xml"
+        save_file_name = self.save_file(save_file_name, daily_xml_file)
+                
+        return save_file_name
+
+class WeatherMonthlyXMLFile(WeatherConvertToXML):
+    def process_one_month_sensor_set(self, records):
+        found_data = False        
+        year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR))
+        month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH))
+    
+        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
+
+        # Information for each daily file.
+        daily_xml_file = self.default_xml_start()
+        daily_xml_file += self.default_xml_field_station(station_id)
+        daily_xml_file += self.default_xml_field_date_year(year)
+        daily_xml_file += self.default_xml_field_date_month(month)
+        daily_xml_file += self.default_xml_start_readings()
+        
+        for day in range(1, 32):
+            try:
+                # TODO find out what is a valid python date range? 1889?
+                # Attempt to see if this is valid date.
+                report_date = date(year, month, day)
+                found_daily_data = False
+                record_xml_snip = ""
+
+                for record in records:
+                    record_xml_snip += self.default_xml_field_day(record, report_date.day)
+                    if record_xml_snip is not "":
+                        found_data = True
+                        found_daily_data = True
+
+                if found_daily_data:
+                    daily_xml_file += self.default_xml_start_reading()
+                    daily_xml_file += self.default_xml_field_date_day(day)
+                    daily_xml_file += record_xml_snip
+                    daily_xml_file += self.default_xml_end_reading()
+                
+            except ValueError:
+                pass
+
+        daily_xml_file += self.default_xml_end_readings()
+        daily_xml_file += self.default_xml_end()
+
+        if not found_data:
+            return 0
+
+        # Make sure the station folder is available.
+        ghcnd_xml_station_path = self.get_base_folder(station_id) + str(report_date.year) + "/"
+        if not os.path.isdir(ghcnd_xml_station_path):
+            os.makedirs(ghcnd_xml_station_path)
+                
+        # Save XML string to disk.
+        save_file_name = ghcnd_xml_station_path + station_id + "_" + str(report_date.year).zfill(4) + str(report_date.month).zfill(2) + ".xml"
+        save_file_name = self.save_file(save_file_name, daily_xml_file)
+
+        if save_file_name is not "":
+            if self.debug_output:
+                print "Wrote file: " + save_file_name
+            return 1
+        else:
+            return 0
+

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py Mon Sep 30 19:25:15 2013
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os.path
+import linecache
+
+# Weather data files created to manage the conversion process.
+# Allows partition and picking up where you left off.
+class WeatherDataFiles:
+
+    INDEX_DATA_FILE_NAME = 0
+    INDEX_DATA_STATUS = 1
+    INDEX_DATA_FILE_COUNT = 2
+    INDEX_DATA_FOLDER_DATA = 3
+
+    DATA_FILE_START_INDEX = 0
+    DATA_FILE_EXTENSION = ".dly"
+    DATA_FILE_MISSING = "missing"
+    DATA_FILE_INITIAL = "initialized"
+    DATA_FILE_CREATED = "created"
+    SEPERATOR = ","
+    
+    def __init__(self, base_path, progress_file_name="/tmp/_weather_data.csv"):
+        self.base_path = base_path
+
+        self.progress_file_name = progress_file_name
+        
+        self.current = self.DATA_FILE_START_INDEX
+        self.progress_data = []
+
+        
+    def get_file_list(self):
+        return glob.glob(self.base_path + "/*" + self.DATA_FILE_EXTENSION)
+
+    def get_file_list_iterator(self):
+        return glob.iglob(self.base_path + "/*" + self.DATA_FILE_EXTENSION)
+
+
+    # Save Functions
+    def build_progress_file(self, options, convert):
+        if not os.path.isfile(self.progress_file_name) or 'reset' in options:
+            # Build a new file.
+            file = open(self.progress_file_name, 'w')
+            contents = self.get_default_progress_file_csv()
+            file.write(contents)
+            file.close()
+        elif 'append' in options or 'recalculate' in options:
+            self.open_progress_data()
+            row_count = len(self.progress_data)
+            for row in range(0, row_count):
+                file_name = self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_FILE_NAME]
+                if self.get_file_row(file_name) < 0 and 'append' in options: 
+                    self.progress_data.append(self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL))
+                elif 'recalculate' in options:
+                    # The folder is hard coded
+                    station_id = os.path.basename(file_name).split('.')[0]
+                    folder_name = convert.get_base_folder(station_id)
+                    if os.path.exists(folder_name):
+                        file_count = self.get_file_count(folder_name)
+                        data_size = self.get_folder_size(folder_name)
+                        self.progress_data[row] = self.get_progress_csv_row(file_name, self.DATA_FILE_CREATED, file_count, data_size)
+                    else:
+                        self.progress_data[row] = self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL)
+            # Save file
+            self.close_progress_data(True)
+        self.reset()
+        
+    # Save Functions
+    def build_partition_structure(self, nodes, partitions):
+        self.open_progress_data()
+        row_count = len(self.progress_data)
+        
+        # Get the dictionary of all the files and data sizes.
+        csv_dict = dict()
+        for row in range(0, row_count):
+            file_name = self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_FILE_NAME]
+            folder_data = int(self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_FOLDER_DATA])
+            
+            csv_dict[file_name] = folder_data
+        
+        # New sorted list.
+        csv_sorted = sorted(csv_dict, key=csv_dict.get, reverse=True)
+        
+        # Initialize the partition variables.
+        total_partitions = nodes * partitions
+        current_partition = 0
+        list_of_partitions = []
+        for i in range(0, total_partitions):
+            list_of_partitions.append(set())
+        
+        # Add the files in a round robin order.
+        for item in csv_sorted:
+            list_of_partitions[current_partition].add(item)
+            current_partition += 1
+            if current_partition >= total_partitions:
+                current_partition = 0
+                
+        # Save list of files for each node's partitions.
+        for i in range(0, nodes):
+            for j in range(0, partitions):
+                current_partition = (i * partitions) + j
+                self.write_partition_file(i + 1, j + 1, list_of_partitions[current_partition])
+        
+        
+    # Write out the partition file list to a CSV file.
+    def write_partition_file(self, node, partition, items):
+        save_partition_file = "node_" + str(node) + "_level_" + str(partition) + ".csv"
+        file = open(save_partition_file, 'w')
+        contents = ""
+        for file_name in items:
+            contents += self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL)
+        file.write(contents)
+        file.close()
+        
+        
+    def get_file_row(self, file_name):
+        for i in range(0, len(self.progress_data)):
+            if self.progress_data[i].startswith(file_name):
+                return i
+        return -1
+        
+    def get_default_progress_file_csv(self):
+        contents = ""
+        for path in self.get_file_list_iterator():
+            file_name = os.path.basename(path)
+            contents += self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL)
+        return contents
+    
+    def print_progress_file_stats(self, convert):
+        station_count_missing = 0
+        station_count = 0
+        file_count = 0
+        data_size = 0
+        
+        station_count_actual = 0
+        file_count_actual = 0
+        data_size_actual = 0
+        
+        self.open_progress_data()
+        row_count = len(self.progress_data)
+        for row in range(0, row_count):
+            row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
+            if int(row_contents[self.INDEX_DATA_FILE_COUNT]) != -1 and  int(row_contents[self.INDEX_DATA_FOLDER_DATA]) != -1:
+                station_count += 1
+                file_count += int(row_contents[self.INDEX_DATA_FILE_COUNT])
+                data_size += int(row_contents[self.INDEX_DATA_FOLDER_DATA])
+            else:
+                station_count_missing += 1
+                
+            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
+            station_id = os.path.basename(file_name).split('.')[0]
+            folder_name = convert.get_base_folder(station_id)
+            if os.path.exists(folder_name):
+                station_count_actual += 1
+                file_count_actual += self.get_file_count(folder_name)
+                data_size_actual += self.get_folder_size(folder_name)
+
+
+        print "Progress File:\t" + self.progress_file_name + "\n"
+        
+        print "CSV DETAILS OF PROCESSED STATIONS"
+        print "Number of stations:\t" + "{:,}".format(station_count)
+        print "Number of files:\t" + "{:,}".format(file_count)
+        print "Data size:\t\t" + sizeof_fmt(data_size) + "\n"
+
+        print "CSV DETAILS OF unPROCESSED STATIONS"
+        print "Number of stations:\t" + "{:,}".format(station_count_missing) + "\n"
+
+        print "FOLDER DETAILS"
+        print "Number of stations:\t" + "{:,}".format(station_count_actual)
+        print "Number of files:\t" + "{:,}".format(file_count_actual)
+        print "Data size:\t\t" + sizeof_fmt(data_size_actual) + "\n"
+
+    
+    def get_progress_csv_row(self, file_name, status, file_count=-1, data_size=-1):
+        return file_name + self.SEPERATOR + status + self.SEPERATOR + str(file_count) + self.SEPERATOR + str(data_size) + "\n"
+    
+    def update_file_status(self, file_name, status, file_count=-1, data_size=-1):
+        for i in range(0, len(self.progress_data)):
+            if self.progress_data[i].startswith(file_name):
+                self.progress_data[i] = self.get_progress_csv_row(file_name, status, file_count, data_size)
+                break
+
+        # Save the file            
+        self.close_progress_data(True)
+
+    def get_file_count(self, folder_name):
+        count = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                count += 1
+        return count
+
+    def get_folder_size(self, folder_name):
+        total_size = 0
+        for dirpath, dirnames, filenames in os.walk(folder_name):
+            for f in filenames:
+                fp = os.path.join(dirpath, f)
+                total_size += os.path.getsize(fp)
+        return total_size
+
+    
+    def open_progress_data(self):
+        with open(self.progress_file_name, 'r') as file:
+            self.progress_data = file.readlines()
+
+    def close_progress_data(self, force=False):
+        if len(self.progress_data) > 0 or force:
+            with open(self.progress_file_name, 'w') as file:
+                file.writelines(self.progress_data)
+
+    
+    def reset(self):
+        self.close_progress_data()
+
+        self.current = self.DATA_FILE_START_INDEX
+        self.open_progress_data()
+
+
+    # Iterator Functions
+    def __iter__(self):
+        return self
+
+    def next(self):
+        columns = []
+        while True:
+            # find a row that has not been created.
+            if self.current >= len(self.progress_data):
+                raise StopIteration
+            row = self.progress_data[self.current]
+            self.current += 1
+            columns = row.rsplit(self.SEPERATOR)
+            if columns[self.INDEX_DATA_STATUS].strip() != self.DATA_FILE_CREATED:
+                break
+        return columns[self.INDEX_DATA_FILE_NAME]
+
+# sizeof_fmt function is taken from an answer posted to stackoverflow.com.
+#
+# Question: 
+#   http://stackoverflow.com/questions/1094841
+# Answer Author: 
+#   http://stackoverflow.com/users/55246/sridhar-ratnakumar
+def sizeof_fmt(num):
+    for x in ['bytes', 'KB', 'MB', 'GB']:
+        if num < 1024.0 and num > -1024.0:
+            return "%3.1f%s" % (num, x)
+        num /= 1024.0
+    return "%3.1f%s" % (num, 'TB')
+    

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py Mon Sep 30 19:25:15 2013
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Base URL used to get all the required files.
+BASE_DOWNLOAD_URL = 'http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/'
+
+# List of required files for a build.
+FILE_NAMES = []
+FILE_NAMES.append('ghcnd-countries.txt')
+FILE_NAMES.append('ghcnd-inventory.txt')
+FILE_NAMES.append('ghcnd-states.txt')
+FILE_NAMES.append('ghcnd-stations.txt')
+FILE_NAMES.append('ghcnd-version.txt')
+FILE_NAMES.append('ghcnd_all.tar.gz')
+FILE_NAMES.append('ghcnd_gsn.tar.gz')
+FILE_NAMES.append('ghcnd_hcn.tar.gz')
+FILE_NAMES.append('readme.txt')
+FILE_NAMES.append('status.txt')
+
+# Store the row details here.
+
+# Index values of each field details.
+FIELD_INDEX_NAME = 0
+FIELD_INDEX_START = 1
+FIELD_INDEX_END = 2
+FIELD_INDEX_TYPE = 3
+
+DLY_FIELD_ID = 0
+DLY_FIELD_YEAR = 1
+DLY_FIELD_MONTH = 2
+DLY_FIELD_ELEMENT = 3
+
+DLY_FIELD_DAY_OFFSET = 4
+DLY_FIELD_DAY_FIELDS = 4
+
+DLY_FIELDS = []
+
+# Details about the row.
+DLY_FIELDS.append(['ID', 1, 11, 'Character'])
+DLY_FIELDS.append(['YEAR', 12, 15, 'Integer'])
+DLY_FIELDS.append(['MONTH', 16, 17, 'Integer'])
+DLY_FIELDS.append(['ELEMENT', 18, 21, 'Character'])
+
+# Days in each row.
+for i in range(1, 32):
+    start = 22 + ((i - 1) * 8)
+    DLY_FIELDS.append(['VALUE' + str(i), (start + 0), (start + 4), 'Integer'])
+    DLY_FIELDS.append(['MFLAG' + str(i), (start + 5), (start + 5), 'Character'])
+    DLY_FIELDS.append(['QFLAG' + str(i), (start + 6), (start + 6), 'Character'])
+    DLY_FIELDS.append(['SFLAG' + str(i), (start + 7), (start + 7), 'Character'])
+
+# Details about the row.
+STATIONS_FIELDS = {}
+STATIONS_FIELDS['ID'] = ['ID', 1, 11, 'Character']
+STATIONS_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real']
+STATIONS_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real']
+STATIONS_FIELDS['ELEVATION'] = ['ELEVATION', 32, 37, 'Real']
+STATIONS_FIELDS['STATE'] = ['STATE', 39, 40, 'Character']
+STATIONS_FIELDS['NAME'] = ['NAME', 42, 71, 'Character']
+STATIONS_FIELDS['GSNFLAG'] = ['GSNFLAG', 73, 75, 'Character']
+STATIONS_FIELDS['HCNFLAG'] = ['HCNFLAG', 77, 79, 'Character']
+STATIONS_FIELDS['WMOID'] = ['WMOID', 81, 85, 'Character']
+
+# Details about the row.
+COUNTRIES_FIELDS = {}
+COUNTRIES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character']
+COUNTRIES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character']
+
+# Details about the row.
+STATES_FIELDS = {}
+STATES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character']
+STATES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character']
+
+# Details about the row.
+INVENTORY_FIELDS = []
+INVENTORY_FIELDS.append(['ID', 1, 11, 'Character'])
+INVENTORY_FIELDS.append(['LATITUDE', 13, 20, 'Real'])
+INVENTORY_FIELDS.append(['LONGITUDE', 22, 30, 'Real'])
+INVENTORY_FIELDS.append(['ELEMENT', 32, 35, 'Character'])
+INVENTORY_FIELDS.append(['FIRSTYEAR', 37, 40, 'Integer'])
+INVENTORY_FIELDS.append(['LASTYEAR', 42, 45, 'Integer'])
+

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
URL: http://svn.apache.org/viewvc/incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py?rev=1527729&view=auto
==============================================================================
--- incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py (added)
+++ incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py Mon Sep 30 19:25:15 2013
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path
+import shutil
+import tarfile
+import urllib2
+
+# Custom modules.
+from weather_dly_config import *
+
+class WeatherDownloadFiles:
+
+    def __init__(self, save_path):
+        self.save_path = save_path
+        
+        if not os.path.isdir(save_path):
+            os.makedirs(save_path)
+
+
+    # Download the complete list
+    def download_all_files(self, reset=False):
+        for file_name in FILE_NAMES:
+            url = BASE_DOWNLOAD_URL + file_name
+            self.download_file(url, reset)
+
+    # Download the file, unless it exists.
+    def download_file(self, url, reset=False):
+        file_name = self.save_path + "/" + url.split('/')[-1]
+
+        if not os.path.isfile(file_name) or reset:
+            download_file_with_status(url, file_name)
+
+    # download_file_with_status function is based on a question posted to
+    # stackoverflow.com.
+    #
+    # Question: 
+    #   http://stackoverflow.com/questions/22676
+    # Answer Authors: 
+    #   http://stackoverflow.com/users/394/pablog
+    #   http://stackoverflow.com/users/160206/bjorn-pollex
+    def download_file_with_status(self, url, file_name):
+        u = urllib2.urlopen(url)
+        f = open(file_name, 'wb')
+        meta = u.info()
+        file_size = int(meta.getheaders("Content-Length")[0])
+        print "Downloading: %s Bytes: %s" % (file_name, file_size)
+
+        file_size_dl = 0
+        block_sz = 8192
+        while True:
+            buffer = u.read(block_sz)
+            if not buffer:
+                break
+
+            file_size_dl += len(buffer)
+            f.write(buffer)
+            status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
+            status = status + chr(8) * (len(status) + 1)
+            print status,
+
+        f.close()
+
+    # Unzip the package file, unless it exists.
+    def unzip_package(self, package, reset=False):
+        file_name = self.save_path + "/" + package + ".tar.gz"
+        unzipped_path = self.save_path + "/" + package
+        
+        if os.path.isdir(unzipped_path) and reset:
+            shutil.rmtree(unzipped_path)
+            
+        if not os.path.isdir(unzipped_path):
+            print "Unzipping: " + file_name
+            tar_file = tarfile.open(file_name, 'r:gz')
+            tar_file.extractall(unzipped_path)
+ 
\ No newline at end of file

Propchange: incubator/vxquery/trunk/vxquery/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
------------------------------------------------------------------------------
    svn:eol-style = native