You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by se...@apache.org on 2015/11/01 19:35:55 UTC
svn commit: r1711837 - /comdev/reporter.apache.org/trunk/urlutils.py
Author: sebb
Date: Sun Nov 1 18:35:55 2015
New Revision: 1711837
URL: http://svn.apache.org/viewvc?rev=1711837&view=rev
Log:
Initial version of URL util class
Added:
comdev/reporter.apache.org/trunk/urlutils.py (with props)
Added: comdev/reporter.apache.org/trunk/urlutils.py
URL: http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/urlutils.py?rev=1711837&view=auto
==============================================================================
--- comdev/reporter.apache.org/trunk/urlutils.py (added)
+++ comdev/reporter.apache.org/trunk/urlutils.py Sun Nov 1 18:35:55 2015
@@ -0,0 +1,166 @@
+"""
+ Some utilities for working with URLs
+"""
+
+import sys
+if sys.hexversion < 0x03000000:
+ raise ImportError("This script requires Python 3")
+import os
+from os.path import dirname, abspath, join, getmtime
+import shutil
+import io
+import urllib.request
+import time
+import calendar
+
+# time format used in Last-Modified/If-Modified-Since HTTP headers
+_HTTP_TIME_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
+
+"""
+ get file mod date in suitable format for If-Modified-Since
+
+"""
+def mod_date(t):
+ return time.strftime(_HTTP_TIME_FORMAT, time.gmtime(t))
+
+"""
+ Get a URL if it is not newer
+
+ @param url: the url to fetch
+ @param sinceTime: the most recent Last-Modified string
+ @param encoding: the encoding to use (default 'None')
+ @param errors: If encoding is provided, this specifies the on-error action (e.g. 'ignore')
+ @return: (lastMod, response)
+ - lastMod: the Last-Modified string
+ - response: the HTTPResponse (encoding == None) or TextIOBase object.
+ 'None' if the URL is not newer
+"""
+def getIfNewer(url, sinceTime, encoding=None, errors=None):
+ if sinceTime:
+ headers = {"If-Modified-Since" : sinceTime}
+ else:
+ headers = {}
+ response = None
+ try:
+ req = urllib.request.Request(url, headers=headers)
+ resp = urllib.request.urlopen(req)
+ lastMod = resp.headers['Last-Modified']
+ if encoding:
+ response = io.TextIOWrapper(resp, encoding=encoding, errors=errors)
+ else:
+ response = resp
+ except urllib.error.HTTPError as err:
+ if err.code == 304:
+ lastMod = sinceTime # preserve timestamp
+ else:
+ raise
+ return lastMod, response
+
+"""
+ Creates a cache for URLs
+ @param cachedir: the cache directory to use
+ (default data/cache; this is assumed to be at the current directory, its parent or grandparent)
+ @param interval: minimum interval between checks for updates to the URL (default 300 secs)
+ if set to -1, never checks (intended for testing only)
+ @return: the instance to use with the get() method
+"""
+class UrlCache(object):
+ # get file mod_date
+ def __file_mtime(self, filename):
+ try:
+ t = getmtime(filename)
+ except FileNotFoundError:
+ t = -1 # so cannot be confused with a valid mtime
+ return t
+
+ def __init__(self, cachedir=None, interval=300):
+ __CACHE = 'data/cache'
+ self.__interval = interval
+ self.__cachedir = None
+ if cachedir:
+ self.__cachedir = cachedir
+ else:
+ self.__cachedir = __CACHE # will be overwritten if actually found
+ for d in ['./','../','../../']: # we may located at same level or 1 or 2 below
+ dir = d + __CACHE
+ if os.path.isdir(dir):
+ self.__cachedir = dir
+ break
+
+ if os.path.isdir(self.__cachedir):
+ print("Cachedir: %s" % self.__cachedir)
+ else:
+ raise OSError("Could not find cache directory '%s'" % self.__cachedir)
+
+ def __getname(self, name):
+ return join(self.__cachedir, name)
+
+ def get(self, url, name, encoding=None, errors=None):
+ """
+ Check if the filename exists in the cache.
+ If it does not, or if it does and the URL has not been checked recently,
+ then try to download the URL using If-Modified-Since.
+ The URL is downloaded to a temporary file and renamed to the filename
+ to reduce the time when the file is being updated.
+ The interval parameter is used to determine how often to check if the URL has changed.
+ (this is mainly intended to avoid excess URL requests in unit testing).
+ If this is set to -1, then the URL will only be downloaded once.
+ @param url: the url to fetch (required)
+ @param name: the name to use in the cache (required)
+ @param encoding: the encoding to use (default None)
+ @param errors: If encoding is provided, this specifies the on-error action (e.g. 'ignore')
+ (default None)
+ @return: the opened stream, using the encoding if specified. Otherwise opened in binary mode.
+ """
+ target=self.__getname(name)
+ fileTime = self.__file_mtime(target)
+ check = self.__getname("."+name)
+ upToDate = False
+ if fileTime >= 0:
+ if self.__interval == -1:
+ print("File %s exists and URL check has been disabled" % name)
+ upToDate = True
+ else:
+ checkTime = self.__file_mtime(check)
+ now = time.time()
+ diff = now - checkTime
+ if diff < self.__interval:
+ print("Recently checked: %d < %d, skip check" % (diff, self.__interval))
+ upToDate = True
+ else:
+ print("Not recently checked: %d > %d" % (diff, self.__interval))
+ else:
+ print("Not found %s " % target)
+
+ if not upToDate:
+ sinceTime = mod_date(fileTime)
+ lastMod, response = getIfNewer(url, sinceTime)
+ if response: # we have a new version
+ try:
+ lastModT = calendar.timegm(time.strptime(lastMod, _HTTP_TIME_FORMAT))
+ except ValueError:
+ lastModT = 0
+
+ tmpFile = target + ".tmp"
+ with open(tmpFile,'wb') as f:
+ shutil.copyfileobj(response, f)
+ # store the last mod time as the time of the file
+ os.utime(tmpFile, times=(lastModT, lastModT))
+ os.rename(tmpFile, target) # seems to preserve file mod time
+ print("Downloaded new version of %s " % target)
+ else:
+ print("Cached copy of %s is up to date" % target)
+
+
+ with open(check,'a'):
+ os.utime(check, None) # touch the marker file
+
+ if encoding:
+ return open(target, 'r', encoding=encoding, errors=errors)
+ else:
+ return open(target, 'rb')
+
+if __name__ == '__main__':
+ fc = UrlCache(cachedir=None,interval=30)
+ print("Done")
+
\ No newline at end of file
Propchange: comdev/reporter.apache.org/trunk/urlutils.py
------------------------------------------------------------------------------
svn:eol-style = native