You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ws.apache.org by di...@apache.org on 2005/12/09 13:22:06 UTC
svn commit: r355462 [7/7] - in /webservices/admin/planet: ./ cache/
compat_logging/ examples/ fancy-examples/ output/ output/images/
Added: webservices/admin/planet/planetlib.py
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/planetlib.py?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/planetlib.py (added)
+++ webservices/admin/planet/planetlib.py Fri Dec 9 04:21:26 2005
@@ -0,0 +1,500 @@
+#!/usr/bin/env python
+"""Planet RSS Aggregator Library.
+
+You may modify the following variables;
+
+ user_agent: User-Agent sent to remote sites
+ cache_directory: Where to put cached feeds
+
+Requires Python 2.2 and Mark Pilgrim's feedparser.py.
+"""
+
+__version__ = "0.2"
+__authors__ = [ "Scott James Remnant <sc...@netsplit.com>",
+ "Jeff Waugh <jd...@perkypants.org>" ]
+__credits__ = "Originally based on spycyroll."
+__license__ = "Python"
+
+
+import os
+import re
+import time
+
+import feedparser
+import StringIO
+import sgmllib
+
+try:
+ import logging
+except:
+ import compat_logging
+ logging = compat_logging
+
+try:
+ import gzip
+except:
+ gzip = None
+
+
+# We might as well advertise ourself when we're off galavanting
+user_agent = " ".join(("Planet/" + __version__,
+ "http://www.planetplanet.org/",
+ feedparser.USER_AGENT))
+
+# Where to put cached rss feeds
+cache_directory = 'cache'
+
+# Things we don't want to see in our cache filenames
+cache_invalid_stuff = re.compile(r'\W+')
+cache_multiple_dots = re.compile(r'\.+')
+
+
+def cache_filename(url):
+ """Returns a cache filename for a URL.
+
+ Sanitises the URL given, replacing unwelcome characters with periods,
+ then prepends the configured cache directory.
+ """
+ filename = url
+ filename = filename.replace("http://", "")
+ filename = filename.replace("www.", "")
+ filename = cache_invalid_stuff.sub('.', filename)
+ filename = cache_multiple_dots.sub('.', filename)
+
+ return os.path.join(cache_directory, filename)
+
+
+class Stripper(sgmllib.SGMLParser):
+ def __init__(self):
+ sgmllib.SGMLParser.__init__(self)
+
+ def strip(self, some_html):
+ self.theString = ""
+ self.feed(some_html)
+ self.close()
+ return self.theString
+
+ def handle_data(self, data):
+ self.theString += data
+
+
+class Channel:
+ """A collection of news items.
+
+ Channel represents a feed of news from a website, or some other
+ source.
+
+ A channel is created with a URI where we can obtain the feed
+ and an optional dictionary of additional properties you wish
+ to set (probably from a config file).
+
+ Special properties:
+ offset: The number of hours out the channel's times tend to be.
+
+ Useful members:
+ uri: Where the feed can be downloaded from
+ etag, modified: Used to determine whether the feed has changed
+
+ title: Title for the feed, often the author's name
+ description: A description of the content of the feed
+ link: Link associated with the feed, generally the HTML version
+
+ items: List of current NewsItems
+ props: Dictionary of properties
+ """
+ def __init__(self, uri, props=None):
+ self.uri = uri
+ self.etag = None
+ self.modified = None
+
+ self.title = None
+ self.description = None
+ self.link = None
+
+ self.items = []
+
+ if props:
+ self.props = props
+ else:
+ self.props = {}
+ if self.props.has_key('offset'):
+ self.offset = float(self.props['offset'])
+ else:
+ self.offset = None
+
+ self.cache_read()
+
+ def update(self, uri=None):
+ """Update the channel.
+
+ Read feed data from channel's URI (or an alternate one) and
+ parse it using feedparser. This, where possible, caches the
+ data and tries not to request it again if it hasn't changed.
+
+ Most of this is actually what feedparser.parse() does, but as
+ we need the unparsed data (to cache), we have to do a bit of the
+ work ourselves.
+
+ The real work is done in _update().
+ """
+ if uri is None:
+ save_uri = 1
+ uri = self.uri
+ logging.info("Updating feed <" + self.uri + ">")
+ else:
+ save_uri = 0
+ logging.info("Updating feed <" + self.uri + "> from <" + uri + ">")
+
+ # Open the resource and read the data
+ f = feedparser.open_resource(uri, agent=user_agent,
+ etag=self.etag, modified=self.modified)
+ data = self._read_data(f)
+
+ # Check for some obvious things
+ if hasattr(f, 'status'):
+ if f.status == 304:
+ logging.info("Feed has not changed")
+ return
+ if f.status >= 400:
+ logging.error("Update failed for <%s> (Error: %d)"
+ % (uri, f.status))
+ return
+
+ # Update etag and modified
+ new_etag = feedparser.get_etag(f)
+ if new_etag:
+ self.etag = new_etag
+ logging.debug("E-Tag: " + self.etag)
+ new_modified = feedparser.get_modified(f)
+ if new_modified:
+ self.modified = new_modified
+ logging.debug("Modified: " + self.format_date(self.modified))
+
+ # Update URI in case of redirect
+ if hasattr(f, 'url') and save_uri:
+ self.uri = f.url
+ logging.debug("URI: <" + self.uri + ">")
+ if hasattr(f, 'headers'):
+ baseuri = f.headers.get('content-location', self.uri)
+ else:
+ baseuri = self.uri
+
+ # Parse the feed
+ f.close()
+ self._update(baseuri, data)
+
+ def _read_data(self, f):
+ """Read the data from the resource.
+
+ Attempts to gunzip the data if the Content-Encoding header claimed
+ to be gzip, but if that fails it doesn't overly worry about it.
+
+ We then take the data and try to squeeze it into a UTF-8 string
+ using Python's unicode module. If it doesn't decode as UTF-8
+ we try ISO-8559-1 before ruthlessly stripping the bad characters.
+ """
+ data = f.read()
+
+ if hasattr(f, 'headers'):
+ if gzip and f.headers.get('content-encoding', '') == 'gzip':
+ try:
+ gzdata = gzip.GzipFile(fileobj=StringIO.StringIO(data))
+ data = gzdata.read()
+ except:
+ logging.warn("Feed contained invalid gzipped data",
+ exc_info=1)
+
+ try:
+ data = unicode(data, "utf8").encode("utf8")
+ logging.debug("Encoding: UTF-8")
+ except UnicodeError:
+ try:
+ data = unicode(data, "iso8859_1").encode("utf8")
+ logging.debug("Encoding: ISO-8859-1")
+ except UnicodeError:
+ data = unicode(data, "ascii", "replace").encode("utf8")
+ logging.warn("Feed wasn't in UTF-8 or ISO-8859-1, replaced " +
+ "all non-ASCII characters.")
+
+ return data
+
+ def cache_read(self):
+ """Initialise the channel from the cache.
+
+ The data is read from a file in the cache_directory and parsed.
+ """
+ cache_uri = cache_filename(self.uri)
+
+ try:
+ if os.path.exists(cache_uri):
+ if os.path.exists(cache_uri + ",etag"):
+ c = open(cache_uri + ",etag")
+ self.etag = c.read().strip()
+ c.close()
+
+ if os.path.exists(cache_uri + ",modified"):
+ c = open(cache_uri + ",modified")
+ self.modified = feedparser.parse_date(c.read().strip())
+ c.close()
+
+ self.update(cache_uri)
+ except:
+ logging.warn("Cache read failed <" + cache_uri + ">", exc_info=1)
+
+ def cache_write(self, data):
+ """Write the unparsed feed to the cache.
+
+ The data is written as-is to a file in the cache_directory.
+ If the channel has etag or modified information, those are written
+ to files alongside.
+ """
+ cache_uri = cache_filename(self.uri)
+
+ try:
+ c = open(cache_uri, "w")
+ c.write(data)
+ c.close()
+
+ if self.etag:
+ c = open(cache_uri + ",etag", "w")
+ c.write(self.etag + "\n")
+ c.close()
+ elif os.path.exists(cache_uri + ",etag"):
+ try:
+ os.remove(cache_uri + ",etag")
+ except:
+ pass
+
+ if self.modified:
+ c = open(cache_uri + ",modified", "w")
+ c.write(feedparser.format_http_date(self.modified) + "\n")
+ c.close()
+ elif os.path.exists(cache_uri + ",modified"):
+ try:
+ os.remove(cache_uri + ",modified")
+ except:
+ pass
+ except:
+ logging.warn("Cache write failed <" + cache_uri + ">", exc_info=1)
+
+ def _update(self, baseuri, data):
+ """Update the channel from a parsed feed.
+
+ This is the real guts of update() and after all the fuss is
+ actually pretty simple. We parse the feed using feedparser
+ and if we get the information, cache it.
+ """
+ feed = feedparser.FeedParser(baseuri)
+ feed.feed(data)
+
+ if len(feed.items) < 1:
+ logging.info("Empty feed, cowardly not updating %s" % (baseuri))
+ return
+
+ new_items = []
+ for item in feed.items:
+ new_items.append(NewsItem(item, self))
+ if abs(new_items[-1].date[0] - time.gmtime()[0]) > 1:
+ logging.warning(("Obviously bogus year in feed (%d), " +
+ "cowardly not updating")
+ % (new_items[-1].date[0],))
+ return
+
+ self.items = new_items
+ self.title = feed.channel.get('title', '')
+ self.description = feed.channel.get('description', '')
+ self.link = feed.channel.get('link', '')
+
+ self.cache_write(data)
+ return self.items
+
+ def utctime(self, date):
+ """Return UTC time() for given date.
+
+ Returns the equivalent of time() for the given date, but taking
+ into account local timezone or any forced offset for the channel.
+
+ This is suitable for using in a call to gmtime() only.
+ """
+ offset = time.timezone
+ if self.offset is not None:
+ # self.offset is the difference from UTC, so add timezone
+ offset += self.offset * 3600 + time.timezone
+
+ return time.mktime(date) - offset
+
+ def format_date(self, date, fmt=None):
+ """Formats a date for output.
+
+ Outputs the UTC date, taking into account any forced offset for the
+ channel.
+ """
+ if fmt == 'iso':
+ fmt = "%Y-%m-%dT%H:%M:%S+00:00"
+ elif fmt == 'rfc822':
+ fmt = "%a, %d %b %Y %H:%M:%S +0000"
+ elif fmt is None:
+ fmt = "%B %d, %Y %I:%M %p"
+
+ return time.strftime(fmt, time.gmtime(self.utctime(date)))
+
+
+class NewsItem:
+ """A single item of news.
+
+ NewsItem represents a single item of news from a channel. They
+ are created and owned by the Channel and accessible through
+ Channel().items.
+
+ Useful members:
+ id: Unique identifier for the item (often a URI)
+ date: Date item was last modified
+
+ title: Title of the item
+ summary: Summary of the content for the first page
+ content: Content of the item
+ link: Link associated with the item, generally the HTML version
+ creator: Person who created the item
+
+ channel: Channel this NewsItem belongs to
+ """
+ def __init__(self, dict, channel):
+ self.channel = channel
+
+ self.link = dict.get('link', '')
+ self.id = dict.get('id', self.link)
+
+ self.title = dict.get('title', '')
+ self.summary = dict.get('summary', '')
+ if 'content' in dict and len(dict['content']):
+ self.content = dict['content'][0]['value']
+ elif 'description' in dict:
+ self.content = dict['description']
+ else:
+ self.content = ''
+
+ if type(self.content) != type(""):
+ try:
+ self.content = unicode(self.content.encode("utf8")).encode("utf8")
+ except UnicodeError:
+ try:
+ self.content = unicode(self.content.encode("iso8859_1")).encode("utf8")
+ except UnicodeError:
+ self.content = unicode(self.content.encode("ascii", "replace")).encode("utf8")
+ logging.warn("Item wasn't in UTF-8 or ISO-8859-1, replaced " +
+ "all non-ASCII characters.")
+
+ self.date = dict.get('modified_parsed')
+ if self.date is None or self.date[3:6] == (0, 0, 0):
+ self.date = self._cached_time()
+
+ self.creator = dict.get('creator', '');
+
+ def _cached_time(self):
+ """Retrieve or save a cached time for this entry.
+
+ Sometimes entries lack any date or time information, and
+ sometimes they just lack time information. The trouble is
+ we need both to be able to put them in the right place in the
+ output.
+
+ This is the solution (for both). When you find no date, or
+ one that ends up at exactly midnight (as-if!) we grovel around
+ inside a cache file to see whether we've recorded anything for
+ it so far. If we have, we use that, otherwise we'll use the
+ current UTC time and save that (along with the rest) into the
+ cache file for use next time.
+
+ Truly midnight dates will sneak a bit forward, but that's not
+ a great loss.
+ """
+ time_cache_uri = cache_filename(self.channel.uri) + ",times"
+ time_cache = {}
+
+ if os.path.exists(time_cache_uri):
+ try:
+ c = open(time_cache_uri)
+ for line in c.readlines():
+ id, timestr = line.strip().split(" = ")
+ time_cache[id] = feedparser.parse_date(timestr)
+ c.close()
+
+ if time_cache.has_key(self.id):
+ return time_cache[self.id]
+ except:
+ logging.warn("Time cache read failed <" + time_cache_uri + ">",
+ exc_info = 1)
+
+ time_cache[self.id] = time.gmtime()
+ fmt_time = self.channel.format_date(time_cache[self.id], 'iso')
+
+ # Make sure we don't move the entry *too far*
+ if self.date is not None:
+ orig_time = self.channel.utctime(self.date)
+ this_time = self.channel.utctime(time_cache[self.id])
+ if abs(this_time - orig_time) > 86400:
+ return self.date
+
+ try:
+ c = open(time_cache_uri, "a")
+ c.write("%s = %s\n" % (self.id, fmt_time))
+ c.close()
+ except:
+ logging.warn("Time cache write failed <" + time_cache_uri + ">",
+ exc_info=1)
+
+ return time_cache[self.id]
+
+
+class Planet:
+ """A collection of channels.
+
+ Planet represents an aggregated set of channels, easing their
+ management and allowing you to directly retreive the items in
+ descending date order.
+
+ Once a planet is created you subscribe new Channels to it. You
+ can then obtain a list of subscribed channels through the
+ channels() member function and a list of items through the items()
+ function (bypassing the Channel level).
+
+ What you do with a Planet is up to you.
+ """
+ def __init__(self):
+ self._channels = []
+ self._items = None
+
+ def subscribe(self, channel):
+ """Subscribe the Planet to a Channel."""
+ self._channels.append(channel)
+ self._items = None
+
+ def unsubscribe(self, channel):
+ """Unsubscribe the Planet from a Channel."""
+ self._channels.remove(channel)
+ self._items = None
+
+ def channels(self):
+ """Retrieve the currently subscribed channels.
+
+ Returns a list of all the Channels this planet is subscribed to.
+ """
+ return list(self._channels)
+
+ def items(self):
+ """Retrieve the items in date order.
+
+ Returns all items in descending date order (most recent first).
+ """
+ if self._items is not None:
+ return self._items
+
+ self._items = []
+ for channel in self._channels:
+ for item in channel.items:
+ if item.date is not None:
+ self._items.append(item)
+
+ self._items.sort(lambda x,y: cmp(y.channel.utctime(y.date),
+ x.channel.utctime(x.date)))
+ return list(self._items)
Added: webservices/admin/planet/planetlib.pyc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/planetlib.pyc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/planetlib.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: webservices/admin/planet/pyblosxom.css
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/pyblosxom.css?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/pyblosxom.css (added)
+++ webservices/admin/planet/pyblosxom.css Fri Dec 9 04:21:26 2005
@@ -0,0 +1,110 @@
+/* Standard Blosxom Bits */
+
+.blosxomDate {
+ color: white;
+ background-color: black;
+ font-weight: bold;
+ padding: 8px 5px 8px 5px;
+ clear: both;
+}
+
+.blosxomDate a {
+ color: white;
+ font-weight: bold;
+ text-decoration: none;
+ margin-right: 0.5ex;
+}
+
+.blosxomDateLink {
+ float: right;
+ border: 0px;
+}
+
+.blosxomTitle {
+ clear: both;
+ color: black;
+ font-weight: bold;
+ text-decoration: none;
+ margin: 15px;
+ margin-bottom: 0px;
+}
+
+.blosxomTitle a {
+ color: black;
+ text-decoration: none;
+}
+
+.blosxomStory {
+ margin: 15px;
+ margin-bottom: 20px;
+ margin-top: 7px;
+ clear: both;
+}
+
+.blosxomStory a.permalink {
+ color: #aaaaaa;
+ font-size: small;
+ text-decoration: none;
+ font-style: italic;
+}
+
+.blosxomStory p {
+ margin-top: 7px;
+ margin-bottom: 7px;
+}
+
+.blosxomStory img {
+ border: 0px;
+ padding: 8px 8px 8px 8px;
+ margin: 0px;
+ float: none;
+}
+
+.blosxomStory img.left {
+ float: left;
+}
+
+.blosxomStory img.right {
+ float: right;
+}
+
+/* Calendar */
+
+#blosxomCalendar {
+ padding: 0px 12px 12px 12px;
+}
+
+.blosxomCalendar {
+ font-size: small;
+ margin-left: 0px;
+ margin-right: 0px;
+ width: 100%;
+}
+
+.blosxomCalendarHead {
+ font-weight: bold;
+}
+
+.blosxomCalendarWeekHeader {
+ color: green;
+}
+
+.blosxomCalendarCell {
+ color: #aaaaaa;
+}
+
+.blosxomCalendarBlogged a {
+ color: #000000;
+}
+
+.blosxomCalendarSpecificDay a {
+ color: blue;
+}
+
+.blosxomCalendarToday {
+ color: green;
+}
+
+.blosxomCalendarToday a {
+ color: green;
+}
Added: webservices/admin/planet/rss09.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss09.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss09.xml.tmpl (added)
+++ webservices/admin/planet/rss09.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns="http://my.netscape.com/rdf/simple/0.9/">
+
+<channel>
+ <title><TMPL_VAR name></title>
+ <TMPL_IF channel_description>
+ <description><TMPL_VAR channel_description></description>
+ </TMPL_IF>
+ <link>http://www.planetapache.org/</link>
+</channel>
+
+<TMPL_LOOP Items>
+<item>
+ <title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+</item>
+</TMPL_LOOP>
+
+</rdf:RDF>
Added: webservices/admin/planet/rss09.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss09.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/rss09.xml.tmplc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: webservices/admin/planet/rss10.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss10.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss10.xml.tmpl (added)
+++ webservices/admin/planet/rss10.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:foaf="http://xmlns.com/foaf/0.1/"
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns="http://purl.org/rss/1.0/"
+>
+<channel rdf:about="http://www.planetapache.org/">
+ <title><TMPL_VAR name></title>
+ <TMPL_IF channel_description>
+ <description><TMPL_VAR channel_description></description>
+ </TMPL_IF>
+ <link>http://www.planetapache.org/</link>
+
+ <items>
+ <rdf:Seq>
+<TMPL_LOOP Items>
+ <rdf:li rdf:resource="<TMPL_VAR link ESCAPE="HTML">" />
+</TMPL_LOOP>
+ </rdf:Seq>
+ </items>
+
+<TMPL_LOOP Items>
+<item rdf:about="<TMPL_VAR link ESCAPE="HTML">">
+ <title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+ <TMPL_IF content>
+ <content:encoded><TMPL_VAR content ESCAPE="HTML"></content:encoded>
+ </TMPL_IF>
+ <dc:date><TMPL_VAR date_iso></dc:date>
+</item>
+</TMPL_LOOP>
+</channel>
+</rdf:RDF>
Added: webservices/admin/planet/rss10.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss10.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/rss10.xml.tmplc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: webservices/admin/planet/rss20.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss20.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss20.xml.tmpl (added)
+++ webservices/admin/planet/rss20.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<rss version="2.0">
+<channel>
+ <title><TMPL_VAR name></title>
+ <link>http://www.planetapache.org/</link>
+ <TMPL_IF channel_description>
+ <description><TMPL_VAR channel_description></description>
+ </TMPL_IF>
+ <language>en</language>
+
+<TMPL_LOOP Items>
+<item>
+ <title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+ <guid isPermaLink="true"><TMPL_VAR link ESCAPE="HTML"></guid>
+ <TMPL_IF content>
+ <description><TMPL_VAR content ESCAPE="HTML"></description>
+ </TMPL_IF>
+ <pubDate><TMPL_VAR date_822></pubDate>
+</item>
+</TMPL_LOOP>
+
+</channel>
+</rss>
Added: webservices/admin/planet/rss20.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss20.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/rss20.xml.tmplc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: webservices/admin/planet/run.sh
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/run.sh?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/run.sh (added)
+++ webservices/admin/planet/run.sh Fri Dec 9 04:21:26 2005
@@ -0,0 +1,2 @@
+cd /usr/home/serendib/blogs
+/usr/local/bin/python planet.py config.ini > /dev/null 2>&1
Added: webservices/admin/planet/timeoutsocket.py
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/timeoutsocket.py?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/timeoutsocket.py (added)
+++ webservices/admin/planet/timeoutsocket.py Fri Dec 9 04:21:26 2005
@@ -0,0 +1,424 @@
+
+####
+# Copyright 2000,2001 by Timothy O'Malley <ti...@alum.mit.edu>
+#
+# All Rights Reserved
+#
+# Permission to use, copy, modify, and distribute this software
+# and its documentation for any purpose and without fee is hereby
+# granted, provided that the above copyright notice appear in all
+# copies and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Timothy O'Malley not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+# AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
+# ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THIS SOFTWARE.
+#
+####
+
+"""Timeout Socket
+
+This module enables a timeout mechanism on all TCP connections. It
+does this by inserting a shim into the socket module. After this module
+has been imported, all socket creation goes through this shim. As a
+result, every TCP connection will support a timeout.
+
+The beauty of this method is that it immediately and transparently
+enables the entire python library to support timeouts on TCP sockets.
+As an example, if you wanted to SMTP connections to have a 20 second
+timeout:
+
+ import timeoutsocket
+ import smtplib
+ timeoutsocket.setDefaultSocketTimeout(20)
+
+
+The timeout applies to the socket functions that normally block on
+execution: read, write, connect, and accept. If any of these
+operations exceeds the specified timeout, the exception Timeout
+will be raised.
+
+The default timeout value is set to None. As a result, importing
+this module does not change the default behavior of a socket. The
+timeout mechanism only activates when the timeout has been set to
+a numeric value. (This behavior mimics the behavior of the
+select.select() function.)
+
+This module implements two classes: TimeoutSocket and TimeoutFile.
+
+The TimeoutSocket class defines a socket-like object that attempts to
+avoid the condition where a socket may block indefinitely. The
+TimeoutSocket class raises a Timeout exception whenever the
+current operation delays too long.
+
+The TimeoutFile class defines a file-like object that uses the TimeoutSocket
+class. When the makefile() method of TimeoutSocket is called, it returns
+an instance of a TimeoutFile.
+
+Each of these objects adds two methods to manage the timeout value:
+
+ get_timeout() --> returns the timeout of the socket or file
+ set_timeout() --> sets the timeout of the socket or file
+
+
+As an example, one might use the timeout feature to create httplib
+connections that will timeout after 30 seconds:
+
+ import timeoutsocket
+ import httplib
+ H = httplib.HTTP("www.python.org")
+ H.sock.set_timeout(30)
+
+Note: When used in this manner, the connect() routine may still
+block because it happens before the timeout is set. To avoid
+this, use the 'timeoutsocket.setDefaultSocketTimeout()' function.
+
+Good Luck!
+
+"""
+
+__version__ = "$Revision: 1.1.1.1 $"
+__author__ = "Timothy O'Malley <ti...@alum.mit.edu>"
+
+#
+# Imports
+#
+import select, string
+import socket
+if not hasattr(socket, "_no_timeoutsocket"):
+ _socket = socket.socket
+else:
+ _socket = socket._no_timeoutsocket
+
+
+#
+# Set up constants to test for Connected and Blocking operations.
+# We delete 'os' and 'errno' to keep our namespace clean(er).
+# Thanks to Alex Martelli and G. Li for the Windows error codes.
+#
+import os
+if os.name == "nt":
+ _IsConnected = ( 10022, 10056 )
+ _ConnectBusy = ( 10035, )
+ _AcceptBusy = ( 10035, )
+else:
+ import errno
+ _IsConnected = ( errno.EISCONN, )
+ _ConnectBusy = ( errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK )
+ _AcceptBusy = ( errno.EAGAIN, errno.EWOULDBLOCK )
+ del errno
+del os
+
+
+#
+# Default timeout value for ALL TimeoutSockets
+#
+_DefaultTimeout = None
+def setDefaultSocketTimeout(timeout):
+ global _DefaultTimeout
+ _DefaultTimeout = timeout
+def getDefaultSocketTimeout():
+ return _DefaultTimeout
+
+#
+# Exceptions for socket errors and timeouts
+#
+Error = socket.error
+class Timeout(Exception):
+ pass
+
+
+#
+# Factory function
+#
+from socket import AF_INET, SOCK_STREAM
+def timeoutsocket(family=AF_INET, type=SOCK_STREAM, proto=None):
+ if family != AF_INET or type != SOCK_STREAM:
+ if proto:
+ return _socket(family, type, proto)
+ else:
+ return _socket(family, type)
+ return TimeoutSocket( _socket(family, type), _DefaultTimeout )
+# end timeoutsocket
+
+#
+# The TimeoutSocket class definition
+#
+class TimeoutSocket:
+ """TimeoutSocket object
+ Implements a socket-like object that raises Timeout whenever
+ an operation takes too long.
+ The definition of 'too long' can be changed using the
+ set_timeout() method.
+ """
+
+ _copies = 0
+ _blocking = 1
+
+ def __init__(self, sock, timeout):
+ self._sock = sock
+ self._timeout = timeout
+ # end __init__
+
+ def __getattr__(self, key):
+ return getattr(self._sock, key)
+ # end __getattr__
+
+ def get_timeout(self):
+ return self._timeout
+ # end set_timeout
+
+ def set_timeout(self, timeout=None):
+ self._timeout = timeout
+ # end set_timeout
+
+ def setblocking(self, blocking):
+ self._blocking = blocking
+ return self._sock.setblocking(blocking)
+ # end set_timeout
+
+ def connect_ex(self, addr):
+ errcode = 0
+ try:
+ self.connect(addr)
+ except Error, why:
+ errcode = why[0]
+ return errcode
+ # end connect_ex
+
+ def connect(self, addr, port=None, dumbhack=None):
+ # In case we were called as connect(host, port)
+ if port != None: addr = (addr, port)
+
+ # Shortcuts
+ sock = self._sock
+ timeout = self._timeout
+ blocking = self._blocking
+
+ # First, make a non-blocking call to connect
+ try:
+ sock.setblocking(0)
+ sock.connect(addr)
+ sock.setblocking(blocking)
+ return
+ except Error, why:
+ # Set the socket's blocking mode back
+ sock.setblocking(blocking)
+
+ # If we are not blocking, re-raise
+ if not blocking:
+ raise
+
+ # If we are already connected, then return success.
+ # If we got a genuine error, re-raise it.
+ errcode = why[0]
+ if dumbhack and errcode in _IsConnected:
+ return
+ elif errcode not in _ConnectBusy:
+ raise
+
+ # Now, wait for the connect to happen
+ # ONLY if dumbhack indicates this is pass number one.
+ # If select raises an error, we pass it on.
+ # Is this the right behavior?
+ if not dumbhack:
+ r,w,e = select.select([], [sock], [], timeout)
+ if w:
+ return self.connect(addr, dumbhack=1)
+
+ # If we get here, then we should raise Timeout
+ raise Timeout("Attempted connect to %s timed out." % str(addr) )
+ # end connect
+
+ def accept(self, dumbhack=None):
+ # Shortcuts
+ sock = self._sock
+ timeout = self._timeout
+ blocking = self._blocking
+
+ # First, make a non-blocking call to accept
+ # If we get a valid result, then convert the
+ # accept'ed socket into a TimeoutSocket.
+ # Be carefult about the blocking mode of ourselves.
+ try:
+ sock.setblocking(0)
+ newsock, addr = sock.accept()
+ sock.setblocking(blocking)
+ timeoutnewsock = self.__class__(newsock, timeout)
+ timeoutnewsock.setblocking(blocking)
+ return (timeoutnewsock, addr)
+ except Error, why:
+ # Set the socket's blocking mode back
+ sock.setblocking(blocking)
+
+ # If we are not supposed to block, then re-raise
+ if not blocking:
+ raise
+
+ # If we got a genuine error, re-raise it.
+ errcode = why[0]
+ if errcode not in _AcceptBusy:
+ raise
+
+ # Now, wait for the accept to happen
+ # ONLY if dumbhack indicates this is pass number one.
+ # If select raises an error, we pass it on.
+ # Is this the right behavior?
+ if not dumbhack:
+ r,w,e = select.select([sock], [], [], timeout)
+ if r:
+ return self.accept(dumbhack=1)
+
+ # If we get here, then we should raise Timeout
+ raise Timeout("Attempted accept timed out.")
+ # end accept
+
+ def send(self, data, flags=0):
+ sock = self._sock
+ if self._blocking:
+ r,w,e = select.select([],[sock],[], self._timeout)
+ if not w:
+ raise Timeout("Send timed out")
+ return sock.send(data, flags)
+ # end send
+
+ def recv(self, bufsize, flags=0):
+ sock = self._sock
+ if self._blocking:
+ r,w,e = select.select([sock], [], [], self._timeout)
+ if not r:
+ raise Timeout("Recv timed out")
+ return sock.recv(bufsize, flags)
+ # end recv
+
+ def makefile(self, flags="r", bufsize=-1):
+ self._copies = self._copies +1
+ return TimeoutFile(self, flags, bufsize)
+ # end makefile
+
+ def close(self):
+ if self._copies <= 0:
+ self._sock.close()
+ else:
+ self._copies = self._copies -1
+ # end close
+
+# end TimeoutSocket
+
+
+class TimeoutFile:
+ """TimeoutFile object
+ Implements a file-like object on top of TimeoutSocket.
+ """
+
+ def __init__(self, sock, mode="r", bufsize=4096):
+ self._sock = sock
+ self._bufsize = 4096
+ if bufsize > 0: self._bufsize = bufsize
+ if not hasattr(sock, "_inqueue"): self._sock._inqueue = ""
+
+ # end __init__
+
+ def __getattr__(self, key):
+ return getattr(self._sock, key)
+ # end __getattr__
+
+ def close(self):
+ self._sock.close()
+ self._sock = None
+ # end close
+
+ def write(self, data):
+ self.send(data)
+ # end write
+
+ def read(self, size=-1):
+ _sock = self._sock
+ _bufsize = self._bufsize
+ while 1:
+ datalen = len(_sock._inqueue)
+ if datalen >= size >= 0:
+ break
+ bufsize = _bufsize
+ if size > 0:
+ bufsize = min(bufsize, size - datalen )
+ buf = self.recv(bufsize)
+ if not buf:
+ break
+ _sock._inqueue = _sock._inqueue + buf
+ data = _sock._inqueue
+ _sock._inqueue = ""
+ if size > 0 and datalen > size:
+ _sock._inqueue = data[size:]
+ data = data[:size]
+ return data
+ # end read
+
+ def readline(self, size=-1):
+ _sock = self._sock
+ _bufsize = self._bufsize
+ while 1:
+ idx = string.find(_sock._inqueue, "\n")
+ if idx >= 0:
+ break
+ datalen = len(_sock._inqueue)
+ if datalen >= size >= 0:
+ break
+ bufsize = _bufsize
+ if size > 0:
+ bufsize = min(bufsize, size - datalen )
+ buf = self.recv(bufsize)
+ if not buf:
+ break
+ _sock._inqueue = _sock._inqueue + buf
+
+ data = _sock._inqueue
+ _sock._inqueue = ""
+ if idx >= 0:
+ idx = idx + 1
+ _sock._inqueue = data[idx:]
+ data = data[:idx]
+ elif size > 0 and datalen > size:
+ _sock._inqueue = data[size:]
+ data = data[:size]
+ return data
+ # end readline
+
+ def readlines(self, sizehint=-1):
+ result = []
+ data = self.read()
+ while data:
+ idx = string.find(data, "\n")
+ if idx >= 0:
+ idx = idx + 1
+ result.append( data[:idx] )
+ data = data[idx:]
+ else:
+ result.append( data )
+ data = ""
+ return result
+ # end readlines
+
+ def flush(self): pass
+
+# end TimeoutFile
+
+
+#
+# Silently replace the socket() builtin function with
+# our timeoutsocket() definition.
+#
+if not hasattr(socket, "_no_timeoutsocket"):
+ socket._no_timeoutsocket = socket.socket
+ socket.socket = timeoutsocket
+del socket
+socket = timeoutsocket
+# Finis