You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ws.apache.org by di...@apache.org on 2005/12/09 13:22:06 UTC

svn commit: r355462 [7/7] - in /webservices/admin/planet: ./ cache/ compat_logging/ examples/ fancy-examples/ output/ output/images/

Added: webservices/admin/planet/planetlib.py
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/planetlib.py?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/planetlib.py (added)
+++ webservices/admin/planet/planetlib.py Fri Dec  9 04:21:26 2005
@@ -0,0 +1,500 @@
+#!/usr/bin/env python
+"""Planet RSS Aggregator Library.
+
+You may modify the following variables;
+
+    user_agent: User-Agent sent to remote sites
+    cache_directory: Where to put cached feeds
+
+Requires Python 2.2 and Mark Pilgrim's feedparser.py.
+"""
+
+__version__ = "0.2"
+__authors__ = [ "Scott James Remnant <sc...@netsplit.com>",
+                "Jeff Waugh <jd...@perkypants.org>" ]
+__credits__ = "Originally based on spycyroll."
+__license__ = "Python"
+
+
+import os
+import re
+import time
+
+import feedparser
+import StringIO
+import sgmllib
+
+try:
+    import logging
+except:
+    import compat_logging
+    logging = compat_logging
+
+try:
+    import gzip
+except:
+    gzip = None
+
+
+# We might as well advertise ourself when we're off galavanting
+user_agent = " ".join(("Planet/" + __version__,
+                       "http://www.planetplanet.org/",
+                       feedparser.USER_AGENT))
+
+# Where to put cached rss feeds
+cache_directory = 'cache'
+
+# Things we don't want to see in our cache filenames
+cache_invalid_stuff = re.compile(r'\W+')
+cache_multiple_dots = re.compile(r'\.+')
+
+
+def cache_filename(url):
+    """Returns a cache filename for a URL.
+
+    Sanitises the URL given, replacing unwelcome characters with periods,
+    then prepends the configured cache directory.
+    """
+    filename = url
+    filename = filename.replace("http://", "")
+    filename = filename.replace("www.", "")
+    filename = cache_invalid_stuff.sub('.', filename)
+    filename = cache_multiple_dots.sub('.', filename)
+
+    return os.path.join(cache_directory, filename)
+
+
+class Stripper(sgmllib.SGMLParser):
+    def __init__(self):
+        sgmllib.SGMLParser.__init__(self)
+		
+    def strip(self, some_html):
+        self.theString = ""
+        self.feed(some_html)
+        self.close()
+        return self.theString
+		
+    def handle_data(self, data):
+        self.theString += data
+
+
+class Channel:
+    """A collection of news items.
+
+    Channel represents a feed of news from a website, or some other
+    source.
+
+    A channel is created with a URI where we can obtain the feed
+    and an optional dictionary of additional properties you wish
+    to set (probably from a config file).
+
+    Special properties:
+        offset: The number of hours out the channel's times tend to be.
+
+    Useful members:
+        uri: Where the feed can be downloaded from
+        etag, modified: Used to determine whether the feed has changed
+
+        title: Title for the feed, often the author's name
+        description: A description of the content of the feed
+        link: Link associated with the feed, generally the HTML version
+
+        items: List of current NewsItems
+        props: Dictionary of properties
+    """
+    def __init__(self, uri, props=None):
+        self.uri = uri
+        self.etag = None
+        self.modified = None
+
+        self.title = None
+        self.description = None
+        self.link = None
+
+        self.items = []
+
+        if props:
+            self.props = props
+        else:
+            self.props = {}
+        if self.props.has_key('offset'):
+            self.offset = float(self.props['offset'])
+        else:
+            self.offset = None
+
+        self.cache_read()
+
+    def update(self, uri=None):
+        """Update the channel.
+
+        Read feed data from channel's URI (or an alternate one) and
+        parse it using feedparser.  This, where possible, caches the
+        data and tries not to request it again if it hasn't changed.
+
+        Most of this is actually what feedparser.parse() does, but as
+        we need the unparsed data (to cache), we have to do a bit of the
+        work ourselves.
+
+        The real work is done in _update().
+        """
+        if uri is None:
+            save_uri = 1
+            uri = self.uri
+            logging.info("Updating feed <" + self.uri + ">")
+        else:
+            save_uri = 0
+            logging.info("Updating feed <" + self.uri + "> from <" + uri + ">")
+
+        # Open the resource and read the data
+        f = feedparser.open_resource(uri, agent=user_agent,
+                                     etag=self.etag, modified=self.modified)
+        data = self._read_data(f)
+
+        # Check for some obvious things
+        if hasattr(f, 'status'):
+            if f.status == 304:
+                logging.info("Feed has not changed")
+                return
+            if f.status >= 400:
+                logging.error("Update failed for <%s> (Error: %d)"
+                              % (uri, f.status))
+                return
+
+        # Update etag and modified
+        new_etag = feedparser.get_etag(f)
+        if new_etag:
+            self.etag = new_etag
+            logging.debug("E-Tag: " + self.etag)
+        new_modified = feedparser.get_modified(f)
+        if new_modified:
+            self.modified = new_modified
+            logging.debug("Modified: " + self.format_date(self.modified))
+
+        # Update URI in case of redirect
+        if hasattr(f, 'url') and save_uri:
+            self.uri = f.url
+            logging.debug("URI: <" + self.uri + ">")
+        if hasattr(f, 'headers'):
+            baseuri = f.headers.get('content-location', self.uri)
+        else:
+            baseuri = self.uri
+
+        # Parse the feed
+        f.close()
+        self._update(baseuri, data)
+
+    def _read_data(self, f):
+        """Read the data from the resource.
+
+        Attempts to gunzip the data if the Content-Encoding header claimed
+        to be gzip, but if that fails it doesn't overly worry about it.
+
+        We then take the data and try to squeeze it into a UTF-8 string
+        using Python's unicode module.  If it doesn't decode as UTF-8
+        we try ISO-8559-1 before ruthlessly stripping the bad characters.
+        """
+        data = f.read()
+
+        if hasattr(f, 'headers'):
+            if gzip and f.headers.get('content-encoding', '') == 'gzip':
+                try:
+                    gzdata = gzip.GzipFile(fileobj=StringIO.StringIO(data))
+                    data = gzdata.read()
+                except:
+                    logging.warn("Feed contained invalid gzipped data",
+                                 exc_info=1)
+
+        try:
+            data = unicode(data, "utf8").encode("utf8")
+            logging.debug("Encoding: UTF-8")
+        except UnicodeError:
+            try:
+                data = unicode(data, "iso8859_1").encode("utf8")
+                logging.debug("Encoding: ISO-8859-1")
+            except UnicodeError:
+                data = unicode(data, "ascii", "replace").encode("utf8")
+                logging.warn("Feed wasn't in UTF-8 or ISO-8859-1, replaced " +
+                             "all non-ASCII characters.")
+
+        return data
+
+    def cache_read(self):
+        """Initialise the channel from the cache.
+
+        The data is read from a file in the cache_directory and parsed.
+        """
+        cache_uri = cache_filename(self.uri)
+
+        try:
+            if os.path.exists(cache_uri):
+                if os.path.exists(cache_uri + ",etag"):
+                    c = open(cache_uri + ",etag")
+                    self.etag = c.read().strip()
+                    c.close()
+
+                if os.path.exists(cache_uri + ",modified"):
+                    c = open(cache_uri + ",modified")
+                    self.modified = feedparser.parse_date(c.read().strip())
+                    c.close()
+
+                self.update(cache_uri)
+        except:
+            logging.warn("Cache read failed <" + cache_uri + ">", exc_info=1)
+
+    def cache_write(self, data):
+        """Write the unparsed feed to the cache.
+
+        The data is written as-is to a file in the cache_directory.
+        If the channel has etag or modified information, those are written
+        to files alongside.
+        """
+        cache_uri = cache_filename(self.uri)
+
+        try:
+            c = open(cache_uri, "w")
+            c.write(data)
+            c.close()
+
+            if self.etag:
+                c = open(cache_uri + ",etag", "w")
+                c.write(self.etag + "\n")
+                c.close()
+            elif os.path.exists(cache_uri + ",etag"):
+                try:
+                    os.remove(cache_uri + ",etag")
+                except:
+                    pass
+
+            if self.modified:
+                c = open(cache_uri + ",modified", "w")
+                c.write(feedparser.format_http_date(self.modified) + "\n")
+                c.close()
+            elif os.path.exists(cache_uri + ",modified"):
+                try:
+                    os.remove(cache_uri + ",modified")
+                except:
+                    pass
+        except:
+            logging.warn("Cache write failed <" + cache_uri + ">", exc_info=1)
+
+    def _update(self, baseuri, data):
+        """Update the channel from a parsed feed.
+
+        This is the real guts of update() and after all the fuss is
+        actually pretty simple.  We parse the feed using feedparser
+        and if we get the information, cache it.
+        """
+        feed = feedparser.FeedParser(baseuri)
+        feed.feed(data)
+
+        if len(feed.items) < 1:
+            logging.info("Empty feed, cowardly not updating %s" % (baseuri))
+            return
+
+        new_items = []
+        for item in feed.items:
+            new_items.append(NewsItem(item, self))
+            if abs(new_items[-1].date[0] - time.gmtime()[0]) > 1:
+                logging.warning(("Obviously bogus year in feed (%d), " +
+                                 "cowardly not updating")
+                                % (new_items[-1].date[0],))
+                return
+
+        self.items = new_items
+        self.title = feed.channel.get('title', '')
+        self.description = feed.channel.get('description', '')
+        self.link = feed.channel.get('link', '')
+
+        self.cache_write(data)
+        return self.items
+
+    def utctime(self, date):
+        """Return UTC time() for given date.
+
+        Returns the equivalent of time() for the given date, but taking
+        into account local timezone or any forced offset for the channel.
+
+        This is suitable for using in a call to gmtime() only.
+        """
+        offset = time.timezone
+        if self.offset is not None:
+            # self.offset is the difference from UTC, so add timezone
+            offset += self.offset * 3600 + time.timezone
+
+        return time.mktime(date) - offset
+
+    def format_date(self, date, fmt=None):
+        """Formats a date for output.
+
+        Outputs the UTC date, taking into account any forced offset for the
+        channel.
+        """
+        if fmt == 'iso':
+            fmt = "%Y-%m-%dT%H:%M:%S+00:00"
+        elif fmt == 'rfc822':
+            fmt = "%a, %d %b %Y %H:%M:%S +0000"
+        elif fmt is None:
+            fmt = "%B %d, %Y %I:%M %p"
+
+        return time.strftime(fmt, time.gmtime(self.utctime(date)))
+
+
+class NewsItem:
+    """A single item of news.
+
+    NewsItem represents a single item of news from a channel.  They
+    are created and owned by the Channel and accessible through
+    Channel().items.
+
+    Useful members:
+        id: Unique identifier for the item (often a URI)
+        date: Date item was last modified
+
+        title: Title of the item
+        summary: Summary of the content for the first page
+        content: Content of the item
+        link: Link associated with the item, generally the HTML version
+        creator: Person who created the item
+
+        channel: Channel this NewsItem belongs to
+    """
+    def __init__(self, dict, channel):
+        self.channel = channel
+
+        self.link = dict.get('link', '')
+        self.id = dict.get('id', self.link)
+
+        self.title = dict.get('title', '')
+        self.summary = dict.get('summary', '')
+        if 'content' in dict and len(dict['content']):
+            self.content = dict['content'][0]['value']
+        elif 'description' in dict:
+            self.content = dict['description']
+        else:
+            self.content = ''
+
+        if type(self.content) != type(""):
+            try:
+                self.content = unicode(self.content.encode("utf8")).encode("utf8")
+            except UnicodeError:
+                try:
+                    self.content = unicode(self.content.encode("iso8859_1")).encode("utf8")
+                except UnicodeError:
+                    self.content = unicode(self.content.encode("ascii", "replace")).encode("utf8")
+                    logging.warn("Item wasn't in UTF-8 or ISO-8859-1, replaced " +
+                                 "all non-ASCII characters.")
+
+        self.date = dict.get('modified_parsed')
+        if self.date is None or self.date[3:6] == (0, 0, 0):
+            self.date = self._cached_time()
+
+        self.creator = dict.get('creator', '');
+
+    def _cached_time(self):
+        """Retrieve or save a cached time for this entry.
+
+        Sometimes entries lack any date or time information, and
+        sometimes they just lack time information.  The trouble is
+        we need both to be able to put them in the right place in the
+        output.
+
+        This is the solution (for both).  When you find no date, or
+        one that ends up at exactly midnight (as-if!) we grovel around
+        inside a cache file to see whether we've recorded anything for
+        it so far.  If we have, we use that, otherwise we'll use the
+        current UTC time and save that (along with the rest) into the
+        cache file for use next time.
+
+        Truly midnight dates will sneak a bit forward, but that's not
+        a great loss.
+        """
+        time_cache_uri = cache_filename(self.channel.uri) + ",times"
+        time_cache = {}
+
+        if os.path.exists(time_cache_uri):
+            try:
+                c = open(time_cache_uri)
+                for line in c.readlines():
+                    id, timestr = line.strip().split(" = ")
+                    time_cache[id] = feedparser.parse_date(timestr)
+                c.close()
+
+                if time_cache.has_key(self.id):
+                    return time_cache[self.id]
+            except:
+                logging.warn("Time cache read failed <" + time_cache_uri + ">",
+                             exc_info = 1)
+
+        time_cache[self.id] = time.gmtime()
+        fmt_time = self.channel.format_date(time_cache[self.id], 'iso')
+
+        # Make sure we don't move the entry *too far*
+        if self.date is not None:
+            orig_time = self.channel.utctime(self.date)
+            this_time = self.channel.utctime(time_cache[self.id])
+            if abs(this_time - orig_time) > 86400:
+                return self.date
+
+        try:
+            c = open(time_cache_uri, "a")
+            c.write("%s = %s\n" % (self.id, fmt_time))
+            c.close()
+        except:
+            logging.warn("Time cache write failed <" + time_cache_uri + ">",
+                         exc_info=1)
+
+        return time_cache[self.id]
+
+
+class Planet:
+    """A collection of channels.
+
+    Planet represents an aggregated set of channels, easing their
+    management and allowing you to directly retreive the items in
+    descending date order.
+
+    Once a planet is created you subscribe new Channels to it.  You
+    can then obtain a list of subscribed channels through the
+    channels() member function and a list of items through the items()
+    function (bypassing the Channel level).
+
+    What you do with a Planet is up to you.
+    """
+    def __init__(self):
+        self._channels = []
+        self._items = None
+
+    def subscribe(self, channel):
+        """Subscribe the Planet to a Channel."""
+        self._channels.append(channel)
+        self._items = None
+
+    def unsubscribe(self, channel):
+        """Unsubscribe the Planet from a Channel."""
+        self._channels.remove(channel)
+        self._items = None
+
+    def channels(self):
+        """Retrieve the currently subscribed channels.
+
+        Returns a list of all the Channels this planet is subscribed to.
+        """
+        return list(self._channels)
+
+    def items(self):
+        """Retrieve the items in date order.
+
+        Returns all items in descending date order (most recent first).
+        """
+        if self._items is not None:
+            return self._items
+
+        self._items = []
+        for channel in self._channels:
+            for item in channel.items:
+                if item.date is not None:
+                    self._items.append(item)
+
+        self._items.sort(lambda x,y: cmp(y.channel.utctime(y.date),
+                                         x.channel.utctime(x.date)))
+        return list(self._items)

Added: webservices/admin/planet/planetlib.pyc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/planetlib.pyc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.

Propchange: webservices/admin/planet/planetlib.pyc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: webservices/admin/planet/pyblosxom.css
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/pyblosxom.css?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/pyblosxom.css (added)
+++ webservices/admin/planet/pyblosxom.css Fri Dec  9 04:21:26 2005
@@ -0,0 +1,110 @@
+/* Standard Blosxom Bits */
+
+.blosxomDate {
+	color: white;
+	background-color: black;
+	font-weight: bold;
+	padding: 8px 5px 8px 5px;
+	clear: both;
+}
+
+.blosxomDate a {
+	color: white;
+	font-weight: bold;
+	text-decoration: none;
+	margin-right: 0.5ex;
+}
+
+.blosxomDateLink {
+	float: right;
+	border: 0px;
+}
+
+.blosxomTitle {
+	clear: both;
+	color: black;
+	font-weight: bold;
+	text-decoration: none;
+	margin: 15px;
+	margin-bottom: 0px;
+}
+
+.blosxomTitle a {
+	color: black;
+	text-decoration: none;
+}
+
+.blosxomStory {
+	margin: 15px;
+	margin-bottom: 20px;
+	margin-top: 7px;
+	clear: both;
+}
+
+.blosxomStory a.permalink {
+	color: #aaaaaa;
+	font-size: small;
+	text-decoration: none;
+	font-style: italic;
+}
+
+.blosxomStory p {
+	margin-top: 7px;
+	margin-bottom: 7px;
+}
+
+.blosxomStory img {
+	border: 0px;
+	padding: 8px 8px 8px 8px;
+	margin: 0px;
+	float: none;
+}
+
+.blosxomStory img.left {
+	float: left;
+}
+
+.blosxomStory img.right {
+	float: right;
+}
+
+/* Calendar */
+
+#blosxomCalendar {
+	padding: 0px 12px 12px 12px;
+}
+
+.blosxomCalendar {
+	font-size: small;
+	margin-left: 0px;
+	margin-right: 0px;
+	width: 100%;
+}
+
+.blosxomCalendarHead {
+	font-weight: bold;
+}
+
+.blosxomCalendarWeekHeader {
+	color: green;
+}
+
+.blosxomCalendarCell {
+	color: #aaaaaa;
+}
+
+.blosxomCalendarBlogged a {
+	color: #000000;
+}
+
+.blosxomCalendarSpecificDay a {
+	color: blue;
+}
+
+.blosxomCalendarToday {
+	color: green;
+}
+
+.blosxomCalendarToday a {
+	color: green;
+}

Added: webservices/admin/planet/rss09.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss09.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss09.xml.tmpl (added)
+++ webservices/admin/planet/rss09.xml.tmpl Fri Dec  9 04:21:26 2005
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<rdf:RDF
+	xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	xmlns="http://my.netscape.com/rdf/simple/0.9/">
+
+<channel>
+	<title><TMPL_VAR name></title>
+	<TMPL_IF channel_description>
+	<description><TMPL_VAR channel_description></description>
+	</TMPL_IF>
+	<link>http://www.planetapache.org/</link>
+</channel>
+
+<TMPL_LOOP Items>
+<item>
+	<title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+	<link><TMPL_VAR link ESCAPE="HTML"></link>
+</item>
+</TMPL_LOOP>
+
+</rdf:RDF>

Added: webservices/admin/planet/rss09.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss09.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.

Propchange: webservices/admin/planet/rss09.xml.tmplc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: webservices/admin/planet/rss10.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss10.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss10.xml.tmpl (added)
+++ webservices/admin/planet/rss10.xml.tmpl Fri Dec  9 04:21:26 2005
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<rdf:RDF
+	xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	xmlns:dc="http://purl.org/dc/elements/1.1/"
+	xmlns:foaf="http://xmlns.com/foaf/0.1/"
+	xmlns:content="http://purl.org/rss/1.0/modules/content/"
+	xmlns="http://purl.org/rss/1.0/"
+>
+<channel rdf:about="http://www.planetapache.org/">
+	<title><TMPL_VAR name></title>
+	<TMPL_IF channel_description>
+	<description><TMPL_VAR channel_description></description>
+	</TMPL_IF>
+	<link>http://www.planetapache.org/</link>
+
+	<items>
+		<rdf:Seq>
+<TMPL_LOOP Items>
+			<rdf:li rdf:resource="<TMPL_VAR link ESCAPE="HTML">" />
+</TMPL_LOOP>
+		</rdf:Seq>
+	</items>
+
+<TMPL_LOOP Items>
+<item rdf:about="<TMPL_VAR link ESCAPE="HTML">">
+	<title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+	<link><TMPL_VAR link ESCAPE="HTML"></link>
+	<TMPL_IF content>
+	<content:encoded><TMPL_VAR content ESCAPE="HTML"></content:encoded>
+	</TMPL_IF>
+	<dc:date><TMPL_VAR date_iso></dc:date>
+</item>
+</TMPL_LOOP>
+</channel>
+</rdf:RDF>

Added: webservices/admin/planet/rss10.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss10.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.

Propchange: webservices/admin/planet/rss10.xml.tmplc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: webservices/admin/planet/rss20.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss20.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/rss20.xml.tmpl (added)
+++ webservices/admin/planet/rss20.xml.tmpl Fri Dec  9 04:21:26 2005
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<rss version="2.0">
+<channel>
+	<title><TMPL_VAR name></title>
+	<link>http://www.planetapache.org/</link>
+	<TMPL_IF channel_description>
+	<description><TMPL_VAR channel_description></description>
+	</TMPL_IF>
+	<language>en</language>
+
+<TMPL_LOOP Items>
+<item>
+	<title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF></title>
+	<guid isPermaLink="true"><TMPL_VAR link ESCAPE="HTML"></guid>
+	<TMPL_IF content>
+	<description><TMPL_VAR content ESCAPE="HTML"></description>
+	</TMPL_IF>
+	<pubDate><TMPL_VAR date_822></pubDate>
+</item>
+</TMPL_LOOP>
+
+</channel>
+</rss>

Added: webservices/admin/planet/rss20.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/rss20.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.

Propchange: webservices/admin/planet/rss20.xml.tmplc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: webservices/admin/planet/run.sh
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/run.sh?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/run.sh (added)
+++ webservices/admin/planet/run.sh Fri Dec  9 04:21:26 2005
@@ -0,0 +1,2 @@
+cd  /usr/home/serendib/blogs
+/usr/local/bin/python planet.py config.ini > /dev/null 2>&1

Added: webservices/admin/planet/timeoutsocket.py
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/timeoutsocket.py?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/timeoutsocket.py (added)
+++ webservices/admin/planet/timeoutsocket.py Fri Dec  9 04:21:26 2005
@@ -0,0 +1,424 @@
+
+####
+# Copyright 2000,2001 by Timothy O'Malley <ti...@alum.mit.edu>
+# 
+#                All Rights Reserved
+# 
+# Permission to use, copy, modify, and distribute this software
+# and its documentation for any purpose and without fee is hereby
+# granted, provided that the above copyright notice appear in all
+# copies and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Timothy O'Malley  not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission. 
+# 
+# Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+# AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
+# ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THIS SOFTWARE. 
+#
+####
+
+"""Timeout Socket
+
+This module enables a timeout mechanism on all TCP connections.  It
+does this by inserting a shim into the socket module.  After this module
+has been imported, all socket creation goes through this shim.  As a
+result, every TCP connection will support a timeout.
+
+The beauty of this method is that it immediately and transparently
+enables the entire python library to support timeouts on TCP sockets.
+As an example, if you wanted to SMTP connections to have a 20 second
+timeout:
+
+    import timeoutsocket
+    import smtplib
+    timeoutsocket.setDefaultSocketTimeout(20)
+
+
+The timeout applies to the socket functions that normally block on
+execution:  read, write, connect, and accept.  If any of these 
+operations exceeds the specified timeout, the exception Timeout
+will be raised.
+
+The default timeout value is set to None.  As a result, importing
+this module does not change the default behavior of a socket.  The
+timeout mechanism only activates when the timeout has been set to
+a numeric value.  (This behavior mimics the behavior of the
+select.select() function.)
+
+This module implements two classes: TimeoutSocket and TimeoutFile.
+
+The TimeoutSocket class defines a socket-like object that attempts to
+avoid the condition where a socket may block indefinitely.  The
+TimeoutSocket class raises a Timeout exception whenever the
+current operation delays too long. 
+
+The TimeoutFile class defines a file-like object that uses the TimeoutSocket
+class.  When the makefile() method of TimeoutSocket is called, it returns
+an instance of a TimeoutFile.
+
+Each of these objects adds two methods to manage the timeout value:
+
+    get_timeout()   -->  returns the timeout of the socket or file
+    set_timeout()   -->  sets the timeout of the socket or file
+
+
+As an example, one might use the timeout feature to create httplib
+connections that will timeout after 30 seconds:
+
+    import timeoutsocket
+    import httplib
+    H = httplib.HTTP("www.python.org")
+    H.sock.set_timeout(30)
+
+Note:  When used in this manner, the connect() routine may still
+block because it happens before the timeout is set.  To avoid
+this, use the 'timeoutsocket.setDefaultSocketTimeout()' function.
+
+Good Luck!
+
+"""
+
+__version__ = "$Revision: 1.1.1.1 $"
+__author__  = "Timothy O'Malley <ti...@alum.mit.edu>"
+
+#
+# Imports
+#
+import select, string
+import socket
+if not hasattr(socket, "_no_timeoutsocket"):
+    _socket = socket.socket
+else:
+    _socket = socket._no_timeoutsocket
+
+
+#
+# Set up constants to test for Connected and Blocking operations.
+# We delete 'os' and 'errno' to keep our namespace clean(er).
+# Thanks to Alex Martelli and G. Li for the Windows error codes.
+#
+import os
+if os.name == "nt":
+    _IsConnected = ( 10022, 10056 )
+    _ConnectBusy = ( 10035, )
+    _AcceptBusy  = ( 10035, )
+else:
+    import errno
+    _IsConnected = ( errno.EISCONN, )
+    _ConnectBusy = ( errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK )
+    _AcceptBusy  = ( errno.EAGAIN, errno.EWOULDBLOCK )
+    del errno
+del os
+
+
+#
+# Default timeout value for ALL TimeoutSockets
+#
+_DefaultTimeout = None
+def setDefaultSocketTimeout(timeout):
+    global _DefaultTimeout
+    _DefaultTimeout = timeout
+def getDefaultSocketTimeout():
+    return _DefaultTimeout
+
+#
+# Exceptions for socket errors and timeouts
+#
+Error = socket.error
+class Timeout(Exception):
+    pass
+
+
+#
+# Factory function
+#
+from socket import AF_INET, SOCK_STREAM
+def timeoutsocket(family=AF_INET, type=SOCK_STREAM, proto=None):
+    if family != AF_INET or type != SOCK_STREAM:
+        if proto:
+            return _socket(family, type, proto)
+        else:
+            return _socket(family, type)
+    return TimeoutSocket( _socket(family, type), _DefaultTimeout )
+# end timeoutsocket
+
+#
+# The TimeoutSocket class definition
+#
+class TimeoutSocket:
+    """TimeoutSocket object
+    Implements a socket-like object that raises Timeout whenever
+    an operation takes too long.
+    The definition of 'too long' can be changed using the
+    set_timeout() method.
+    """
+
+    _copies = 0
+    _blocking = 1
+    
+    def __init__(self, sock, timeout):
+        self._sock     = sock
+        self._timeout  = timeout
+    # end __init__
+
+    def __getattr__(self, key):
+        return getattr(self._sock, key)
+    # end __getattr__
+
+    def get_timeout(self):
+        return self._timeout
+    # end set_timeout
+
+    def set_timeout(self, timeout=None):
+        self._timeout = timeout
+    # end set_timeout
+
+    def setblocking(self, blocking):
+        self._blocking = blocking
+        return self._sock.setblocking(blocking)
+    # end set_timeout
+
+    def connect_ex(self, addr):
+        errcode = 0
+        try:
+            self.connect(addr)
+        except Error, why:
+            errcode = why[0]
+        return errcode
+    # end connect_ex
+        
+    def connect(self, addr, port=None, dumbhack=None):
+        # In case we were called as connect(host, port)
+        if port != None:  addr = (addr, port)
+
+        # Shortcuts
+        sock    = self._sock
+        timeout = self._timeout
+        blocking = self._blocking
+
+        # First, make a non-blocking call to connect
+        try:
+            sock.setblocking(0)
+            sock.connect(addr)
+            sock.setblocking(blocking)
+            return
+        except Error, why:
+            # Set the socket's blocking mode back
+            sock.setblocking(blocking)
+            
+            # If we are not blocking, re-raise
+            if not blocking:
+                raise
+            
+            # If we are already connected, then return success.
+            # If we got a genuine error, re-raise it.
+            errcode = why[0]
+            if dumbhack and errcode in _IsConnected:
+                return
+            elif errcode not in _ConnectBusy:
+                raise
+            
+        # Now, wait for the connect to happen
+        # ONLY if dumbhack indicates this is pass number one.
+        #   If select raises an error, we pass it on.
+        #   Is this the right behavior?
+        if not dumbhack:
+            r,w,e = select.select([], [sock], [], timeout)
+            if w:
+                return self.connect(addr, dumbhack=1)
+
+        # If we get here, then we should raise Timeout
+        raise Timeout("Attempted connect to %s timed out." % str(addr) )
+    # end connect
+
+    def accept(self, dumbhack=None):
+        # Shortcuts
+        sock     = self._sock
+        timeout  = self._timeout
+        blocking = self._blocking
+
+        # First, make a non-blocking call to accept
+        #  If we get a valid result, then convert the
+        #  accept'ed socket into a TimeoutSocket.
+        # Be carefult about the blocking mode of ourselves.
+        try:
+            sock.setblocking(0)
+            newsock, addr = sock.accept()
+            sock.setblocking(blocking)
+            timeoutnewsock = self.__class__(newsock, timeout)
+            timeoutnewsock.setblocking(blocking)
+            return (timeoutnewsock, addr)
+        except Error, why:
+            # Set the socket's blocking mode back
+            sock.setblocking(blocking)
+
+            # If we are not supposed to block, then re-raise
+            if not blocking:
+                raise
+            
+            # If we got a genuine error, re-raise it.
+            errcode = why[0]
+            if errcode not in _AcceptBusy:
+                raise
+            
+        # Now, wait for the accept to happen
+        # ONLY if dumbhack indicates this is pass number one.
+        #   If select raises an error, we pass it on.
+        #   Is this the right behavior?
+        if not dumbhack:
+            r,w,e = select.select([sock], [], [], timeout)
+            if r:
+                return self.accept(dumbhack=1)
+
+        # If we get here, then we should raise Timeout
+        raise Timeout("Attempted accept timed out.")
+    # end accept
+
+    def send(self, data, flags=0):
+        sock = self._sock
+        if self._blocking:
+            r,w,e = select.select([],[sock],[], self._timeout)
+            if not w:
+                raise Timeout("Send timed out")
+        return sock.send(data, flags)
+    # end send
+
+    def recv(self, bufsize, flags=0):
+        sock = self._sock
+        if self._blocking:
+            r,w,e = select.select([sock], [], [], self._timeout)
+            if not r:
+                raise Timeout("Recv timed out")
+        return sock.recv(bufsize, flags)
+    # end recv
+
+    def makefile(self, flags="r", bufsize=-1):
+        self._copies = self._copies +1
+        return TimeoutFile(self, flags, bufsize)
+    # end makefile
+
+    def close(self):
+        if self._copies <= 0:
+            self._sock.close()
+        else:
+            self._copies = self._copies -1
+    # end close
+
+# end TimeoutSocket
+
+
+class TimeoutFile:
+    """TimeoutFile object
+    Implements a file-like object on top of TimeoutSocket.
+    """
+    
+    def __init__(self, sock, mode="r", bufsize=4096):
+        self._sock          = sock
+        self._bufsize       = 4096
+        if bufsize > 0: self._bufsize = bufsize
+        if not hasattr(sock, "_inqueue"): self._sock._inqueue = ""
+
+    # end __init__
+
+    def __getattr__(self, key):
+        return getattr(self._sock, key)
+    # end __getattr__
+
+    def close(self):
+        self._sock.close()
+        self._sock = None
+    # end close
+    
+    def write(self, data):
+        self.send(data)
+    # end write
+
+    def read(self, size=-1):
+        _sock = self._sock
+        _bufsize = self._bufsize
+        while 1:
+            datalen = len(_sock._inqueue)
+            if datalen >= size >= 0:
+                break
+            bufsize = _bufsize
+            if size > 0:
+                bufsize = min(bufsize, size - datalen )
+            buf = self.recv(bufsize)
+            if not buf:
+                break
+            _sock._inqueue = _sock._inqueue + buf
+        data = _sock._inqueue
+        _sock._inqueue = ""
+        if size > 0 and datalen > size:
+            _sock._inqueue = data[size:]
+            data = data[:size]
+        return data
+    # end read
+
+    def readline(self, size=-1):
+        _sock = self._sock
+        _bufsize = self._bufsize
+        while 1:
+            idx = string.find(_sock._inqueue, "\n")
+            if idx >= 0:
+                break
+            datalen = len(_sock._inqueue)
+            if datalen >= size >= 0:
+                break
+            bufsize = _bufsize
+            if size > 0:
+                bufsize = min(bufsize, size - datalen )
+            buf = self.recv(bufsize)
+            if not buf:
+                break
+            _sock._inqueue = _sock._inqueue + buf
+
+        data = _sock._inqueue
+        _sock._inqueue = ""
+        if idx >= 0:
+            idx = idx + 1
+            _sock._inqueue = data[idx:]
+            data = data[:idx]
+        elif size > 0 and datalen > size:
+            _sock._inqueue = data[size:]
+            data = data[:size]
+        return data
+    # end readline
+
+    def readlines(self, sizehint=-1):
+        result = []
+        data = self.read()
+        while data:
+            idx = string.find(data, "\n")
+            if idx >= 0:
+                idx = idx + 1
+                result.append( data[:idx] )
+                data = data[idx:]
+            else:
+                result.append( data )
+                data = ""
+        return result
+    # end readlines
+
+    def flush(self):  pass
+
+# end TimeoutFile
+
+
+#
+# Silently replace the socket() builtin function with
+# our timeoutsocket() definition.
+#
+if not hasattr(socket, "_no_timeoutsocket"):
+    socket._no_timeoutsocket = socket.socket
+    socket.socket = timeoutsocket
+del socket
+socket = timeoutsocket
+# Finis