You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ws.apache.org by di...@apache.org on 2005/12/09 13:22:06 UTC
svn commit: r355462 [5/7] - in /webservices/admin/planet: ./ cache/
compat_logging/ examples/ fancy-examples/ output/ output/images/
Added: webservices/admin/planet/default.css
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/default.css?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/default.css (added)
+++ webservices/admin/planet/default.css Fri Dec 9 04:21:26 2005
@@ -0,0 +1,233 @@
+/*
+ * Written by Stefano Mazzocchi <stefano at apache dot org>
+ */
+
+/* ----------------------------- Global Definitions -------------------- */
+
+body {
+ margin: 0px;
+ padding: 0px;
+ font-family: "georgia", "times", "times new roman", serif;
+ color: #222;
+ background-color: #fff;
+ font-size: 12px;
+ quotes: "\201C" "\201E" "\2018" "\2019";
+}
+
+a:link {
+ color: #222;
+}
+
+a:visited {
+ color: #555;
+}
+
+a:hover {
+ color: #000;
+}
+
+a:active {
+}
+
+a:focus {
+}
+
+h1 {
+ font-family: "georgia", "times", "times new roman", serif;
+ font-size: 28px;
+ text-transform: uppercase;
+ letter-spacing: 0.25em;
+ padding: 10px;
+ margin: 0px 0px 0px 0px;
+ color: #000;
+ font-weight: normal;
+ background-color: #eee;
+ border-bottom: 2px solid #bbb
+}
+
+/* ----------------------------- Sidebar -------------------------- */
+
+#sidebar {
+ float: right;
+ top: 150px;
+ right: 0px;
+ width: 210px;
+ background-color: white;
+
+ padding: 0px 0px 20px 0px;
+ margin: 0px 0px 20px 20px;
+ border-left: 1px solid #ccc;
+ border-bottom: 1px solid #ccc;
+}
+
+#sidebar h2 {
+ font-family: "verdana", "helvetica", "arial", sans-serif;
+ letter-spacing: 0.15em;
+ text-transform: uppercase;
+ font-size: 11px;
+ color: #666;
+ font-weight: normal;
+ padding: 2px 0px 2px 4px;
+ margin: 15px 0px 5px 10px;
+ border-top: 1px solid #ccc;
+ border-left: 1px solid #ccc;
+ border-bottom: 1px solid #ccc;
+}
+
+#sidebar p {
+ font-family: "verdana", "helvetica", "arial", sans-serif;
+ font-size: 10px;
+ padding-left: 20px;
+ padding-right: 5px;
+}
+
+#sidebar ul {
+ margin: 5px;
+ padding-left: 25px;
+}
+
+#sidebar li {
+ margin-left: 0px;
+ list-style-type: none;
+ font-family: "verdana", "helvetica", "arial", sans-serif;
+ font-size: 10px;
+}
+
+/* ----------------------------- Body ---------------------------- */
+
+#body {
+ padding: 10px;
+}
+
+#body h2 {
+ font-family: "georgia", "times", "times new roman", serif;
+ font-size: 14px;
+ text-transform: none;
+ margin: 0px 0px 0px 0px;
+ color: #333;
+ font-weight: bold;
+ text-align: right;
+ border-top: 1px solid #ccc;
+ background-color: #eee;
+ border-bottom: 1px solid #ccc;
+ padding: 1px 5px 1px 5px;
+ margin: 10px -10px 0px -10px;
+}
+
+/* ----------------------------- News ---------------------------- */
+
+.news {
+ margin: 30px 10px 30px 10px;
+}
+
+.news h3 {
+ margin: 0px;
+ padding: 0px;
+}
+
+.news .content {
+ margin: 5px 5px 5px 15px;
+ padding: 0px 5px 0px 5px;
+ border-left: 1px solid #ccc;
+}
+
+.news .links {
+
+}
+
+.news .permalink {
+ text-align: right;
+}
+
+/* ----------------------------- News Content ---------------------------- */
+
+.news .content p {
+ font-size: 12px;
+ line-height: 16px;
+}
+
+.news .content img {
+ margin: 5px;
+}
+
+.news .content blockquote {
+ margin: 10px 35px 10px 35px;
+ padding: 5px;
+}
+
+.news .content pre {
+ font-family: monospace;
+ border: 1px solid #ddd;
+ padding: 10px;
+ margin: 10px 20px 10px 20px;
+ background-color: #f8f8f8;
+}
+
+.news .content ul, .news .content ol {
+ margin: 5px 35px 5px 35px;
+ padding: 5px;
+ counter-reset: item;
+}
+
+.news .content ul > ul, .news .content ul > ol, .news .content ol > ul, .news .content ol > ol {
+ margin: 0px 0px 0px 35px;
+ padding: 0px;
+}
+
+.news .content li {
+ padding: 1px;
+ font-size: 12px;
+ line-height: 16px;
+}
+
+.news .content a {
+ text-decoration: none;
+ color: #000;
+ border-bottom: 1px dotted #777;
+ margin: 0px 2px 0px 2px;
+ padding: 1px 1px 1px 1px;
+}
+
+.news .content a:hover {
+ border: 1px dotted #000;
+ background-color: #eee;
+ padding: 1px 2px 1px 2px;
+ margin: 0px;
+}
+
+.news .content a:active {
+ background-color: #ccc !important;
+ position: relative;
+ top: 1px;
+ left: 1px;
+ padding: 1px 2px 1px 2px;
+ margin: 0px;
+}
+
+.news .content a:focus {
+ border: 1px solid #fff !important;
+ background-color: #ccc !important;
+ padding: 1px 2px 1px 2px;
+ margin: 0px;
+}
+
+/* ----------------------------- Footer ---------------------------- */
+
+#footer {
+ padding: 0px;
+ margin: 30px 0px 50px 50px;
+}
+
+#footer p {
+ padding: 2px 2px 2px 5px;
+ background-color: #ccc;
+ border-top: 1px solid #aaa;
+ border-bottom: 1px solid #aaa;
+ border-left: 1px solid #aaa;
+ font-family: "verdana", "helvetica", "arial", sans-serif;
+ letter-spacing: 0.15em;
+ text-transform: uppercase;
+ font-size: 12px;
+ text-align: left;
+}
+
Added: webservices/admin/planet/examples/config.ini
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/config.ini?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/config.ini (added)
+++ webservices/admin/planet/examples/config.ini Fri Dec 9 04:21:26 2005
@@ -0,0 +1,81 @@
+# Planet configuration file
+
+# Every planet needs a [Planet] section
+[Planet]
+# name: Your planet's name
+# link: Link to the main page
+# owner_name: Your name
+# owner_email: Your e-mail address
+name = Planet Zog
+link = http://www.planet.zog/
+owner_name = Zig The Alien
+owner_email = zig@planet.zog
+
+# cache_directory: Where cached feeds are stored
+# log_level: One of DEBUG, INFO, WARNING, ERROR or CRITICAL
+cache_directory = cache
+log_level = DEBUG
+
+# template_files: Space-separated list of output template files
+template_files = examples/index.html.tmpl examples/rss20.xml.tmpl examples/rss10.xml.tmpl examples/opml.xml.tmpl examples/foafroll.xml.tmpl
+
+# The following provide defaults for each template:
+# output_dir: Directory to place output files
+# items_per_page: How many items to put on each page
+# days_per_page: How many complete days of posts to put on each page
+# This is the absolute, hard limit (over the item limit)
+# date_format: strftime format for the default 'date' template variable
+output_dir = output
+items_per_page = 60
+days_per_page = 0
+date_format = %B %d, %Y %I:%M %p
+
+# To define a different value for a particular template you may create
+# a section with the same name as the template file's filename (as given
+# in template_files).
+#
+# [examples/rss10.xml.tmpl]
+# items_per_page = 30
+
+
+# Any other section defines a feed to subscribe to. The section title
+# (in the []s) is the URI of the feed itself. A section can also be
+# have any of the following options:
+#
+# name: Name of the feed (defaults to the title found in the feed)
+# offset: Number of hours (+ or -) the feed's times tend to be out
+#
+# Additionally any other option placed here will be available in
+# the template (prefixed with channel_ for the Items loop). You can
+# define defaults for these in a [DEFAULT] section, for example
+# Planet Debian uses the following to define faces:
+#
+# [DEFAULT]
+# facewidth = 64
+# faceheight = 64
+#
+# [http://www.blog.com/rss]
+# face = foo.png
+# faceheight = 32
+#
+# The facewidth of the defined blog defaults to 64.
+
+[http://www.livejournal.com/users/keybuk/data/rss]
+name = Scott James Remnant
+
+[http://www.gnome.org/~jdub/blog/?flav=rss]
+name = Jeff Waugh
+keyword = free
+
+[http://usefulinc.com/edd/blog/rss91]
+name = Edd Dumbill
+
+[http://blog.clearairturbulence.org/?flav=rss]
+name = Thom May
+
+[http://codeblogs.ximian.com/blogs/evolution/index.rdf]
+name = Evolution
+
+[http://www.actsofvolition.com/rss]
+name = Steven Garrity
+keyword = gnome
Added: webservices/admin/planet/examples/foafroll.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/foafroll.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/foafroll.xml.tmpl (added)
+++ webservices/admin/planet/examples/foafroll.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+ xmlns:foaf="http://xmlns.com/foaf/0.1/"
+ xmlns:rss="http://purl.org/rss/1.0/"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+>
+<foaf:Group>
+ <foaf:name><TMPL_VAR name></foaf:name>
+ <foaf:homepage><TMPL_VAR link ESCAPE="HTML"></foaf:homepage>
+ <rdfs:seeAlso rdf:resource="<TMPL_VAR uri ESCAPE="HTML">" />
+
+<TMPL_LOOP Channels>
+ <foaf:member>
+ <foaf:Agent>
+ <foaf:name><TMPL_VAR name></foaf:name>
+ <foaf:weblog>
+ <foaf:Document rdf:about="<TMPL_VAR link ESCAPE="HTML">">
+ <dc:title><TMPL_VAR title></dc:title>
+ <rdfs:seeAlso>
+ <rss:channel rdf:about="<TMPL_VAR uri ESCAPE="HTML">" />
+ </rdfs:seeAlso>
+ </foaf:Document>
+ </foaf:weblog>
+ </foaf:Agent>
+ </foaf:member>
+</TMPL_LOOP>
+
+</foaf:Group>
+</rdf:RDF>
Added: webservices/admin/planet/examples/index.html.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/index.html.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/index.html.tmpl (added)
+++ webservices/admin/planet/examples/index.html.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,88 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+
+### Planet HTML template.
+###
+### This is intended to demonstrate and document Planet's templating
+### facilities, and at the same time provide a good base for you to
+### modify into your own design.
+###
+### The output's a bit boring though, if you're after less documentation
+### and more instant gratification, there's an example with a much
+### prettier output in the fancy-examples/ directory of the Planet source.
+
+### Lines like this are comments, and are automatically removed by the
+### templating engine before processing.
+
+
+### Planet makes a large number of variables available for your templates.
+### See INSTALL for the complete list. The raw value can be placed in your
+### output file using <TMPL_VAR varname>. We'll put the name of our
+### Planet in the page title and again in an h1.
+
+<head>
+<title><TMPL_VAR name></title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta name="generator" content="<TMPL_VAR generator>">
+</head>
+
+<body>
+<h1><TMPL_VAR name></h1>
+
+### One of the two loops available is the Channels loop. This allows you
+### to easily create a list of subscriptions, which is exactly what we'll do
+### here.
+
+### Note that we can also expand variables inside HTML tags, but we need
+### to be cautious and HTML-escape any illegal characters using the form
+### <TMPL_VAR varname ESCAPE="HTML">
+
+<div style="float: right">
+<h2>Subscriptions</h2>
+<ul>
+<TMPL_LOOP Channels>
+<li><a href="<TMPL_VAR link ESCAPE="HTML">" title="<TMPL_VAR title ESCAPE="HTML">"><TMPL_VAR name></a> <a href="<TMPL_VAR uri ESCAPE="HTML">">(feed)</a></li>
+</TMPL_LOOP>
+</ul>
+</div>
+
+### The other loop is the Items loop, which will get iterated for each
+### news item.
+
+<TMPL_LOOP Items>
+
+### Visually distinguish articles from different days by checking for
+### the new_date flag. This demonstrates the <TMPL_IF varname> ... </TMPL_IF>
+### check.
+
+<TMPL_IF new_date>
+<h2><TMPL_VAR new_date></h2>
+</TMPL_IF>
+
+### Group consecutive articles by the same author together by checking
+### for the new_channel flag.
+
+<TMPL_IF new_channel>
+<h3><a href="<TMPL_VAR channel_link ESCAPE="HTML">" title="<TMPL_VAR channel_title ESCAPE="HTML">"><TMPL_VAR channel_name></a></h3>
+</TMPL_IF>
+
+
+<TMPL_IF title>
+<h4><a href="<TMPL_VAR link ESCAPE="HTML">"><TMPL_VAR title></a></h4>
+</TMPL_IF>
+<p>
+<TMPL_VAR content>
+</p>
+<p>
+<em><a href="<TMPL_VAR link ESCAPE="HTML">"><TMPL_IF creator>by <TMPL_VAR creator> at </TMPL_IF><TMPL_VAR date></a></em>
+</p>
+</TMPL_LOOP>
+
+<hr>
+<p>
+<a href="http://www.planetplanet.org/">Powered by Planet!</a><br>
+<em>Last updated: <TMPL_VAR date></em>
+</p>
+</body>
+
+</html>
Added: webservices/admin/planet/examples/opml.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/opml.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/opml.xml.tmpl (added)
+++ webservices/admin/planet/examples/opml.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,16 @@
+<?xml version="1.0"?>
+<opml version="1.1">
+ <head>
+ <title><TMPL_VAR name></title>
+ <dateCreated><TMPL_VAR date_822></dateCreated>
+ <dateModified><TMPL_VAR date_822></dateModified>
+ <ownerName><TMPL_VAR owner_name></ownerName>
+ <ownerEmail><TMPL_VAR owner_email></ownerEmail>
+ </head>
+
+ <body>
+ <TMPL_LOOP Channels>
+ <outline text="<TMPL_VAR name ESCAPE="HTML">" xmlUrl="<TMPL_VAR uri ESCAPE="HTML">"/>
+ </TMPL_LOOP>
+ </body>
+</opml>
Added: webservices/admin/planet/examples/rss10.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/rss10.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/rss10.xml.tmpl (added)
+++ webservices/admin/planet/examples/rss10.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:foaf="http://xmlns.com/foaf/0.1/"
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns="http://purl.org/rss/1.0/"
+>
+<channel rdf:about="<TMPL_VAR link ESCAPE="HTML">">
+ <title><TMPL_VAR name></title>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+ <description><TMPL_VAR name> - <TMPL_VAR link ESCAPE="HTML"></description>
+
+ <items>
+ <rdf:Seq>
+<TMPL_LOOP Items>
+ <rdf:li rdf:resource="<TMPL_VAR id ESCAPE="HTML">" />
+</TMPL_LOOP>
+ </rdf:Seq>
+ </items>
+</channel>
+
+<TMPL_LOOP Items>
+<item rdf:about="<TMPL_VAR id ESCAPE="HTML">">
+ <title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title></TMPL_IF></title>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+ <TMPL_IF content>
+ <content:encoded><TMPL_VAR content ESCAPE="HTML"></content:encoded>
+ </TMPL_IF>
+ <dc:date><TMPL_VAR date_iso></dc:date>
+ <TMPL_IF creator>
+ <dc:creator><TMPL_VAR creator></dc:creator>
+ </TMPL_IF>
+</item>
+</TMPL_LOOP>
+
+</rdf:RDF>
Added: webservices/admin/planet/examples/rss20.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/examples/rss20.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/examples/rss20.xml.tmpl (added)
+++ webservices/admin/planet/examples/rss20.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+<channel>
+ <title><TMPL_VAR name></title>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+ <language>en</language>
+ <description><TMPL_VAR name> - <TMPL_VAR link ESCAPE="HTML"></description>
+
+<TMPL_LOOP Items>
+<item>
+ <title><TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title></TMPL_IF></title>
+ <guid><TMPL_VAR id ESCAPE="HTML"></guid>
+ <link><TMPL_VAR link ESCAPE="HTML"></link>
+ <TMPL_IF content>
+ <description><TMPL_VAR content ESCAPE="HTML"></description>
+ </TMPL_IF>
+ <pubDate><TMPL_VAR date_822></pubDate>
+ <TMPL_IF creator>
+ <dc:creator><TMPL_VAR creator></dc:creator>
+ </TMPL_IF>
+</item>
+</TMPL_LOOP>
+
+</channel>
+</rss>
Added: webservices/admin/planet/fancy-examples/config.ini
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/fancy-examples/config.ini?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/fancy-examples/config.ini (added)
+++ webservices/admin/planet/fancy-examples/config.ini Fri Dec 9 04:21:26 2005
@@ -0,0 +1,96 @@
+# Planet configuration file
+#
+# This illustrates some of Planet's fancier features with example.
+
+# Every planet needs a [Planet] section
+[Planet]
+# name: Your planet's name
+# link: Link to the main page
+# owner_name: Your name
+# owner_email: Your e-mail address
+name = Planet Schmanet
+link = http://planet.schmanet.janet/
+owner_name = Janet Weiss
+owner_email = janet@slut.sex
+
+# cache_directory: Where cached feeds are stored
+# log_level: One of DEBUG, INFO, WARNING, ERROR or CRITICAL
+cache_directory = cache
+log_level = DEBUG
+
+# template_files: Space-separated list of output template files
+template_files = fancy-examples/index.html.tmpl examples/rss20.xml.tmpl examples/rss10.xml.tmpl examples/opml.xml.tmpl examples/foafroll.xml.tmpl
+
+# The following provide defaults for each template:
+# output_dir: Directory to place output files
+# items_per_page: How many items to put on each page
+# days_per_page: How many complete days of posts to put on each page
+# This is the absolute, hard limit (over the item limit)
+# date_format: strftime format for the default 'date' template variable
+output_dir = output
+items_per_page = 60
+days_per_page = 0
+date_format = %B %d, %Y %I:%M %p
+
+
+# To define a different value for a particular template you may create
+# a section with the same name as the template file's filename (as given
+# in template_files).
+
+# Provide no more than 7 days articles on the front page
+[fancy-examples/index.html.tmpl]
+items_per_page = 30
+
+
+# Options placed in the [DEFAULT] section provide defaults for the feed
+# sections. Placing a default here means you only need to override the
+# special cases later.
+[DEFAULT]
+# Hackergotchi default size.
+# If we want to put a face alongside a feed, and it's this size, we
+# can omit these variables.
+facewidth = 65
+faceheight = 85
+
+
+# Any other section defines a feed to subscribe to. The section title
+# (in the []s) is the URI of the feed itself. A section can also be
+# have any of the following options:
+#
+# name: Name of the feed (defaults to the title found in the feed)
+# offset: Number of hours (+ or -) the feed's times tend to be out
+#
+# Additionally any other option placed here will be available in
+# the template (prefixed with channel_ for the Items loop). We use
+# this trick to make the faces work -- this isn't something Planet
+# "natively" knows about. Look at fancy-examples/index.html.tmpl
+# for the flip-side of this.
+
+[http://www.livejournal.com/users/keybuk/data/rss]
+name = Scott James Remnant
+face = keybuk.png
+# pick up the default facewidth and faceheight
+
+[http://www.gnome.org/~jdub/blog/?flav=rss]
+name = Jeff Waugh
+face = jdub.png
+facewidth = 70
+faceheight = 74
+
+[http://usefulinc.com/edd/blog/rss91]
+name = Edd Dumbill
+face = edd.png
+facewidth = 62
+faceheight = 80
+
+[http://blog.clearairturbulence.org/?flav=rss]
+name = Thom May
+face = thom.png
+# pick up the default faceheight only
+facewidth = 59
+
+[http://codeblogs.ximian.com/blogs/evolution/index.rdf]
+name = Evolution
+face = evolution.png
+facewidth = 48
+faceheight = 48
Added: webservices/admin/planet/fancy-examples/index.html.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/fancy-examples/index.html.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/fancy-examples/index.html.tmpl (added)
+++ webservices/admin/planet/fancy-examples/index.html.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,103 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+
+### Fancy Planet HTML template.
+###
+### When combined with the stylesheet and images in the output/ directory
+### of the Planet source, this gives you a much prettier result than the
+### default examples template and demonstrates how to use the config file
+### to support things like faces
+###
+### For documentation on the more boring template elements, see
+### examples/config.ini and examples/index.html.tmpl in the Planet source.
+
+<head>
+<title><TMPL_VAR name></title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<link rel="stylesheet" href="planet.css" type="text/css">
+</head>
+
+<body>
+<h1><TMPL_VAR name></h1>
+
+<TMPL_LOOP Items>
+<TMPL_IF new_date>
+<h2><TMPL_VAR new_date></h2>
+</TMPL_IF>
+
+<TMPL_IF new_channel>
+
+### Planet provides template variables for *all* configuration options for
+### the channel (and defaults), even if it doesn't know about them. We
+### exploit this here to add hackergotchi faces to our channels. Planet
+### doesn't know about the "face", "facewidth" and "faceheight" configuration
+### variables, but makes them available to us anyway.
+
+<h3><a href="<TMPL_VAR channel_link ESCAPE="HTML">" title="<TMPL_VAR channel_title ESCAPE="HTML">"><TMPL_VAR channel_name></a></h3>
+<TMPL_IF channel_face>
+<img class="face" src="images/<TMPL_VAR channel_face ESCAPE="HTML">" width="<TMPL_VAR channel_facewidth ESCAPE="HTML">" height="<TMPL_VAR channel_faceheight ESCAPE="HTML">" alt="">
+</TMPL_IF>
+</TMPL_IF>
+
+
+<TMPL_IF title>
+<h4><a href="<TMPL_VAR link ESCAPE="HTML">"><TMPL_VAR title></a></h4>
+</TMPL_IF>
+<div class="entry">
+<p>
+<TMPL_VAR content>
+</p>
+<p class="date">
+<a href="<TMPL_VAR link ESCAPE="HTML">"><TMPL_IF creator>by <TMPL_VAR creator> at </TMPL_IF><TMPL_VAR date></a>
+</p>
+</div>
+</TMPL_LOOP>
+
+
+<div class="sidebar">
+<img src="images/logo.png" width="136" height="136" alt="">
+
+<h2>Subscriptions</h2>
+<ul>
+<TMPL_LOOP Channels>
+<li><a href="<TMPL_VAR link ESCAPE="HTML">" title="<TMPL_VAR title ESCAPE="HTML">"><TMPL_VAR name></a> <a href="<TMPL_VAR uri ESCAPE="HTML">">(feed)</a></li>
+</TMPL_LOOP>
+</ul>
+
+<p>
+<strong>Last updated:</strong><br>
+<TMPL_VAR date><br>
+<em>All times are UTC.</em><br>
+<br>
+Powered by:<br>
+<a href="http://www.planetplanet.org/"><img src="images/planet.png" width="80" height="15" alt="Planet" border="0"></a>
+</p>
+
+<p>
+<h2>Planetarium:</h2>
+<ul>
+<li><a href="http://www.planetapache.org/">Planet Apache</a></li>
+<li><a href="http://classpath.wildebeest.org/planet/">Planet Classpath</a></li>
+<li><a href="http://planet.debian.net/">Planet Debian</a></li>
+<li><a href="http://planet.debian.org.hk/">Planet Debian HK</a></li>
+<li><a href="http://planet.freedesktop.org/">Planet freedesktop.org</a></li>
+<li><a href="http://planet.humbug.org.au/">Planet Humbug</a></li>
+<li><a href="http://planet.gnome.org/">Planet GNOME</a></li>
+<li><a href="http://gnome.or.kr/pgk/">Planet GNOME Korea</a></li>
+<li><a href="http://planetjava.org/">Planet Java.org</a></li>
+<li><a href="http://myoss.bytebot.net/">Planet MYOSS</a></li>
+<li><a href="http://planet.perl.org/">Planet Perl</a></li>
+<li><a href="http://planet.slug.org.au/">Planet SLUG</a></li>
+<li><a href="http://planetsun.org/">Planet Sun</a></li>
+<li><a href="http://www.planetsuse.org/">Planet SuSE</a></li>
+<li><a href="http://planet.twistedmatrix.com/">Planet Twisted</a></li>
+<li><a href="http://planet.arslinux.com/">Ars Linux</a></li>
+<li><a href="http://fedora.linux.duke.edu/fedorapeople/">Fedora People</a></li>
+<li><a href="http://fossplanet.osdir.com/">FOSS Planet</a></li>
+<li><a href="http://live.linuxchix.org/">LinuxChix Live</a></li>
+</ul>
+</p>
+</div>
+</body>
+
+</html>
Added: webservices/admin/planet/feedparser.py
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/feedparser.py?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/feedparser.py (added)
+++ webservices/admin/planet/feedparser.py Fri Dec 9 04:21:26 2005
@@ -0,0 +1,1398 @@
+#!/usr/bin/python
+"""Ultra-liberal feed parser
+
+Visit http://diveintomark.org/projects/feed_parser/ for the latest version
+
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds
+
+Things it handles that choke other parsers:
+- bastard combinations of RSS 0.9x and RSS 1.0
+- illegal 8-bit XML characters
+- naked and/or invalid HTML in description
+- content:encoded, xhtml:body, fullitem
+- guid
+- elements in non-standard namespaces or non-default namespaces
+- multiple content items per entry (Atom)
+- multiple links per entry (Atom)
+
+Other features:
+- resolves relative URIs in some elements
+ - uses xml:base to define base URI
+ - uses URI of feed if no xml:base is given
+ - to control which elements are resolved, set FeedParser.can_be_relative_uri
+- resolves relative URIs within embedded markup
+ - to control which elements are resolved, set FeedParser.can_contain_relative_uris
+- sanitizes embedded markup in some elements
+ - to allow/disallow HTML elements, set HTMLSanitizer.acceptable_elements
+ - to allow/disallow HTML attributes, set HTMLSanitizer.acceptable_attributes
+ - to control which feed elements are sanitized, set FeedParser.can_contain_dangerous_markup
+ - to disable entirely (NOT RECOMMENDED), set FeedParser.can_contain_dangerous_markup = []
+- tidies embedded markup
+ - fixes malformed HTML
+ - converts to XHTML
+ - converts character entities to numeric entities
+ - requires tidylib <http://utidylib.sourceforge.net/> or mxTidy <http://www.lemburg.com/files/python/mxTidy.html>
+
+Requires Python 2.1; 2.3 or later recommended
+"""
+
+__version__ = "2.7.6"
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__copyright__ = "Copyright 2002-4, Mark Pilgrim"
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
+ "John Beimler <http://john.beimler.org/>",
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>"]
+__license__ = "Python"
+__history__ = """
+1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
+ added Simon Fell's test suite
+1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
+2.0 - 10/19/2002
+ JD - use inchannel to watch out for image and textinput elements which can
+ also contain title, link, and description elements
+ JD - check for isPermaLink="false" attribute on guid elements
+ JD - replaced openAnything with open_resource supporting ETag and
+ If-Modified-Since request headers
+ JD - parse now accepts etag, modified, agent, and referrer optional
+ arguments
+ JD - modified parse to return a dictionary instead of a tuple so that any
+ etag or modified information can be returned and cached by the caller
+2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
+ because of etag/modified, return the old etag/modified to the caller to
+ indicate why nothing is being returned
+2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
+ useless. Fixes the problem JD was addressing by adding it.
+2.1 - 11/14/2002 - MAP - added gzip support
+2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
+ start_admingeneratoragent is an example of how to handle elements with
+ only attributes, no content.
+2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
+ also, make sure we send the User-Agent even if urllib2 isn't available.
+ Match any variation of backend.userland.com/rss namespace.
+2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
+2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
+ snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
+ project name
+2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
+ removed unnecessary urllib code -- urllib2 should always be available anyway;
+ return actual url, status, and full HTTP headers (as result['url'],
+ result['status'], and result['headers']) if parsing a remote feed over HTTP --
+ this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
+ added the latest namespace-of-the-week for RSS 2.0
+2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
+ User-Agent (otherwise urllib2 sends two, which confuses some servers)
+2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
+ inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
+2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
+ textInput, and also to return the character encoding (if specified)
+2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
+ nested divs within content (JohnD); fixed missing sys import (JohanS);
+ fixed regular expression to capture XML character encoding (Andrei);
+ added support for Atom 0.3-style links; fixed bug with textInput tracking;
+ added support for cloud (MartijnP); added support for multiple
+ category/dc:subject (MartijnP); normalize content model: "description" gets
+ description (which can come from description, summary, or full content if no
+ description), "content" gets dict of base/language/type/value (which can come
+ from content:encoded, xhtml:body, content, or fullitem);
+ fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
+ tracking; fixed bug tracking unknown tags; fixed bug tracking content when
+ <content> element is not in default namespace (like Pocketsoap feed);
+ resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
+ wfw:commentRSS; resolve relative URLs within embedded HTML markup in
+ description, xhtml:body, content, content:encoded, title, subtitle,
+ summary, info, tagline, and copyright; added support for pingback and
+ trackback namespaces
+2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
+ namespaces, as opposed to 2.6 when I said I did but didn't really;
+ sanitize HTML markup within some elements; added mxTidy support (if
+ installed) to tidy HTML markup within some elements; fixed indentation
+ bug in parse_date (FazalM); use socket.setdefaulttimeout if available
+ (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
+ 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
+ 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
+ and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
+2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
+ leak not closing url opener (JohnD); added dc:publisher support (MarekK);
+ added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
+2.7.2 - 1/13/2004 - MAP - feeds that are not well-formed XML are not parsed, and
+ the 'bozo' bit is set to 1 in the result. See
+ http://tbray.org/ongoing/When/200x/2004/01/11/PostelPilgrim
+ for an explanation of why anyone who can't create well-formed XML is a bozo
+ and an incompetent fool. You can disable this check by passing
+ disableWellFormedCheck=1, but it will write arrogant messages to stderr.
+2.7.3 - 1/14/2004 - MAP - just kidding
+2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
+ encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
+ fixed relative URI processing for guid (skadz); added ICBM support; added
+ base64 support
+2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
+ blogspot.com sites); added _debug variable
+2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
+"""
+
+_debug = 0
+
+# if you are embedding feedparser in a larger application, you should change this to your application name and URL
+USER_AGENT = "UltraLiberalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
+
+# ---------- required modules (should come with any Python distribution) ----------
+import cgi, re, sgmllib, string, urllib2, sys, copy, urlparse, htmlentitydefs, time, rfc822
+
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+
+try:
+ from cStringIO import StringIO
+except:
+ from StringIO import StringIO
+
+# gzip is included with most Python distributions, but may not be available if you compiled your own
+try:
+ import gzip
+except:
+ gzip = None
+
+# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
+# Python 2.3 now has this functionality available in the standard socket library, so under
+# 2.3 you don't need to install anything.
+import socket
+if hasattr(socket, 'setdefaulttimeout'):
+ socket.setdefaulttimeout(10)
+else:
+ try:
+ import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
+ timeoutsocket.setDefaultSocketTimeout(10)
+ except ImportError:
+ pass
+
+# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
+# this does not affect HTML sanitizing, which is self-contained in the HTMLSanitizer class
+try:
+ from mx.Tidy import Tidy as mxtidy # http://www.lemburg.com/files/python/mxTidy.html
+except:
+ mxtidy = None
+
+try:
+ from xml.sax import make_parser, handler, SAXParseException
+ from xml.sax.handler import ContentHandler
+ from xml.sax.xmlreader import InputSource
+
+ class WellFormedChecker(ContentHandler):
+ def __init__(self):
+ ContentHandler.__init__(self)
+ self.bozo = 0
+
+ def startElementNS(self, name, qname, attrs):
+ if _debug: sys.stderr.write("start %s %s\n" % (name, attrs))
+ pass
+
+ def resolveEntity(self, publicId, systemId):
+ return StringIO()
+
+ def characters(self, text):
+ if _debug: sys.stderr.write(text)
+ pass
+
+ def endElementNS(self, name, qname):
+ if _debug: sys.stderr.write("end %s\n" % (name,))
+ pass
+
+ def fatalError(self, exc):
+ if _debug: sys.stderr.write("%s\n" % exc)
+ self.bozo = 1
+
+ def isWellFormed(data):
+ if _debug: sys.stderr.write('entering isWellFormed\n')
+ source = InputSource()
+ source.setByteStream(StringIO(data))
+ validator = WellFormedChecker()
+ parser = make_parser()
+ parser.setFeature(handler.feature_namespaces, 1)
+ parser.setContentHandler(validator)
+ parser.setErrorHandler(validator)
+ parser.setEntityResolver(validator)
+ if hasattr(parser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
+ parser.parse(source)
+ if _debug: sys.stderr.write('leaving isWellFormed\n')
+ return not validator.bozo
+
+ _XML_AVAILABLE = 1
+except:
+ _XML_AVAILABLE = 0
+
+# ---------- don't touch this ----------
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+
+if not dict:
+ def dict(aList):
+ rc = {}
+ for k, v in aList:
+ rc[k] = v
+ return rc
+
+class FeedParser(sgmllib.SGMLParser):
+ namespaces = {"http://backend.userland.com/rss": "",
+ "http://blogs.law.harvard.edu/tech/rss": "",
+ "http://purl.org/rss/1.0/": "",
+ "http://example.com/newformat#": "",
+ "http://example.com/necho": "",
+ "http://purl.org/echo/": "",
+ "uri/of/echo/namespace#": "",
+ "http://purl.org/pie/": "",
+ "http://purl.org/atom/ns#": "",
+ "http://purl.org/rss/1.0/modules/textinput/": "ti",
+ "http://purl.org/rss/1.0/modules/company/": "co",
+ "http://purl.org/rss/1.0/modules/syndication/": "sy",
+ "http://purl.org/dc/elements/1.1/": "dc",
+ "http://purl.org/dc/terms/": "dcterms",
+ "http://webns.net/mvcb/": "admin",
+ "http://wellformedweb.org/CommentAPI/": "wfw",
+ "http://madskills.com/public/xml/rss/module/trackback/": "trackback",
+ "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
+ "http://postneo.com/icbm/": "icbm",
+ "http://www.w3.org/1999/xhtml": "xhtml"}
+
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments']
+ can_contain_relative_uris = ['content', 'body', 'xhtml_body', 'content_encoded', 'fullitem', 'description', 'title', 'summary', 'subtitle', 'info', 'tagline', 'copyright']
+ can_contain_dangerous_markup = ['content', 'body', 'xhtml_body', 'content_encoded', 'fullitem', 'description', 'title', 'summary', 'subtitle', 'info', 'tagline', 'copyright']
+ explicitly_set_type = ['title', 'tagline', 'summary', 'info', 'copyright', 'content']
+ html_types = ['text/html', 'application/xhtml+xml']
+
+ def __init__(self, baseuri=None):
+ if _debug: sys.stderr.write("initializing FeedParser\n")
+ sgmllib.SGMLParser.__init__(self)
+ self.baseuri = baseuri or ''
+
+ def reset(self):
+ self.channel = {}
+ self.items = []
+ self.elementstack = []
+ self.inchannel = 0
+ self.initem = 0
+ self.incontent = 0
+ self.intextinput = 0
+ self.inimage = 0
+ self.contentparams = {}
+ self.namespacemap = {}
+ self.basestack = []
+ self.langstack = []
+ self.baseuri = ''
+ self.lang = None
+ sgmllib.SGMLParser.reset(self)
+
+ def unknown_starttag(self, tag, attrs):
+ if _debug: sys.stderr.write("start %s with %s\n" % (tag, attrs))
+
+ # normalize attrs
+ attrs = [(k.lower(), sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v).strip()) for k, v in attrs]
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+
+ # track inline content
+ if self.incontent and self.contentparams.get('mode') == 'xml':
+ return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])))
+
+ # track xml:base and xml:lang
+ attrsD = dict(attrs)
+ baseuri = attrsD.get('xml:base')
+ if baseuri:
+ self.baseuri = baseuri
+ lang = attrsD.get('xml:lang')
+ if lang:
+ self.lang = lang
+ self.basestack.append(baseuri)
+ self.langstack.append(lang)
+
+ # track namespaces
+ for prefix, value in attrs:
+ if not prefix.startswith("xmlns:"): continue
+ prefix = prefix[6:]
+ if value.find('backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ value = 'http://backend.userland.com/rss'
+ if self.namespaces.has_key(value):
+ self.namespacemap[prefix] = self.namespaces[value]
+
+ # match namespaces
+ colonpos = tag.find(':')
+ if colonpos <> -1:
+ prefix = tag[:colonpos]
+ suffix = tag[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+ else:
+ prefix = ''
+ suffix = tag
+
+ # call special handler (if defined) or default handler
+ methodname = '_start_' + prefix + suffix
+ try:
+ method = getattr(self, methodname)
+ return method(attrs)
+ except AttributeError:
+ return self.push(prefix + suffix, 1)
+
+ def unknown_endtag(self, tag):
+ # track inline content
+ if self.incontent and self.contentparams.get('mode') == 'xml':
+ self.handle_data("</%s>" % tag)
+
+ # match namespaces
+ colonpos = tag.find(':')
+ if colonpos <> -1:
+ prefix = tag[:colonpos]
+ suffix = tag[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+ else:
+ prefix = ''
+ suffix = tag
+
+ # call special handler (if defined) or default handler
+ methodname = '_end_' + prefix + suffix
+ try:
+ method = getattr(self, methodname)
+ method()
+ except AttributeError:
+ self.pop(prefix + suffix)
+
+ # track xml:base and xml:lang going out of scope
+ if self.basestack:
+ baseuri = self.basestack.pop()
+ if baseuri:
+ self.baseuri = baseuri
+ if self.langstack:
+ lang = self.langstack.pop()
+ if lang:
+ self.lang = lang
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for " ", ref will be "160"
+ # Reconstruct the original character reference.
+ if not self.elementstack: return
+ text = "&#%s;" % ref
+ if self.incontent and self.contentparams.get('mode') == 'xml':
+ text = cgi.escape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for "©", ref will be "copy"
+ # Reconstruct the original entity reference.
+ if not self.elementstack: return
+ text = "&%s;" % ref
+ if self.incontent and self.contentparams.get('mode') == 'xml':
+ text = cgi.escape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ if not self.elementstack: return
+ if self.incontent and self.contentparams.get('mode') == 'xml':
+ text = cgi.escape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_comment(self, text):
+ # called for each comment, e.g. <!-- insert message here -->
+ pass
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ pass
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ pass
+
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return string.lower(name), m.end()
+ else:
+ self.updatepos(declstartpos, i)
+ self.error("expected name token")
+
+ def parse_declaration(self, i):
+ # override internal declaration handler to handle CDATA blocks
+ if _debug: sys.stderr.write("entering parse_declaration\n")
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1: k = len(self.rawdata)
+ self.handle_data(cgi.escape(self.rawdata[i+9:k]))
+ return k+3
+ else:
+ k = self.rawdata.find('>', i)
+ return k+1
+# if _debug: sys.stderr.write("entering super::parse_declaration\n")
+# return sgmllib.SGMLParser.parse_declaration(self, i)
+
+ def error(self, message):
+ if _debug: sys.stderr.write(message)
+ pass
+
+ def resolveURI(self, uri):
+ return urlparse.urljoin(self.baseuri or '', uri)
+
+ def push(self, element, expectingText):
+ self.elementstack.append([element, expectingText, []])
+
+ def pop(self, element):
+ if not self.elementstack: return
+ if self.elementstack[-1][0] != element: return
+
+ element, expectingText, pieces = self.elementstack.pop()
+ if not expectingText: return
+
+ output = "".join(pieces)
+ output = output.strip()
+
+ # resolve relative URIs
+ if (element in self.can_be_relative_uri) and output:
+ output = self.resolveURI(output)
+
+ # decode entities within embedded markup
+ output = output or ''
+ if (element in self.explicitly_set_type and self.contentparams.get('type') in self.html_types) or \
+ (element not in self.explicitly_set_type):
+ output = output.replace('<', '<')
+ output = output.replace('>', '>')
+ output = output.replace('&', '&')
+ output = output.replace('"', '"')
+ output = output.replace(''', "'")
+ output = re.sub(r'(\S)/>', r'\1 />', output)
+
+ # resolve relative URIs within embedded markup
+ if element in self.can_contain_relative_uris:
+ output = resolveRelativeURIs(output, self.baseuri)
+
+ # sanitize embedded markup
+ if element in self.can_contain_dangerous_markup:
+ output = sanitizeHTML(output)
+
+ # decode base64 content
+ if self.contentparams.get('mode') == 'base64':
+ import base64
+ output = base64.decodestring(output)
+
+ # store output in appropriate place(s)
+ if self.incontent and self.initem:
+ if not self.items[-1].has_key(element):
+ self.items[-1][element] = []
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.items[-1][element].append(contentparams)
+ elif self.initem:
+ if element == 'category':
+ domain = self.items[-1]['categories'][-1][0]
+ self.items[-1]['categories'][-1] = (domain, output)
+ elif element == 'link':
+ if output:
+ self.items[-1]['links'][-1]['href'] = output
+ self.items[-1][element] = output
+ elif self.inchannel and (not self.intextinput) and (not self.inimage):
+ if element == 'category':
+ domain = self.channel['categories'][-1][0]
+ self.channel['categories'][-1] = (domain, output)
+ elif element == 'link':
+ self.channel['links']['href'] = output
+ self.channel[element] = output
+
+ return output
+
+ def _mapToStandardPrefix(self, name):
+ colonpos = name.find(':')
+ if colonpos <> -1:
+ prefix = name[:colonpos]
+ suffix = name[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ name = prefix + ':' + suffix
+ return name
+
+ def _getAttribute(self, attrs, name):
+ return dict(attrs).get(self._mapToStandardPrefix(name))
+
+ def _save(self, key, value):
+ if value:
+ if self.initem:
+ self.items[-1].setdefault(key, value)
+ elif self.channel:
+ self.channel.setdefault(key, value)
+
+ def _start_channel(self, attrs):
+ self.inchannel = 1
+ _start_feed = _start_channel
+
+ def _end_channel(self):
+ self.inchannel = 0
+ _end_feed = _end_channel
+
+ def _start_image(self, attrs):
+ self.inimage = 1
+
+ def _end_image(self):
+ self.inimage = 0
+
+ def _start_textinput(self, attrs):
+ self.intextinput = 1
+ _start_textInput = _start_textinput
+
+ def _end_textinput(self):
+ self.intextinput = 0
+ _end_textInput = _end_textinput
+
+ def _start_tagline(self, attrs):
+ self.push('tagline', 1)
+
+ def _end_tagline(self):
+ value = self.pop('tagline')
+ if self.inchannel:
+ self.channel['description'] = value
+
+ def _start_item(self, attrs):
+ self.items.append({})
+ self.push('item', 0)
+ self.initem = 1
+ _start_entry = _start_item
+
+ def _end_item(self):
+ self.pop('item')
+ self.initem = 0
+ _end_entry = _end_item
+
+ def _start_dc_language(self, attrs):
+ self.push('language', 1)
+ _start_language = _start_dc_language
+
+ def _end_dc_language(self):
+ self.pop('language')
+ _end_language = _end_dc_language
+
+ def _start_dc_creator(self, attrs):
+ self.push('creator', 1)
+ _start_managingeditor = _start_dc_creator
+ _start_webmaster = _start_dc_creator
+ _start_name = _start_dc_creator
+
+ def _end_dc_creator(self):
+ self.pop('creator')
+ _end_managingeditor = _end_dc_creator
+ _end_webmaster = _end_dc_creator
+ _end_name = _end_dc_creator
+
+ def _start_dc_author(self, attrs):
+ self.push('author', 1)
+ _start_author = _start_dc_author
+
+ def _end_dc_author(self):
+ self.pop('author')
+ _end_author = _end_dc_author
+
+ def _start_dc_publisher(self, attrs):
+ self.push('publisher', 1)
+
+ def _end_dc_publisher(self):
+ self.pop('publisher')
+
+ def _start_dc_rights(self, attrs):
+ self.push('rights', 1)
+ _start_copyright = _start_dc_rights
+
+ def _end_dc_rights(self):
+ self.pop('rights')
+ _end_copyright = _end_dc_rights
+
+ def _start_dcterms_issued(self, attrs):
+ self.push('issued', 1)
+ _start_issued = _start_dcterms_issued
+
+ def _end_dcterms_issued(self):
+ value = self.pop('issued')
+ self._save('issued_parsed', parse_date(value))
+ _end_issued = _end_dcterms_issued
+
+ def _start_dcterms_created(self, attrs):
+ self.push('created', 1)
+ _start_created = _start_dcterms_created
+
+ def _end_dcterms_created(self):
+ value = self.pop('created')
+ self._save('created_parsed', parse_date(value))
+ _end_created = _end_dcterms_created
+
+ def _start_dcterms_modified(self, attrs):
+ self.push('modified', 1)
+ _start_modified = _start_dcterms_modified
+ _start_dc_date = _start_dcterms_modified
+ _start_pubdate = _start_dcterms_modified
+
+ def _end_dcterms_modified(self):
+ value = self.pop('modified')
+ parsed_value = parse_date(value)
+ self._save('date', value)
+ self._save('date_parsed', parsed_value)
+ self._save('modified_parsed', parsed_value)
+ _end_modified = _end_dcterms_modified
+ _end_dc_date = _end_dcterms_modified
+ _end_pubdate = _end_dcterms_modified
+
+ def _start_category(self, attrs):
+ self.push('category', 1)
+ domain = self._getAttribute(attrs, 'domain')
+ cats = []
+ if self.initem:
+ cats = self.items[-1].setdefault('categories', [])
+ elif self.inchannel:
+ cats = self.channel.setdefault('categories', [])
+ cats.append((domain, None))
+ _start_dc_subject = _start_category
+
+ def _end_category(self):
+ self.pop('category')
+ _end_dc_subject = _end_category
+
+ def _start_link(self, attrs):
+ attrsD = dict(attrs)
+ attrsD.setdefault('rel', 'alternate')
+ attrsD.setdefault('type', 'text/html')
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ expectingText = self.inchannel or self.initem
+ if self.initem:
+ self.items[-1].setdefault('links', [])
+ self.items[-1]['links'].append(attrsD)
+ elif self.inchannel:
+ self.channel['links'] = attrsD
+ if attrsD.has_key('href'):
+ expectingText = 0
+ if attrsD.get('type', '') in self.html_types:
+ if self.initem:
+ self.items[-1]['link'] = attrsD['href']
+ elif self.inchannel:
+ self.channel['link'] = attrsD['href']
+ else:
+ self.push('link', expectingText)
+
+ def _start_guid(self, attrs):
+ self.guidislink = ('ispermalink', 'false') not in attrs
+ self.push('guid', 1)
+
+ def _end_guid(self):
+ value = self.pop('guid')
+ self._save('id', value)
+ if self.guidislink:
+ # guid acts as link, but only if "ispermalink" is not present or is "true",
+ # and only if the item doesn't already have a link element
+ self._save('link', value)
+
+ def _start_id(self, attrs):
+ self.push('id', 1)
+
+ def _end_id(self):
+ value = self.pop('id')
+ self._save('guid', value)
+
+ def _start_title(self, attrs):
+ self.push('title', self.inchannel or self.initem)
+ _start_dc_title = _start_title
+
+ def _end_title(self):
+ self.pop('title')
+ _end_dc_title = _end_title
+
+ def _start_description(self, attrs):
+ self.push('description', self.inchannel or self.initem)
+
+ def _end_description(self):
+ value = self.pop('description')
+ if self.initem:
+ self.items[-1]['summary'] = value
+ elif self.inchannel:
+ self.channel['tagline'] = value
+
+ def _start_admin_generatoragent(self, attrs):
+ self.push('generator', 1)
+ value = self._getAttribute(attrs, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('generator')
+
+ def _start_admin_errorreportsto(self, attrs):
+ self.push('errorreportsto', 1)
+ value = self._getAttribute(attrs, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('errorreportsto')
+
+ def _start_summary(self, attrs):
+ self.push('summary', 1)
+
+ def _end_summary(self):
+ value = self.pop('summary')
+ if self.items:
+ self.items[-1]['description'] = value
+
+ def _start_content(self, attrs):
+ attrsD = dict(attrs)
+ self.incontent += 1
+ self.contentparams = {'mode': attrsD.get('mode', 'xml'),
+ 'type': attrsD.get('type', 'text/plain'),
+ 'language': attrsD.get('xml:lang', None),
+ 'base': attrsD.get('xml:base', self.baseuri)}
+ self.push('content', 1)
+
+ def _start_body(self, attrs):
+ attrsD = dict(attrs)
+ self.incontent += 1
+ self.contentparams = {'mode': 'xml',
+ 'type': 'application/xhtml+xml',
+ 'language': attrsD.get('xml:lang', None),
+ 'base': attrsD.get('xml:base', self.baseuri)}
+ self.push('content', 1)
+ _start_xhtml_body = _start_body
+
+ def _start_content_encoded(self, attrs):
+ attrsD = dict(attrs)
+ self.incontent += 1
+ self.contentparams = {'mode': 'escaped',
+ 'type': 'text/html',
+ 'language': attrsD.get('xml:lang', None),
+ 'base': attrsD.get('xml:base', self.baseuri)}
+ self.push('content', 1)
+ _start_fullitem = _start_content_encoded
+
+ def _end_content(self):
+ value = self.pop('content')
+ if self.contentparams.get('type') in (['text/plain'] + self.html_types):
+ self._save('description', value)
+ self.incontent -= 1
+ self.contentparams.clear()
+ _end_body = _end_content
+ _end_xhtml_body = _end_content
+ _end_content_encoded = _end_content
+ _end_fullitem = _end_content
+
+class BaseHTMLProcessor(sgmllib.SGMLParser):
+ def __init__(self):
+ sgmllib.SGMLParser.__init__(self)
+
+ def reset(self):
+ # extend (called by sgmllib.SGMLParser.__init__)
+ self.pieces = []
+ sgmllib.SGMLParser.reset(self)
+
+ def normalize_attrs(self, attrs):
+ # utility method to be called by descendants
+ attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ return attrs
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
+ strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
+ self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be "pre"
+ # Reconstruct the original end tag.
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for " ", ref will be "160"
+ # Reconstruct the original character reference.
+ self.pieces.append("&#%(ref)s;" % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for "©", ref will be "copy"
+ # Reconstruct the original entity reference.
+ self.pieces.append("&%(ref)s" % locals())
+ # standard HTML entities are closed with a semicolon; other entities are not
+ if htmlentitydefs.entitydefs.has_key(ref):
+ self.pieces.append(";")
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ self.pieces.append("<!--%(text)s-->" % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append("<?%(text)s>" % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append("<!%(text)s>" % locals())
+
+ def output(self):
+ """Return processed HTML as a single string"""
+ return "".join(self.pieces)
+
+class RelativeURIResolver(BaseHTMLProcessor):
+ relative_uris = [('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+ def __init__(self, baseuri):
+ BaseHTMLProcessor.__init__(self)
+ self.baseuri = baseuri
+
+ def resolveURI(self, uri):
+ return urlparse.urljoin(self.baseuri, uri)
+
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
+ BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+def resolveRelativeURIs(htmlSource, baseURI):
+ p = RelativeURIResolver(baseURI)
+ p.feed(htmlSource)
+ data = p.output()
+ return data
+
+class HTMLSanitizer(BaseHTMLProcessor):
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
+ 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
+ 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
+ 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
+ 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
+ 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
+ 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
+ 'thead', 'tr', 'tt', 'u', 'ul', 'var']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
+ 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
+ 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
+ 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
+ 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
+ 'usemap', 'valign', 'value', 'vspace', 'width']
+
+ def unknown_starttag(self, tag, attrs):
+ if ":" in tag: prefix, tag = tag.split(":", 1)
+ if not tag in self.acceptable_elements: return
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
+ BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+ def unknown_endtag(self, tag):
+ if ":" in tag: prefix, tag = tag.split(":", 1)
+ if not tag in self.acceptable_elements: return
+ BaseHTMLProcessor.unknown_endtag(self, tag)
+
+ def handle_pi(self, text):
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+def sanitizeHTML(htmlSource):
+ p = HTMLSanitizer()
+ p.feed(htmlSource)
+ data = p.output()
+ if mxtidy:
+ nerrors, nwarnings, data, errordata = mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
+ data = data.strip()
+ return data
+
+class FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+ def http_error_default(self, req, fp, code, msg, headers):
+ if ((code / 100) == 3) and (code != 304):
+ return self.http_error_302(req, fp, code, msg, headers)
+ from urllib import addinfourl
+ infourl = addinfourl(fp, headers, req.get_full_url())
+ infourl.status = code
+ return infourl
+# raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+ def http_error_302(self, req, fp, code, msg, headers):
+ infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
+ infourl.status = code
+ return infourl
+
+ def http_error_301(self, req, fp, code, msg, headers):
+ infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
+ infourl.status = code
+ return infourl
+
+ http_error_300 = http_error_302
+ http_error_307 = http_error_302
+
+def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
+ """
+ URI, filename, or string --> stream
+
+ This function lets you define parsers that take any input source
+ (URL, pathname to local or network file, or actual data as a string)
+ and deal with it in a uniform manner. Returned object is guaranteed
+ to have all the basic stdio read methods (read, readline, readlines).
+ Just .close() the object when you're done with it.
+
+ If the etag argument is supplied, it will be used as the value of an
+ If-None-Match request header.
+
+ If the modified argument is supplied, it must be a tuple of 9 integers
+ as returned by gmtime() in the standard Python time module. This MUST
+ be in GMT (Greenwich Mean Time). The formatted date/time will be used
+ as the value of an If-Modified-Since request header.
+
+ If the agent argument is supplied, it will be used as the value of a
+ User-Agent request header.
+
+ If the referrer argument is supplied, it will be used as the value of a
+ Referer[sic] request header.
+ """
+
+ if hasattr(source, "read"):
+ return source
+
+ if source == "-":
+ return sys.stdin
+
+ if not agent:
+ agent = USER_AGENT
+
+ # try to open with urllib2 (to use optional headers)
+ request = urllib2.Request(source)
+ if etag:
+ request.add_header("If-None-Match", etag)
+ if modified:
+ request.add_header("If-Modified-Since", format_http_date(modified))
+ request.add_header("User-Agent", agent)
+ if referrer:
+ request.add_header("Referer", referrer)
+ if gzip:
+ request.add_header("Accept-encoding", "gzip")
+ opener = urllib2.build_opener(FeedURLHandler())
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
+ try:
+ try:
+ return opener.open(request)
+ except:
+ # source is not a valid URL, but it might be a valid filename
+ pass
+ finally:
+ opener.close() # JohnD
+
+ # try to open with native open function (if source is a filename)
+ try:
+ return open(source)
+ except:
+ pass
+
+ # treat source as string
+ return StringIO(str(source))
+
+def get_etag(resource):
+ """
+ Get the ETag associated with a response returned from a call to
+ open_resource().
+
+ If the resource was not returned from an HTTP server or the server did
+ not specify an ETag for the resource, this will return None.
+ """
+
+ if hasattr(resource, "info"):
+ return resource.info().getheader("ETag")
+ return None
+
+def get_modified(resource):
+ """
+ Get the Last-Modified timestamp for a response returned from a call to
+ open_resource().
+
+ If the resource was not returned from an HTTP server or the server did
+ not specify a Last-Modified timestamp, this function will return None.
+ Otherwise, it returns a tuple of 9 integers as returned by gmtime() in
+ the standard Python time module().
+ """
+
+ if hasattr(resource, "info"):
+ last_modified = resource.info().getheader("Last-Modified")
+ if last_modified:
+ return parse_date(last_modified)
+ return None
+
+short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+long_weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+
+def format_http_date(date):
+ """
+ Formats a tuple of 9 integers into an RFC 1123-compliant timestamp as
+ required in RFC 2616. We don't use time.strftime() since the %a and %b
+ directives can be affected by the current locale (HTTP dates have to be
+ in English). The date MUST be in GMT (Greenwich Mean Time).
+ """
+
+ return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[date[6]], date[2], months[date[1] - 1], date[0], date[3], date[4], date[5])
+
+# if possible, use the PyXML module xml.utils.iso8601 to parse dates
+try:
+ from xml.utils.iso8601 import parse as iso8601_parse
+except ImportError:
+ iso8601_parse = None
+
+# the ISO 8601 standard is very convoluted and irregular - a full ISO 8601
+# parser is beyond the scope of feedparser and would be a worthwhile addition
+# to the Python library
+# A single regular expression cannot parse ISO 8601 date formats into groups
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
+# 0301-04-01), so we use templates instead
+# Please note the order in templates is significant because we need a
+# greedy match
+iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+ '-YY-?MM', '-OOO', '-YY',
+ '--MM-?DD', '--MM',
+ '---DD',
+ 'CC', '']
+iso8601_re = [
+ tmpl.replace(
+ 'YYYY', r'(?P<year>\d{4})').replace(
+ 'YY', r'(?P<year>\d\d)').replace(
+ 'MM', r'(?P<month>[01]\d)').replace(
+ 'DD', r'(?P<day>[0123]\d)').replace(
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
+ 'CC', r'(?P<century>\d\d$)')
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
+ + r'(:(?P<second>\d{2}))?'
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
+ for tmpl in iso8601_tmpl]
+
+iso8601_matches = [re.compile(regex).match for regex in iso8601_re]
+
+def parse_date(date):
+ """
+ Parses a variety of date formats into a tuple of 9 integers as
+ returned by time.gmtime(). This should not use time.strptime() since
+ that function is not available on all platforms and could also be
+ affected by the current locale.
+ """
+
+ date = str(date)
+
+ try:
+ # if at all possible, use the standard library's rfc822 module's
+ # (RFC2822, actually, which also encompasses RFC1123)
+ # parsedate function instead of rolling our own
+ # rfc822.parsedate is quite robust, and handles asctime-style dates
+ # as well
+ tm = rfc822.parsedate_tz(date)
+ if tm:
+ return time.gmtime(rfc822.mktime_tz(tm))
+ # not a RFC2822 date, try ISO 8601 format instead
+ try:
+ if iso8601_parse:
+ tm = iso8601_parse(date)
+ except ValueError:
+ tm = None
+ if tm:
+ return time.gmtime(tm)
+ # unfortunately, xml.utils.iso8601 does not recognize many valid
+ # ISO8601 formats like 20040105, so we try our home-made
+ # regular expressions instead
+ for iso8601_match in iso8601_matches:
+ m = iso8601_match(date)
+ if m:
+ break
+ if not m:
+ return None
+ # catch truly malformed strings
+ if m.span() == (0, 0):
+ return None
+ params = m.groupdict()
+ ordinal = params.get("ordinal", 0)
+ if ordinal:
+ ordinal = int(ordinal)
+ else:
+ ordinal = 0
+ year = params.get("year", "--")
+ if not year or year == "--":
+ year = time.gmtime()[0]
+ elif len(year) == 2:
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
+ year = 100 * (time.gmtime()[0] // 100) + int(year)
+ else:
+ year = int(year)
+ month = params.get("month", "-")
+ if not month or month == "-":
+ # ordinals are NOT normalized by mktime, we simulate them
+ # by setting month=1, day=ordinal
+ if ordinal:
+ month = 1
+ else:
+ month = time.gmtime()[1]
+ month = int(month)
+ day = params.get("day", 0)
+ if not day:
+ # see above
+ if ordinal:
+ day = ordinal
+ elif params.get("century", 0) or \
+ params.get("year", 0) or params.get("month", 0):
+ day = 1
+ else:
+ day = time.gmtime()[2]
+ else:
+ day = int(day)
+ # special case of the century - is the first year of the 21st century
+ # 2000 or 2001 ? The debate goes on...
+ if "century" in params:
+ year = (int(params["century"]) - 1) * 100 + 1
+ # in ISO 8601 most fields are optional
+ for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
+ if not params.get(field, None):
+ params[field] = 0
+ hour = int(params.get("hour", 0))
+ minute = int(params.get("minute", 0))
+ second = int(params.get("second", 0))
+ # weekday is normalized by mktime(), we can ignore it
+ weekday = 0
+ # daylight savings is complex, but not needed for feedparser's purposes
+ # as time zones, if specified, include mention of whether it is active
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
+ # and most implementations have DST bugs
+ daylight_savings_flag = 0
+ tm = [year, month, day, hour, minute, second, weekday,
+ ordinal, daylight_savings_flag]
+ # ISO 8601 time zone adjustments
+ tz = params.get("tz")
+ if tz and tz != "Z":
+ if tz[0] == "-":
+ tm[3] += int(params.get("tzhour", 0))
+ tm[4] += int(params.get("tzmin", 0))
+ elif tz[0] == "+":
+ tm[3] -= int(params.get("tzhour", 0))
+ tm[4] -= int(params.get("tzmin", 0))
+ else:
+ return None
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
+ # which is guaranteed to normalize d/m/y/h/m/s
+ # many implementations have bugs, however
+ return time.localtime(time.mktime(tm))
+ except:
+ return None
+
+def parse(uri, etag=None, modified=None, agent=None, referrer=None):
+ result = {}
+ f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
+ data = f.read()
+ if hasattr(f, "headers"):
+ if gzip and f.headers.get('content-encoding', '') == 'gzip':
+ try:
+ data = gzip.GzipFile(fileobj=StringIO(data)).read()
+ except:
+ # some feeds claim to be gzipped but they're not, so we get garbage
+ data = ''
+ newEtag = get_etag(f)
+ if newEtag: result["etag"] = newEtag
+ elif etag: result["etag"] = etag
+ newModified = get_modified(f)
+ if newModified: result["modified"] = newModified
+ elif modified: result["modified"] = modified
+ if hasattr(f, "url"):
+ result["url"] = f.url
+ if hasattr(f, "headers"):
+ result["headers"] = f.headers.dict
+ if hasattr(f, "status"):
+ result["status"] = f.status
+ elif hasattr(f, "url"):
+ result["status"] = 200
+ # get the xml encoding
+# xmlheaderRe = re.compile('<\?.*encoding="(.*)".*\?>') # TvdV's version
+# xmlheaderRe = re.compile('xml\s.*\sencoding=(".*"|\'.*\').*') # Blake's version
+ xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
+ match = xmlheaderRe.match(data)
+ if match:
+ result['encoding'] = match.groups()[0].lower()
+ f.close()
+ result['channel'] = {}
+ result['items'] = {}
+ if _XML_AVAILABLE:
+ result['bozo'] = not isWellFormed(data)
+ baseuri = result.get('headers', {}).get('content-location', result.get('url'))
+ r = FeedParser(baseuri)
+ r.feed(data)
+ result['channel'] = r.channel
+ result['items'] = r.items
+ return result
+
+TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
+ 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
+ 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
+ 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
+ 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
+ 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
+ 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
+ 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
+
+last_day_year = time.localtime(time.mktime(
+ (time.gmtime()[0], 12, 31, 0, 0, 0, 0, 0, 0)))
+last_day_january = time.localtime(time.mktime(
+ (time.gmtime()[0], 1, 31, 0, 0, 0, 0, 0, 0)))
+first_day_month = time.localtime(time.mktime(
+ (time.gmtime()[0], time.gmtime()[1], 1, 0, 0, 0, 0, 0, 0)))
+first_day_december = time.localtime(time.mktime(
+ (time.gmtime()[0], 12, 1, 0, 0, 0, 0, 0, 0)))
+DATETIME_SUITE = (
+ ('asctime', 'Sun Jan 4 16:29:06 PST 2004',
+ (2004, 1, 5, 0, 29, 6, 0, 5, 0)),
+ ('RFC-2822', 'Sat, 03 Jan 2004 07:21:52 GMT',
+ (2004, 1, 3, 7, 21, 52, 5, 3, 0)),
+ # http://www.w3.org/TR/NOTE-datetime
+ ('W3C-datetime (Tokyo)', '2003-12-31T18:14:55+08:00',
+ (2003, 12, 31, 10, 14, 55, 2, 365, 0)),
+ ('W3C-datetime (San Francisco)', '2003-12-31T10:14:55-08:00',
+ (2003, 12, 31, 18, 14, 55, 2, 365, 0)),
+ ('W3C-datetime (zulu)', '2003-12-31T10:14:55Z',
+ (2003, 12, 31, 10, 14, 55, 2, 365, 0)),
+ # Complete ISO 8601 test cases for the sake of completeness
+ # See:
+ # http://www.cl.cam.ac.uk/~mgk25/iso-time.html
+ # http://www.mcs.vuw.ac.nz/technical/software/SGML/doc/iso8601/ISO8601.html
+ ('ISO8601 date only', '2003-12-31',
+ (2003, 12, 31, 0, 0, 0, 2, 365, 0)),
+ ('ISO8601 date only (variant)', '20031231',
+ (2003, 12, 31, 0, 0, 0, 2, 365, 0)),
+ ('ISO8601 year/month only', '2003-12',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year only', '2003',
+ (2003, 1, 1, 0, 0, 0, 2, 1, 0)),
+ ('ISO8601 century only', '21',
+ (2001, 1, 1, 0, 0, 0, 0, 1, 0)),
+ ('ISO8601 century omitted', '03-12-31',
+ (2003, 12, 31, 0, 0, 0, 2, 365, 0)),
+ ('ISO8601 century omitted (variant)', '031231',
+ (2003, 12, 31, 0, 0, 0, 2, 365, 0)),
+ ('ISO8601 year/month only (century omitted)', '-03-12',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year/month only (century omitted variant)', '-0312',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year only (century omitted)', '-03',
+ (2003, 1, 1, 0, 0, 0, 2, 1, 0)),
+ ('ISO8601 day/month only (year omitted)', '--12-31', last_day_year),
+ ('ISO8601 day/month only (year omitted variant)', '--1231', last_day_year),
+ ('ISO8601 month only', '--12', first_day_december),
+ ('ISO8601 day only', '---01', first_day_month),
+ ('ISO8601 year/ordinal', '2003-335',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year/ordinal (variant)', '2003335',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year/ordinal (century omitted)', '03-335',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 year/ordinal (century omitted variant)', '03335',
+ (2003, 12, 1, 0, 0, 0, 0, 335, 0)),
+ ('ISO8601 ordinal only', '-%03d' % last_day_year[-2], last_day_year),
+ ('ISO8601 ordinal only', '-031', last_day_january),
+ # XXX missing ISO 8601 week/day formats
+ # time formats
+ ('ISO8601 time only', '17:41:00',
+ time.gmtime()[0:3] + (17, 41 ,00) + time.gmtime()[-3:]),
+ ('ISO8601 time only (zulu)', '17:41:00Z',
+ time.gmtime()[0:3] + (17, 41 ,00) + time.gmtime()[-3:]),
+ ('ISO8601 time only (Tokyo)', '18:14:55+08:00',
+ time.gmtime()[0:3] + (10, 14, 55) + time.gmtime()[-3:]),
+ ('ISO8601 time only (Tokyo)', '18:14:55+08',
+ time.gmtime()[0:3] + (10, 14, 55) + time.gmtime()[-3:]),
+ # rollover, leap years, and so on
+ ('Rollover', '2004-02-28T18:14:55-08:00',
+ (2004, 2, 29, 2, 14, 55, 6, 60, 0)),
+ ('Rollover', '2003-02-28T18:14:55-08:00',
+ (2003, 3, 1, 2, 14, 55, 5, 60, 0)),
+ ('Rollover (Y2K)', '2000-02-28T18:14:55-08:00',
+ (2000, 2, 29, 2, 14, 55, 1, 60, 0)),
+ # this will overflow due to 32-bit time_t overflow
+ # years multiple of 100 but not of 400 are not leap years, e.g. 1900, 2100
+ ('Rollover (2100) (IGNORE)', '2100-02-28T18:14:55-08:00',
+ (2100, 3, 1, 2, 14, 55, 0, 60, 0)),
+ # miscellaneous non-conforming formats, seen in the wild
+ ('Bogus (from http://mindview.net/WebLog/RSS.xml)', '1-2-04', None),
+ ('US-style date only', '04-01-05',
+ (2004, 1, 5, 0, 0, 0, 0, 5, 0)))
+
+
+if __name__ == '__main__':
+ if sys.argv[1:] == ['date']:
+ for test, date, gmtime in DATETIME_SUITE:
+ result = parse_date(date)
+ if result != gmtime:
+ print '### failed test for', test, '("%s")' % date
+ print 'got', result, 'expected', gmtime
+ sys.exit(0)
+ if sys.argv[1:]:
+ urls = sys.argv[1:]
+ else:
+ urls = TEST_SUITE
+ from pprint import pprint
+ for url in urls:
+ print url
+ print
+ result = parse(url)
+ pprint(result)
+ print
+
+"""
+TODO
+- image
+- author
+- contributor
+- comments
+- base64 content
+"""
Added: webservices/admin/planet/feedparser.pyc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/feedparser.pyc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/feedparser.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: webservices/admin/planet/foafroll.xml.tmpl
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/foafroll.xml.tmpl?rev=355462&view=auto
==============================================================================
--- webservices/admin/planet/foafroll.xml.tmpl (added)
+++ webservices/admin/planet/foafroll.xml.tmpl Fri Dec 9 04:21:26 2005
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+ xmlns:foaf="http://xmlns.com/foaf/0.1/"
+ xmlns:rss="http://purl.org/rss/1.0/"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+>
+<foaf:Group>
+ <foaf:name><TMPL_VAR name></foaf:name>
+ <foaf:homepage>http://www.planetapache.org/</foaf:homepage>
+ <rdfs:seeAlso rdf:resource="http://www.planetapache.org/foafroll.xml" />
+
+<TMPL_LOOP Channels>
+ <foaf:member>
+ <foaf:Agent>
+ <foaf:name><TMPL_VAR name></foaf:name>
+ <foaf:weblog>
+ <foaf:Document rdf:about="<TMPL_VAR link ESCAPE="HTML">">
+ <dc:title><TMPL_VAR title ESCAPE="HTML"></dc:title>
+ <rdfs:seeAlso>
+ <rss:channel rdf:about="<TMPL_VAR uri ESCAPE="HTML">" />
+ </rdfs:seeAlso>
+ </foaf:Document>
+ </foaf:weblog>
+ </foaf:Agent>
+ </foaf:member>
+</TMPL_LOOP>
+
+</foaf:Group>
+</rdf:RDF>
Added: webservices/admin/planet/foafroll.xml.tmplc
URL: http://svn.apache.org/viewcvs/webservices/admin/planet/foafroll.xml.tmplc?rev=355462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: webservices/admin/planet/foafroll.xml.tmplc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream