You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2005/01/29 00:40:46 UTC
svn commit: r148952 - /jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java

Author: burton
Date: Fri Jan 28 15:40:44 2005
New Revision: 148952

URL: http://svn.apache.org/viewcvs?view=rev&rev=148952
Log:
impl class...
Added:
   jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java

Added: jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java
Url: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java?view=auto&rev=148952
==============================================================================
--- (empty file)
+++ jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java	Fri Jan 28 15:40:44 2005
@@ -0,0 +1,217 @@
+/*
+ * Copyright 1999,2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.feedparser;
+
+import org.apache.commons.feedparser.tools.XMLCleanser;
+import org.apache.commons.feedparser.tools.XMLEncodingParser;
+import org.apache.log4j.Logger;
+import org.jdom.input.DOMBuilder;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * This FeedParser implementation is based on JDOM and Jaxen and is based around
+ * XPath and JDOM iteration.  While the implementation is straight forward it
+ * has not been optimized for performance.  A SAX based parser would certainly
+ * be less memory intensive but with the downside of being harder to develop.
+ *
+ * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
+ * @version $Id: FeedParser.java,v 1.12 2005/01/23 09:49:50 burton Exp $
+ */
+public class FeedParserImpl implements FeedParser {
+
+    private static Logger log = Logger.getLogger(FeedParserImpl.class);
+
+    /**
+     * Parse this feed.
+     *
+     * @param resource The URL of the feed being parsed.  This is optional and
+     *                 may be null but is used when an exception is thrown to aid debugging.
+     */
+    public void parse(FeedParserListener listener,
+                      InputStream is,
+                      String resource) throws FeedParserException {
+
+        try {
+
+            // Need to massage our XML support for UTF-8 to prevent the dreaded
+            // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
+            // default feeds.  This was tested a great deal under NewsMonster
+            // and I'm happy with the results.  Within FeedParser 2.0 we will be
+            // using SAX2 so this won't be as big of a problem.  In FeedParser
+            // 2.0 (or as soon as we use SAX) this code should be totally
+            // removed to use the original stream.
+
+            is = getCorrectInputStream(is);
+
+            //OK.  Now we have the right InputStream so we should build our DOM
+            //and exec.
+            DOMBuilder builder = new DOMBuilder();
+
+            org.jdom.Document doc = builder.build(is);
+
+            parse(listener, doc);
+
+        } catch (FeedParserException fpe) {
+            //if an explicit FeedParserException is thrown just rethrow it..
+            throw fpe;
+        } catch (Throwable t) {
+
+            //FIXME: when this is a JDOM or XML parser Exception we should
+            //detect when we're working with an XHTML or HTML file and then
+            //parse it with an XFN/XOXO event listener.
+
+            throw new FeedParserException(t);
+        }
+
+    }
+
+    /**
+     * Perform the Xerces UTF8 correction and FeedFilter.
+     */
+    private InputStream getCorrectInputStream(InputStream is)
+            throws Exception {
+
+        byte[] bytes = toByteArray(is);
+
+        //FIXME: if we return the WRONG content type here we will break.
+        //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
+        //HTTP Content-Type parsing here to preserve the content type.  This can
+        //be fixed by integrating our networking API from NewsMonster.
+
+        String encoding = XMLEncodingParser.parse(bytes);
+
+        if (encoding == null)
+            encoding = "UTF-8";
+
+        if (encoding.startsWith("UTF")) {
+
+            String result = XMLCleanser.cleanse(bytes, encoding);
+            bytes = FeedFilter.parse(result, encoding);
+
+        } else {
+
+            bytes = FeedFilter.parse(bytes, encoding);
+
+        }
+
+        //remove prefix whitespace, intern HTML entities, etc.
+
+        //build an input stream from the our bytes for parsing...
+        is = new ByteArrayInputStream(bytes);
+
+        return is;
+
+    }
+
+    /**
+     * @deprecated Use #parse( FeedParserException, InputStream, String )
+     */
+    public void parse(FeedParserListener listener,
+                      InputStream is) throws FeedParserException {
+
+        parse(listener, is, null);
+
+    }
+
+    /**
+     * Parse this feed.
+     */
+    public void parse(FeedParserListener listener,
+                      org.jdom.Document doc) throws FeedParserException {
+
+        try {
+
+            String root = doc.getRootElement().getName();
+
+            //Handle OPML
+            if ("opml".equals(root)) {
+                OPMLFeedParser.parse(listener, doc);
+                return;
+            }
+
+            //Handle changes.xml
+            if ("weblogUpdates".equals(root)) {
+                ChangesFeedParser.parse(listener, doc);
+                return;
+            }
+
+            //Handle ATOM
+            if ("feed".equals(root)) {
+                AtomFeedParser.parse(listener, doc);
+                return;
+            }
+
+            //Handle FOAF
+            if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
+                FOAFFeedParser.parse(listener, doc);
+                return;
+            }
+
+            //FIXME: if this is XHTML we need to handle this with either an XFN
+            //or an XOXO directory parser.  There might be more metadata we need
+            //to parse here.  (also I wonder if this could be a chance to do
+            //autodiscovery).
+
+            //fall back on RDF and RSS parsing.
+
+            //FIXME: if this is an UNKNOWN format We need to throw an
+            //UnsupportedFeedxception (which extends FeedParserException)
+            //
+            // In this situation the ROOT elements should be: rss or RDF
+
+            RSSFeedParser.parse(listener, doc);
+
+        } catch (FeedParserException fpe) {
+            //if an explicit FeedParserException is thrown just rethrow it..
+            throw fpe;
+        } catch (Throwable t) {
+            throw new FeedParserException(t);
+        }
+
+    }
+
+    /**
+     * Convert an InputStream to a byte array.
+     */
+    public byte[] toByteArray(InputStream is) throws IOException {
+
+        //WARNING:
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+        //now process the Reader...
+        byte data[] = new byte[200];
+
+        int readCount = 0;
+
+        while ((readCount = is.read(data)) > 0) {
+
+            bos.write(data, 0, readCount);
+        }
+
+        is.close();
+        bos.close();
+
+        return bos.toByteArray();
+
+    }
+
+}
+

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org