You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by hu...@apache.org on 2002/08/04 20:33:51 UTC
cvs commit: xml-cocoon2/src/java/org/apache/cocoon/generation LinkStatusGenerator.java
huber 2002/08/04 11:33:51
Modified: src/java/org/apache/cocoon/generation
LinkStatusGenerator.java
Log:
Explictly close BufferedReader, and HttpURLConnection
Revision Changes Path
1.5 +170 -101 xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java
Index: LinkStatusGenerator.java
===================================================================
RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- LinkStatusGenerator.java 2 Aug 2002 07:06:21 -0000 1.4
+++ LinkStatusGenerator.java 4 Aug 2002 18:33:51 -0000 1.5
@@ -1,3 +1,53 @@
+/*
+
+ ============================================================================
+ The Apache Software License, Version 1.1
+ ============================================================================
+
+ Copyright (C) 1999-2002 The Apache Software Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modifica-
+ tion, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. The end-user documentation included with the redistribution, if any, must
+ include the following acknowledgment: "This product includes software
+ developed by the Apache Software Foundation (http://www.apache.org/)."
+ Alternately, this acknowledgment may appear in the software itself, if
+ and wherever such third-party acknowledgments normally appear.
+
+ 4. The names "Apache Cocoon" and "Apache Software Foundation" must not be
+ used to endorse or promote products derived from this software without
+ prior written permission. For written permission, please contact
+ apache@apache.org.
+
+ 5. Products derived from this software may not be called "Apache", nor may
+ "Apache" appear in their name, without prior written permission of the
+ Apache Software Foundation.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
+ DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many individuals
+ on behalf of the Apache Software Foundation and was originally created by
+ Stefano Mazzocchi <st...@apache.org>. For more information on the Apache
+ Software Foundation, please see <http://www.apache.org/>.
+
+ */
package org.apache.cocoon.generation;
import org.apache.avalon.excalibur.pool.Recyclable;
@@ -40,23 +90,23 @@
public class LinkStatusGenerator extends ComposerGenerator implements Recyclable, Configurable {
/** The URI of the namespace of this generator. */
protected static final String URI =
- "http://apache.org/cocoon/linkstatus/2.0";
-
+ "http://apache.org/cocoon/linkstatus/2.0";
+
/** The namespace prefix for this namespace. */
protected static final String PREFIX = "linkstatus";
-
+
/* Node and attribute names */
protected static final String TOP_NODE_NAME = "linkstatus";
protected static final String LINK_NODE_NAME = "link";
-
+
protected static final String HREF_ATTR_NAME = "href";
protected static final String REFERRER_ATTR_NAME = "referrer";
protected static final String CONTENT_ATTR_NAME = "content";
protected static final String STATUS_ATTR_NAME = "status";
- protected static final String MESSAGE_ATTR_NAME = "message";
-
+ protected static final String MESSAGE_ATTR_NAME = "message";
+
protected AttributesImpl attributes = new AttributesImpl();
-
+
/**
* Config element name specifying expected link content-typ.
* <p>
@@ -66,7 +116,7 @@
* @since
*/
public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
-
+
/**
* Default value of <code>link-content-type</code> configuration value.
* <p>
@@ -96,7 +146,7 @@
* @since
*/
public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
-
+
/**
* Config element name specifying excluding regular expression pattern.
* <p>
@@ -106,7 +156,7 @@
* @since
*/
public final static String EXCLUDE_CONFIG = "exclude";
-
+
/**
* Config element name specifying including regular expression pattern.
* <p>
@@ -116,7 +166,7 @@
* @since
*/
public final static String INCLUDE_CONFIG = "include";
-
+
/**
* Config element name specifying http header value for user-Agent.
* <p>
@@ -133,7 +183,7 @@
* @since
*/
public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
-
+
/**
* Config element name specifying http header value for accept.
* <p>
@@ -152,43 +202,42 @@
* @since
*/
public final static String ACCEPT_DEFAULT = "*/*";
-
+
private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
private HashSet excludeCrawlingURL;
private HashSet includeCrawlingURL;
private String userAgent = USER_AGENT_DEFAULT;
private String accept = ACCEPT_DEFAULT;
-
+
private HashSet crawled;
private HashSet linksToProcess;
-
+
/**
* Stores links to process and the referrer links
*/
-
private class Link {
private URL url;
private String referrer;
-
+
public Link( URL url, String referrer ) {
this.url = url;
this.referrer = referrer;
}
-
+
public URL getURL() {
return url;
}
-
+
public String getReferrer() {
return referrer;
}
-
+
public boolean equals( Link l ) {
return url.equals( l.getURL());
}
}
-
+
/**
* Configure the crawler component.
* <p>
@@ -214,8 +263,8 @@
* @since
*/
public void configure(Configuration configuration)
- throws ConfigurationException {
-
+ throws ConfigurationException {
+
Configuration[] children;
children = configuration.getChildren(INCLUDE_CONFIG);
if (children != null && children.length > 0) {
@@ -229,12 +278,12 @@
this.includeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
- getLogger().error("Cannot create includeing regular-expression for " +
- pattern, rese);
+ getLogger().error("Cannot create including regular-expression for " +
+ pattern, rese);
}
}
}
-
+
children = configuration.getChildren(EXCLUDE_CONFIG);
if (children != null && children.length > 0) {
excludeCrawlingURL = new HashSet();
@@ -247,15 +296,15 @@
this.excludeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
- getLogger().error("Cannot create excluding regular-expression for " +
- pattern, rese);
+ getLogger().error("Cannot create excluding regular-expression for " +
+ pattern, rese);
}
}
} else {
excludeCrawlingURL = new HashSet();
setDefaultExcludeFromCrawling();
}
-
+
Configuration child;
String value;
child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
@@ -272,7 +321,7 @@
this.linkViewQuery = value.trim();
}
}
-
+
child = configuration.getChild(USER_AGENT_CONFIG, false);
if (child != null) {
value = child.getValue();
@@ -280,7 +329,7 @@
this.userAgent = value;
}
}
-
+
child = configuration.getChild(ACCEPT_CONFIG, false);
if (child != null) {
value = child.getValue();
@@ -289,20 +338,20 @@
}
}
}
-
+
public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par)
- throws ProcessingException, SAXException, IOException {
-
+ throws ProcessingException, SAXException, IOException {
+
super.setup(resolver, objectModel, src, par);
-
+
/* Create a reusable attributes for creating nodes */
this.attributes = new AttributesImpl();
-
+
// already done in configure...
//excludeCrawlingURL = new HashSet();
//this.setDefaultExcludeFromCrawling();
}
-
+
/**
* Generate XML data.
*
@@ -312,42 +361,42 @@
* if the requsted URI wasn't found
*/
public void generate()
- throws SAXException, ProcessingException {
+ throws SAXException, ProcessingException {
try {
-
+
crawled = new HashSet();
linksToProcess = new HashSet();
-
+
URL root = new URL(source);
linksToProcess.add(new Link( root, ""));
-
-
+
+
if (getLogger().isDebugEnabled()) {
getLogger().debug("crawl URL " + root);
}
-
+
this.contentHandler.startDocument();
this.contentHandler.startPrefixMapping(PREFIX,URI);
-
+
attributes.clear();
super.contentHandler.startElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME, attributes);
-
+
while (linksToProcess.size() > 0) {
Iterator i = linksToProcess.iterator();
-
+
if (i.hasNext()) {
// fetch a URL
Link link = (Link) i.next();
- URL url = link.getURL();
-
+ URL url = link.getURL();
+
// remove it from the to-do list
linksToProcess.remove(link);
-
+
String new_url_link = processURL(url, link.getReferrer());
-
+
// calc all links from this url
if (new_url_link != null) {
-
+
List url_links = getLinksFromConnection(new_url_link, url);
if (url_links != null) {
// add links of this url to the to-do list
@@ -356,7 +405,7 @@
}
}
}
-
+
super.contentHandler.endElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME);
this.contentHandler.endPrefixMapping(PREFIX);
this.contentHandler.endDocument();
@@ -365,7 +414,7 @@
throw new ResourceNotFoundException("Could not read source ", ioe);
}
}
-
+
/**
* Default exclude patterns.
* <p>
@@ -389,19 +438,19 @@
".*\\.js(\\?.*)?$",
".*\\.css(\\?.*)?$"
};
-
+
for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
try {
excludeCrawlingURL.add(new RE(pattern));
} catch (RESyntaxException rese) {
getLogger().error("Cannot create excluding regular-expression for " +
- pattern, rese);
+ pattern, rese);
}
}
}
-
-
+
+
/**
* Retrieve a list of links of a url
*
@@ -410,31 +459,38 @@
* <code>http://host/foo/bar?cocoon-view=links</code>
* @param url_of_referrer base url of which links are requested, ie of the form
* <code>http://host/foo/bar</code>
- * @return List of links from url_of_referrer, as result of requesting url
+ * @return List of links from url_of_referrer, as result of requesting url
* url_link_string
*/
protected List getLinksFromConnection(String url_link_string, URL url_of_referrer) {
List url_links = null;
+ BufferedReader br = null;
try {
URL url_link = new URL( url_link_string );
URLConnection conn = url_link.openConnection();
String content_type = conn.getContentType();
-
+
+ if (content_type == null) {
+ getLogger().warn( "No content type available for " + String.valueOf( url_link_string ) );
+ // caller checks if null
+ return url_links;
+ }
+
if (getLogger().isDebugEnabled()) {
getLogger().debug("Content-type: " + content_type);
}
-
+
if (content_type.equals(linkContentType)) {
url_links = new ArrayList();
-
+
InputStream is = conn.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
-
+ br = new BufferedReader(new InputStreamReader(is));
+
// content is supposed to be a list of links,
// relative to current URL
String line;
String referrer = url_of_referrer.toString();
-
+
while ((line = br.readLine()) != null) {
URL new_url = new URL(url_link, line);
boolean add_url = true;
@@ -442,22 +498,22 @@
if (add_url) {
add_url &= !url_links.contains(new_url);
}
-
+
// don't add new_url if it has been crawled already
if (add_url) {
add_url &= !crawled.contains(new_url.toString());
}
-
+
Link new_link = new Link( new_url, referrer );
if (add_url) {
add_url &= !linksToProcess.contains(new_link);
}
-
+
// don't add if is not matched by existing include definition
if (add_url) {
add_url &= isIncludedURL(new_url.toString());
}
-
+
if (add_url) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("Add URL: " + new_url.toString());
@@ -469,10 +525,18 @@
}
} catch (IOException ioe) {
getLogger().warn("Problems get links of " + url_link_string, ioe);
+ } finally {
+ // explictly close the stream
+ if (br != null) {
+ try {
+ br.close();
+ br = null;
+ } catch (IOException ignored) {}
+ }
}
return url_links;
}
-
+
/**
* Generate xml attributes of a url, calculate url for retrieving links
*
@@ -482,57 +546,63 @@
* and not an included-url.
*/
protected String processURL(URL url, String referrer) throws SAXException {
-
+
if (getLogger().isDebugEnabled()) {
getLogger().debug("getLinks URL " + url);
}
-
+
String result = null;
-
+
// don't try to investigate a url which has been crawled already
if (crawled.contains(url.toString())) {
return null;
}
-
+
// mark it as crawled
crawled.add(url.toString());
-
+
attributes.clear();
attributes.addAttribute("", HREF_ATTR_NAME,
- HREF_ATTR_NAME, "CDATA", url.toString());
+ HREF_ATTR_NAME, "CDATA", url.toString());
attributes.addAttribute("", REFERRER_ATTR_NAME,
- REFERRER_ATTR_NAME, "CDATA", referrer);
-
+ REFERRER_ATTR_NAME, "CDATA", referrer);
+
// Output url, referrer, content-type, status, message for traversable url's
+ HttpURLConnection h = null;
try {
+
URLConnection links_url_connection = url.openConnection();
- HttpURLConnection h = (HttpURLConnection)links_url_connection;
+ h = (HttpURLConnection)links_url_connection;
String content_type = links_url_connection.getContentType();
-
+
attributes.addAttribute("", CONTENT_ATTR_NAME,
- CONTENT_ATTR_NAME, "CDATA",
- content_type);
-
+ CONTENT_ATTR_NAME, "CDATA",
+ content_type);
+
attributes.addAttribute("", MESSAGE_ATTR_NAME,
- MESSAGE_ATTR_NAME, "CDATA",
- h.getResponseMessage());
-
+ MESSAGE_ATTR_NAME, "CDATA",
+ h.getResponseMessage());
+
attributes.addAttribute("", STATUS_ATTR_NAME,
- STATUS_ATTR_NAME, "CDATA",
- String.valueOf(h.getResponseCode()));
+ STATUS_ATTR_NAME, "CDATA",
+ String.valueOf(h.getResponseCode()));
} catch (IOException ioe) {
attributes.addAttribute("", MESSAGE_ATTR_NAME,
- MESSAGE_ATTR_NAME, "CDATA",
- ioe.getMessage());
+ MESSAGE_ATTR_NAME, "CDATA",
+ ioe.getMessage());
+ } finally {
+ if (h != null) {
+ h.disconnect();
+ }
}
-
+
// don't try to get links of a url which is excluded from crawling
- // try to get links of a url which is included for crawling
+ // try to get links of a url which is included for crawling
if (!isExcludedURL(url.toString()) && isIncludedURL( url.toString() )) {
// add prefix and query to get data from the linkserializer.
result = url.toExternalForm()
- + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&")
- + linkViewQuery;
+ + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&")
+ + linkViewQuery;
}
super.contentHandler.startElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME, attributes);
@@ -540,7 +610,7 @@
return result;
}
-
+
/**
* check if URL is a candidate for indexing
*
@@ -556,7 +626,7 @@
}
return false;
}
-
+
final String s = url.toString();
Iterator i = excludeCrawlingURL.iterator();
while (i.hasNext()) {
@@ -573,8 +643,8 @@
}
return false;
}
-
-
+
+
/**
* check if URL is a candidate for indexing
*
@@ -590,7 +660,7 @@
}
return true;
}
-
+
final String s = url.toString();
Iterator i = includeCrawlingURL.iterator();
while (i.hasNext()) {
@@ -607,12 +677,11 @@
}
return false;
}
-
+
public void recycle() {
super.recycle();
-
+
this.attributes = null;
//this.excludeCrawlingURL = null;
}
}
-
----------------------------------------------------------------------
In case of troubles, e-mail: webmaster@xml.apache.org
To unsubscribe, e-mail: cocoon-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: cocoon-cvs-help@xml.apache.org