You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by ni...@apache.org on 2002/03/26 09:10:58 UTC
cvs commit: xml-cocoon2/src/scratchpad/src/org/apache/cocoon/generation LinkStatusGenerator.java
nicolaken 02/03/26 00:10:58
Modified: . changes.xml
Added: src/scratchpad/webapp/mount/linkstatus linkstatus.xsl
sitemap.xmap
src/scratchpad/src/org/apache/cocoon/generation
LinkStatusGenerator.java
Log:
Added LinkStatusGenerator donated by Michael Homeijer and accompanying sample
sitemap to scratchpad.
Revision Changes Path
1.130 +5 -1 xml-cocoon2/changes.xml
Index: changes.xml
===================================================================
RCS file: /home/cvs/xml-cocoon2/changes.xml,v
retrieving revision 1.129
retrieving revision 1.130
diff -u -r1.129 -r1.130
--- changes.xml 26 Mar 2002 08:09:01 -0000 1.129
+++ changes.xml 26 Mar 2002 08:10:58 -0000 1.130
@@ -4,7 +4,7 @@
<!--
History of Cocoon changes
- $Id: changes.xml,v 1.129 2002/03/26 08:09:01 nicolaken Exp $
+ $Id: changes.xml,v 1.130 2002/03/26 08:10:58 nicolaken Exp $
-->
<changes title="History of Changes">
@@ -35,6 +35,10 @@
</devs>
<release version="@version@" date="@date@">
+ <action dev="NKB" type="add">
+ Added LinkStatusGenerator donated by Michael Homeijer and accompanying sample
+ sitemap to scratchpad.
+ </action>
<action dev="NKB" type="update">
Moved castor scratchpad sample from /samples to /mount as other samples.
Now the refactored sample page points to the mount dir, thus
1.1 xml-cocoon2/src/scratchpad/webapp/mount/linkstatus/linkstatus.xsl
Index: linkstatus.xsl
===================================================================
<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:linkstatus="http://apache.org/cocoon/linkstatus/2.0">
<xsl:template match="linkstatus:linkstatus">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="linkstatus:link">
HREF: <xsl:value-of select="@href"/>, REFERRER: <xsl:value-of select="@referrer"/>, CONTENT-TYPE: <xsl:value-of select="@content"/>, STATUS: <xsl:value-of select="@status"/>, MESSAGE: <xsl:value-of select="@message"/><BR/>
</xsl:template>
</xsl:stylesheet>
1.1 xml-cocoon2/src/scratchpad/webapp/mount/linkstatus/sitemap.xmap
Index: sitemap.xmap
===================================================================
<?xml version="1.0"?>
<map:sitemap xmlns:map="http://apache.org/cocoon/sitemap/1.0">
<!-- =========================== Components ================================ -->
<map:components>
<map:generators default="file">
<map:generator name="linkstatus" logger="sitemap.generator.linkstatus" label="content,data"
src="org.apache.cocoon.generation.LinkStatusGenerator"/>
</map:generators>
<map:transformers default="xslt"/>
<map:readers default="resource"/>
<map:serializers default="html"/>
<map:selectors default="browser"/>
<map:matchers default="wildcard">
<map:matcher name="wildcard" src="org.apache.cocoon.matching.WildcardURIMatcherFactory"/>
</map:matchers>
</map:components>
<map:views>
<map:view name="links" from-position="last">
<map:serialize type="links"/>
</map:view>
</map:views>
<!-- =========================== Pipelines ================================= -->
<map:pipelines>
<map:pipeline>
<map:match pattern="">
<map:redirect-to uri="linkstatus"/>
</map:match>
<map:match pattern="linkstatus">
<map:generate type="linkstatus" src="http://localhost:8080/cocoon/welcome"/>
<map:transform src="linkstatus.xsl"/>
<map:serialize/>
</map:match>
</map:pipeline>
</map:pipelines>
</map:sitemap>
<!-- end of file -->
1.1 xml-cocoon2/src/scratchpad/src/org/apache/cocoon/generation/LinkStatusGenerator.java
Index: LinkStatusGenerator.java
===================================================================
package org.apache.cocoon.generation;
import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.ResourceNotFoundException;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.cocoon.Constants;
import org.apache.cocoon.util.Tokenizer;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;
import org.apache.log.Logger;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Map;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
/**
* Generates a list of links that are reachable from the src and their status.
*
* @author Michael Homeijer
*/
public class LinkStatusGenerator extends ComposerGenerator implements Recyclable, Configurable {
/** The URI of the namespace of this generator. */
protected static final String URI =
"http://apache.org/cocoon/linkstatus/2.0";
/** The namespace prefix for this namespace. */
protected static final String PREFIX = "linkstatus";
/* Node and attribute names */
protected static final String TOP_NODE_NAME = "linkstatus";
protected static final String LINK_NODE_NAME = "link";
protected static final String HREF_ATTR_NAME = "href";
protected static final String REFERRER_ATTR_NAME = "referrer";
protected static final String CONTENT_ATTR_NAME = "content";
protected static final String STATUS_ATTR_NAME = "status";
protected static final String MESSAGE_ATTR_NAME = "message";
protected AttributesImpl attributes = new AttributesImpl();
/**
* Config element name specifying expected link content-typ.
* <p>
* Its value is <code>link-content-type</code>.
* </p>
*
* @since
*/
public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
/**
* Default value of <code>link-content-type</code> configuration value.
* <p>
* Its value is <code>application/x-cocoon-links</code>.
* </p>
*
* @since
*/
public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
/**
* Config element name specifying query-string appendend for requesting links
* of an URL.
* <p>
* Its value is <code>link-view-query</code>.
* </p>
*
* @since
*/
public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
/**
* Default value of <code>link-view-query</code> configuration value.
* <p>
* Its value is <code>?cocoon-view=links</code>.
* </p>
*
* @since
*/
public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
/**
* Config element name specifying excluding regular expression pattern.
* <p>
* Its value is <code>exclude</code>.
* </p>
*
* @since
*/
public final static String EXCLUDE_CONFIG = "exclude";
/**
* Config element name specifying including regular expression pattern.
* <p>
* Its value is <code>include</code>.
* </p>
*
* @since
*/
public final static String INCLUDE_CONFIG = "include";
/**
* Config element name specifying http header value for user-Agent.
* <p>
* Its value is <code>user-agent</code>.
* </p>
*
* @since
*/
public final static String USER_AGENT_CONFIG = "user-agent";
/**
* Default value of <code>user-agent</code> configuration value.
* <p>
* Its value is @see org.apache.cocoon.Constants#COMPLETE_NAME.
* </p>
*
* @since
*/
public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
/**
* Config element name specifying http header value for accept.
* <p>
* Its value is <code>accept</code>.
* </p>
*
* @since
*/
public final static String ACCEPT_CONFIG = "accept";
/**
* Default value of <code>accept</code> configuration value.
* <p>
* Its value is <code>* / *</code>
* </p>
*
* @since
*/
public final static String ACCEPT_DEFAULT = "*/*";
private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
private HashSet excludeCrawlingURL;
private HashSet includeCrawlingURL;
private String userAgent = USER_AGENT_DEFAULT;
private String accept = ACCEPT_DEFAULT;
private HashSet crawled;
private HashSet linksToProcess;
/**
* Stores links to process and the referrer links
*/
private class Link {
private URL url;
private String referrer;
public Link( URL url, String referrer ) {
this.url = url;
this.referrer = referrer;
}
public URL getURL() {
return url;
}
public String getReferrer() {
return referrer;
}
public boolean equals( Link l ) {
return url.equals( l.getURL());
}
}
/**
* Configure the crawler component.
* <p>
* Configure can specify which URI to include, and which URI to exclude
* from crawling. You specify the patterns as regular expressions.
* </p>
* <p>
* Morover you can configure
* the required content-type of crawling request, and the
* query-string appended to each crawling request.
* </p>
* <pre><tt>
* <include>.*\.html?</exclude> or <exclude>.*\.html?, .*\.xsp</exclude>
* <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude>
* <link-content-type> application/x-cocoon-links </link-content-type>
* <link-view-query> ?cocoon-view=links </link-view-query>
* </tt></pre>
*
* @param configuration XML configuration of this avalon component.
* @exception ConfigurationException is throwing if configuration is invalid.
* @since
*/
public void configure(Configuration configuration)
throws ConfigurationException {
Configuration[] children;
children = configuration.getChildren(INCLUDE_CONFIG);
if (children != null && children.length > 0) {
includeCrawlingURL = new HashSet();
for (int i = 0; i < children.length; i++) {
String pattern = children[i].getValue();
try {
Tokenizer t = new Tokenizer(pattern, ", ");
while (t.hasMoreTokens()) {
String tokenized_pattern = t.nextToken();
this.includeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
getLogger().error("Cannot create includeing regular-expression for " +
pattern, rese);
}
}
}
children = configuration.getChildren(EXCLUDE_CONFIG);
if (children != null && children.length > 0) {
excludeCrawlingURL = new HashSet();
for (int i = 0; i < children.length; i++) {
String pattern = children[i].getValue();
try {
Tokenizer t = new Tokenizer(pattern, ", ");
while (t.hasMoreTokens()) {
String tokenized_pattern = t.nextToken();
this.excludeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
getLogger().error("Cannot create excluding regular-expression for " +
pattern, rese);
}
}
} else {
excludeCrawlingURL = new HashSet();
setDefaultExcludeFromCrawling();
}
Configuration child;
String value;
child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
if (child != null) {
value = child.getValue();
if (value != null && value.length() > 0) {
this.linkContentType = value.trim();
}
}
child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
if (child != null) {
value = child.getValue();
if (value != null && value.length() > 0) {
this.linkViewQuery = value.trim();
}
}
child = configuration.getChild(USER_AGENT_CONFIG, false);
if (child != null) {
value = child.getValue();
if (value != null && value.length() > 0) {
this.userAgent = value;
}
}
child = configuration.getChild(ACCEPT_CONFIG, false);
if (child != null) {
value = child.getValue();
if (value != null && value.length() > 0) {
this.accept = value;
}
}
}
public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par)
throws ProcessingException, SAXException, IOException {
super.setup(resolver, objectModel, src, par);
/* Create a reusable attributes for creating nodes */
this.attributes = new AttributesImpl();
excludeCrawlingURL = new HashSet();
this.setDefaultExcludeFromCrawling();
}
/**
* Generate XML data.
*
* @throws SAXException
* if an error occurs while outputting the document
* @throws ProcessingException
* if the requsted URI wasn't found
*/
public void generate()
throws SAXException, ProcessingException {
try {
crawled = new HashSet();
linksToProcess = new HashSet();
URL root = new URL(source);
linksToProcess.add(new Link( root, ""));
if (getLogger().isDebugEnabled()) {
getLogger().debug("crawl URL " + root);
}
this.contentHandler.startDocument();
this.contentHandler.startPrefixMapping(PREFIX,URI);
attributes.clear();
super.contentHandler.startElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME, attributes);
while (linksToProcess.size() > 0) {
Iterator i = linksToProcess.iterator();
if (i.hasNext()) {
// fetch a URL
Link link = (Link) i.next();
URL url = link.getURL();
// remove it from the to-do list
linksToProcess.remove(link);
URLConnection conn = processURL(url, link.getReferrer());
// calc all links from this url
if (conn != null) {
List url_links = getLinksFromConnection(conn, url);
if (url_links != null) {
// add links of this url to the to-do list
linksToProcess.addAll(url_links);
}
}
}
}
super.contentHandler.endElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME);
this.contentHandler.endPrefixMapping(PREFIX);
this.contentHandler.endDocument();
} catch (IOException ioe) {
getLogger().warn("Could not read source ", ioe);
throw new ResourceNotFoundException("Could not read source ", ioe);
}
}
/**
* Default exclude patterns.
* <p>
* By default URLs matching following patterns are excluded:
* </p>
* <ul>
* <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
* <li>.*\\.png(\\?.*)?$ - exclude png images</li>
* <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
* <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
* <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
* </ul>
*
* @since
*/
private void setDefaultExcludeFromCrawling() {
String[] EXCLUDE_FROM_CRAWLING_DEFAULT = {
".*\\.gif(\\?.*)?$",
".*\\.png(\\?.*)?$",
".*\\.jpe?g(\\?.*)?$",
".*\\.js(\\?.*)?$",
".*\\.css(\\?.*)?$"
};
for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
try {
excludeCrawlingURL.add(new RE(pattern));
} catch (RESyntaxException rese) {
getLogger().error("Cannot create excluding regular-expression for " +
pattern, rese);
}
}
}
protected List getLinksFromConnection(URLConnection conn, URL url) {
List url_links = null;
try {
String content_type = conn.getContentType();
if (getLogger().isDebugEnabled()) {
getLogger().debug("Content-type: " + content_type);
}
if (content_type.equals(linkContentType)) {
url_links = new ArrayList();
InputStream is = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
// content is supposed to be a list of links,
// relative to current URL
String line;
String referrer = url.toString();
while ((line = br.readLine()) != null) {
URL new_url = new URL(url, line);
boolean add_url = true;
// don't add new_url twice
if (add_url) {
add_url &= !url_links.contains(new_url);
}
// don't add new_url if it has been crawled already
if (add_url) {
add_url &= !crawled.contains(new_url.toString());
}
Link new_link = new Link( new_url, referrer );
if (add_url) {
add_url &= !linksToProcess.contains(new_link);
}
// don't add if is not matched by existing include definition
if (add_url) {
add_url &= isIncludedURL(new_url.toString());
}
if (add_url) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("Add URL: " + new_url.toString());
}
url_links.add(new_link);
}
}
// now we have a list of URL which should be examined
}
} catch (IOException ioe) {
getLogger().warn("Problems get links of " + url, ioe);
}
return url_links;
}
protected URLConnection processURL(URL url, String referrer) throws SAXException {
if (getLogger().isDebugEnabled()) {
getLogger().debug("getLinks URL " + url);
}
URLConnection result = null;
// don't try to investigate url which has been crawled already
if (crawled.contains(url.toString())) {
return null;
}
// mark it as crawled
crawled.add(url.toString());
attributes.clear();
attributes.addAttribute("", HREF_ATTR_NAME,
HREF_ATTR_NAME, "CDATA", url.toString());
attributes.addAttribute("", REFERRER_ATTR_NAME,
REFERRER_ATTR_NAME, "CDATA", referrer);
// don't try to get links for url which is excluded from crawling
if (isExcludedURL(url.toString())) {
// Check for status and output it.
try {
URLConnection links_url_connection = url.openConnection();
HttpURLConnection h = (HttpURLConnection)links_url_connection;
String content_type = links_url_connection.getContentType();
attributes.addAttribute("", CONTENT_ATTR_NAME,
CONTENT_ATTR_NAME, "CDATA",
content_type);
attributes.addAttribute("", MESSAGE_ATTR_NAME,
MESSAGE_ATTR_NAME, "CDATA",
h.getResponseMessage());
attributes.addAttribute("", STATUS_ATTR_NAME,
STATUS_ATTR_NAME, "CDATA",
String.valueOf(h.getResponseCode()));
}
catch (IOException ioe)
{
attributes.addAttribute("", MESSAGE_ATTR_NAME,
MESSAGE_ATTR_NAME, "CDATA",
ioe.getMessage());
}
} else {
// Output url, referrer, content-type, status, message for traversable url's
// add prefix and query to get data from the linkserializer.
try {
URL links_url = new URL(url, url.getPath()
+ ((url.getPath().indexOf("?") == -1) ? "?" : "&")
+ linkViewQuery);
URLConnection links_url_connection = links_url.openConnection();
HttpURLConnection h = (HttpURLConnection)links_url_connection;
result = links_url_connection;
attributes.addAttribute("", CONTENT_ATTR_NAME,
CONTENT_ATTR_NAME, "CDATA",
links_url_connection.getContentType());
attributes.addAttribute("", MESSAGE_ATTR_NAME,
MESSAGE_ATTR_NAME, "CDATA",
h.getResponseMessage());
attributes.addAttribute("", STATUS_ATTR_NAME,
STATUS_ATTR_NAME, "CDATA",
String.valueOf(h.getResponseCode()));
}
catch(IOException ioe ) {
// Output url referrer status message
attributes.addAttribute("", MESSAGE_ATTR_NAME,
MESSAGE_ATTR_NAME, "CDATA",
ioe.getMessage());
}
}
super.contentHandler.startElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME, attributes);
super.contentHandler.endElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME);
return result;
}
/**
* check if URL is a candidate for indexing
*
* @param url Description of Parameter
* @return The excludedURL value
* @since
*/
private boolean isExcludedURL(String url) {
// by default include URL for crawling
if (excludeCrawlingURL == null) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("exclude no URL " + url);
}
return false;
}
final String s = url.toString();
Iterator i = excludeCrawlingURL.iterator();
while (i.hasNext()) {
RE pattern = (RE) i.next();
if (pattern.match(s)) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("exclude URL " + url);
}
return true;
}
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("exclude not URL " + url);
}
return false;
}
/**
* check if URL is a candidate for indexing
*
* @param url Description of Parameter
* @return The includedURL value
* @since
*/
private boolean isIncludedURL(String url) {
// by default include URL for crawling
if (includeCrawlingURL == null) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("include all URL " + url);
}
return true;
}
final String s = url.toString();
Iterator i = includeCrawlingURL.iterator();
while (i.hasNext()) {
RE pattern = (RE) i.next();
if (pattern.match(s)) {
if (getLogger().isDebugEnabled()) {
getLogger().debug("include URL " + url);
}
return true;
}
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("include not URL " + url);
}
return false;
}
public void recycle() {
super.recycle();
this.attributes = null;
this.excludeCrawlingURL = null;
}
}
----------------------------------------------------------------------
In case of troubles, e-mail: webmaster@xml.apache.org
To unsubscribe, e-mail: cocoon-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: cocoon-cvs-help@xml.apache.org