You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by bi...@apache.org on 2011/07/05 19:26:12 UTC
svn commit: r1143148 - in
/maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared: pom.xml
src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
Author: bimargulies
Date: Tue Jul 5 17:26:11 2011
New Revision: 1143148
URL: http://svn.apache.org/viewvc?rev=1143148&view=rev
Log:
[WAGON-338] Exception from cyberneko from jenkins repo listing
backport of change from trunk.
Modified:
maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml
maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
Modified: maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml
URL: http://svn.apache.org/viewvc/maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml?rev=1143148&r1=1143147&r2=1143148&view=diff
==============================================================================
--- maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml (original)
+++ maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml Tue Jul 5 17:26:11 2011
@@ -36,22 +36,6 @@ under the License.
<dependencies>
<dependency>
- <groupId>nekohtml</groupId>
- <artifactId>xercesMinimal</artifactId>
- <version>1.9.6.2</version>
- </dependency>
- <dependency>
- <groupId>nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>1.9.6.2</version>
- <exclusions>
- <exclusion>
- <groupId>xerces</groupId>
- <artifactId>xercesImpl</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
@@ -75,5 +59,15 @@ under the License.
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.0.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <version>1.6.1</version>
+ </dependency>
</dependencies>
</project>
Modified: maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
URL: http://svn.apache.org/viewvc/maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java?rev=1143148&r1=1143147&r2=1143148&view=diff
==============================================================================
--- maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java (original)
+++ maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java Tue Jul 5 17:26:11 2011
@@ -31,21 +31,34 @@ import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
import org.apache.maven.wagon.TransferFailedException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.parser.XMLInputSource;
-import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.codehaus.plexus.util.StringUtils;
-import org.cyberneko.html.HTMLConfiguration;
-import org.cyberneko.html.filters.DefaultFilter;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
/**
* Html File List Parser.
*/
public class HtmlFileListParser
{
+ // Apache Fancy Index Sort Headers
+ private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
+
+ // URLs with excessive paths.
+ private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
+
+ // URLs that to a parent directory.
+ private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
+
+ // mailto urls
+ private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
+
+ private static final Pattern[] SKIPS = new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT,
+ MAILTO_URLS };
+
/**
* Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
*
@@ -58,126 +71,96 @@ public class HtmlFileListParser
{
try
{
- // Use URI object to get benefits of proper absolute and relative path resolution for free
URI baseURI = new URI( baseurl );
+ // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
+ // assumption.
+ String content = IOUtils.toString( stream, "utf-8" );
+ Document doc = Jsoup.parse( content, baseurl );
+ Elements links = doc.getElementsByTag( "a" );
+ Set results = new HashSet();
+ for ( int lx = 0; lx < links.size(); lx++ )
+ {
+ Element link = links.get( lx );
+ /*
+ * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
+ */
+ String target = link.attr( "href" );
+ if ( target != null)
+ {
+ String clean = cleanLink( baseURI, target );
+ if ( isAcceptableLink( clean ))
+ {
+ results.add( clean );
+ }
+ }
- Parser handler = new Parser( baseURI );
-
- XMLParserConfiguration parser = new HTMLConfiguration();
- parser.setDocumentHandler( handler );
- parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
- parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
- parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
- parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
-
- return new ArrayList( handler.getLinks() );
+ }
+ ArrayList resultsAsList = new ArrayList();
+ resultsAsList.addAll( results );
+ return resultsAsList;
}
catch ( URISyntaxException e )
{
- throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
+ throw new TransferFailedException( "Unable to parse as base URI: " + baseurl );
}
catch ( IOException e )
{
- throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
+ throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
}
}
- private static class Parser
- extends DefaultFilter
+ private static String cleanLink( URI baseURI, String link )
{
- // Apache Fancy Index Sort Headers
- private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
-
- // URLs with excessive paths.
- private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
-
- // URLs that to a parent directory.
- private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
-
- // mailto urls
- private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
-
- private static final Pattern[] SKIPS =
- new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
-
- private Set links = new HashSet();
-
- private URI baseURI;
-
- public Parser( URI baseURI )
+ if ( StringUtils.isEmpty( link ) )
{
- this.baseURI = baseURI.normalize();
+ return "";
}
- public Set getLinks()
- {
- return links;
- }
+ String ret = link;
- public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
+ try
{
- if ( "A".equals( element.rawname ) )
+ URI linkuri = new URI( ret );
+ if ( link.startsWith( "/" ))
{
- String href = attrs.getValue( "HREF" );
- if ( href != null )
- {
- String link = cleanLink( baseURI, href );
- if ( isAcceptableLink( link ) )
- {
- links.add( link );
- }
- }
+ linkuri = baseURI.resolve( linkuri );
}
- }
-
- private static String cleanLink( URI baseURI, String link )
- {
- if ( StringUtils.isEmpty( link ) )
+ URI relativeURI = baseURI.relativize( linkuri ).normalize();
+ ret = relativeURI.toASCIIString();
+ if ( ret.startsWith( baseURI.getPath() ) )
{
- return "";
+ ret = ret.substring( baseURI.getPath().length() );
}
- String ret = link;
-
- try
- {
- URI linkuri = new URI( ret );
- URI relativeURI = baseURI.relativize( linkuri ).normalize();
- ret = relativeURI.toASCIIString();
- if ( ret.startsWith( baseURI.getPath() ) )
- {
- ret = ret.substring( baseURI.getPath().length() );
- }
+ ret = URLDecoder.decode( ret, "UTF-8" );
+ }
+ catch ( URISyntaxException e )
+ {
+ }
+ catch ( UnsupportedEncodingException e )
+ {
+ }
- ret = URLDecoder.decode( ret, "UTF-8" );
- }
- catch ( URISyntaxException e )
- {
- }
- catch ( UnsupportedEncodingException e )
- {
- }
+ return ret;
+ }
- return ret;
+ private static boolean isAcceptableLink( String link )
+ {
+ if ( StringUtils.isEmpty( link ) )
+ {
+ return false;
}
- private static boolean isAcceptableLink( String link )
+ for ( int i = 0; i < SKIPS.length; i++ )
{
- if ( StringUtils.isEmpty( link ) )
+ if ( SKIPS[i].matcher( link ).find() )
{
return false;
}
-
- for ( int i = 0; i < SKIPS.length; i++ )
- {
- if ( SKIPS[i].matcher( link ).find() )
- {
- return false;
- }
- }
-
- return true;
}
+
+ return true;
}
+
}