You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by bi...@apache.org on 2011/07/05 19:26:12 UTC

svn commit: r1143148 - in /maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared: pom.xml src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java

Author: bimargulies
Date: Tue Jul  5 17:26:11 2011
New Revision: 1143148

URL: http://svn.apache.org/viewvc?rev=1143148&view=rev
Log:
[WAGON-338] Exception from cyberneko from jenkins repo listing

backport of change from trunk.

Modified:
    maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml
    maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java

Modified: maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml
URL: http://svn.apache.org/viewvc/maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml?rev=1143148&r1=1143147&r2=1143148&view=diff
==============================================================================
--- maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml (original)
+++ maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/pom.xml Tue Jul  5 17:26:11 2011
@@ -36,22 +36,6 @@ under the License.
 
   <dependencies>
     <dependency>
-      <groupId>nekohtml</groupId>
-      <artifactId>xercesMinimal</artifactId>
-      <version>1.9.6.2</version>
-    </dependency>
-    <dependency>
-      <groupId>nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-      <version>1.9.6.2</version>
-      <exclusions>
-        <exclusion>
-          <groupId>xerces</groupId>
-          <artifactId>xercesImpl</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
       <groupId>commons-httpclient</groupId>
       <artifactId>commons-httpclient</artifactId>
       <version>3.1</version>
@@ -75,5 +59,15 @@ under the License.
       <artifactId>commons-logging</artifactId>
       <version>1.1.1</version>
     </dependency>
+    <dependency>
+    	<groupId>commons-io</groupId>
+    	<artifactId>commons-io</artifactId>
+    	<version>2.0.1</version>
+    </dependency>
+    <dependency>
+    	<groupId>org.jsoup</groupId>
+    	<artifactId>jsoup</artifactId>
+    	<version>1.6.1</version>
+    </dependency>
   </dependencies>
 </project>

Modified: maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
URL: http://svn.apache.org/viewvc/maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java?rev=1143148&r1=1143147&r2=1143148&view=diff
==============================================================================
--- maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java (original)
+++ maven/wagon/branches/wagon-1.x/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java Tue Jul  5 17:26:11 2011
@@ -31,21 +31,34 @@ import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.maven.wagon.TransferFailedException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.parser.XMLInputSource;
-import org.apache.xerces.xni.parser.XMLParserConfiguration;
 import org.codehaus.plexus.util.StringUtils;
-import org.cyberneko.html.HTMLConfiguration;
-import org.cyberneko.html.filters.DefaultFilter;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 
 /**
  * Html File List Parser.
  */
 public class HtmlFileListParser
 {
+    // Apache Fancy Index Sort Headers
+    private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
+
+    // URLs with excessive paths.
+    private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
+
+    // URLs that to a parent directory.
+    private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
+
+    // mailto urls
+    private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
+
+    private static final Pattern[] SKIPS = new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT,
+        MAILTO_URLS };
+
     /**
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
      * 
@@ -58,126 +71,96 @@ public class HtmlFileListParser
     {
         try
         {
-            // Use URI object to get benefits of proper absolute and relative path resolution for free
             URI baseURI = new URI( baseurl );
+            // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
+            // assumption.
+            String content = IOUtils.toString( stream, "utf-8" );
+            Document doc = Jsoup.parse( content, baseurl );
+            Elements links = doc.getElementsByTag( "a" );
+            Set results = new HashSet();
+            for ( int lx = 0; lx < links.size(); lx++ )
+            {
+                Element link = links.get( lx );
+                /*
+                 * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
+                 */
+                String target = link.attr( "href" );
+                if ( target != null)
+                {
+                    String clean = cleanLink( baseURI, target );
+                    if ( isAcceptableLink( clean )) 
+                    {
+                        results.add( clean );
+                    }
+                }
 
-            Parser handler = new Parser( baseURI );
-
-            XMLParserConfiguration parser = new HTMLConfiguration();
-            parser.setDocumentHandler( handler );
-            parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
-            parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
-            parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
-            parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
-
-            return new ArrayList( handler.getLinks() );
+            }
 
+            ArrayList resultsAsList = new ArrayList();
+            resultsAsList.addAll( results );
+            return resultsAsList;
         }
         catch ( URISyntaxException e )
         {
-            throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
+            throw new TransferFailedException( "Unable to parse as base URI: " + baseurl );
         }
         catch ( IOException e )
         {
-            throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
+            throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
         }
     }
 
-    private static class Parser
-        extends DefaultFilter
+    private static String cleanLink( URI baseURI, String link )
     {
-        // Apache Fancy Index Sort Headers
-        private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
-
-        // URLs with excessive paths.
-        private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
-
-        // URLs that to a parent directory.
-        private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
-
-        // mailto urls
-        private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
-
-        private static final Pattern[] SKIPS =
-            new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
-        
-        private Set links = new HashSet();
-
-        private URI baseURI;
-
-        public Parser( URI baseURI )
+        if ( StringUtils.isEmpty( link ) )
         {
-            this.baseURI = baseURI.normalize();
+            return "";
         }
 
-        public Set getLinks()
-        {
-            return links;
-        }
+        String ret = link;
 
-        public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
+        try
         {
-            if ( "A".equals( element.rawname ) )
+            URI linkuri = new URI( ret );
+            if ( link.startsWith( "/" )) 
             {
-                String href = attrs.getValue( "HREF" );
-                if ( href != null )
-                {
-                    String link = cleanLink( baseURI, href );
-                    if ( isAcceptableLink( link ) )
-                    {
-                        links.add( link );
-                    }
-                }
+                linkuri =  baseURI.resolve( linkuri );
             }
-        }
-
-        private static String cleanLink( URI baseURI, String link )
-        {
-            if ( StringUtils.isEmpty( link ) )
+            URI relativeURI = baseURI.relativize( linkuri ).normalize();
+            ret = relativeURI.toASCIIString();
+            if ( ret.startsWith( baseURI.getPath() ) )
             {
-                return "";
+                ret = ret.substring( baseURI.getPath().length() );
             }
 
-            String ret = link;
-
-            try
-            {
-                URI linkuri = new URI( ret );
-                URI relativeURI = baseURI.relativize( linkuri ).normalize();
-                ret = relativeURI.toASCIIString();
-                if ( ret.startsWith( baseURI.getPath() ) )
-                {
-                    ret = ret.substring( baseURI.getPath().length() );
-                }
+            ret = URLDecoder.decode( ret, "UTF-8" );
+        }
+        catch ( URISyntaxException e )
+        {
+        }
+        catch ( UnsupportedEncodingException e )
+        {
+        }
 
-                ret = URLDecoder.decode( ret, "UTF-8" );
-            }
-            catch ( URISyntaxException e )
-            {
-            }
-            catch ( UnsupportedEncodingException e )
-            {
-            }
+        return ret;
+    }
 
-            return ret;
+    private static boolean isAcceptableLink( String link )
+    {
+        if ( StringUtils.isEmpty( link ) )
+        {
+            return false;
         }
 
-        private static boolean isAcceptableLink( String link )
+        for ( int i = 0; i < SKIPS.length; i++ )
         {
-            if ( StringUtils.isEmpty( link ) )
+            if ( SKIPS[i].matcher( link ).find() )
             {
                 return false;
             }
-
-            for ( int i = 0; i < SKIPS.length; i++ )
-            {
-                if ( SKIPS[i].matcher( link ).find() )
-                {
-                    return false;
-                }
-            }
-
-            return true;
         }
+
+        return true;
     }
+
 }