You are viewing a plain text version of this content. The canonical link for it is here.
Posted to wagon-commits@maven.apache.org by br...@apache.org on 2008/06/01 18:26:40 UTC

svn commit: r662242 - in /maven/wagon/trunk/wagon-providers/wagon-http-shared: pom.xml src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java

Author: brett
Date: Sun Jun  1 09:26:39 2008
New Revision: 662242

URL: http://svn.apache.org/viewvc?rev=662242&view=rev
Log:
use a minimal number of classes from xerces

Modified:
    maven/wagon/trunk/wagon-providers/wagon-http-shared/pom.xml
    maven/wagon/trunk/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java

Modified: maven/wagon/trunk/wagon-providers/wagon-http-shared/pom.xml
URL: http://svn.apache.org/viewvc/maven/wagon/trunk/wagon-providers/wagon-http-shared/pom.xml?rev=662242&r1=662241&r2=662242&view=diff
==============================================================================
--- maven/wagon/trunk/wagon-providers/wagon-http-shared/pom.xml (original)
+++ maven/wagon/trunk/wagon-providers/wagon-http-shared/pom.xml Sun Jun  1 09:26:39 2008
@@ -38,8 +38,19 @@
   <dependencies>
     <dependency>
       <groupId>nekohtml</groupId>
+      <artifactId>xercesMinimal</artifactId>
+      <version>1.9.6.2</version>
+    </dependency>
+    <dependency>
+      <groupId>nekohtml</groupId>
       <artifactId>nekohtml</artifactId>
-      <version>1.9.6.1</version>
+      <version>1.9.6.2</version>
+      <exclusions>
+        <exclusion>
+          <groupId>xerces</groupId>
+          <artifactId>xercesImpl</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>commons-httpclient</groupId>

Modified: maven/wagon/trunk/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
URL: http://svn.apache.org/viewvc/maven/wagon/trunk/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java?rev=662242&r1=662241&r2=662242&view=diff
==============================================================================
--- maven/wagon/trunk/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java (original)
+++ maven/wagon/trunk/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java Sun Jun  1 09:26:39 2008
@@ -33,38 +33,20 @@
 import java.util.regex.Pattern;
 
 import org.apache.maven.wagon.TransferFailedException;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.QName;
+import org.apache.xerces.xni.XMLAttributes;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xerces.xni.parser.XMLParserConfiguration;
 import org.codehaus.plexus.util.StringUtils;
-import org.cyberneko.html.parsers.DOMParser;
-import org.w3c.dom.Element;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
+import org.cyberneko.html.HTMLConfiguration;
+import org.cyberneko.html.filters.DefaultFilter;
 
 /**
  * Html File List Parser.
  */
 public class HtmlFileListParser
 {
-    private static final Set/*<Pattern>*/skips = new HashSet/*<Pattern>*/();
-
-    static
-    {
-        // Apache Fancy Index Sort Headers
-        skips.add( Pattern.compile( "\\?[CDMNS]=.*" ) );
-
-        // URLs with excessive paths.
-        skips.add( Pattern.compile( "/[^/]*/" ) );
-
-        // URLs that to a parent directory.
-        skips.add( Pattern.compile( "\\.\\./" ) );
-
-        // mailto urls
-        skips.add( Pattern.compile( "mailto:.*" ) );
-    }
-
     /**
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
      * 
@@ -72,7 +54,7 @@
      * @return the file list.
      * @throws TransferFailedException if there was a problem fetching the raw html.
      */
-    public static List/*<String>*/parseFileList( String baseurl, InputStream stream )
+    public static List/* <String> */parseFileList( String baseurl, InputStream stream )
         throws TransferFailedException
     {
         try
@@ -80,55 +62,68 @@
             // Use URI object to get benefits of proper absolute and relative path resolution for free
             URI baseURI = new URI( baseurl );
 
-            DOMParser parser = new DOMParser();
+            Parser handler = new Parser( baseURI );
+
+            XMLParserConfiguration parser = new HTMLConfiguration();
+            parser.setDocumentHandler( handler );
             parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
             parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
             parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
-            parser.parse( new InputSource( stream ) );
-
-            Set/*<String>*/links = new HashSet/*<String>*/();
-
-            recursiveLinkCollector( parser.getDocument(), baseURI, links );
+            parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
 
-            return new ArrayList( links );
+            return new ArrayList( handler.getLinks() );
 
         }
         catch ( URISyntaxException e )
         {
             throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
         }
-        catch ( SAXNotRecognizedException e )
-        {
-            throw new TransferFailedException( "Unable to setup XML/SAX: " + e.getMessage(), e );
-        }
-        catch ( SAXNotSupportedException e )
-        {
-            throw new TransferFailedException( "XML/SAX not supported?: " + e.getMessage(), e );
-        }
-        catch ( SAXException e )
-        {
-            throw new TransferFailedException( "XML/SAX error: " + e.getMessage(), e );
-        }
         catch ( IOException e )
         {
             throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
         }
     }
 
-    private static void recursiveLinkCollector( Node node, URI baseURI, Set/*<String>*/links )
+    private static class Parser
+        extends DefaultFilter
     {
-        if ( node.getNodeType() == Node.ELEMENT_NODE )
+        // Apache Fancy Index Sort Headers
+        private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
+
+        // URLs with excessive paths.
+        private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
+
+        // URLs that to a parent directory.
+        private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
+
+        // mailto urls
+        private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
+
+        private static final Pattern[] SKIPS =
+            new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
+        
+        private Set links = new HashSet();
+
+        private URI baseURI;
+
+        public Parser( URI baseURI )
+        {
+            this.baseURI = baseURI;
+        }
+
+        public Set getLinks()
         {
-            //            System.out.println("Element <" + node.getNodeName() + dumpAttributes((Element) node) + ">");
-            if ( "A".equals( node.getNodeName() ) )
+            return links;
+        }
+
+        public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
+        {
+            if ( "A".equals( element.rawname ) )
             {
-                Element anchor = (Element) node;
-                NamedNodeMap nodemap = anchor.getAttributes();
-                Node href = nodemap.getNamedItem( "HREF" );
+                String href = attrs.getValue( "HREF" );
                 if ( href != null )
                 {
-                    String link = cleanLink( baseURI, href.getNodeValue() );
-                    //                    System.out.println("HREF (" + href.getNodeValue() + " => " + link + ")");
+                    String link = cleanLink( baseURI, href );
                     if ( isAcceptableLink( link ) )
                     {
                         links.add( link );
@@ -137,76 +132,53 @@
             }
         }
 
-        Node child = node.getFirstChild();
-        while ( child != null )
+        private static String cleanLink( URI baseURI, String link )
         {
-            recursiveLinkCollector( child, baseURI, links );
-            child = child.getNextSibling();
-        }
-    }
-
-    //    private String dumpAttributes(Element elem) {
-    //        StringBuffer buf = new StringBuffer();
-    //        NamedNodeMap nodemap = elem.getAttributes();
-    //        int len = nodemap.getLength();
-    //        for (int i = 0; i < len; i++) {
-    //            Node att = nodemap.item(i);
-    //            buf.append(" ");
-    //            buf.append(att.getNodeName()).append("=\"");
-    //            buf.append(att.getNodeValue()).append("\"");
-    //        }
-    //        return buf.toString();
-    //    }
+            if ( StringUtils.isEmpty( link ) )
+            {
+                return "";
+            }
 
-    private static String cleanLink( URI baseURI, String link )
-    {
-        if ( StringUtils.isEmpty( link ) )
-        {
-            return "";
-        }
+            String ret = link;
 
-        String ret = link;
+            try
+            {
+                URI linkuri = new URI( ret );
+                URI relativeURI = baseURI.relativize( linkuri ).normalize();
+                ret = relativeURI.toASCIIString();
+                if ( ret.startsWith( baseURI.getPath() ) )
+                {
+                    ret = ret.substring( baseURI.getPath().length() );
+                }
 
-        try
-        {
-            URI linkuri = new URI( ret );
-            URI relativeURI = baseURI.relativize( linkuri ).normalize();
-            ret = relativeURI.toASCIIString();
-            if ( ret.startsWith( baseURI.getPath() ) )
+                ret = URLDecoder.decode( ret, "UTF-8" );
+            }
+            catch ( URISyntaxException e )
+            {
+            }
+            catch ( UnsupportedEncodingException e )
             {
-                ret = ret.substring( baseURI.getPath().length() );
             }
-            
-            ret = URLDecoder.decode( ret, "UTF-8" );
-        }
-        catch ( URISyntaxException e )
-        {
-        }
-        catch ( UnsupportedEncodingException e )
-        {
-        }
-
-        return ret;
-    }
-    
-    
 
-    private static boolean isAcceptableLink( String link )
-    {
-        if ( StringUtils.isEmpty( link ) )
-        {
-            return false;
+            return ret;
         }
 
-        for ( Iterator it = skips.iterator(); it.hasNext(); )
+        private static boolean isAcceptableLink( String link )
         {
-            Pattern skipPat = (Pattern) it.next();
-            if ( skipPat.matcher( link ).find() )
+            if ( StringUtils.isEmpty( link ) )
             {
                 return false;
             }
-        }
 
-        return true;
+            for ( int i = 0; i < SKIPS.length; i++ )
+            {
+                if ( SKIPS[i].matcher( link ).find() )
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
     }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: wagon-commits-unsubscribe@maven.apache.org
For additional commands, e-mail: wagon-commits-help@maven.apache.org