You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by sl...@apache.org on 2020/01/02 21:42:06 UTC

[maven-wagon] 01/01: WIP Remve jsoup

This is an automated email from the ASF dual-hosted git repository.

slachiewicz pushed a commit to branch jsoup
in repository https://gitbox.apache.org/repos/asf/maven-wagon.git

commit 490362ba31dc5d8e119ffd447390327bf9d988a0
Author: Sylwester Lachiewicz <sl...@apache.org>
AuthorDate: Thu Jan 2 22:40:48 2020 +0100

    WIP Remve jsoup
---
 wagon-providers/wagon-http-shared/pom.xml          |  5 --
 .../wagon/shared/http/HtmlFileListParser.java      | 78 +++++++++++++---------
 wagon-providers/wagon-http/pom.xml                 |  5 --
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/wagon-providers/wagon-http-shared/pom.xml b/wagon-providers/wagon-http-shared/pom.xml
index 4aef9a4..ac6923d 100644
--- a/wagon-providers/wagon-http-shared/pom.xml
+++ b/wagon-providers/wagon-http-shared/pom.xml
@@ -35,11 +35,6 @@ under the License.
 
   <dependencies>
     <dependency>
-      <groupId>org.jsoup</groupId>
-      <artifactId>jsoup</artifactId>
-      <version>1.12.1</version>
-    </dependency>
-    <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpclient</artifactId>
       <exclusions>
diff --git a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
index e27696a..28c797d 100644
--- a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
+++ b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
@@ -19,19 +19,20 @@ package org.apache.maven.wagon.shared.http;
  * under the License.
  */
 
-import org.apache.commons.io.IOUtils;
 import org.apache.maven.wagon.TransferFailedException;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 
+import javax.swing.text.MutableAttributeSet;
+import javax.swing.text.html.HTML;
+import javax.swing.text.html.HTMLEditorKit;
+import javax.swing.text.html.parser.ParserDelegator;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
@@ -65,36 +66,56 @@ public class HtmlFileListParser
      * @return the file list.
      * @throws TransferFailedException if there was a problem fetching the raw html.
      */
-    public static List<String> parseFileList( String baseurl, InputStream stream )
-        throws TransferFailedException
+    public static List<String> parseFileList( String baseurl, InputStream stream ) throws TransferFailedException
     {
         try
         {
-            URI baseURI = new URI( baseurl );
-            // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
-            // assumption.
-            String content = IOUtils.toString( stream, "utf-8" );
-            Document doc = Jsoup.parse( content, baseurl );
-            Elements links = doc.select( "a[href]" );
-            Set<String> results = new HashSet<String>();
-            for ( Element link : links )
+            final Set<String> list = new HashSet<>();
+            final URI baseURI = new URI( baseurl );
+
+            ParserDelegator parserDelegator = new ParserDelegator();
+            HTMLEditorKit.ParserCallback parserCallback = new HTMLEditorKit.ParserCallback()
             {
-                /*
-                 * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
-                 */
-                String target = link.attr( "href" );
-                if ( target != null )
+                public void handleText( final char[] data, final int pos )
+                {
+                }
+
+                public void handleStartTag( HTML.Tag tag, MutableAttributeSet attribute, int pos )
                 {
-                    String clean = cleanLink( baseURI, target );
-                    if ( isAcceptableLink( clean ) )
+                    if ( tag == HTML.Tag.A )
                     {
-                        results.add( clean );
+                        String address = (String) attribute.getAttribute( HTML.Attribute.HREF );
+                         // The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
+                        if ( address != null )
+                        {
+                            String clean = cleanLink( baseURI, address );
+                            if ( isAcceptableLink( clean ) )
+                            {
+                                list.add( clean );
+                            }
+                        }
                     }
                 }
 
-            }
+                public void handleEndTag( HTML.Tag t, final int pos )
+                {
+                }
 
-            return new ArrayList<String>( results );
+                public void handleSimpleTag( HTML.Tag t, MutableAttributeSet a, final int pos )
+                {
+                }
+
+                public void handleComment( final char[] data, final int pos )
+                {
+                }
+
+                public void handleError( final java.lang.String errMsg, final int pos )
+                {
+                }
+            };
+            parserDelegator.parse( new InputStreamReader( stream, StandardCharsets.UTF_8 ), parserCallback, false );
+
+            return new ArrayList<>( list );
         }
         catch ( URISyntaxException e )
         {
@@ -131,11 +152,7 @@ public class HtmlFileListParser
 
             ret = URLDecoder.decode( ret, "UTF-8" );
         }
-        catch ( URISyntaxException e )
-        {
-            // ignore
-        }
-        catch ( UnsupportedEncodingException e )
+        catch ( URISyntaxException | UnsupportedEncodingException e )
         {
             // ignore
         }
@@ -160,5 +177,4 @@ public class HtmlFileListParser
 
         return true;
     }
-
 }
\ No newline at end of file
diff --git a/wagon-providers/wagon-http/pom.xml b/wagon-providers/wagon-http/pom.xml
index c97e284..958fdab 100644
--- a/wagon-providers/wagon-http/pom.xml
+++ b/wagon-providers/wagon-http/pom.xml
@@ -124,7 +124,6 @@ under the License.
                   <include>org.apache.httpcomponents:httpcore</include>
                   <include>commons-codec:commons-codec</include>
                   <include>commons-io:commons-io</include>
-                  <include>org.jsoup:jsoup</include>
                   <include>org.apache.maven.wagon:wagon-http-shared</include>
                 </includes>
               </artifactSet>
@@ -140,10 +139,6 @@ under the License.
                   <shadedPattern>org.apache.maven.wagon.providers.http.commons.io</shadedPattern>
                 </relocation>
                 <relocation>
-                  <pattern>org.jsoup</pattern>
-                  <shadedPattern>org.apache.maven.wagon.providers.http.org.jsoup</shadedPattern>
-                </relocation>
-                <relocation>
                   <pattern>org.apache.http</pattern>
                   <shadedPattern>org.apache.maven.wagon.providers.http.httpclient</shadedPattern>
                 </relocation>