You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by sl...@apache.org on 2020/01/02 21:42:06 UTC
[maven-wagon] 01/01: WIP Remve jsoup
This is an automated email from the ASF dual-hosted git repository.
slachiewicz pushed a commit to branch jsoup
in repository https://gitbox.apache.org/repos/asf/maven-wagon.git
commit 490362ba31dc5d8e119ffd447390327bf9d988a0
Author: Sylwester Lachiewicz <sl...@apache.org>
AuthorDate: Thu Jan 2 22:40:48 2020 +0100
WIP Remve jsoup
---
wagon-providers/wagon-http-shared/pom.xml | 5 --
.../wagon/shared/http/HtmlFileListParser.java | 78 +++++++++++++---------
wagon-providers/wagon-http/pom.xml | 5 --
3 files changed, 47 insertions(+), 41 deletions(-)
diff --git a/wagon-providers/wagon-http-shared/pom.xml b/wagon-providers/wagon-http-shared/pom.xml
index 4aef9a4..ac6923d 100644
--- a/wagon-providers/wagon-http-shared/pom.xml
+++ b/wagon-providers/wagon-http-shared/pom.xml
@@ -35,11 +35,6 @@ under the License.
<dependencies>
<dependency>
- <groupId>org.jsoup</groupId>
- <artifactId>jsoup</artifactId>
- <version>1.12.1</version>
- </dependency>
- <dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<exclusions>
diff --git a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
index e27696a..28c797d 100644
--- a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
+++ b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
@@ -19,19 +19,20 @@ package org.apache.maven.wagon.shared.http;
* under the License.
*/
-import org.apache.commons.io.IOUtils;
import org.apache.maven.wagon.TransferFailedException;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
+import javax.swing.text.MutableAttributeSet;
+import javax.swing.text.html.HTML;
+import javax.swing.text.html.HTMLEditorKit;
+import javax.swing.text.html.parser.ParserDelegator;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@@ -65,36 +66,56 @@ public class HtmlFileListParser
* @return the file list.
* @throws TransferFailedException if there was a problem fetching the raw html.
*/
- public static List<String> parseFileList( String baseurl, InputStream stream )
- throws TransferFailedException
+ public static List<String> parseFileList( String baseurl, InputStream stream ) throws TransferFailedException
{
try
{
- URI baseURI = new URI( baseurl );
- // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
- // assumption.
- String content = IOUtils.toString( stream, "utf-8" );
- Document doc = Jsoup.parse( content, baseurl );
- Elements links = doc.select( "a[href]" );
- Set<String> results = new HashSet<String>();
- for ( Element link : links )
+ final Set<String> list = new HashSet<>();
+ final URI baseURI = new URI( baseurl );
+
+ ParserDelegator parserDelegator = new ParserDelegator();
+ HTMLEditorKit.ParserCallback parserCallback = new HTMLEditorKit.ParserCallback()
{
- /*
- * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
- */
- String target = link.attr( "href" );
- if ( target != null )
+ public void handleText( final char[] data, final int pos )
+ {
+ }
+
+ public void handleStartTag( HTML.Tag tag, MutableAttributeSet attribute, int pos )
{
- String clean = cleanLink( baseURI, target );
- if ( isAcceptableLink( clean ) )
+ if ( tag == HTML.Tag.A )
{
- results.add( clean );
+ String address = (String) attribute.getAttribute( HTML.Attribute.HREF );
+ // The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
+ if ( address != null )
+ {
+ String clean = cleanLink( baseURI, address );
+ if ( isAcceptableLink( clean ) )
+ {
+ list.add( clean );
+ }
+ }
}
}
- }
+ public void handleEndTag( HTML.Tag t, final int pos )
+ {
+ }
- return new ArrayList<String>( results );
+ public void handleSimpleTag( HTML.Tag t, MutableAttributeSet a, final int pos )
+ {
+ }
+
+ public void handleComment( final char[] data, final int pos )
+ {
+ }
+
+ public void handleError( final java.lang.String errMsg, final int pos )
+ {
+ }
+ };
+ parserDelegator.parse( new InputStreamReader( stream, StandardCharsets.UTF_8 ), parserCallback, false );
+
+ return new ArrayList<>( list );
}
catch ( URISyntaxException e )
{
@@ -131,11 +152,7 @@ public class HtmlFileListParser
ret = URLDecoder.decode( ret, "UTF-8" );
}
- catch ( URISyntaxException e )
- {
- // ignore
- }
- catch ( UnsupportedEncodingException e )
+ catch ( URISyntaxException | UnsupportedEncodingException e )
{
// ignore
}
@@ -160,5 +177,4 @@ public class HtmlFileListParser
return true;
}
-
}
\ No newline at end of file
diff --git a/wagon-providers/wagon-http/pom.xml b/wagon-providers/wagon-http/pom.xml
index c97e284..958fdab 100644
--- a/wagon-providers/wagon-http/pom.xml
+++ b/wagon-providers/wagon-http/pom.xml
@@ -124,7 +124,6 @@ under the License.
<include>org.apache.httpcomponents:httpcore</include>
<include>commons-codec:commons-codec</include>
<include>commons-io:commons-io</include>
- <include>org.jsoup:jsoup</include>
<include>org.apache.maven.wagon:wagon-http-shared</include>
</includes>
</artifactSet>
@@ -140,10 +139,6 @@ under the License.
<shadedPattern>org.apache.maven.wagon.providers.http.commons.io</shadedPattern>
</relocation>
<relocation>
- <pattern>org.jsoup</pattern>
- <shadedPattern>org.apache.maven.wagon.providers.http.org.jsoup</shadedPattern>
- </relocation>
- <relocation>
<pattern>org.apache.http</pattern>
<shadedPattern>org.apache.maven.wagon.providers.http.httpclient</shadedPattern>
</relocation>