You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/04/04 21:42:24 UTC
any23 git commit: ANY23-341 Remove dependency on defunct
commons-httpclient
Repository: any23
Updated Branches:
refs/heads/master f9abbec20 -> db25f0213
ANY23-341 Remove dependency on defunct commons-httpclient
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/db25f021
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/db25f021
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/db25f021
Branch: refs/heads/master
Commit: db25f0213714f0d6c0377818c00a3aeb58436d56
Parents: f9abbec
Author: Hans <fi...@gmail.com>
Authored: Tue Apr 3 13:33:20 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Wed Apr 4 16:37:16 2018 -0500
----------------------------------------------------------------------
cli/pom.xml | 4 -
core/pom.xml | 4 -
.../any23/extractor/html/HTMLDocument.java | 30 ++++--
.../apache/any23/http/DefaultHTTPClient.java | 106 +++++++++++--------
.../java/org/apache/any23/http/HTTPClient.java | 12 +--
.../apache/any23/source/HTTPDocumentSource.java | 11 +-
.../java/org/apache/any23/util/LogUtils.java | 2 -
pom.xml | 5 -
.../java/org/apache/any23/servlet/Servlet.java | 4 +-
9 files changed, 95 insertions(+), 83 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 07b7e6b..321b150 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -94,10 +94,6 @@
<artifactId>commons-lang</artifactId>
</dependency>
<dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- </dependency>
- <dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index 6fd2550..58a37ee 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -63,10 +63,6 @@
<artifactId>commons-lang</artifactId>
</dependency>
<dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- </dependency>
- <dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java
index bb958c7..188e0f1 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java
@@ -24,6 +24,7 @@ import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -375,15 +376,32 @@ public class HTMLDocument {
private java.net.URI getBaseIRI() throws ExtractionException {
if (baseIRI == null) {
+ // document.getBaseURI() returns null for document URIs with
+ // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
+ // It also does *not* take html "base" elements into account.
+ // (But it does take into account urls specified by the attribute "xml:base".)
+
+ // So, for now, let's use getDocumentURI() instead.
+ // TODO: Make this approach better.
+
+ Document doc = document instanceof Document ? (Document)document : document.getOwnerDocument();
+
+ if (doc == null) {
+ throw new ExtractionException("Node " + document.getNodeName() + " was not associated with a document.");
+ }
+
+ String uri = doc.getDocumentURI();
+
+ if (uri == null) {
+ throw new ExtractionException("document URI is null, this should not happen");
+ }
+
try {
- if (document.getBaseURI() == null) {
- log.warn("document.getBaseURI() is null, this should not happen");
- }
- baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(document.getBaseURI()));
+ baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
} catch (IllegalArgumentException ex) {
- throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex);
+ throw new ExtractionException("Error in base IRI: " + uri, ex);
} catch (URISyntaxException ex) {
- throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex);
+ throw new ExtractionException("Error in base IRI: " + uri, ex);
}
}
return baseIRI;
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
index d520441..2615585 100644
--- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
@@ -17,16 +17,24 @@
package org.apache.any23.http;
-import org.apache.commons.httpclient.*;
-import org.apache.commons.httpclient.methods.GetMethod;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+import org.apache.commons.io.IOUtils;
+import org.apache.http.Header;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.config.SocketConfig;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.message.BasicHeader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URI;
import java.util.ArrayList;
import java.util.List;
-import java.util.regex.Pattern;
/**
* Opens an {@link InputStream} on an HTTP IRI. Is configured
@@ -37,9 +45,7 @@ import java.util.regex.Pattern;
*/
public class DefaultHTTPClient implements HTTPClient {
- private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE);
-
- private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
+ private final PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
private HTTPClientConfiguration configuration;
@@ -51,9 +57,6 @@ public class DefaultHTTPClient implements HTTPClient {
private String contentType = null;
- public static final boolean isUrlEncoded(String url) {
- return ESCAPED_PATTERN.matcher(url).find();
- }
/**
* Creates a {@link DefaultHTTPClient} instance already initialized
@@ -82,35 +85,31 @@ public class DefaultHTTPClient implements HTTPClient {
* located at the URI.
*/
public InputStream openInputStream(String uri) throws IOException {
- GetMethod method = null;
+ HttpGet method = null;
try {
ensureClientInitialized();
- String uriStr;
- try {
- URI uriObj = new URI(uri, isUrlEncoded(uri));
- // [scheme:][//authority][path][?query][#fragment]
- uriStr = uriObj.toString();
- } catch (URIException e) {
- throw new IllegalArgumentException("Invalid IRI string.", e);
- }
- method = new GetMethod(uriStr);
- method.setFollowRedirects(true);
- client.executeMethod(method);
- _contentLength = method.getResponseContentLength();
- final Header contentTypeHeader = method.getResponseHeader("Content-Type");
+ HttpClientContext context = HttpClientContext.create();
+ method = new HttpGet(uri);
+ HttpResponse response = client.execute(method, context);
+ List<URI> locations = context.getRedirectLocations();
+
+ URI actualURI = locations == null || locations.isEmpty() ? method.getURI() : locations.get(locations.size() - 1);
+ actualDocumentIRI = actualURI.toString();
+
+ final Header contentTypeHeader = response.getFirstHeader("Content-Type");
contentType = contentTypeHeader == null ? null : contentTypeHeader.getValue();
- if (method.getStatusCode() != 200) {
+ if (response.getStatusLine().getStatusCode() != 200) {
throw new IOException(
- "Failed to fetch " + uri + ": " + method.getStatusCode() + " " + method.getStatusText()
+ "Failed to fetch " + uri + ": " + response.getStatusLine().getStatusCode() + " " + response.getStatusLine().getReasonPhrase()
);
}
- actualDocumentIRI = method.getURI().toString();
- byte[] response = method.getResponseBody();
- return new ByteArrayInputStream(response);
+ byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
+ _contentLength = bytes.length;
+ return new ByteArrayInputStream(bytes);
} finally {
if (method != null) {
- method.releaseConnection();
+ method.reset();
}
}
}
@@ -143,25 +142,38 @@ public class DefaultHTTPClient implements HTTPClient {
}
private void ensureClientInitialized() {
- if(configuration == null) throw new IllegalStateException("client must be initialized first.");
- if (client != null) return;
- client = new HttpClient(manager);
- HttpConnectionManager connectionManager = client.getHttpConnectionManager();
- HttpConnectionManagerParams params = connectionManager.getParams();
- params.setConnectionTimeout(configuration.getDefaultTimeout());
- params.setSoTimeout(configuration.getDefaultTimeout());
- params.setMaxTotalConnections(configuration.getMaxConnections());
-
- HostConfiguration hostConf = client.getHostConfiguration();
- List<Header> headers = new ArrayList<Header>();
- headers.add(new Header("User-Agent", configuration.getUserAgent()));
+ if (configuration == null)
+ throw new IllegalStateException("client must be initialized first.");
+ if (client != null)
+ return;
+
+ RequestConfig requestConfig = RequestConfig.custom()
+ .setConnectTimeout(getConnectionTimeout())
+ .setSocketTimeout(getSoTimeout())
+ .setRedirectsEnabled(true)
+ .build();
+
+ SocketConfig socketConfig = SocketConfig.custom()
+ .setSoTimeout(getSoTimeout())
+ .build();
+
+ List<Header> headers = new ArrayList<>();
+ headers.add(new BasicHeader("User-Agent", configuration.getUserAgent()));
if (configuration.getAcceptHeader() != null) {
- headers.add(new Header("Accept", configuration.getAcceptHeader()));
+ headers.add(new BasicHeader("Accept", configuration.getAcceptHeader()));
}
- headers.add(new Header("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric.
- headers.add(new Header("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
- // headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
- hostConf.getParams().setParameter("http.default-headers", headers);
+ headers.add(new BasicHeader("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric.
+ // headers.add(new BasicHeader("Accept-Encoding", "x-gzip, gzip"));
+ headers.add(new BasicHeader("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
+
+
+ client = HttpClients.custom()
+ .setConnectionManager(manager)
+ .setDefaultRequestConfig(requestConfig)
+ .setDefaultSocketConfig(socketConfig)
+ .setMaxConnTotal(configuration.getMaxConnections())
+ .setDefaultHeaders(headers)
+ .build();
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/http/HTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/HTTPClient.java b/core/src/main/java/org/apache/any23/http/HTTPClient.java
index 0bc4dbc..3f08975 100644
--- a/core/src/main/java/org/apache/any23/http/HTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/HTTPClient.java
@@ -33,7 +33,7 @@ public interface HTTPClient {
*
* @param configuration configuration for the HTTP Client.
*/
- public abstract void init(HTTPClientConfiguration configuration);
+ void init(HTTPClientConfiguration configuration);
/**
* Opens the input stream for the given target IRI.
@@ -42,7 +42,7 @@ public interface HTTPClient {
* @return input stream to access IRI content.
* @throws IOException if any error occurs while reading the IRI content.
*/
- public abstract InputStream openInputStream(String uri) throws IOException;
+ InputStream openInputStream(String uri) throws IOException;
/**
* Release all static resources help by the instance. Call this
@@ -50,7 +50,7 @@ public interface HTTPClient {
* application, like for example when shutting down a servlet
* context.
*/
- public abstract void close();
+ void close();
/**
* The value of the Content-Type header reported by the server.
@@ -58,12 +58,12 @@ public interface HTTPClient {
*
* @return the content type as string.
*/
- public abstract String getContentType();
+ String getContentType();
/**
* @return content length in bytes.
*/
- public abstract long getContentLength();
+ long getContentLength();
/**
* Returns the actual IRI from which the document was fetched.
@@ -73,6 +73,6 @@ public interface HTTPClient {
*
* @return actual document IRI.
*/
- public abstract String getActualDocumentIRI();
+ String getActualDocumentIRI();
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
index fef124d..e9cebee 100644
--- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
@@ -17,15 +17,13 @@
package org.apache.any23.source;
-import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.HTTPClient;
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URI;
import java.net.URISyntaxException;
/**
@@ -50,13 +48,12 @@ public class HTTPDocumentSource implements DocumentSource {
private String normalize(String uri) throws URISyntaxException {
try {
- URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri));
- normalized.normalize();
+ URI normalized = new URI(uri).normalize();
return normalized.toString();
- } catch (URIException e) {
+ } catch (URISyntaxException e) {
LOG.warn("Invalid uri: {}", uri);
LOG.error("Can not convert URL", e);
- throw new URISyntaxException(uri, e.getMessage());
+ throw e;
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/core/src/main/java/org/apache/any23/util/LogUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/util/LogUtils.java b/core/src/main/java/org/apache/any23/util/LogUtils.java
index ef43c20..30b24ca 100644
--- a/core/src/main/java/org/apache/any23/util/LogUtils.java
+++ b/core/src/main/java/org/apache/any23/util/LogUtils.java
@@ -27,8 +27,6 @@ public class LogUtils {
public static void setDefaultLogging() {
Logger.getLogger("").setLevel(Level.WARNING);
- // Suppress silly cookie warnings.
- Logger.getLogger("org.apache.commons.httpclient").setLevel(Level.SEVERE);
Logger.getLogger("").getHandlers()[0].setLevel(Level.ALL);
}
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 4a62dd1..f0f809d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -332,11 +332,6 @@
<version>2.6</version>
</dependency>
<dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.1</version>
- </dependency>
- <dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient.version}</version>
http://git-wip-us.apache.org/repos/asf/any23/blob/db25f021/service/src/main/java/org/apache/any23/servlet/Servlet.java
----------------------------------------------------------------------
diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java
index 154f41d..ad7c1ed 100644
--- a/service/src/main/java/org/apache/any23/servlet/Servlet.java
+++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java
@@ -29,7 +29,6 @@ import org.apache.any23.source.ByteArrayDocumentSource;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
-import org.apache.commons.httpclient.URI;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,6 +40,7 @@ import javax.servlet.http.HttpServletResponse;
import java.io.File;
import java.io.IOException;
+import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
@@ -286,7 +286,7 @@ public class Servlet extends HttpServlet {
private boolean isValidIRI(String s) {
try {
- URI uri = new URI(s, false);
+ URI uri = new URI(s);
if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
return false;
}