You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2014/05/29 22:07:09 UTC
[1/4] git commit: Fix URL encoding issues when making request - we
don't need to re-encode anything there
Repository: any23
Updated Branches:
refs/heads/master 4aaccd28d -> 76b089a65
Fix URL encoding issues when making request - we don't need to re-encode anything there
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/64ae99b5
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/64ae99b5
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/64ae99b5
Branch: refs/heads/master
Commit: 64ae99b591cd369a9f8e7c7ff61880337cf16aa9
Parents: 4249ef3
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Wed Dec 25 18:16:10 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:21 2014 -0400
----------------------------------------------------------------------
.../main/java/org/apache/any23/http/DefaultHTTPClient.java | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/64ae99b5/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
index 027bfa8..967f59f 100644
--- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
@@ -95,13 +95,9 @@ public class DefaultHTTPClient implements HTTPClient {
"%s://%s%s%s%s%s%s",
uriObj.getScheme(),
uriObj.getAuthority(),
- path != null ? URLEncoder.encode(path, "UTF-8").replaceAll("%2F", "/") : "",
+ path,
query == null ? "" : "?",
- query != null ? URLEncoder.encode(query, "UTF-8")
- .replaceAll("%3D", "=")
- .replaceAll("%26", "&")
- :
- "",
+ query,
fragment == null ? "" : "#",
fragment != null ? URLEncoder.encode(fragment, "UTF-8") : ""
);
[3/4] git commit: Yet another fix for URL processing - do not escape
what is already escaped :)
Posted by le...@apache.org.
Yet another fix for URL processing - do not escape what is already escaped :)
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f773f840
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f773f840
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f773f840
Branch: refs/heads/master
Commit: f773f840e93766265e87038688f9d36d4fe7e939
Parents: 64ae99b
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Wed Dec 25 18:47:40 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:22 2014 -0400
----------------------------------------------------------------------
.../apache/any23/http/DefaultHTTPClient.java | 34 +++++++-------------
.../apache/any23/source/HTTPDocumentSource.java | 3 +-
2 files changed, 13 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
index 967f59f..f533040 100644
--- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
@@ -17,22 +17,16 @@
package org.apache.any23.http;
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpConnectionManager;
-import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
+import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Pattern;
/**
* Opens an {@link InputStream} on an HTTP URI. Is configured
@@ -43,6 +37,8 @@ import java.util.List;
*/
public class DefaultHTTPClient implements HTTPClient {
+ private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE);
+
private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
private HTTPClientConfiguration configuration;
@@ -55,6 +51,10 @@ public class DefaultHTTPClient implements HTTPClient {
private String contentType = null;
+ public static final boolean isUrlEncoded(String url) {
+ return ESCAPED_PATTERN.matcher(url).find();
+ }
+
/**
* Creates a {@link DefaultHTTPClient} instance already initialized
*
@@ -86,22 +86,10 @@ public class DefaultHTTPClient implements HTTPClient {
ensureClientInitialized();
String uriStr;
try {
- URI uriObj = new URI(uri);
+ URI uriObj = new URI(uri, isUrlEncoded(uri));
// [scheme:][//authority][path][?query][#fragment]
- final String path = uriObj.getPath();
- final String query = uriObj.getQuery();
- final String fragment = uriObj.getFragment();
- uriStr = String.format(
- "%s://%s%s%s%s%s%s",
- uriObj.getScheme(),
- uriObj.getAuthority(),
- path,
- query == null ? "" : "?",
- query,
- fragment == null ? "" : "#",
- fragment != null ? URLEncoder.encode(fragment, "UTF-8") : ""
- );
- } catch (URISyntaxException e) {
+ uriStr = uriObj.toString();
+ } catch (URIException e) {
throw new IllegalArgumentException("Invalid URI string.", e);
}
method = new GetMethod(uriStr);
http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
index 709bf5a..61a1b2d 100644
--- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
@@ -17,6 +17,7 @@
package org.apache.any23.source;
+import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.HTTPClient;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
@@ -49,7 +50,7 @@ public class HTTPDocumentSource implements DocumentSource {
private String normalize(String uri) throws URISyntaxException {
try {
- URI normalized = new URI(uri, false);
+ URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri));
normalized.normalize();
return normalized.toString();
} catch (URIException e) {
[2/4] git commit: Fix URL encoding issues
Posted by le...@apache.org.
Fix URL encoding issues
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/4249ef32
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/4249ef32
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/4249ef32
Branch: refs/heads/master
Commit: 4249ef3229565cd810eff2f79c1c6b06013d96a0
Parents: c224e26
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Sun Dec 22 23:37:04 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:21 2014 -0400
----------------------------------------------------------------------
.../apache/any23/source/HTTPDocumentSource.java | 19 +++++++--
.../java/org/apache/any23/servlet/Servlet.java | 41 ++++++++++----------
2 files changed, 36 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
index 6ea2cc8..709bf5a 100644
--- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
@@ -18,10 +18,13 @@
package org.apache.any23.source;
import org.apache.any23.http.HTTPClient;
+import org.apache.commons.httpclient.URI;
+import org.apache.commons.httpclient.URIException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
-import java.net.URI;
import java.net.URISyntaxException;
/**
@@ -29,6 +32,8 @@ import java.net.URISyntaxException;
*/
public class HTTPDocumentSource implements DocumentSource {
+ private static final Logger LOG = LoggerFactory.getLogger(HTTPDocumentSource.class);
+
private final HTTPClient client;
private String uri;
@@ -43,7 +48,15 @@ public class HTTPDocumentSource implements DocumentSource {
}
private String normalize(String uri) throws URISyntaxException {
- return new URI(uri).normalize().toString();
+ try {
+ URI normalized = new URI(uri, false);
+ normalized.normalize();
+ return normalized.toString();
+ } catch (URIException e) {
+ LOG.warn("Invalid uri: {}", uri);
+ LOG.error("Can not convert URL", e);
+ throw new URISyntaxException(uri, e.getMessage());
+ }
}
private void ensureOpen() throws IOException {
@@ -80,5 +93,5 @@ public class HTTPDocumentSource implements DocumentSource {
public boolean isLocal() {
return false;
}
-
+
}
http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/service/src/main/java/org/apache/any23/servlet/Servlet.java
----------------------------------------------------------------------
diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java
index 0a968de..31f104e 100644
--- a/service/src/main/java/org/apache/any23/servlet/Servlet.java
+++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java
@@ -26,14 +26,16 @@ import org.apache.any23.source.ByteArrayDocumentSource;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
+import org.apache.commons.httpclient.URI;
import org.openrdf.rio.RDFFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
-import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
@@ -48,6 +50,8 @@ import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
*/
public class Servlet extends HttpServlet {
+ private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);
+
public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
private static final long serialVersionUID = 8207685628715421336L;
@@ -135,23 +139,17 @@ public class Servlet extends HttpServlet {
MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
if (result == null) {
return null;
- }
- else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
+ } else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
return "turtle";
- }
- else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
+ } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
return "n3";
- }
- else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
+ } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
return "nq";
- }
- else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
+ } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
return "rdf";
- }
- else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
+ } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
return "nt";
- }
- else {
+ } else {
return "turtle"; // shouldn't happen
}
}
@@ -220,13 +218,14 @@ public class Servlet extends HttpServlet {
}
private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
- throws IOException {
+ throws IOException {
try {
if (!isValidURI(uri)) {
throw new URISyntaxException(uri, "@@@");
}
return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
} catch (URISyntaxException ex) {
+ LOG.error("Invalid URI detected", ex);
responder.sendError(400, "Invalid input URI " + uri, report);
return null;
}
@@ -239,11 +238,11 @@ public class Servlet extends HttpServlet {
private boolean isValidURI(String s) {
try {
- URI uri = new URI(s);
+ URI uri = new URI(s, false);
if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
return false;
}
- } catch (URISyntaxException e) {
+ } catch (Exception e) {
return false;
}
return true;
@@ -252,15 +251,15 @@ public class Servlet extends HttpServlet {
private ValidationMode getValidationMode(HttpServletRequest request) {
final String PARAMETER = "validation-mode";
final String validationMode = request.getParameter(PARAMETER);
- if(validationMode == null) return ValidationMode.None;
- if("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
- if("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
- if("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
+ if (validationMode == null) return ValidationMode.None;
+ if ("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
+ if ("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
+ if ("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
throw new IllegalArgumentException(
String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER)
);
}
-
+
private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
final ValidationMode mode = getValidationMode(request);
return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
[4/4] git commit: Merge branch 'master' into
jdevelop-fix-url-encoding-problem
Posted by le...@apache.org.
Merge branch 'master' into jdevelop-fix-url-encoding-problem
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/76b089a6
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/76b089a6
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/76b089a6
Branch: refs/heads/master
Commit: 76b089a65ff1ecbc9d76e898d9a984f8e0a1f01a
Parents: f773f84 4aaccd2
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Thu May 29 12:50:20 2014 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Thu May 29 12:50:20 2014 -0700
----------------------------------------------------------------------
NOTICE.txt | 2 +-
RELEASE-NOTES.txt | 39 ++++++++++++++++++++
api/pom.xml | 2 +-
core/pom.xml | 2 +-
.../rdfa/AbstractRDFaExtractorTestCase.java | 2 +-
csvutils/pom.xml | 2 +-
encoding/pom.xml | 2 +-
mime/pom.xml | 2 +-
nquads/pom.xml | 2 +-
plugins/basic-crawler/pom.xml | 8 ++--
plugins/html-scraper/pom.xml | 6 +--
plugins/integration-test/pom.xml | 8 ++--
plugins/office-scraper/pom.xml | 6 +--
pom.xml | 22 +++++++----
service/pom.xml | 2 +-
src/site/apt/dev-csv-extractor.apt | 8 ++--
test-resources/pom.xml | 2 +-
.../resources/html/rdfa/rdfa-11-curies.html | 2 +-
.../apache/any23/extractor/csv/test-comma.csv | 2 +-
.../apache/any23/extractor/csv/test-missing.csv | 2 +-
.../any23/extractor/csv/test-semicolon.csv | 2 +-
.../org/apache/any23/extractor/csv/test-tab.csv | 2 +-
22 files changed, 87 insertions(+), 40 deletions(-)
----------------------------------------------------------------------