You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2014/05/29 22:07:09 UTC

[1/4] git commit: Fix URL encoding issues when making request - we don't need to re-encode anything there

Repository: any23
Updated Branches:
  refs/heads/master 4aaccd28d -> 76b089a65


Fix URL encoding issues when making request - we don't need to re-encode anything there


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/64ae99b5
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/64ae99b5
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/64ae99b5

Branch: refs/heads/master
Commit: 64ae99b591cd369a9f8e7c7ff61880337cf16aa9
Parents: 4249ef3
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Wed Dec 25 18:16:10 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:21 2014 -0400

----------------------------------------------------------------------
 .../main/java/org/apache/any23/http/DefaultHTTPClient.java   | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/64ae99b5/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
index 027bfa8..967f59f 100644
--- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
@@ -95,13 +95,9 @@ public class DefaultHTTPClient implements HTTPClient {
                         "%s://%s%s%s%s%s%s",
                         uriObj.getScheme(),
                         uriObj.getAuthority(),
-                        path != null ? URLEncoder.encode(path, "UTF-8").replaceAll("%2F", "/") : "",
+                        path,
                         query == null ? "" : "?",
-                        query != null ? URLEncoder.encode(query, "UTF-8")
-                                .replaceAll("%3D", "=")
-                                .replaceAll("%26", "&") 
-                            :
-                            "",
+                        query,
                         fragment == null ? "" : "#",
                         fragment != null ? URLEncoder.encode(fragment, "UTF-8") : ""
                 );


[3/4] git commit: Yet another fix for URL processing - do not escape what is already escaped :)

Posted by le...@apache.org.
Yet another fix for URL processing - do not escape what is already escaped :)


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f773f840
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f773f840
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f773f840

Branch: refs/heads/master
Commit: f773f840e93766265e87038688f9d36d4fe7e939
Parents: 64ae99b
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Wed Dec 25 18:47:40 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:22 2014 -0400

----------------------------------------------------------------------
 .../apache/any23/http/DefaultHTTPClient.java    | 34 +++++++-------------
 .../apache/any23/source/HTTPDocumentSource.java |  3 +-
 2 files changed, 13 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
index 967f59f..f533040 100644
--- a/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
+++ b/core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
@@ -17,22 +17,16 @@
 
 package org.apache.any23.http;
 
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpConnectionManager;
-import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
+import org.apache.commons.httpclient.*;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;
 
 /**
  * Opens an {@link InputStream} on an HTTP URI. Is configured
@@ -43,6 +37,8 @@ import java.util.List;
  */
 public class DefaultHTTPClient implements HTTPClient {
 
+    private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE);
+
     private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
 
     private HTTPClientConfiguration configuration;
@@ -55,6 +51,10 @@ public class DefaultHTTPClient implements HTTPClient {
 
     private String contentType = null;
 
+    public static final boolean isUrlEncoded(String url) {
+        return ESCAPED_PATTERN.matcher(url).find();
+    }
+
     /**
      * Creates a {@link DefaultHTTPClient} instance already initialized
      *
@@ -86,22 +86,10 @@ public class DefaultHTTPClient implements HTTPClient {
             ensureClientInitialized();
             String uriStr;
             try {
-                URI uriObj = new URI(uri);
+                URI uriObj = new URI(uri, isUrlEncoded(uri));
                 // [scheme:][//authority][path][?query][#fragment]
-                final String path = uriObj.getPath();
-                final String query = uriObj.getQuery();
-                final String fragment = uriObj.getFragment();
-                uriStr = String.format(
-                        "%s://%s%s%s%s%s%s",
-                        uriObj.getScheme(),
-                        uriObj.getAuthority(),
-                        path,
-                        query == null ? "" : "?",
-                        query,
-                        fragment == null ? "" : "#",
-                        fragment != null ? URLEncoder.encode(fragment, "UTF-8") : ""
-                );
-            } catch (URISyntaxException e) {
+                uriStr = uriObj.toString();
+            } catch (URIException e) {
                 throw new IllegalArgumentException("Invalid URI string.", e);
             }
             method = new GetMethod(uriStr);

http://git-wip-us.apache.org/repos/asf/any23/blob/f773f840/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
index 709bf5a..61a1b2d 100644
--- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
@@ -17,6 +17,7 @@
 
 package org.apache.any23.source;
 
+import org.apache.any23.http.DefaultHTTPClient;
 import org.apache.any23.http.HTTPClient;
 import org.apache.commons.httpclient.URI;
 import org.apache.commons.httpclient.URIException;
@@ -49,7 +50,7 @@ public class HTTPDocumentSource implements DocumentSource {
 
     private String normalize(String uri) throws URISyntaxException {
         try {
-            URI normalized = new URI(uri, false);
+            URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri));
             normalized.normalize();
             return normalized.toString();
         } catch (URIException e) {


[2/4] git commit: Fix URL encoding issues

Posted by le...@apache.org.
Fix URL encoding issues


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/4249ef32
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/4249ef32
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/4249ef32

Branch: refs/heads/master
Commit: 4249ef3229565cd810eff2f79c1c6b06013d96a0
Parents: c224e26
Author: Eugene Dzhurinsky <jd...@gmail.com>
Authored: Sun Dec 22 23:37:04 2013 -0500
Committer: Eugene Dzhurinsky <jd...@gmail.com>
Committed: Thu May 8 23:03:21 2014 -0400

----------------------------------------------------------------------
 .../apache/any23/source/HTTPDocumentSource.java | 19 +++++++--
 .../java/org/apache/any23/servlet/Servlet.java  | 41 ++++++++++----------
 2 files changed, 36 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
index 6ea2cc8..709bf5a 100644
--- a/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
@@ -18,10 +18,13 @@
 package org.apache.any23.source;
 
 import org.apache.any23.http.HTTPClient;
+import org.apache.commons.httpclient.URI;
+import org.apache.commons.httpclient.URIException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.URI;
 import java.net.URISyntaxException;
 
 /**
@@ -29,6 +32,8 @@ import java.net.URISyntaxException;
  */
 public class HTTPDocumentSource implements DocumentSource {
 
+    private static final Logger LOG = LoggerFactory.getLogger(HTTPDocumentSource.class);
+
     private final HTTPClient client;
 
     private String uri;
@@ -43,7 +48,15 @@ public class HTTPDocumentSource implements DocumentSource {
     }
 
     private String normalize(String uri) throws URISyntaxException {
-        return new URI(uri).normalize().toString();
+        try {
+            URI normalized = new URI(uri, false);
+            normalized.normalize();
+            return normalized.toString();
+        } catch (URIException e) {
+            LOG.warn("Invalid uri: {}", uri);
+            LOG.error("Can not convert URL", e);
+            throw new URISyntaxException(uri, e.getMessage());
+        }
     }
 
     private void ensureOpen() throws IOException {
@@ -80,5 +93,5 @@ public class HTTPDocumentSource implements DocumentSource {
     public boolean isLocal() {
         return false;
     }
-    
+
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/4249ef32/service/src/main/java/org/apache/any23/servlet/Servlet.java
----------------------------------------------------------------------
diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java
index 0a968de..31f104e 100644
--- a/service/src/main/java/org/apache/any23/servlet/Servlet.java
+++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java
@@ -26,14 +26,16 @@ import org.apache.any23.source.ByteArrayDocumentSource;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.source.HTTPDocumentSource;
 import org.apache.any23.source.StringDocumentSource;
+import org.apache.commons.httpclient.URI;
 import org.openrdf.rio.RDFFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.servlet.ServletException;
 import javax.servlet.http.HttpServlet;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 import java.io.IOException;
-import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.regex.Pattern;
 
@@ -48,6 +50,8 @@ import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
  */
 public class Servlet extends HttpServlet {
 
+    private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);
+
     public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
 
     private static final long serialVersionUID = 8207685628715421336L;
@@ -135,23 +139,17 @@ public class Servlet extends HttpServlet {
         MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
         if (result == null) {
             return null;
-        }
-        else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
+        } else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
             return "turtle";
-        }
-        else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
+        } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
             return "n3";
-        }
-        else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
+        } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
             return "nq";
-        }
-        else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
+        } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
             return "rdf";
-        }
-        else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
+        } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
             return "nt";
-        }
-        else {
+        } else {
             return "turtle";    // shouldn't happen
         }
     }
@@ -220,13 +218,14 @@ public class Servlet extends HttpServlet {
     }
 
     private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
-    throws IOException {
+            throws IOException {
         try {
             if (!isValidURI(uri)) {
                 throw new URISyntaxException(uri, "@@@");
             }
             return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
         } catch (URISyntaxException ex) {
+            LOG.error("Invalid URI detected", ex);
             responder.sendError(400, "Invalid input URI " + uri, report);
             return null;
         }
@@ -239,11 +238,11 @@ public class Servlet extends HttpServlet {
 
     private boolean isValidURI(String s) {
         try {
-            URI uri = new URI(s);
+            URI uri = new URI(s, false);
             if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
                 return false;
             }
-        } catch (URISyntaxException e) {
+        } catch (Exception e) {
             return false;
         }
         return true;
@@ -252,15 +251,15 @@ public class Servlet extends HttpServlet {
     private ValidationMode getValidationMode(HttpServletRequest request) {
         final String PARAMETER = "validation-mode";
         final String validationMode = request.getParameter(PARAMETER);
-        if(validationMode == null) return ValidationMode.None;
-        if("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
-        if("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
-        if("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
+        if (validationMode == null) return ValidationMode.None;
+        if ("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
+        if ("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
+        if ("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
         throw new IllegalArgumentException(
                 String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER)
         );
     }
-    
+
     private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
         final ValidationMode mode = getValidationMode(request);
         return new ExtractionParameters(DefaultConfiguration.singleton(), mode);


[4/4] git commit: Merge branch 'master' into jdevelop-fix-url-encoding-problem

Posted by le...@apache.org.
Merge branch 'master' into jdevelop-fix-url-encoding-problem


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/76b089a6
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/76b089a6
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/76b089a6

Branch: refs/heads/master
Commit: 76b089a65ff1ecbc9d76e898d9a984f8e0a1f01a
Parents: f773f84 4aaccd2
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Thu May 29 12:50:20 2014 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Thu May 29 12:50:20 2014 -0700

----------------------------------------------------------------------
 NOTICE.txt                                      |  2 +-
 RELEASE-NOTES.txt                               | 39 ++++++++++++++++++++
 api/pom.xml                                     |  2 +-
 core/pom.xml                                    |  2 +-
 .../rdfa/AbstractRDFaExtractorTestCase.java     |  2 +-
 csvutils/pom.xml                                |  2 +-
 encoding/pom.xml                                |  2 +-
 mime/pom.xml                                    |  2 +-
 nquads/pom.xml                                  |  2 +-
 plugins/basic-crawler/pom.xml                   |  8 ++--
 plugins/html-scraper/pom.xml                    |  6 +--
 plugins/integration-test/pom.xml                |  8 ++--
 plugins/office-scraper/pom.xml                  |  6 +--
 pom.xml                                         | 22 +++++++----
 service/pom.xml                                 |  2 +-
 src/site/apt/dev-csv-extractor.apt              |  8 ++--
 test-resources/pom.xml                          |  2 +-
 .../resources/html/rdfa/rdfa-11-curies.html     |  2 +-
 .../apache/any23/extractor/csv/test-comma.csv   |  2 +-
 .../apache/any23/extractor/csv/test-missing.csv |  2 +-
 .../any23/extractor/csv/test-semicolon.csv      |  2 +-
 .../org/apache/any23/extractor/csv/test-tab.csv |  2 +-
 22 files changed, 87 insertions(+), 40 deletions(-)
----------------------------------------------------------------------