You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/07/31 17:40:35 UTC

any23 git commit: ANY23-374 fix schemeless microdata urls

Repository: any23
Updated Branches:
  refs/heads/master 33ce96c39 -> d283d70ce


ANY23-374 fix schemeless microdata urls


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/d283d70c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/d283d70c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/d283d70c

Branch: refs/heads/master
Commit: d283d70ceb692cacb1f31659ee5d5c987822028f
Parents: 33ce96c
Author: Hans <fi...@gmail.com>
Authored: Tue Jul 31 12:21:26 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Tue Jul 31 12:24:42 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/microdata/ItemScope.java    | 20 ++++++++++--
 .../microdata/MicrodataExtractorTest.java       |  9 ++++++
 .../microdata/microdata-missing-scheme.html     | 33 ++++++++++++++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
index 0ab0fee..2f079bb 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
@@ -28,6 +28,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 /**
  * This class describes a <b>Microdata <i>itemscope</i></b>.
@@ -75,12 +76,27 @@ public class ItemScope extends Item {
         this(xpath, itemProps, id, refs, stringToUrl(type), itemId);
     }
 
+    private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?");
+
     static URL stringToUrl(String type) {
         if (StringUtils.isNotBlank(type)) {
             try {
-                return new URL(ParsedIRI.create(type.trim()).toString());
+                ParsedIRI iri = ParsedIRI.create(type.trim());
+                if (StringUtils.isBlank(iri.getScheme())) {
+                    String host = iri.getHost();
+                    if (StringUtils.isNotBlank(host)) {
+                        iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
+                    } else {
+                        String path = iri.getPath();
+                        if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
+                            iri = ParsedIRI.create("http://" + iri.toString());
+                        }
+                    }
+                }
+
+                return new URL(iri.toString());
             } catch (MalformedURLException murle) {
-                throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL.");
+                throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage());
             }
         } else {
             return null;

http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index f2e7852..280b3f7 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -23,6 +23,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.junit.Assert;
@@ -83,6 +84,14 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
         assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
     }
 
+    @Test
+    public void testMicrodataMissingScheme() {
+        assertExtract("/microdata/microdata-missing-scheme.html");
+        assertModelNotEmpty();
+        assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
+        System.out.println(dumpHumanReadableTriples());
+    }
+
     /**
      * Reference test as provided by <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich Snippet for Microdata.</a>
      *

http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/microdata-missing-scheme.html b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
new file mode 100644
index 0000000..af8277f
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Missing Scheme</title>
+</head>
+<body>
+
+<div itemscope itemtype="http://schema.org/Question">
+    <h3 itemprop="name">Name</h3>
+    <div itemprop="acceptedAnswer" itemscope itemtype="schema.org/Answer">
+        <p itemprop="text">Text</p>
+    </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file