You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/07/31 17:40:35 UTC
any23 git commit: ANY23-374 fix schemeless microdata urls
Repository: any23
Updated Branches:
refs/heads/master 33ce96c39 -> d283d70ce
ANY23-374 fix schemeless microdata urls
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/d283d70c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/d283d70c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/d283d70c
Branch: refs/heads/master
Commit: d283d70ceb692cacb1f31659ee5d5c987822028f
Parents: 33ce96c
Author: Hans <fi...@gmail.com>
Authored: Tue Jul 31 12:21:26 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Tue Jul 31 12:24:42 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/microdata/ItemScope.java | 20 ++++++++++--
.../microdata/MicrodataExtractorTest.java | 9 ++++++
.../microdata/microdata-missing-scheme.html | 33 ++++++++++++++++++++
3 files changed, 60 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
index 0ab0fee..2f079bb 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
@@ -28,6 +28,7 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
/**
* This class describes a <b>Microdata <i>itemscope</i></b>.
@@ -75,12 +76,27 @@ public class ItemScope extends Item {
this(xpath, itemProps, id, refs, stringToUrl(type), itemId);
}
+ private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?");
+
static URL stringToUrl(String type) {
if (StringUtils.isNotBlank(type)) {
try {
- return new URL(ParsedIRI.create(type.trim()).toString());
+ ParsedIRI iri = ParsedIRI.create(type.trim());
+ if (StringUtils.isBlank(iri.getScheme())) {
+ String host = iri.getHost();
+ if (StringUtils.isNotBlank(host)) {
+ iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
+ } else {
+ String path = iri.getPath();
+ if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
+ iri = ParsedIRI.create("http://" + iri.toString());
+ }
+ }
+ }
+
+ return new URL(iri.toString());
} catch (MalformedURLException murle) {
- throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL.");
+ throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage());
}
} else {
return null;
http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index f2e7852..280b3f7 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -23,6 +23,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.junit.Assert;
@@ -83,6 +84,14 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
}
+ @Test
+ public void testMicrodataMissingScheme() {
+ assertExtract("/microdata/microdata-missing-scheme.html");
+ assertModelNotEmpty();
+ assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
+ System.out.println(dumpHumanReadableTriples());
+ }
+
/**
* Reference test as provided by <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich Snippet for Microdata.</a>
*
http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/microdata-missing-scheme.html b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
new file mode 100644
index 0000000..af8277f
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <title>Missing Scheme</title>
+</head>
+<body>
+
+<div itemscope itemtype="http://schema.org/Question">
+ <h3 itemprop="name">Name</h3>
+ <div itemprop="acceptedAnswer" itemscope itemtype="schema.org/Answer">
+ <p itemprop="text">Text</p>
+ </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file