You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/08/17 18:58:15 UTC
any23 git commit: ANY23-389 fix html base elements for RDFa
Repository: any23
Updated Branches:
refs/heads/master 9f7ba688d -> ef7826df5
ANY23-389 fix html base elements for RDFa
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ef7826df
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ef7826df
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ef7826df
Branch: refs/heads/master
Commit: ef7826df5e4ff9a2d32d1b9105760760a0293581
Parents: 9f7ba68
Author: Hans <fi...@gmail.com>
Authored: Fri Aug 17 13:56:40 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Fri Aug 17 13:56:40 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/rdf/BaseRDFExtractor.java | 34 ++++++++++++++++++--
.../rdfa/opengraph-structured-properties.html | 3 ++
2 files changed, 34 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index e908d55..767f6ee 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -26,6 +26,7 @@ import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.html.JsoupUtils;
+import org.eclipse.rdf4j.common.net.ParsedIRI;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RDFParser;
@@ -40,8 +41,6 @@ import org.jsoup.nodes.Entities;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@@ -57,10 +56,10 @@ import java.util.regex.Pattern;
* {@link org.apache.any23.extractor.Extractor.ContentExtractor}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
*/
public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
- private static final Logger LOG = LoggerFactory.getLogger(BaseRDFExtractor.class);
private boolean verifyDataType;
private boolean stopAtFirstError;
@@ -176,6 +175,35 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
tagName = tagName.substring(tagName.lastIndexOf(':') + 1);
((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div");
+ // fix for ANY23-389
+ resolve_base:
+ if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) {
+ String href = node.attr("href");
+ String absHref;
+ try {
+ ParsedIRI parsedHref = ParsedIRI.create(href.trim());
+ if (parsedHref.isAbsolute()) {
+ absHref = parsedHref.toString();
+ } else {
+ parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref);
+ if (parsedHref.isAbsolute()) {
+ absHref = parsedHref.toString();
+ } else {
+ // shouldn't happen unless document IRI wasn't absolute
+ // ignore and let underlying RDFa parser report the issue
+ break resolve_base;
+ }
+ }
+ } catch (RuntimeException e) {
+ // can't parse href as a relative or absolute IRI:
+ // ignore and let underlying RDFa parser report the issue
+ break resolve_base;
+ }
+ if (!absHref.equals(href)) {
+ node.attr("href", absHref);
+ }
+ }
+
return FilterResult.CONTINUE;
}
return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType
http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
index 365ddac..7d7dbc2 100644
--- a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
+++ b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
@@ -19,6 +19,9 @@
<!-- All of the content below is based on the OGP examples provided at
http://ogp.me/, this ensures that thw Any23 coverage is suffciently up-to-date.
-->
+
+ <!-- use relative base href to make sure ANY23-389 is fixed -->
+ <base href="">
<!-- Begin Basic Metadata -->
<title>The Rock (1996)</title>