You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/08/17 18:58:15 UTC

any23 git commit: ANY23-389 fix html base elements for RDFa

Repository: any23
Updated Branches:
  refs/heads/master 9f7ba688d -> ef7826df5


ANY23-389 fix html base elements for RDFa


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ef7826df
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ef7826df
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ef7826df

Branch: refs/heads/master
Commit: ef7826df5e4ff9a2d32d1b9105760760a0293581
Parents: 9f7ba68
Author: Hans <fi...@gmail.com>
Authored: Fri Aug 17 13:56:40 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Fri Aug 17 13:56:40 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   | 34 ++++++++++++++++++--
 .../rdfa/opengraph-structured-properties.html   |  3 ++
 2 files changed, 34 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index e908d55..767f6ee 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -26,6 +26,7 @@ import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.html.JsoupUtils;
+import org.eclipse.rdf4j.common.net.ParsedIRI;
 import org.eclipse.rdf4j.rio.RDFFormat;
 import org.eclipse.rdf4j.rio.RDFParseException;
 import org.eclipse.rdf4j.rio.RDFParser;
@@ -40,8 +41,6 @@ import org.jsoup.nodes.Entities;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.NodeFilter;
 import org.jsoup.select.NodeTraversor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -57,10 +56,10 @@ import java.util.regex.Pattern;
  * {@link org.apache.any23.extractor.Extractor.ContentExtractor}.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
 public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
 
-    private static final Logger LOG = LoggerFactory.getLogger(BaseRDFExtractor.class);
     private boolean verifyDataType;
     private boolean stopAtFirstError;
 
@@ -176,6 +175,35 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
                             tagName = tagName.substring(tagName.lastIndexOf(':') + 1);
                             ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div");
 
+                            // fix for ANY23-389
+                            resolve_base:
+                            if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) {
+                                String href = node.attr("href");
+                                String absHref;
+                                try {
+                                    ParsedIRI parsedHref = ParsedIRI.create(href.trim());
+                                    if (parsedHref.isAbsolute()) {
+                                        absHref = parsedHref.toString();
+                                    } else {
+                                        parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref);
+                                        if (parsedHref.isAbsolute()) {
+                                            absHref = parsedHref.toString();
+                                        } else {
+                                            // shouldn't happen unless document IRI wasn't absolute
+                                            // ignore and let underlying RDFa parser report the issue
+                                            break resolve_base;
+                                        }
+                                    }
+                                } catch (RuntimeException e) {
+                                    // can't parse href as a relative or absolute IRI:
+                                    // ignore and let underlying RDFa parser report the issue
+                                    break resolve_base;
+                                }
+                                if (!absHref.equals(href)) {
+                                    node.attr("href", absHref);
+                                }
+                            }
+
                             return FilterResult.CONTINUE;
                         }
                         return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType

http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
index 365ddac..7d7dbc2 100644
--- a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
+++ b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
@@ -19,6 +19,9 @@
   <!--  All of the content below is based on the OGP examples provided at
   http://ogp.me/, this ensures that thw Any23 coverage is suffciently up-to-date.
    -->
+
+  <!-- use relative base href to make sure ANY23-389 is fixed -->
+  <base href="">
   
   <!-- Begin Basic Metadata -->
   <title>The Rock (1996)</title>