You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/24 21:45:26 UTC

any23 git commit: ANY23-409 allow multiple microdata itemtype values

Repository: any23
Updated Branches:
  refs/heads/master a58d59e35 -> 8b951d8e0


ANY23-409 allow multiple microdata itemtype values


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/8b951d8e
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/8b951d8e
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/8b951d8e

Branch: refs/heads/master
Commit: 8b951d8e06ed5ad941ec4ba452532bb93d04a057
Parents: a58d59e
Author: Hans <fi...@gmail.com>
Authored: Wed Oct 24 16:36:12 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Wed Oct 24 16:36:12 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/microdata/ItemScope.java    | 63 +++++++++++---------
 .../extractor/microdata/MicrodataExtractor.java | 21 +++----
 .../extractor/microdata/MicrodataParser.java    | 36 ++++++++---
 3 files changed, 73 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
index 2f079bb..1612aad 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
@@ -17,14 +17,17 @@
 
 package org.apache.any23.extractor.microdata;
 
+import org.apache.any23.rdf.RDFUtils;
 import org.apache.commons.lang.StringUtils;
 import org.eclipse.rdf4j.common.net.ParsedIRI;
+import org.eclipse.rdf4j.model.IRI;
 
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -34,6 +37,7 @@ import java.util.regex.Pattern;
  * This class describes a <b>Microdata <i>itemscope</i></b>.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
 public class ItemScope extends Item {
 
@@ -55,7 +59,7 @@ public class ItemScope extends Item {
     /**
      * <i>itemscope</i> type.
      */
-    private final URL type;
+    private final List<IRI> type;
 
     /**
      * <i>itemscope</i> external identifier.
@@ -73,44 +77,39 @@ public class ItemScope extends Item {
      * @param itemId    <i>itemscope</i> id. Can be <code>null</code>.
      */
     public ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, String type, String itemId) {
-        this(xpath, itemProps, id, refs, stringToUrl(type), itemId);
+        this(xpath, itemProps, id, refs, stringToSingletonIRI(type), itemId);
     }
 
     private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?");
 
-    static URL stringToUrl(String type) {
+    static List<IRI> stringToSingletonIRI(String type) {
         if (StringUtils.isNotBlank(type)) {
-            try {
-                ParsedIRI iri = ParsedIRI.create(type.trim());
-                if (StringUtils.isBlank(iri.getScheme())) {
-                    String host = iri.getHost();
-                    if (StringUtils.isNotBlank(host)) {
-                        iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
-                    } else {
-                        String path = iri.getPath();
-                        if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
-                            iri = ParsedIRI.create("http://" + iri.toString());
-                        }
+            ParsedIRI iri = ParsedIRI.create(type.trim());
+            if (StringUtils.isBlank(iri.getScheme())) {
+                String host = iri.getHost();
+                if (StringUtils.isNotBlank(host)) {
+                    iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
+                } else {
+                    String path = iri.getPath();
+                    if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
+                        iri = ParsedIRI.create("http://" + iri.toString());
                     }
                 }
-
-                return new URL(iri.toString());
-            } catch (MalformedURLException murle) {
-                throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage());
             }
+            return Collections.singletonList(RDFUtils.iri(iri.toString()));
         } else {
-            return null;
+            return Collections.emptyList();
         }
     }
 
-    ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, URL type, String itemId) {
+    ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, List<IRI> types, String itemId) {
         super(xpath);
 
         if (itemProps == null) {
             throw new NullPointerException("itemProps list cannot be null.");
         }
 
-        this.type = type;
+        this.type = types;
         this.id = id;
         this.refs = refs;
         this.itemId = itemId;
@@ -162,6 +161,20 @@ public class ItemScope extends Item {
      * @return <i>itemscope</i> type.
      */
     public URL getType() {
+        //No longer using URL.
+        //But for backwards compatibility:
+        try {
+            return type.isEmpty() ? null : new URL(type.get(0).stringValue());
+        } catch (MalformedURLException e) {
+            try {
+                return new URL(ParsedIRI.create(type.get(0).stringValue()).toASCIIString());
+            } catch (Exception e1) {
+                return null;
+            }
+        }
+    }
+
+    List<IRI> getTypes() {
         return type;
     }
 
@@ -200,7 +213,7 @@ public class ItemScope extends Item {
                 getXpath(),
                 id == null ? null : "\"" + id + "\"",
                 refs == null ? null : toJSON(refs),
-                type == null ? null : "\"" + type + "\"",
+                type.isEmpty() ? null : "\"" + type.get(0) + "\"",
                 itemId == null ? null : "\"" + itemId + "\"",
                 sb.toString()
         );
@@ -248,11 +261,7 @@ public class ItemScope extends Item {
     }
 
     protected void acquireProperty(ItemProp itemProp) {
-        List<ItemProp> itemProps = properties.get(itemProp.getName());
-        if (itemProps == null) {
-            itemProps = new ArrayList<>();
-            properties.put(itemProp.getName(), itemProps);
-        }
+        List<ItemProp> itemProps = properties.computeIfAbsent(itemProp.getName(), k -> new ArrayList<>());
         if (!itemProps.contains(itemProp))
             itemProps.add(itemProp);
     }

http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 1e1f021..efd54e9 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -41,7 +41,6 @@ import org.w3c.dom.NodeList;
 
 import java.io.IOException;
 import java.net.URISyntaxException;
-import java.net.URL;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -419,12 +418,15 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
             IRI documentIRI, ExtractionResult out,
             Map<ItemScope, Resource> mappings, IRI defaultNamespace
     ) throws ExtractionException {
-        Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(documentIRI, scope.getItemId()));
-
-        IRI itemScopeType = getType(itemScope);
-        if (itemScopeType != null) {
-            out.writeTriple(subject, RDF.TYPE, itemScopeType);
-            defaultNamespace = getNamespaceIRI(itemScopeType);
+        Resource subject = mappings.computeIfAbsent(itemScope, scope ->
+                createSubjectForItemId(documentIRI, scope.getItemId()));
+
+        List<IRI> itemScopeTypes = itemScope.getTypes();
+        if (!itemScopeTypes.isEmpty()) {
+            defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0));
+            for (IRI type : itemScopeTypes) {
+                out.writeTriple(subject, RDF.TYPE, type);
+            }
         }
         for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
             String propName = itemProps.getKey();
@@ -454,11 +456,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
         return subject;
     }
 
-    private static IRI getType(ItemScope scope) {
-        URL type = scope.getType();
-        return type == null ? null : RDFUtils.iri(type.toString());
-    }
-
     private static Resource createSubjectForItemId(IRI documentIRI, String itemId) {
         if (itemId == null) {
             return RDFUtils.bnode();

http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index dfb9de6..95fd94b 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata;
 import org.apache.any23.extractor.html.DomUtils;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.commons.lang.StringUtils;
+import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
 import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
@@ -31,7 +32,6 @@ import org.w3c.dom.traversal.NodeFilter;
 import org.w3c.dom.traversal.TreeWalker;
 
 import java.io.PrintStream;
-import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -47,6 +47,7 @@ import java.util.Set;
  * nodes contained within a <i>DOM</i> document.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
 public class MicrodataParser {
 
@@ -578,12 +579,31 @@ public class MicrodataParser {
             itemProps.add(deferredProperty);
         }
 
-        URL type;
-        try {
-            type = ItemScope.stringToUrl(itemType);
-        } catch (IllegalArgumentException e) {
-            manageError(new MicrodataParserException(e.getMessage(), node));
-            type = null;
+        List<IRI> types;
+        if (itemType == null) {
+            types = Collections.emptyList();
+        } else {
+            types = new ArrayList<>();
+            boolean canConcatWithPrev = false;
+            for (String s : itemType.trim().split("\\s+")) {
+                try {
+                    canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s));
+                } catch (RuntimeException e) {
+                    if (canConcatWithPrev) {
+                        int lastInd = types.size() - 1;
+                        try {
+                            List<IRI> secondTry = ItemScope.stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s);
+                            types.remove(lastInd);
+                            canConcatWithPrev = types.addAll(secondTry);
+                        } catch (RuntimeException e2) {
+                            manageError(new MicrodataParserException(e.getMessage(), node));
+                            canConcatWithPrev = false;
+                        }
+                    } else {
+                        manageError(new MicrodataParserException(e.getMessage(), node));
+                    }
+                }
+            }
         }
 
         final ItemScope newItemScope = new ItemScope(
@@ -591,7 +611,7 @@ public class MicrodataParser {
                 itemProps.toArray(new ItemProp[itemProps.size()]),
                 id,
                 itemrefIDs,
-                type,
+                types,
                 itemId
         );
         itemScopes.put(node, newItemScope);