You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/24 21:45:26 UTC
any23 git commit: ANY23-409 allow multiple microdata itemtype values
Repository: any23
Updated Branches:
refs/heads/master a58d59e35 -> 8b951d8e0
ANY23-409 allow multiple microdata itemtype values
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/8b951d8e
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/8b951d8e
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/8b951d8e
Branch: refs/heads/master
Commit: 8b951d8e06ed5ad941ec4ba452532bb93d04a057
Parents: a58d59e
Author: Hans <fi...@gmail.com>
Authored: Wed Oct 24 16:36:12 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Wed Oct 24 16:36:12 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/microdata/ItemScope.java | 63 +++++++++++---------
.../extractor/microdata/MicrodataExtractor.java | 21 +++----
.../extractor/microdata/MicrodataParser.java | 36 ++++++++---
3 files changed, 73 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
index 2f079bb..1612aad 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
@@ -17,14 +17,17 @@
package org.apache.any23.extractor.microdata;
+import org.apache.any23.rdf.RDFUtils;
import org.apache.commons.lang.StringUtils;
import org.eclipse.rdf4j.common.net.ParsedIRI;
+import org.eclipse.rdf4j.model.IRI;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -34,6 +37,7 @@ import java.util.regex.Pattern;
* This class describes a <b>Microdata <i>itemscope</i></b>.
*
* @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
*/
public class ItemScope extends Item {
@@ -55,7 +59,7 @@ public class ItemScope extends Item {
/**
* <i>itemscope</i> type.
*/
- private final URL type;
+ private final List<IRI> type;
/**
* <i>itemscope</i> external identifier.
@@ -73,44 +77,39 @@ public class ItemScope extends Item {
* @param itemId <i>itemscope</i> id. Can be <code>null</code>.
*/
public ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, String type, String itemId) {
- this(xpath, itemProps, id, refs, stringToUrl(type), itemId);
+ this(xpath, itemProps, id, refs, stringToSingletonIRI(type), itemId);
}
private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?");
- static URL stringToUrl(String type) {
+ static List<IRI> stringToSingletonIRI(String type) {
if (StringUtils.isNotBlank(type)) {
- try {
- ParsedIRI iri = ParsedIRI.create(type.trim());
- if (StringUtils.isBlank(iri.getScheme())) {
- String host = iri.getHost();
- if (StringUtils.isNotBlank(host)) {
- iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
- } else {
- String path = iri.getPath();
- if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
- iri = ParsedIRI.create("http://" + iri.toString());
- }
+ ParsedIRI iri = ParsedIRI.create(type.trim());
+ if (StringUtils.isBlank(iri.getScheme())) {
+ String host = iri.getHost();
+ if (StringUtils.isNotBlank(host)) {
+ iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
+ } else {
+ String path = iri.getPath();
+ if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
+ iri = ParsedIRI.create("http://" + iri.toString());
}
}
-
- return new URL(iri.toString());
- } catch (MalformedURLException murle) {
- throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage());
}
+ return Collections.singletonList(RDFUtils.iri(iri.toString()));
} else {
- return null;
+ return Collections.emptyList();
}
}
- ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, URL type, String itemId) {
+ ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, List<IRI> types, String itemId) {
super(xpath);
if (itemProps == null) {
throw new NullPointerException("itemProps list cannot be null.");
}
- this.type = type;
+ this.type = types;
this.id = id;
this.refs = refs;
this.itemId = itemId;
@@ -162,6 +161,20 @@ public class ItemScope extends Item {
* @return <i>itemscope</i> type.
*/
public URL getType() {
+ //No longer using URL.
+ //But for backwards compatibility:
+ try {
+ return type.isEmpty() ? null : new URL(type.get(0).stringValue());
+ } catch (MalformedURLException e) {
+ try {
+ return new URL(ParsedIRI.create(type.get(0).stringValue()).toASCIIString());
+ } catch (Exception e1) {
+ return null;
+ }
+ }
+ }
+
+ List<IRI> getTypes() {
return type;
}
@@ -200,7 +213,7 @@ public class ItemScope extends Item {
getXpath(),
id == null ? null : "\"" + id + "\"",
refs == null ? null : toJSON(refs),
- type == null ? null : "\"" + type + "\"",
+ type.isEmpty() ? null : "\"" + type.get(0) + "\"",
itemId == null ? null : "\"" + itemId + "\"",
sb.toString()
);
@@ -248,11 +261,7 @@ public class ItemScope extends Item {
}
protected void acquireProperty(ItemProp itemProp) {
- List<ItemProp> itemProps = properties.get(itemProp.getName());
- if (itemProps == null) {
- itemProps = new ArrayList<>();
- properties.put(itemProp.getName(), itemProps);
- }
+ List<ItemProp> itemProps = properties.computeIfAbsent(itemProp.getName(), k -> new ArrayList<>());
if (!itemProps.contains(itemProp))
itemProps.add(itemProp);
}
http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 1e1f021..efd54e9 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -41,7 +41,6 @@ import org.w3c.dom.NodeList;
import java.io.IOException;
import java.net.URISyntaxException;
-import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@@ -419,12 +418,15 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
IRI documentIRI, ExtractionResult out,
Map<ItemScope, Resource> mappings, IRI defaultNamespace
) throws ExtractionException {
- Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(documentIRI, scope.getItemId()));
-
- IRI itemScopeType = getType(itemScope);
- if (itemScopeType != null) {
- out.writeTriple(subject, RDF.TYPE, itemScopeType);
- defaultNamespace = getNamespaceIRI(itemScopeType);
+ Resource subject = mappings.computeIfAbsent(itemScope, scope ->
+ createSubjectForItemId(documentIRI, scope.getItemId()));
+
+ List<IRI> itemScopeTypes = itemScope.getTypes();
+ if (!itemScopeTypes.isEmpty()) {
+ defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0));
+ for (IRI type : itemScopeTypes) {
+ out.writeTriple(subject, RDF.TYPE, type);
+ }
}
for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
String propName = itemProps.getKey();
@@ -454,11 +456,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
return subject;
}
- private static IRI getType(ItemScope scope) {
- URL type = scope.getType();
- return type == null ? null : RDFUtils.iri(type.toString());
- }
-
private static Resource createSubjectForItemId(IRI documentIRI, String itemId) {
if (itemId == null) {
return RDFUtils.bnode();
http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index dfb9de6..95fd94b 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata;
import org.apache.any23.extractor.html.DomUtils;
import org.apache.any23.rdf.RDFUtils;
import org.apache.commons.lang.StringUtils;
+import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
@@ -31,7 +32,6 @@ import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;
import java.io.PrintStream;
-import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -47,6 +47,7 @@ import java.util.Set;
* nodes contained within a <i>DOM</i> document.
*
* @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
*/
public class MicrodataParser {
@@ -578,12 +579,31 @@ public class MicrodataParser {
itemProps.add(deferredProperty);
}
- URL type;
- try {
- type = ItemScope.stringToUrl(itemType);
- } catch (IllegalArgumentException e) {
- manageError(new MicrodataParserException(e.getMessage(), node));
- type = null;
+ List<IRI> types;
+ if (itemType == null) {
+ types = Collections.emptyList();
+ } else {
+ types = new ArrayList<>();
+ boolean canConcatWithPrev = false;
+ for (String s : itemType.trim().split("\\s+")) {
+ try {
+ canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s));
+ } catch (RuntimeException e) {
+ if (canConcatWithPrev) {
+ int lastInd = types.size() - 1;
+ try {
+ List<IRI> secondTry = ItemScope.stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s);
+ types.remove(lastInd);
+ canConcatWithPrev = types.addAll(secondTry);
+ } catch (RuntimeException e2) {
+ manageError(new MicrodataParserException(e.getMessage(), node));
+ canConcatWithPrev = false;
+ }
+ } else {
+ manageError(new MicrodataParserException(e.getMessage(), node));
+ }
+ }
+ }
}
final ItemScope newItemScope = new ItemScope(
@@ -591,7 +611,7 @@ public class MicrodataParser {
itemProps.toArray(new ItemProp[itemProps.size()]),
id,
itemrefIDs,
- type,
+ types,
itemId
);
itemScopes.put(node, newItemScope);