You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/24 00:44:55 UTC
any23 git commit: ANY23-404 hardcode default microdata registry
Repository: any23
Updated Branches:
refs/heads/master 7cbd82e88 -> 6b1469152
ANY23-404 hardcode default microdata registry
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6b146915
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6b146915
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6b146915
Branch: refs/heads/master
Commit: 6b1469152ccd30f93b0686a73bd1ba02955d6411
Parents: 7cbd82e
Author: Hans <fi...@gmail.com>
Authored: Tue Oct 23 19:37:37 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Tue Oct 23 19:37:37 2018 -0500
----------------------------------------------------------------------
.../extractor/microdata/MicrodataExtractor.java | 54 ++++++++++++--------
.../microdata/MicrodataExtractorTest.java | 21 ++++++++
.../src/test/resources/microdata/example2.html | 28 ++++++++++
.../src/test/resources/microdata/example5.html | 31 +++++++++++
4 files changed, 113 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 3663800..3b45dd4 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -64,8 +64,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
private String documentLanguage;
- private IRI defaultNamespace;
-
@Override
public ExtractorDescription getDescription() {
return MicrodataExtractorFactory.getDescriptionInstance();
@@ -95,7 +93,10 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
return;
}
+ final IRI documentIRI = extractionContext.getDocumentIRI();
+
boolean isStrict = extractionParameters.getFlag("any23.microdata.strict");
+ final IRI defaultNamespace;
if (!isStrict) {
defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
if (!defaultNamespace.getLocalName().isEmpty()) {
@@ -110,10 +111,9 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
/**
* 5.2.6
*/
- final IRI documentIRI = extractionContext.getDocumentIRI();
final Map<ItemScope, Resource> mappings = new HashMap<>();
for (ItemScope itemScope : itemScopes) {
- Resource subject = processType(itemScope, documentIRI, out, mappings);
+ Resource subject = processType(itemScope, documentIRI, out, mappings, defaultNamespace);
out.writeTriple(
documentIRI,
MICRODATA_ITEM,
@@ -417,26 +417,31 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
private Resource processType(
ItemScope itemScope,
IRI documentIRI, ExtractionResult out,
- Map<ItemScope, Resource> mappings
+ Map<ItemScope, Resource> mappings, IRI defaultNamespace
) throws ExtractionException {
Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(scope.getItemId()));
IRI itemScopeType = getType(itemScope);
if (itemScopeType != null) {
out.writeTriple(subject, RDF.TYPE, itemScopeType);
+ defaultNamespace = getNamespaceIRI(itemScopeType);
}
for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
String propName = itemProps.getKey();
+ IRI predicate = getPredicate(defaultNamespace, propName);
+ if (predicate == null) {
+ continue;
+ }
for (ItemProp itemProp : itemProps.getValue()) {
try {
processProperty(
subject,
- propName,
+ predicate,
itemProp,
- itemScopeType,
documentIRI,
mappings,
- out
+ out,
+ defaultNamespace
);
} catch (URISyntaxException e) {
throw new ExtractionException(
@@ -461,40 +466,47 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
private void processProperty(
Resource subject,
- String propName,
+ IRI predicate,
ItemProp itemProp,
- IRI itemScopeType,
IRI documentIRI,
Map<ItemScope, Resource> mappings,
- ExtractionResult out
+ ExtractionResult out,
+ IRI defaultNamespace
) throws URISyntaxException, ExtractionException {
- IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName);
- if (predicate == null) {
- return;
- }
-
Value value;
Object propValue = itemProp.getValue().getContent();
ItemPropValue.Type propType = itemProp.getValue().getType();
if (propType.equals(ItemPropValue.Type.Nested)) {
- value = processType((ItemScope) propValue, documentIRI, out, mappings);
+ value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace);
} else if (propType.equals(ItemPropValue.Type.Plain)) {
value = RDFUtils.literal((String) propValue, documentLanguage);
} else if (propType.equals(ItemPropValue.Type.Link)) {
value = toAbsoluteIRI(documentIRI, (String)propValue);
+ //TODO: support registries so hardcoding not needed
+ if (predicate.stringValue().equals("http://schema.org/additionalType")) {
+ out.writeTriple(subject, RDF.TYPE, value);
+ }
} else if (propType.equals(ItemPropValue.Type.Date)) {
value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
} else {
throw new RuntimeException("Invalid Type '" +
- propType + "' for ItemPropValue with name: '" + propName + "'");
+ propType + "' for ItemPropValue with name: '" + predicate + "'");
}
out.writeTriple(subject, predicate, value);
}
- private static IRI getPredicate(IRI itemType, String localName) {
- return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? null :
- RDFUtils.iri(itemType.getNamespace(), localName.trim()));
+ private static final String hcardPrefix = "http://microformats.org/profile/hcard";
+ private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#");
+
+ private static IRI getNamespaceIRI(IRI itemType) {
+ //TODO: support registries so hardcoding not needed
+ return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType;
+ }
+
+ private static IRI getPredicate(IRI namespaceIRI, String localName) {
+ return toAbsoluteIRI(localName).orElseGet(() -> namespaceIRI == null ? null :
+ RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
}
private static Optional<IRI> toAbsoluteIRI(String urlString) {
http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index e858ea3..fedd5fa 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -24,6 +24,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -78,6 +79,26 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
}
@Test
+ public void testExample2() {
+ //Property URI generation for hcard
+ assertExtract("/microdata/example2.html");
+ assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
+ assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value)null);
+ assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value)null);
+ }
+
+ @Test
+ public void testExample5() {
+ //Vocabulary expansion for schema.org
+ assertExtract("/microdata/example5.html");
+ assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
+ assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
+ assertContains(null, RDFUtils.iri("http://schema.org/additionalType"), RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
+ assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
+ assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
+ }
+
+ @Test
public void testMicrodataBasic() {
assertExtract("/microdata/microdata-basic.html");
assertModelNotEmpty();
http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example2.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example2.html b/test-resources/src/test/resources/microdata/example2.html
new file mode 100644
index 0000000..6ad5a33
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example2.html
@@ -0,0 +1,28 @@
+<!DOCTYPE html>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<body>
+<span itemscope itemtype="http://microformats.org/profile/hcard">
+ <span itemprop="n" itemscope>
+ <span itemprop="given-name">Princeton</span>
+ </span>
+</span>
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example5.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example5.html b/test-resources/src/test/resources/microdata/example5.html
new file mode 100644
index 0000000..ba05051
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example5.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<head>
+</head>
+<body>
+<div itemscope itemtype="http://schema.org/Person">
+ <link itemprop="additionalType" href="http://xmlns.com/foaf/0.1/Person"/>
+ <a itemprop="email http://xmlns.com/foaf/0.1/mbox" href="mailto:mail@gmail.com">
+ mail@gmail.com
+ </a>
+</div>
+</body>
+</html>
\ No newline at end of file