You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/24 00:44:55 UTC

any23 git commit: ANY23-404 hardcode default microdata registry

Repository: any23
Updated Branches:
  refs/heads/master 7cbd82e88 -> 6b1469152


ANY23-404 hardcode default microdata registry


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6b146915
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6b146915
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6b146915

Branch: refs/heads/master
Commit: 6b1469152ccd30f93b0686a73bd1ba02955d6411
Parents: 7cbd82e
Author: Hans <fi...@gmail.com>
Authored: Tue Oct 23 19:37:37 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Tue Oct 23 19:37:37 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/MicrodataExtractor.java | 54 ++++++++++++--------
 .../microdata/MicrodataExtractorTest.java       | 21 ++++++++
 .../src/test/resources/microdata/example2.html  | 28 ++++++++++
 .../src/test/resources/microdata/example5.html  | 31 +++++++++++
 4 files changed, 113 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 3663800..3b45dd4 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -64,8 +64,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
 
     private String documentLanguage;
 
-    private IRI defaultNamespace;
-
     @Override
     public ExtractorDescription getDescription() {
         return MicrodataExtractorFactory.getDescriptionInstance();
@@ -95,7 +93,10 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
             return;
         }
 
+        final IRI documentIRI = extractionContext.getDocumentIRI();
+
         boolean isStrict = extractionParameters.getFlag("any23.microdata.strict");
+        final IRI defaultNamespace;
         if (!isStrict) {
             defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
             if (!defaultNamespace.getLocalName().isEmpty()) {
@@ -110,10 +111,9 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
         /**
          * 5.2.6
          */
-        final IRI documentIRI = extractionContext.getDocumentIRI();
         final Map<ItemScope, Resource> mappings = new HashMap<>();
         for (ItemScope itemScope : itemScopes) {
-            Resource subject = processType(itemScope, documentIRI, out, mappings);
+            Resource subject = processType(itemScope, documentIRI, out, mappings, defaultNamespace);
             out.writeTriple(
                     documentIRI,
                     MICRODATA_ITEM,
@@ -417,26 +417,31 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
     private Resource processType(
             ItemScope itemScope,
             IRI documentIRI, ExtractionResult out,
-            Map<ItemScope, Resource> mappings
+            Map<ItemScope, Resource> mappings, IRI defaultNamespace
     ) throws ExtractionException {
         Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(scope.getItemId()));
 
         IRI itemScopeType = getType(itemScope);
         if (itemScopeType != null) {
             out.writeTriple(subject, RDF.TYPE, itemScopeType);
+            defaultNamespace = getNamespaceIRI(itemScopeType);
         }
         for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
             String propName = itemProps.getKey();
+            IRI predicate = getPredicate(defaultNamespace, propName);
+            if (predicate == null) {
+                continue;
+            }
             for (ItemProp itemProp : itemProps.getValue()) {
                 try {
                     processProperty(
                             subject,
-                            propName,
+                            predicate,
                             itemProp,
-                            itemScopeType,
                             documentIRI,
                             mappings,
-                            out
+                            out,
+                            defaultNamespace
                     );
                 } catch (URISyntaxException e) {
                     throw new ExtractionException(
@@ -461,40 +466,47 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
 
     private void processProperty(
             Resource subject,
-            String propName,
+            IRI predicate,
             ItemProp itemProp,
-            IRI itemScopeType,
             IRI documentIRI,
             Map<ItemScope, Resource> mappings,
-            ExtractionResult out
+            ExtractionResult out,
+            IRI defaultNamespace
     ) throws URISyntaxException, ExtractionException {
 
-        IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName);
-        if (predicate == null) {
-            return;
-        }
-
         Value value;
         Object propValue = itemProp.getValue().getContent();
         ItemPropValue.Type propType = itemProp.getValue().getType();
         if (propType.equals(ItemPropValue.Type.Nested)) {
-            value = processType((ItemScope) propValue, documentIRI, out, mappings);
+            value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace);
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
             value = RDFUtils.literal((String) propValue, documentLanguage);
         } else if (propType.equals(ItemPropValue.Type.Link)) {
             value = toAbsoluteIRI(documentIRI, (String)propValue);
+            //TODO: support registries so hardcoding not needed
+            if (predicate.stringValue().equals("http://schema.org/additionalType")) {
+                out.writeTriple(subject, RDF.TYPE, value);
+            }
         } else if (propType.equals(ItemPropValue.Type.Date)) {
             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
         } else {
             throw new RuntimeException("Invalid Type '" +
-                    propType + "' for ItemPropValue with name: '" + propName + "'");
+                    propType + "' for ItemPropValue with name: '" + predicate + "'");
         }
         out.writeTriple(subject, predicate, value);
     }
 
-    private static IRI getPredicate(IRI itemType, String localName) {
-        return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? null :
-                RDFUtils.iri(itemType.getNamespace(), localName.trim()));
+    private static final String hcardPrefix    = "http://microformats.org/profile/hcard";
+    private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#");
+
+    private static IRI getNamespaceIRI(IRI itemType) {
+        //TODO: support registries so hardcoding not needed
+        return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType;
+    }
+
+    private static IRI getPredicate(IRI namespaceIRI, String localName) {
+        return toAbsoluteIRI(localName).orElseGet(() -> namespaceIRI == null ? null :
+                RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
     }
 
     private static Optional<IRI> toAbsoluteIRI(String urlString) {

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index e858ea3..fedd5fa 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -24,6 +24,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -78,6 +79,26 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
     }
 
     @Test
+    public void testExample2() {
+        //Property URI generation for hcard
+        assertExtract("/microdata/example2.html");
+        assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
+        assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value)null);
+        assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value)null);
+    }
+
+    @Test
+    public void testExample5() {
+        //Vocabulary expansion for schema.org
+        assertExtract("/microdata/example5.html");
+        assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
+        assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
+        assertContains(null, RDFUtils.iri("http://schema.org/additionalType"), RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
+        assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
+        assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
+    }
+
+    @Test
     public void testMicrodataBasic() {
         assertExtract("/microdata/microdata-basic.html");
         assertModelNotEmpty();

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example2.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example2.html b/test-resources/src/test/resources/microdata/example2.html
new file mode 100644
index 0000000..6ad5a33
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example2.html
@@ -0,0 +1,28 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<body>
+<span itemscope itemtype="http://microformats.org/profile/hcard">
+  <span itemprop="n" itemscope>
+    <span itemprop="given-name">Princeton</span>
+  </span>
+</span>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example5.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example5.html b/test-resources/src/test/resources/microdata/example5.html
new file mode 100644
index 0000000..ba05051
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example5.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<head>
+</head>
+<body>
+<div itemscope itemtype="http://schema.org/Person">
+    <link itemprop="additionalType" href="http://xmlns.com/foaf/0.1/Person"/>
+    <a itemprop="email http://xmlns.com/foaf/0.1/mbox" href="mailto:mail@gmail.com">
+        mail@gmail.com
+    </a>
+</div>
+</body>
+</html>
\ No newline at end of file