You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2012/12/19 11:06:11 UTC
svn commit: r1423812 [1/2] - in
/stanbol/trunk/enhancement-engines/htmlextractor/src:
main/java/org/apache/stanbol/enhancer/engines/htmlextractor/
main/resources/ main/resources/xslt/
test/java/org/apache/stanbol/enhancer/engines/htmlextractor/ test/re...
Author: wkasper
Date: Wed Dec 19 10:06:10 2012
New Revision: 1423812
URL: http://svn.apache.org/viewvc?rev=1423812&view=rev
Log:
STANBOL-771: Add support for Microdata extraction from HTML-5 pages
Added:
stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/xslt/microdata.xsl
stanbol/trunk/enhancement-engines/htmlextractor/src/test/resources/test-microdata.html
Modified:
stanbol/trunk/enhancement-engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/HtmlExtractorEngine.java
stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/htmlextractors.xml
stanbol/trunk/enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
Modified: stanbol/trunk/enhancement-engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/HtmlExtractorEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/HtmlExtractorEngine.java?rev=1423812&r1=1423811&r2=1423812&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/HtmlExtractorEngine.java (original)
+++ stanbol/trunk/enhancement-engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/HtmlExtractorEngine.java Wed Dec 19 10:06:10 2012
@@ -46,7 +46,6 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
@@ -101,7 +100,10 @@ public class HtmlExtractorEngine extends
private HtmlParser htmlParser;
private boolean singleRootRdf = true;
-
+
+ // define the Nepomuk NIE namespace locally here
+ private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";
+
protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
super.activate(ce);
this.bundleContext = ce.getBundleContext();
@@ -164,7 +166,7 @@ public class HtmlExtractorEngine extends
ClerezzaRDFUtils.urifyBlankNodes(model);
// make the model single rooted
if (singleRootRdf) {
- ClerezzaRDFUtils.makeConnected(model,ci.getUri(),new UriRef(NamespaceEnum.nie+"contains"));
+ ClerezzaRDFUtils.makeConnected(model,ci.getUri(),new UriRef(NIE_NS+"contains"));
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
Modified: stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/htmlextractors.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/htmlextractors.xml?rev=1423812&r1=1423811&r2=1423812&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/htmlextractors.xml (original)
+++ stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/htmlextractors.xml Wed Dec 19 10:06:10 2012
@@ -22,4 +22,7 @@
<extractor id="RDFa">
<source type="xslt">xslt/rdfa.xslt</source>
</extractor>
+ <extractor id="MD">
+ <source type="xslt">xslt/microdata.xsl</source>
+ </extractor>
</htmlextractors>
Added: stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/xslt/microdata.xsl
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/xslt/microdata.xsl?rev=1423812&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/xslt/microdata.xsl (added)
+++ stanbol/trunk/enhancement-engines/htmlextractor/src/main/resources/xslt/microdata.xsl Wed Dec 19 10:06:10 2012
@@ -0,0 +1,694 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ version="1.0" xmlns:h="http://www.w3.org/1999/xhtml"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:nfo="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#"
+ xmlns:nie="http://www.semanticdesktop.org/ontologies/2007/01/19/nie#"
+ xmlns:owl="http://www.w3.org/2002/07/owl#">
+
+ <xsl:output indent="yes" method="xml" media-type="application/rdf+xml"
+ encoding="UTF-8" omit-xml-declaration="yes" />
+
+ <!-- base of the current XML doc -->
+ <xsl:variable name='xml_base' select="//*/@xml:base[position()=1]" />
+
+ <!-- base of the current HTML doc -->
+ <xsl:variable name='html_base' select="//*/head/base[position()=1]/@href" />
+
+ <!-- url of the current XHTML page if provided by the XSLT engine -->
+ <xsl:param name='uri' select="'http://foobar.com/'" />
+
+ <!-- this contains the URL of the source document whether it was provided
+ by the base or as a parameter e.g. http://example.org/bla/file.html -->
+ <xsl:variable name='this'>
+ <xsl:choose>
+ <xsl:when test="string-length($html_base)>0">
+ <xsl:value-of select="$html_base" />
+ </xsl:when>
+ <xsl:when test="string-length($xml_base)>0">
+ <xsl:value-of select="$xml_base" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$uri" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- this_location contains the location the source document e.g. http://example.org/bla/ -->
+ <xsl:variable name='this_location'>
+ <xsl:call-template name="get-location">
+ <xsl:with-param name="url" select="$this" />
+ </xsl:call-template>
+ </xsl:variable>
+
+ <!-- this_root contains the root location of the source document e.g. http://example.org/ -->
+ <xsl:variable name='this_root'>
+ <xsl:call-template name="get-root">
+ <xsl:with-param name="url" select="$this" />
+ </xsl:call-template>
+ </xsl:variable>
+
+
+ <!-- specify a registry for vocabs to parametrize mappings to RDF? -->
+ <!-- this bottom up approach does not allow for list-valued properties -->
+
+ <!-- templates for parsing - - - - - - - - - - - - - - - - - - - - - - - -->
+
+ <!--Start the RDF generation -->
+ <xsl:template match="/">
+ <rdf:RDF>
+ <!-- TODO: attributes can be in list! -->
+ <xsl:if
+ test="descendant::*/@itemprop or descendant::*/@itemprop or descendant::*/@itemtype">
+ <xsl:choose>
+ <xsl:when test="$uri != $this">
+ <nfo:HtmlDocument rdf:about="{$uri}">
+ <owl:sameAs rdf:resource="{$this}" />
+ </nfo:HtmlDocument>
+ <xsl:apply-templates />
+ </xsl:when>
+ <xsl:otherwise>
+ <nfo:HtmlDocument rdf:about="{$uri}" />
+ <xsl:apply-templates />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ </rdf:RDF>
+ </xsl:template>
+
+ <!-- TODO: itemref -->
+
+ <xsl:template match="*[attribute::itemscope='']">
+ <xsl:param name="refSource" />
+ <xsl:variable name="itemId">
+ <xsl:call-template name="getNodeId">
+ <xsl:with-param name="node" select="." />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:choose>
+ <!-- object properties -->
+ <xsl:when test="attribute::itemprop != ''">
+ <xsl:variable name="subj">
+ <xsl:choose>
+ <xsl:when test="$refSource">
+ <xsl:call-template name="getNodeId">
+ <xsl:with-param name="node" select="$refSource" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:call-template name="getNodeId">
+ <xsl:with-param name="node" select="(ancestor::*[attribute::itemscope=''])[last()]" />
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- TODO itemId of object is not passed to daughter nodes -->
+ <xsl:call-template name="relation">
+ <xsl:with-param name="subject" select="$subj" />
+ <xsl:with-param name="predicate" select="@itemprop" />
+ <xsl:with-param name="object" select="$itemId" />
+ </xsl:call-template>
+ <xsl:if test="attribute::itemtype != ''">
+ <xsl:call-template name="class">
+ <xsl:with-param name="resource" select="$itemId" />
+ <xsl:with-param name="class" select="@itemtype" />
+ </xsl:call-template>
+ </xsl:if>
+ </xsl:when>
+ <xsl:when test="attribute::itemtype != ''">
+ <xsl:call-template name="class">
+ <xsl:with-param name="resource" select="$itemId" />
+ <xsl:with-param name="class" select="@itemtype" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:if test="attribute::itemref">
+ <xsl:call-template name="getReferences">
+ <xsl:with-param name="references" select="@itemref" />
+ <xsl:with-param name="refSource" select="." />
+ </xsl:call-template>
+ </xsl:if>
+ <xsl:apply-templates select="*" />
+ </xsl:template>
+
+ <!-- datatype properties -->
+ <!-- TODO datatype mappings -->
+ <xsl:template match="*[attribute::itemprop != '' and not(attribute::itemscope)]">
+ <xsl:param name="refSource"/>
+ <xsl:variable name="subj">
+ <xsl:choose>
+ <xsl:when test="$refSource">
+ <xsl:call-template name="getNodeId">
+ <xsl:with-param name="node" select="$refSource"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:call-template name="getNodeId">
+ <xsl:with-param name="node" select="(ancestor::*[attribute::itemscope=''])[last()]"/>
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+ <xsl:variable name="propVal">
+ <xsl:choose>
+ <xsl:when test="attribute::content">
+ <xsl:value-of select="attribute::content" />
+ </xsl:when>
+ <!-- TODO test element names too? -->
+ <xsl:when test="attribute::src">
+ <xsl:call-template name="resolveUri">
+ <xsl:with-param name="base" select="$this" />
+ <xsl:with-param name="ref" select="attribute::src" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:when test="attribute::href != ''">
+ <xsl:call-template name="resolveUri">
+ <xsl:with-param name="base" select="$this" />
+ <xsl:with-param name="ref" select="attribute::href" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:when test="attribute::datetime">
+ <!-- TODO datetime datatype declaration, value conversion necessary? -->
+ <xsl:value-of select="attribute::datetime"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="." />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+ <xsl:call-template name="property">
+ <xsl:with-param name="subject" select="$subj" />
+ <xsl:with-param name="predicate" select="@itemprop" />
+ <xsl:with-param name="object" select="$propVal" />
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- named templates to process URIs and token lists - - - - - - - - - - -->
+
+ <!-- tokenize a string using space as a delimiter -->
+ <xsl:template name="tokenize">
+ <xsl:param name="string" />
+ <xsl:if test="string-length($string)>0">
+ <xsl:choose>
+ <xsl:when test="contains($string,' ')">
+ <xsl:value-of select="normalize-space(substring-before($string,' '))" />
+ <xsl:call-template name="tokenize">
+ <xsl:with-param name="string"
+ select="normalize-space(substring-after($string,' '))" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$string" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- get file location from URL -->
+ <xsl:template name="get-location">
+ <xsl:param name="url" />
+ <xsl:if test="string-length($url)>0 and contains($url,'/')">
+ <xsl:value-of select="concat(substring-before($url,'/'),'/')" />
+ <xsl:call-template name="get-location">
+ <xsl:with-param name="url" select="substring-after($url,'/')" />
+ </xsl:call-template>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- get root location from URL -->
+ <xsl:template name="get-root">
+ <xsl:param name="url" />
+ <xsl:choose>
+ <xsl:when test="contains($url,'//')">
+ <xsl:value-of
+ select="concat(substring-before($url,'//'),'//',substring-before(substring-after($url,'//'),'/'),'/')" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="concat($url,'/')" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- returns the first token in a list separated by spaces -->
+ <xsl:template name="get-first-token">
+ <xsl:param name="tokens" />
+ <xsl:if test="string-length($tokens)>0">
+ <xsl:choose>
+ <xsl:when test="contains($tokens,' ')">
+ <xsl:value-of select="normalize-space(substring-before($tokens,' '))" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$tokens" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- returns the namespace for a predicate -->
+ <xsl:template name="get-predicate-ns">
+ <xsl:param name="qname" />
+ <xsl:choose>
+ <xsl:when test="contains($qname,'#')">
+ <xsl:value-of select="concat(substring-before($qname,'#'),'#')"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:variable name="uriPrefix">
+ <xsl:call-template name="longestPrefix">
+ <xsl:with-param name="string" select="$qname"/>
+ <xsl:with-param name="sep" select="'/'"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:choose>
+ <xsl:when test="$uriPrefix = ''">
+ <xsl:value-of select="$this_location"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="concat($uriPrefix,'/')"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- extract atomic name from URI name -->
+ <xsl:template name="getAtomicPredName">
+ <xsl:param name="qname"/>
+ <xsl:choose>
+ <xsl:when test="contains($qname,'#')">
+ <xsl:value-of select="substring-after($qname,'#')"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:call-template name="lastIndexOf">
+ <xsl:with-param name="string" select="$qname"/>
+ <xsl:with-param name="sep" select="'/'"/>
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- expand the name for a predicate -->
+ <xsl:template name="get-expanded-predicate-name">
+ <xsl:param name="qname" />
+ <!-- TODO multiple type declarations (lists) or missing type -->
+ <xsl:variable name="typeUri" select="(ancestor::*[@itemscope='' and @itemtype!=''])[last()]/@itemtype"/>
+ <xsl:choose>
+ <xsl:when test="$typeUri != ''">
+ <xsl:call-template name="resolveUri">
+ <xsl:with-param name="base" select="$typeUri"/>
+ <xsl:with-param name="ref" select="$qname"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="concat($this_location,$qname)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template name="getNodeId">
+ <xsl:param name="node" />
+ <xsl:choose>
+ <xsl:when test="$node">
+ <xsl:choose>
+ <xsl:when test="$node/@itemid != ''">
+ <xsl:value-of select="$node/@itemid" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="concat('blank:node:',generate-id($node))" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$this" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template name="getReferences">
+ <xsl:param name="references"/>
+ <xsl:param name="refSource"/>
+ <xsl:variable name="oneReference">
+ <xsl:call-template name="get-first-token">
+ <xsl:with-param name="tokens" select="$references" />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:for-each select="//*[@id = $oneReference]">
+ <xsl:apply-templates select=".">
+ <xsl:with-param name="refSource" select="$refSource"/>
+ </xsl:apply-templates>
+ </xsl:for-each>
+ <!-- recursive call for multiple references -->
+ <xsl:variable name="otherRefs"
+ select="normalize-space(substring-after($references,' '))" />
+ <xsl:if test="string-length($otherRefs)>0">
+ <xsl:call-template name="getReferences">
+ <xsl:with-param name="references" select="$otherRefs" />
+ <xsl:with-param name="refSource" select="$refSource"/>
+ </xsl:call-template>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- named templates to generate RDF - - - - - - - - - - - - - - - - - - -->
+
+ <!-- generate an RDF statement for a relation -->
+ <xsl:template name="relation">
+ <xsl:param name="subject" />
+ <xsl:param name="predicate" />
+ <xsl:param name="object" />
+
+ <!-- test for multiple predicates -->
+ <xsl:variable name="single-predicate">
+ <xsl:call-template name="get-first-token">
+ <xsl:with-param name="tokens" select="$predicate" />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="expandedPredName">
+ <xsl:call-template name="get-expanded-predicate-name">
+ <xsl:with-param name="qname" select="$predicate"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="atomicPredName">
+ <xsl:call-template name="getAtomicPredName">
+ <xsl:with-param name="qname" select="$expandedPredName"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="predNs">
+ <xsl:call-template name="get-predicate-ns">
+ <xsl:with-param name="qname" select="$expandedPredName"/>
+ </xsl:call-template>
+ </xsl:variable>
+
+ <xsl:element name="rdf:Description">
+ <xsl:choose>
+ <xsl:when test="starts-with($subject,'blank:node:')">
+ <xsl:attribute name="rdf:nodeID"><xsl:value-of
+ select="substring-after($subject,'blank:node:')" /></xsl:attribute>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:attribute name="rdf:about"><xsl:value-of
+ select="$subject" /></xsl:attribute>
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:element name="{$atomicPredName}" namespace="{$predNs}">
+ <xsl:choose>
+ <xsl:when test="starts-with($object,'blank:node:')">
+ <xsl:attribute name="rdf:nodeID"><xsl:value-of
+ select="substring-after($object,'blank:node:')" /></xsl:attribute>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:attribute name="rdf:resource"><xsl:value-of
+ select="$object" /></xsl:attribute>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:element>
+ </xsl:element>
+
+ <!-- recursive call for multiple predicates -->
+ <xsl:variable name="other-predicates"
+ select="normalize-space(substring-after($predicate,' '))" />
+ <xsl:if test="string-length($other-predicates)>0">
+ <xsl:call-template name="relation">
+ <xsl:with-param name="subject" select="$subject" />
+ <xsl:with-param name="predicate" select="$other-predicates" />
+ <xsl:with-param name="object" select="$object" />
+ </xsl:call-template>
+ </xsl:if>
+
+ </xsl:template>
+
+
+ <!-- generate an RDF statement for a property -->
+ <xsl:template name="property">
+ <xsl:param name="subject" />
+ <xsl:param name="predicate" />
+ <xsl:param name="object" />
+ <xsl:param name="datatype" />
+ <xsl:param name="attrib" /> <!-- is the content from an attribute ? true /false -->
+ <xsl:param name="language" />
+
+ <!-- test for multiple predicates -->
+ <xsl:variable name="single-predicate">
+ <xsl:call-template name="get-first-token">
+ <xsl:with-param name="tokens" select="$predicate" />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="expandedPredName">
+ <xsl:call-template name="get-expanded-predicate-name">
+ <xsl:with-param name="qname" select="$predicate"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="atomicPredName">
+ <xsl:call-template name="getAtomicPredName">
+ <xsl:with-param name="qname" select="$expandedPredName"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:variable name="predNs">
+ <xsl:call-template name="get-predicate-ns">
+ <xsl:with-param name="qname" select="$expandedPredName"/>
+ </xsl:call-template>
+ </xsl:variable>
+
+ <!-- TODO expand property name with URI -->
+ <xsl:element name="rdf:Description">
+ <xsl:choose>
+ <xsl:when test="starts-with($subject,'blank:node:')">
+ <xsl:attribute name="rdf:nodeID"><xsl:value-of
+ select="substring-after($subject,'blank:node:')" /></xsl:attribute>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:attribute name="rdf:about"><xsl:value-of
+ select="$subject" /></xsl:attribute>
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:element name="{$atomicPredName}" namespace="{$predNs}">
+ <xsl:if test="string-length($language)>0">
+ <xsl:attribute name="xml:lang"><xsl:value-of
+ select="$language" /></xsl:attribute>
+ </xsl:if>
+ <xsl:choose>
+ <xsl:when
+ test="$datatype='http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral'">
+ <xsl:choose>
+ <xsl:when test="$attrib='true'"> <!-- content is in an attribute -->
+ <xsl:value-of select="normalize-space(string($object))" />
+ </xsl:when>
+ <xsl:otherwise> <!-- content is in the element and may include some tags -->
+ <!-- <attribute name="rdf:datatype"><value-of select="$datatype"
+ /></attribute> -->
+ <attribute name="rdf:parseType">
+ <value-of select="'Literal'" />
+ </attribute>
+ <xsl:value-of select="normalize-space($object)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:when test="string-length($datatype)>0">
+ <!-- there is a datatype other than XMLLiteral -->
+ <xsl:attribute name="rdf:datatype"><xsl:value-of
+ select="$datatype" /></xsl:attribute>
+ <xsl:choose>
+ <xsl:when test="$attrib='true'"> <!-- content is in an attribute -->
+ <xsl:value-of select="normalize-space(string($object))" />
+ </xsl:when>
+ <xsl:otherwise> <!-- content is in the text nodes of the element -->
+ <xsl:value-of select="normalize-space($object)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:otherwise> <!-- there is no datatype -->
+ <xsl:choose>
+ <xsl:when test="$attrib='true'"> <!-- content is in an attribute -->
+ <xsl:value-of select="normalize-space(string($object))" />
+ </xsl:when>
+ <xsl:otherwise> <!-- content is in the text nodes of the element -->
+ <xsl:value-of select="normalize-space($object)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:element>
+ </xsl:element>
+
+ <!-- recursive call for multiple predicates -->
+ <xsl:variable name="other-predicates"
+ select="normalize-space(substring-after($predicate,' '))" />
+ <xsl:if test="string-length($other-predicates)>0">
+ <xsl:call-template name="property">
+ <xsl:with-param name="subject" select="$subject" />
+ <xsl:with-param name="predicate" select="$other-predicates" />
+ <xsl:with-param name="object" select="$object" />
+ <xsl:with-param name="datatype" select="$datatype" />
+ <xsl:with-param name="attrib" select="$attrib" />
+ <xsl:with-param name="language" select="$language" />
+ </xsl:call-template>
+ </xsl:if>
+
+ </xsl:template>
+
+
+
+ <!-- generate an RDF statement for a class -->
+ <xsl:template name="class">
+ <xsl:param name="resource" />
+ <xsl:param name="class" />
+
+ <!-- case multiple classes -->
+ <xsl:variable name="single-class">
+ <xsl:call-template name="get-first-token">
+ <xsl:with-param name="tokens" select="$class" />
+ </xsl:call-template>
+ </xsl:variable>
+
+ <xsl:element name="rdf:Description">
+ <xsl:choose>
+ <xsl:when test="starts-with($resource,'blank:node:')">
+ <xsl:attribute name="rdf:nodeID"><xsl:value-of
+ select="substring-after($resource,'blank:node:')" /></xsl:attribute>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:attribute name="rdf:about"><xsl:value-of
+ select="$resource" /></xsl:attribute>
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:element name="rdf:type">
+ <xsl:attribute name="rdf:resource">
+ <xsl:call-template name="correctURI">
+ <xsl:with-param name="uri" select="$class"/>
+ </xsl:call-template>
+<!-- <xsl:value-of select="$class" /> -->
+ </xsl:attribute>
+ </xsl:element>
+ </xsl:element>
+
+ <!-- recursive call for multiple classes -->
+ <xsl:variable name="other-classes"
+ select="normalize-space(substring-after($class,' '))" />
+ <xsl:if test="string-length($other-classes)>0">
+ <xsl:call-template name="class">
+ <xsl:with-param name="resource" select="$resource" />
+ <xsl:with-param name="class" select="$other-classes" />
+ </xsl:call-template>
+ </xsl:if>
+
+ </xsl:template>
+
+
+ <!-- ignore the rest of the DOM -->
+ <xsl:template match="*|text()|@*">
+ <xsl:param name="refSource"/>
+ <xsl:apply-templates>
+ <xsl:with-param name="refSource" select="$refSource"/>
+ </xsl:apply-templates>
+ </xsl:template>
+
+ <xsl:template name="resolveUri">
+ <xsl:param name="base" />
+ <xsl:param name="ref" />
+ <xsl:variable name="base2">
+ <xsl:call-template name="correctURI">
+ <xsl:with-param name="uri" select="$base"/>
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:choose>
+ <xsl:when test="starts-with($ref,'#')">
+ <xsl:value-of select="concat($base2,$ref)" />
+ </xsl:when>
+ <xsl:when test="not(contains($ref,':/'))">
+ <!-- TODO: remove double slashes? -->
+ <xsl:variable name="baseUri">
+ <xsl:call-template name="longestPrefix">
+ <xsl:with-param name="string" select="$base2" />
+ <xsl:with-param name="sep" select="'/'" />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:choose>
+ <xsl:when test="starts-with($ref,'//')">
+ <!-- prefix is just the protocol part -->
+ <!-- <xsl:value-of select="concat(substring-before($baseUri,':'),':',$ref)"/> -->
+ <!-- hard code the prefix for Stanbol because there we will not see
+ the real URLs anyway -->
+ <xsl:value-of select="concat('http:',$ref)" />
+ </xsl:when>
+ <xsl:when test="starts-with($ref,'/')">
+ <xsl:value-of select="concat($baseUri,$ref)" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="concat($baseUri,'/',$ref)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$ref" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template name="longestPrefix">
+ <xsl:param name="string" />
+ <xsl:param name="sep" />
+ <xsl:variable name="lastSeg">
+ <xsl:call-template name="lastIndexOf">
+ <xsl:with-param name="string" select="$string" />
+ <xsl:with-param name="sep" select="$sep" />
+ </xsl:call-template>
+ </xsl:variable>
+ <xsl:value-of
+ select="substring($string,1,string-length($string) - string-length($sep) - string-length($lastSeg))" />
+ </xsl:template>
+
+ <!-- returns the substring after the last occurrence of the separator -->
+ <xsl:template name="lastIndexOf">
+ <xsl:param name="string" />
+ <xsl:param name="sep" />
+ <xsl:choose>
+ <xsl:when test="contains($string, $sep)">
+ <xsl:call-template name="lastIndexOf">
+ <xsl:with-param name="string"
+ select="substring-after($string, $sep)" />
+ <xsl:with-param name="sep" select="$sep" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$string" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- hacks to create a formally correct URI from potential garbage -->
+ <xsl:template name="correctURI">
+ <xsl:param name="uri"/>
+ <xsl:choose>
+ <xsl:when test="starts-with($uri,'schema.org')">
+ <xsl:value-of select="concat('http://',$uri)"/>
+ </xsl:when>
+ <xsl:when test="substring-before($uri,':') = ''">
+ <xsl:call-template name="resolveUri">
+ <xsl:with-param name="base" select="$this"/>
+ <xsl:with-param name="ref" select="$uri"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$uri"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+</xsl:stylesheet>
Modified: stanbol/trunk/enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java?rev=1423812&r1=1423811&r2=1423812&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java (original)
+++ stanbol/trunk/enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java Wed Dec 19 10:06:10 2012
@@ -35,7 +35,6 @@ import org.apache.stanbol.enhancer.engin
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
-import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
@@ -54,6 +53,9 @@ public class TestHtmlExtractor {
private static HtmlExtractionRegistry registry;
+ // define the Nepomuks NIE namespace locally here
+ private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";
+
@BeforeClass
public static void oneTimeSetup() throws IOException {
try {
@@ -89,7 +91,7 @@ public class TestHtmlExtractor {
LOG.debug("RDFa triples: {}",tripleCounter);
printTriples(model);
assertEquals(8, tripleCounter);
- ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NamespaceEnum.nie+"contains"));
+ ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
}
/** This tests some Microformat extraction
@@ -114,9 +116,33 @@ public class TestHtmlExtractor {
LOG.debug("Microformat triples: {}",tripleCounter);
printTriples(model);
assertEquals(127, tripleCounter);
- ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NamespaceEnum.nie+"contains"));
+ ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
}
+ /** This test some extraction of microdata from an HTML-5 document
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testMicrodataExtraction() throws Exception {
+ HtmlExtractor extractor = new HtmlExtractor(registry, parser);
+ MGraph model = new SimpleMGraph();
+ String testFile = "test-microdata.html";
+
+ // extract text from RDFa annotated html
+ InputStream in = getResourceAsStream(testFile);
+ assertNotNull("failed to load resource " + testFile, in);
+
+ extractor.extract("file://" + testFile,in,null, "text/html", model);
+
+ // show triples
+ int tripleCounter = model.size();
+ LOG.debug("Microdata triples: {}",tripleCounter);
+ printTriples(model);
+ assertEquals(91, tripleCounter);
+ ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
+ }
+
/** This tests the merging of disconnected graphs under a single root
*
* @throws Exception
@@ -139,7 +165,7 @@ public class TestHtmlExtractor {
printTriples(model);
Set<NonLiteral> roots = ClerezzaRDFUtils.findRoots(model);
assertTrue(roots.size() > 1);
- ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NamespaceEnum.nie+"contains"));
+ ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
roots = ClerezzaRDFUtils.findRoots(model);
assertEquals(1,roots.size());
}