You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2015/06/13 20:09:08 UTC
any23 git commit: ANY23-185 Add missing element attributes to
HTMLMetaExtractor
Repository: any23
Updated Branches:
refs/heads/master a03bafa9c -> 0d106d4f2
ANY23-185 Add missing <meta> element attributes to HTMLMetaExtractor
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0d106d4f
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0d106d4f
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0d106d4f
Branch: refs/heads/master
Commit: 0d106d4f2aa26de8b2626d2d38991717d1a0a0fe
Parents: a03bafa
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Sat Jun 13 11:08:34 2015 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Sat Jun 13 11:08:34 2015 -0700
----------------------------------------------------------------------
.../any23/extractor/html/HTMLMetaExtractor.java | 88 ++++++++++++++++----
.../test/java/org/apache/any23/Any23Test.java | 4 +-
.../extractor/html/HTMLMetaExtractorTest.java | 9 +-
.../html/html-head-link-extractor.html | 1 -
...-meta-extractor-with-mozilla-extensions.html | 34 ++++++++
5 files changed, 117 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
index 16a0f6c..3e0c84e 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
@@ -81,11 +81,19 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
if(meta.getLang() != null) {
lang = meta.getLang();
}
- out.writeTriple(
- documentURI,
- meta.getName(),
- new LiteralImpl(meta.getContent(), lang)
- );
+ if(meta.isPragmaDirective){
+ out.writeTriple(
+ documentURI,
+ meta.getHttpEquiv(),
+ new LiteralImpl(meta.getContent(), lang)
+ );
+ }else {
+ out.writeTriple(
+ documentURI,
+ meta.getName(),
+ new LiteralImpl(meta.getContent(), lang)
+ );
+ }
}
}
@@ -134,19 +142,37 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
for (Node metaNode : metaNodes) {
NamedNodeMap attributes = metaNode.getAttributes();
Node nameAttribute = attributes.getNamedItem("name");
+ Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
Node contentAttribute = attributes.getNamedItem("content");
- if (nameAttribute == null || contentAttribute == null) {
- continue;
+ if (nameAttribute == null && httpEquivAttribute == null)
+ continue; //support HTML5 meta element nodes that do not have both name and http-equiv
+ if (nameAttribute != null || httpEquivAttribute != null){
+ if ( contentAttribute == null ){
+ continue;
+ }
}
- String name = nameAttribute.getTextContent();
- String content = contentAttribute.getTextContent();
- String xpath = DomUtils.getXPathForNode(metaNode);
- URI nameAsURI = getPrefixIfExists(name);
- if (nameAsURI == null) {
- nameAsURI = new URIImpl(baseProfile + name);
+ boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
+ if (isPragmaDirective){
+ String httpEquiv = httpEquivAttribute.getTextContent();
+ String content = contentAttribute.getTextContent();
+ String xpath = DomUtils.getXPathForNode(metaNode);
+ URI httpEquivAsURI = getPrefixIfExists(httpEquiv);
+ if (httpEquivAsURI == null) {
+ httpEquivAsURI = new URIImpl(baseProfile + httpEquiv);
+ }
+ Meta meta = new Meta(xpath, content, httpEquivAsURI);
+ result.add(meta);
+ } else {
+ String name = nameAttribute.getTextContent();
+ String content = contentAttribute.getTextContent();
+ String xpath = DomUtils.getXPathForNode(metaNode);
+ URI nameAsURI = getPrefixIfExists(name);
+ if (nameAsURI == null) {
+ nameAsURI = new URIImpl(baseProfile + name);
+ }
+ Meta meta = new Meta(xpath, nameAsURI, content);
+ result.add(meta);
}
- Meta meta = new Meta(xpath, nameAsURI, content);
- result.add(meta);
}
return result;
}
@@ -170,10 +196,26 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
private URI name;
+ private URI httpEquiv;
+
private String lang;
private String content;
+ private boolean isPragmaDirective;
+
+ public Meta(String xpath, String content, URI httpEquiv) {
+ this.xpath = xpath;
+ this.content = content;
+ this.httpEquiv = httpEquiv;
+ this.setPragmaDirective(true);
+ }
+
+ public Meta(String xpath, String content, URI httpEquiv, String lang) {
+ this(xpath,content,httpEquiv);
+ this.lang = lang;
+ }
+
public Meta(String xpath, URI name, String content) {
this.xpath = xpath;
this.name = name;
@@ -185,6 +227,22 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
this.lang = lang;
}
+ public boolean isPragmaDirective(){
+ return isPragmaDirective;
+ }
+
+ private void setPragmaDirective(boolean value){
+ this.isPragmaDirective=value;
+ }
+
+ public URI getHttpEquiv(){
+ return httpEquiv;
+ }
+
+ public void setHttpEquiv(URI httpEquiv){
+ this.httpEquiv=httpEquiv;
+ }
+
public URI getName() {
return name;
}
http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/Any23Test.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java
index 24bc913..c487ee8 100644
--- a/core/src/test/java/org/apache/any23/Any23Test.java
+++ b/core/src/test/java/org/apache/any23/Any23Test.java
@@ -286,7 +286,7 @@ public class Any23Test extends Any23OnlineTestBase {
final String bufferContent = byteArrayOutputStream.toString();
logger.debug(bufferContent);
- Assert.assertSame("Unexpected number of triples.", 16,
+ Assert.assertSame("Unexpected number of triples.", 18,
StringUtils.countNL(bufferContent));
}
@@ -368,7 +368,7 @@ public class Any23Test extends Any23OnlineTestBase {
@Test
public void testExtractionParametersWithNestingDisabled()
throws IOException, ExtractionException, TripleHandlerException {
- final int EXPECTED_TRIPLES = 19;
+ final int EXPECTED_TRIPLES = 20;
Any23 runner = new Any23();
DocumentSource source = getDocumentSourceFromResource(
"/microformats/nested-microformats-a1.html",
http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
index b35e33c..854360c 100644
--- a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
@@ -40,7 +40,7 @@ public class HTMLMetaExtractorTest extends AbstractExtractorTestCase {
public void testExtractPageMeta() throws Exception {
assertExtract("/html/html-head-meta-extractor.html");
assertModelNotEmpty();
- assertStatementsSize(null, null, null, 7);
+ assertStatementsSize(null, null, null, 10);
assertContains(new URIImpl("http://bob.example.com/"), new URIImpl(
"http://purl.org/dc/elements/1.1/title"), "XHTML+RDFa example",
"en");
@@ -70,4 +70,11 @@ public class HTMLMetaExtractorTest extends AbstractExtractorTestCase {
assertModelEmpty();
}
+ @Test
+ public void testExtractPageMetaWithExtensionsPerMozillaSpecification() throws Exception {
+ assertExtract("/html/html-head-meta-extractor-with-mozilla-extensions.html");
+ assertModelNotEmpty();
+ assertStatementsSize(null, null, null, 2);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-link-extractor.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/html-head-link-extractor.html b/test-resources/src/test/resources/html/html-head-link-extractor.html
index 86a76d6..59a374a 100644
--- a/test-resources/src/test/resources/html/html-head-link-extractor.html
+++ b/test-resources/src/test/resources/html/html-head-link-extractor.html
@@ -18,7 +18,6 @@
-->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
- <meta http-equiv="content-type" content="text/html;charset=UTF-8"/>
<title>myExperiment - Workflows - Pathways and Gene annotations for QTL region - Mouse (Paul Fisher)
[Taverna 2 Workflow]</title>
<link rel="alternate" href="http://www.myexperiment.org/workflows/16.rdf" type="application/rdf+xml"
http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
new file mode 100644
index 0000000..87a1fac
--- /dev/null
+++ b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
@@ -0,0 +1,34 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>test to check meta extraction with missing elements per mozilla specification</title>
+ <!-- Defining the charset in HTML4 -->
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+ <!-- In HTML5 -->
+ <meta charset="utf-8"/>
+
+ <!-- Redirect page after 3 seconds -->
+ <meta http-equiv="refresh" content="3;url=http://www.mozilla.org/"/>
+</head>
+<body>
+</body>
+</html>
+