You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2008/10/17 00:48:10 UTC
svn commit: r705388 - in /incubator/tika/trunk/src:
main/java/org/apache/tika/parser/html/HtmlParser.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
test/resources/test-documents/testHTML.html
test/resources/test-documents/testXHTML.html
Author: dmeikle
Date: Thu Oct 16 15:48:10 2008
New Revision: 705388
URL: http://svn.apache.org/viewvc?rev=705388&view=rev
Log:
TIKA-166: Updated HTMLParser to parse HTML meta tags into Metadata
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Oct 16 15:48:10 2008
@@ -81,6 +81,7 @@
DISCARD_ELEMENTS.add("STYLE");
DISCARD_ELEMENTS.add("SCRIPT");
+
}
public void parse(
@@ -95,9 +96,11 @@
XPathParser xpath = new XPathParser(null, "");
Matcher body = xpath.parse("/HTML/BODY//node()");
Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
+ Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
handler = new TeeContentHandler(
new MatchingContentHandler(getBodyHandler(xhtml), body),
- new MatchingContentHandler(getTitleHandler(metadata), title));
+ new MatchingContentHandler(getTitleHandler(metadata), title),
+ new MatchingContentHandler(getMetaHandler(metadata), meta));
// Parse the HTML document
xhtml.startDocument();
@@ -116,6 +119,22 @@
};
}
+ private ContentHandler getMetaHandler(final Metadata metadata) {
+ return new WriteOutContentHandler() {
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes atts)
+ throws SAXException {
+ if (atts.getValue("http-equiv") != null) {
+ metadata.set(atts.getValue("http-equiv"), atts.getValue("content"));
+ }
+ if (atts.getValue("name") != null) {
+ metadata.set(atts.getValue("name"), atts.getValue("content"));
+ }
+ }
+ };
+ }
+
private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
return new TextContentHandler(xhtml) {
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Oct 16 15:48:10 2008
@@ -67,6 +67,8 @@
assertEquals(
"Title : Test Indexation Html", metadata.get(Metadata.TITLE));
+ assertEquals("Tika Developers", metadata.get("Author"));
+ assertEquals("5", metadata.get("refresh"));
assertEquals("http://www.apache.org/", href.toString());
String content = body.toString();
@@ -115,6 +117,8 @@
assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
String content = handler.toString();
+ assertEquals("Tika Developers", metadata.get("Author"));
+ assertEquals("5", metadata.get("refresh"));
assertTrue(content.contains("ability of Apache Tika"));
assertTrue(content.contains("extract content"));
assertTrue(content.contains("an XHTML document"));
Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html (original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Thu Oct 16 15:48:10 2008
@@ -1,9 +1,11 @@
<html>
<head>
- <title>Title : Test Indexation Html</title>
- </head>
+ <title>Title : Test Indexation Html</title>
+ <meta name="Author" content="Tika Developers">
+ <meta http-equiv="refresh" content="5">
+ </head>
<body>
<h1>Test Indexation Html</h1>
<p><a href="http://www.apache.org/">Indexation</a> du fichier</p>
- </body>
+ </body>
</html>
\ No newline at end of file
Modified: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html (original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Thu Oct 16 15:48:10 2008
@@ -1,11 +1,13 @@
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>XHTML test document</title>
+ <meta name="Author" content="Tika Developers"/>
+ <meta http-equiv="refresh" content="5"/>
</head>
<body>
<p>
This document tests the ability of Apache Tika to extract content
from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
</p>
- </body>
+ </body>
</html>
\ No newline at end of file