You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2008/10/17 00:48:10 UTC

svn commit: r705388 - in /incubator/tika/trunk/src: main/java/org/apache/tika/parser/html/HtmlParser.java test/java/org/apache/tika/parser/html/HtmlParserTest.java test/resources/test-documents/testHTML.html test/resources/test-documents/testXHTML.html

Author: dmeikle
Date: Thu Oct 16 15:48:10 2008
New Revision: 705388

URL: http://svn.apache.org/viewvc?rev=705388&view=rev
Log:
TIKA-166: Updated HTMLParser to parse HTML meta tags into Metadata

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
    incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Oct 16 15:48:10 2008
@@ -81,6 +81,7 @@
 
         DISCARD_ELEMENTS.add("STYLE");
         DISCARD_ELEMENTS.add("SCRIPT");
+
     }
 
     public void parse(
@@ -95,9 +96,11 @@
         XPathParser xpath = new XPathParser(null, "");
         Matcher body = xpath.parse("/HTML/BODY//node()");
         Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
+        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
         handler = new TeeContentHandler(
                 new MatchingContentHandler(getBodyHandler(xhtml), body),
-                new MatchingContentHandler(getTitleHandler(metadata), title));
+                new MatchingContentHandler(getTitleHandler(metadata), title),
+                new MatchingContentHandler(getMetaHandler(metadata), meta));
 
         // Parse the HTML document
         xhtml.startDocument();
@@ -116,6 +119,22 @@
         };
     }
 
+    private ContentHandler getMetaHandler(final Metadata metadata) {
+        return new WriteOutContentHandler() {
+            @Override
+            public void startElement(
+                    String uri, String local, String name, Attributes atts)
+                    throws SAXException {
+                    if (atts.getValue("http-equiv") != null) {
+                        metadata.set(atts.getValue("http-equiv"), atts.getValue("content"));
+                    }
+                    if (atts.getValue("name") != null) {
+                        metadata.set(atts.getValue("name"), atts.getValue("content"));
+                    }
+            }
+        };
+    }
+
     private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
         return new TextContentHandler(xhtml) {
 

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Oct 16 15:48:10 2008
@@ -67,6 +67,8 @@
 
         assertEquals(
                 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
         assertEquals("http://www.apache.org/", href.toString());
 
         String content = body.toString();
@@ -115,6 +117,8 @@
         assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
         String content = handler.toString();
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
         assertTrue(content.contains("ability of Apache Tika"));
         assertTrue(content.contains("extract content"));
         assertTrue(content.contains("an XHTML document"));

Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html (original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Thu Oct 16 15:48:10 2008
@@ -1,9 +1,11 @@
 <html>
 	<head>
-		<title>Title : Test Indexation Html</title>	
-	</head>
+        <title>Title : Test Indexation Html</title>
+        <meta name="Author" content="Tika Developers">
+        <meta http-equiv="refresh" content="5">
+    </head>
 	<body>
 		<h1>Test Indexation Html</h1>
 		<p><a href="http://www.apache.org/">Indexation</a> du fichier</p>
-	</body>	
+	</body>
 </html>
\ No newline at end of file

Modified: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html (original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Thu Oct 16 15:48:10 2008
@@ -1,11 +1,13 @@
 <html xmlns="http://www.w3.org/1999/xhtml">
   <head>
     <title>XHTML test document</title>
+    <meta name="Author" content="Tika Developers"/>
+    <meta http-equiv="refresh" content="5"/>
   </head>
   <body>
     <p>
       This document tests the ability of Apache Tika to extract content
       from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
     </p>
-  </body> 
+  </body>
 </html>
\ No newline at end of file