You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/23 01:00:28 UTC
svn commit: r698028 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/html/ src/main/resources/ src/test/java/org/apache/tika/parser/html/ src/test/resources/test-documents/

Author: jukka
Date: Mon Sep 22 16:00:27 2008
New Revision: 698028

URL: http://svn.apache.org/viewvc?rev=698028&view=rev
Log:
TIKA-140: HTML parser unable to extract text

Use a new XHTMLDowngradeHandler decorator to "dumb down" incoming XHTML so we can handle all forms of HTML with the same logic.

Added a test case for parsing (and auto-detecting) an XHTML document.

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
    incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Sep 22 16:00:27 2008
@@ -84,7 +84,11 @@
 
 35. TIKA-161 - Enable PMD reports (Jukka Zitting)
 
-36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi (Sami Siren)
+36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi
+               (Sami Siren)
+
+37. TIKA-140 - HTML parser unable to extract text
+               (Julien Nioche & Jukka Zitting)
 
 
 Release 0.1-incubating - 12/27/2007

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Mon Sep 22 16:00:27 2008
@@ -102,7 +102,7 @@
         // Parse the HTML document
         xhtml.startDocument();
         SAXParser parser = new SAXParser();
-        parser.setContentHandler(handler);
+        parser.setContentHandler(new XHTMLDowngradeHandler(handler));
         parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
         xhtml.endDocument();
     }

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=698028&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Mon Sep 22 16:00:27 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+    public XHTMLDowngradeHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes atts)
+            throws SAXException {
+        String upper = localName.toUpperCase();
+
+        AttributesImpl attributes = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            String local = atts.getLocalName(i);
+            String qname = atts.getQName(i);
+            if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
+                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+                attributes.addAttribute(
+                        atts.getURI(i), local, qname,
+                        atts.getType(i), atts.getValue(i));
+            }
+        }
+
+        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name)
+            throws SAXException {
+        String upper = localName.toUpperCase();
+        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) {
+    }
+
+}

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Mon Sep 22 16:00:27 2008
@@ -25,7 +25,6 @@
 
         <parser name="parse-dcxml" class="org.apache.tika.parser.xml.DcXMLParser">
                 <mime>application/xml</mime>
-                <mime>application/xhtml+xml</mime>
         </parser>
 
         <parser name="parse-office" class="org.apache.tika.parser.microsoft.OfficeParser">
@@ -39,6 +38,7 @@
 
         <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
                 <mime>text/html</mime>
+                <mime>application/xhtml+xml</mime>
                 <mime>application/x-asp</mime>
         </parser>
 

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Mon Sep 22 16:00:27 2008
@@ -25,6 +25,7 @@
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
@@ -44,7 +45,7 @@
 
     public void testParseAscii() throws Exception {
         final StringWriter href = new StringWriter();
-        
+
         ContentHandler body = new BodyContentHandler();
         ContentHandler link = new DefaultHandler() {
             @Override
@@ -75,7 +76,6 @@
         assertTrue(
                 "Did not contain expected text:" + "Indexation du fichier",
                 content.contains("Indexation du fichier"));
-
     }
 
     public void XtestParseUTF8() throws IOException, SAXException, TikaException {
@@ -97,7 +97,27 @@
 
         assertTrue("Did not contain expected text:" + "Ã¥Ã¤Ã¶", content
                 .contains("Ã¥Ã¤Ã¶"));
+    }
+
+    public void testXhtmlParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
 
+        InputStream stream = HtmlParserTest.class.getResourceAsStream(
+                "/test-documents/testXHTML.html");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
+        String content = handler.toString();
+        assertTrue(content.contains("ability of Apache Tika"));
+        assertTrue(content.contains("extract content"));
+        assertTrue(content.contains("an XHTML document"));
     }
 
     public void testParseEmpty() throws Exception {

Added: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=698028&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html (added)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Mon Sep 22 16:00:27 2008
@@ -0,0 +1,11 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <title>XHTML test document</title>
+  </head>
+  <body>
+    <p>
+      This document tests the ability of Apache Tika to extract content
+      from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
+    </p>
+  </body> 
+</html>
\ No newline at end of file