You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/23 01:00:28 UTC
svn commit: r698028 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/html/ src/main/resources/
src/test/java/org/apache/tika/parser/html/ src/test/resources/test-documents/
Author: jukka
Date: Mon Sep 22 16:00:27 2008
New Revision: 698028
URL: http://svn.apache.org/viewvc?rev=698028&view=rev
Log:
TIKA-140: HTML parser unable to extract text
Use a new XHTMLDowngradeHandler decorator to "dumb down" incoming XHTML so we can handle all forms of HTML with the same logic.
Added a test case for parsing (and auto-detecting) an XHTML document.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Sep 22 16:00:27 2008
@@ -84,7 +84,11 @@
35. TIKA-161 - Enable PMD reports (Jukka Zitting)
-36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi (Sami Siren)
+36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi
+ (Sami Siren)
+
+37. TIKA-140 - HTML parser unable to extract text
+ (Julien Nioche & Jukka Zitting)
Release 0.1-incubating - 12/27/2007
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Mon Sep 22 16:00:27 2008
@@ -102,7 +102,7 @@
// Parse the HTML document
xhtml.startDocument();
SAXParser parser = new SAXParser();
- parser.setContentHandler(handler);
+ parser.setContentHandler(new XHTMLDowngradeHandler(handler));
parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
xhtml.endDocument();
}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=698028&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Mon Sep 22 16:00:27 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+ public XHTMLDowngradeHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ String upper = localName.toUpperCase();
+
+ AttributesImpl attributes = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ String local = atts.getLocalName(i);
+ String qname = atts.getQName(i);
+ if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
+ && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+ && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+ attributes.addAttribute(
+ atts.getURI(i), local, qname,
+ atts.getType(i), atts.getValue(i));
+ }
+ }
+
+ super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String upper = localName.toUpperCase();
+ super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+
+}
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Mon Sep 22 16:00:27 2008
@@ -25,7 +25,6 @@
<parser name="parse-dcxml" class="org.apache.tika.parser.xml.DcXMLParser">
<mime>application/xml</mime>
- <mime>application/xhtml+xml</mime>
</parser>
<parser name="parse-office" class="org.apache.tika.parser.microsoft.OfficeParser">
@@ -39,6 +38,7 @@
<parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
<mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
<mime>application/x-asp</mime>
</parser>
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=698028&r1=698027&r2=698028&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Mon Sep 22 16:00:27 2008
@@ -25,6 +25,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
@@ -44,7 +45,7 @@
public void testParseAscii() throws Exception {
final StringWriter href = new StringWriter();
-
+
ContentHandler body = new BodyContentHandler();
ContentHandler link = new DefaultHandler() {
@Override
@@ -75,7 +76,6 @@
assertTrue(
"Did not contain expected text:" + "Indexation du fichier",
content.contains("Indexation du fichier"));
-
}
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
@@ -97,7 +97,27 @@
assertTrue("Did not contain expected text:" + "åäö", content
.contains("åäö"));
+ }
+
+ public void testXhtmlParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = HtmlParserTest.class.getResourceAsStream(
+ "/test-documents/testXHTML.html");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
+ String content = handler.toString();
+ assertTrue(content.contains("ability of Apache Tika"));
+ assertTrue(content.contains("extract content"));
+ assertTrue(content.contains("an XHTML document"));
}
public void testParseEmpty() throws Exception {
Added: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=698028&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html (added)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Mon Sep 22 16:00:27 2008
@@ -0,0 +1,11 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>XHTML test document</title>
+ </head>
+ <body>
+ <p>
+ This document tests the ability of Apache Tika to extract content
+ from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
+ </p>
+ </body>
+</html>
\ No newline at end of file