You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/25 12:31:13 UTC

svn commit: r1175364 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/sax/ToHTMLContentHandler.java main/java/org/apache/tika/sax/ToXMLContentHandler.java test/java/org/apache/tika/sax/SerializerTest.java

Author: jukka
Date: Sun Sep 25 10:31:13 2011
New Revision: 1175364

URL: http://svn.apache.org/viewvc?rev=1175364&view=rev
Log:
TIKA-651: Unescaped attribute value generated

Add a simple HTML serializer

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java   (with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java?rev=1175364&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java Sun Sep 25 10:31:13 2011
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that serializes the HTML document to a character stream.
+ * The incoming SAX events are expected to be well-formed (properly nested,
+ * etc.) and valid HTML.
+ *
+ * @since Apache Tika 1.0
+ */
+public class ToHTMLContentHandler extends ToXMLContentHandler {
+
+    private static final Set<String> EMPTY_ELEMENTS =
+        new HashSet<String>(Arrays.asList(
+            "area", "base", "basefont", "br", "col", "frame", "hr",
+            "img", "input", "isindex", "link", "meta", "param"));
+
+    public ToHTMLContentHandler(OutputStream stream, String encoding)
+            throws UnsupportedEncodingException {
+        super(stream, encoding);
+    }
+
+    public ToHTMLContentHandler() {
+        super();
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if (inStartElement) {
+            write('>');
+            inStartElement = false;
+
+            if (EMPTY_ELEMENTS.contains(localName)) {
+                namespaces.clear();
+                return;
+            }
+        }
+
+        super.endElement(uri, localName, qName);
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java?rev=1175364&r1=1175363&r2=1175364&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java Sun Sep 25 10:31:13 2011
@@ -77,9 +77,9 @@ public class ToXMLContentHandler extends
 
     private final String encoding;
 
-    private boolean inStartElement = false;
+    protected boolean inStartElement = false;
 
-    private final Map<String, String> namespaces =
+    protected final Map<String, String> namespaces =
         new HashMap<String, String>();
 
     private ElementInfo currentElement;

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java?rev=1175364&r1=1175363&r2=1175364&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java Sun Sep 25 10:31:13 2011
@@ -43,9 +43,9 @@ public class SerializerTest extends Test
         assertCharacters("content", new ToXMLContentHandler());
         assertCharacterEscaping("&lt;&amp;\"&gt;", new ToXMLContentHandler());
         assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler());
-        assertEmptyElement("<p />", new ToXMLContentHandler());
+        assertEmptyElement("<br />", new ToXMLContentHandler());
         assertEmptyElementWithAttributes(
-                "<p class=\"test\" />",
+                "<meta name=\"foo\" value=\"bar\" />",
                 new ToXMLContentHandler());
         assertEmptyElementWithAttributeEscaping(
                 "<p class=\"&lt;&amp;&quot;&gt;\" />",
@@ -56,6 +56,24 @@ public class SerializerTest extends Test
                 new ToXMLContentHandler());
     }
 
+    public void testToHTMLContentHandler() throws Exception {
+        assertStartDocument("", new ToHTMLContentHandler());
+        assertCharacters("content", new ToHTMLContentHandler());
+        assertCharacterEscaping("&lt;&amp;\"&gt;", new ToHTMLContentHandler());
+        assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler());
+        assertEmptyElement("<br>", new ToHTMLContentHandler());
+        assertEmptyElementWithAttributes(
+                "<meta name=\"foo\" value=\"bar\">",
+                new ToHTMLContentHandler());
+        assertEmptyElementWithAttributeEscaping(
+                "<p class=\"&lt;&amp;&quot;&gt;\"></p>",
+                new ToHTMLContentHandler());
+        assertElement("<p>content</p>", new ToHTMLContentHandler());
+        assertElementWithAttributes(
+                "<p class=\"test\">content</p>",
+                new ToHTMLContentHandler());
+    }
+
     private void assertStartDocument(String expected, ContentHandler handler)
             throws Exception {
         handler.startDocument();
@@ -83,17 +101,18 @@ public class SerializerTest extends Test
     private void assertEmptyElement(String expected, ContentHandler handler)
             throws Exception {
         AttributesImpl attributes = new AttributesImpl();
-        handler.startElement("", "p", "p", attributes);
-        handler.endElement("", "p", "p");
+        handler.startElement("", "br", "br", attributes);
+        handler.endElement("", "br", "br");
         assertEquals(expected, handler.toString());
     }
 
     private void assertEmptyElementWithAttributes(
             String expected, ContentHandler handler) throws Exception {
         AttributesImpl attributes = new AttributesImpl();
-        attributes.addAttribute("", "class", "class", "CDATA", "test");
-        handler.startElement("", "p", "p", attributes);
-        handler.endElement("", "p", "p");
+        attributes.addAttribute("", "name", "name", "CDATA", "foo");
+        attributes.addAttribute("", "value", "value", "CDATA", "bar");
+        handler.startElement("", "meta", "meta", attributes);
+        handler.endElement("", "meta", "meta");
         assertEquals(expected, handler.toString());
     }