You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/25 12:31:13 UTC
svn commit: r1175364 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/sax/ToHTMLContentHandler.java
main/java/org/apache/tika/sax/ToXMLContentHandler.java
test/java/org/apache/tika/sax/SerializerTest.java
Author: jukka
Date: Sun Sep 25 10:31:13 2011
New Revision: 1175364
URL: http://svn.apache.org/viewvc?rev=1175364&view=rev
Log:
TIKA-651: Unescaped attribute value generated
Add a simple HTML serializer
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java (with props)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java?rev=1175364&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java Sun Sep 25 10:31:13 2011
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that serializes the HTML document to a character stream.
+ * The incoming SAX events are expected to be well-formed (properly nested,
+ * etc.) and valid HTML.
+ *
+ * @since Apache Tika 1.0
+ */
+public class ToHTMLContentHandler extends ToXMLContentHandler {
+
+ private static final Set<String> EMPTY_ELEMENTS =
+ new HashSet<String>(Arrays.asList(
+ "area", "base", "basefont", "br", "col", "frame", "hr",
+ "img", "input", "isindex", "link", "meta", "param"));
+
+ public ToHTMLContentHandler(OutputStream stream, String encoding)
+ throws UnsupportedEncodingException {
+ super(stream, encoding);
+ }
+
+ public ToHTMLContentHandler() {
+ super();
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (inStartElement) {
+ write('>');
+ inStartElement = false;
+
+ if (EMPTY_ELEMENTS.contains(localName)) {
+ namespaces.clear();
+ return;
+ }
+ }
+
+ super.endElement(uri, localName, qName);
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
------------------------------------------------------------------------------
svn:executable = *
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java?rev=1175364&r1=1175363&r2=1175364&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java Sun Sep 25 10:31:13 2011
@@ -77,9 +77,9 @@ public class ToXMLContentHandler extends
private final String encoding;
- private boolean inStartElement = false;
+ protected boolean inStartElement = false;
- private final Map<String, String> namespaces =
+ protected final Map<String, String> namespaces =
new HashMap<String, String>();
private ElementInfo currentElement;
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java?rev=1175364&r1=1175363&r2=1175364&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java Sun Sep 25 10:31:13 2011
@@ -43,9 +43,9 @@ public class SerializerTest extends Test
assertCharacters("content", new ToXMLContentHandler());
assertCharacterEscaping("<&\">", new ToXMLContentHandler());
assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler());
- assertEmptyElement("<p />", new ToXMLContentHandler());
+ assertEmptyElement("<br />", new ToXMLContentHandler());
assertEmptyElementWithAttributes(
- "<p class=\"test\" />",
+ "<meta name=\"foo\" value=\"bar\" />",
new ToXMLContentHandler());
assertEmptyElementWithAttributeEscaping(
"<p class=\"<&">\" />",
@@ -56,6 +56,24 @@ public class SerializerTest extends Test
new ToXMLContentHandler());
}
+ public void testToHTMLContentHandler() throws Exception {
+ assertStartDocument("", new ToHTMLContentHandler());
+ assertCharacters("content", new ToHTMLContentHandler());
+ assertCharacterEscaping("<&\">", new ToHTMLContentHandler());
+ assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler());
+ assertEmptyElement("<br>", new ToHTMLContentHandler());
+ assertEmptyElementWithAttributes(
+ "<meta name=\"foo\" value=\"bar\">",
+ new ToHTMLContentHandler());
+ assertEmptyElementWithAttributeEscaping(
+ "<p class=\"<&">\"></p>",
+ new ToHTMLContentHandler());
+ assertElement("<p>content</p>", new ToHTMLContentHandler());
+ assertElementWithAttributes(
+ "<p class=\"test\">content</p>",
+ new ToHTMLContentHandler());
+ }
+
private void assertStartDocument(String expected, ContentHandler handler)
throws Exception {
handler.startDocument();
@@ -83,17 +101,18 @@ public class SerializerTest extends Test
private void assertEmptyElement(String expected, ContentHandler handler)
throws Exception {
AttributesImpl attributes = new AttributesImpl();
- handler.startElement("", "p", "p", attributes);
- handler.endElement("", "p", "p");
+ handler.startElement("", "br", "br", attributes);
+ handler.endElement("", "br", "br");
assertEquals(expected, handler.toString());
}
private void assertEmptyElementWithAttributes(
String expected, ContentHandler handler) throws Exception {
AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "test");
- handler.startElement("", "p", "p", attributes);
- handler.endElement("", "p", "p");
+ attributes.addAttribute("", "name", "name", "CDATA", "foo");
+ attributes.addAttribute("", "value", "value", "CDATA", "bar");
+ handler.startElement("", "meta", "meta", attributes);
+ handler.endElement("", "meta", "meta");
assertEquals(expected, handler.toString());
}