You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/27 18:00:04 UTC
svn commit: r1634594 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
Author: tallison
Date: Mon Oct 27 17:00:03 2014
New Revision: 1634594
URL: http://svn.apache.org/r1634594
Log:
TIKA-1459 fix write limit bug in BasicContentHandlerFactory when creating a BodyContentHandler
Added:
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1634594&r1=1634593&r2=1634594&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Mon Oct 27 17:00:03 2014
@@ -121,7 +121,7 @@ public class TikaGUI extends JFrame
}
//maximum length to allow for mark for reparse to get JSON
- private final int MAX_MARK = 20971520;//20MB
+ private final int MAX_MARK = 20*1024*1024;//20MB
/**
* Parsing context.
*/
@@ -379,7 +379,8 @@ public class TikaGUI extends JFrame
}
if (isReset) {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
wrapper.parse(input, null, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1634594&r1=1634593&r2=1634594&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Mon Oct 27 17:00:03 2014
@@ -15,12 +15,14 @@ package org.apache.tika.sax;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
import java.io.OutputStream;
+import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
/**
* Basic factory for creating common types of ContentHandlers
*/
@@ -53,12 +55,13 @@ public class BasicContentHandlerFactory
@Override
public ContentHandler getNewContentHandler() {
+ if (type == HANDLER_TYPE.BODY) {
+ return new BodyContentHandler(writeLimit);
+ } else if (type == HANDLER_TYPE.IGNORE) {
+ return new DefaultHandler();
+ }
if (writeLimit > -1) {
switch(type) {
- case BODY:
- return new BodyContentHandler(writeLimit);
- case IGNORE:
- return new DefaultHandler();
case TEXT:
return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
case HTML:
@@ -70,10 +73,6 @@ public class BasicContentHandlerFactory
}
} else {
switch (type) {
- case BODY:
- return new BodyContentHandler();
- case IGNORE:
- return new DefaultHandler();
case TEXT:
return new ToTextContentHandler();
case HTML:
@@ -89,12 +88,17 @@ public class BasicContentHandlerFactory
@Override
public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
+
+ if (type == HANDLER_TYPE.IGNORE) {
+ return new DefaultHandler();
+ }
+
if (writeLimit > -1) {
switch(type) {
case BODY:
- return new WriteOutContentHandler(new BodyContentHandler(new ToTextContentHandler(os, encoding)), writeLimit);
- case IGNORE:
- return new DefaultHandler();
+ return new WriteOutContentHandler(
+ new BodyContentHandler(
+ new OutputStreamWriter(os, encoding)), writeLimit);
case TEXT:
return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
case HTML:
@@ -107,9 +111,7 @@ public class BasicContentHandlerFactory
} else {
switch (type) {
case BODY:
- return new BodyContentHandler(new ToTextContentHandler(os, encoding));
- case IGNORE:
- return new DefaultHandler();
+ return new BodyContentHandler(new OutputStreamWriter(os, encoding));
case TEXT:
return new ToTextContentHandler(os, encoding);
case HTML:
Added: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java?rev=1634594&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java (added)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java Mon Oct 27 17:00:03 2014
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class.
+ */
+public class BasicContentHandlerFactoryTest {
+ private static final String ENCODING = "UTF-8";
+ //default max char len (at least in WriteOutContentHandler is 100k)
+ private static final int OVER_DEFAULT = 120000;
+
+ @Test
+ public void testIgnore() throws Exception {
+ Parser p = new MockParser(OVER_DEFAULT);
+ ContentHandler handler =
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler();
+ assertTrue(handler instanceof DefaultHandler);
+ p.parse(null, handler, null, null);
+ assertTrue(handler.toString().contains(""));
+
+ //tests that no write limit exception is thrown
+ p = new MockParser(100);
+ handler =
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler();
+ assertTrue(handler instanceof DefaultHandler);
+ p.parse(null, handler, null, null);
+ assertTrue(handler.toString().contains(""));
+ }
+
+ @Test
+ public void testText() throws Exception {
+ Parser p = new MockParser(OVER_DEFAULT);
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+ ContentHandler handler =
+ new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+ assertTrue(handler instanceof ToTextContentHandler);
+ p.parse(null, handler, null, null);
+ assertTrue(handler.toString().contains("This is the title"));
+ assertTrue(handler.toString().contains("aaaaaaaaaa"));
+ assertFalse(handler.toString().toLowerCase().contains("<body"));
+ assertFalse(handler.toString().toLowerCase().contains("<html"));
+ assertTrue(handler.toString().length() > 110000);
+ //now test write limit
+ p = new MockParser(10);
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertTrue(handler.toString().contains("This "));
+ assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+ //now test outputstream call
+ p = new MockParser(OVER_DEFAULT);
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof ToTextContentHandler);
+ p.parse(null, handler, null, null);
+ assertContains("This is the title", os.toByteArray());
+ assertContains("aaaaaaaaaa", os.toByteArray());
+ assertTrue(os.toByteArray().length > 110000);
+ assertNotContains("<body", os.toByteArray());
+ assertNotContains("<html", os.toByteArray());
+
+ p = new MockParser(10);
+ os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ //When writing to an OutputStream and a write limit is reached,
+ //currently, nothing is written.
+ assertEquals(0, os.toByteArray().length);
+ }
+
+
+ @Test
+ public void testHTML() throws Exception {
+ Parser p = new MockParser(OVER_DEFAULT);
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+ ContentHandler handler =
+ new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+ assertTrue(handler instanceof ToHTMLContentHandler);
+ p.parse(null, handler, null, null);
+ assertTrue(handler.toString().contains("<head><title>This is the title"));
+ assertTrue(handler.toString().contains("aaaaaaaaaa"));
+ assertTrue(handler.toString().length() > 110000);
+
+ //now test write limit
+ p = new MockParser(10);
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertTrue(handler.toString().contains("This "));
+ assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+ //now test outputstream call
+ p = new MockParser(OVER_DEFAULT);
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof ToHTMLContentHandler);
+ p.parse(null, handler, null, null);
+ assertContains("This is the title", os.toByteArray());
+ assertContains("aaaaaaaaaa", os.toByteArray());
+ assertContains("<body", os.toByteArray());
+ assertContains("<html", os.toByteArray());
+ assertTrue(os.toByteArray().length > 110000);
+
+
+ p = new MockParser(10);
+ os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertEquals(0, os.toByteArray().length);
+ }
+
+ @Test
+ public void testXML() throws Exception {
+ Parser p = new MockParser(OVER_DEFAULT);
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+ ContentHandler handler =
+ new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+ assertTrue(handler instanceof ToXMLContentHandler);
+ p.parse(null, handler, new Metadata(), null);
+ assertTrue(handler.toString().contains("<head><title>This is the title"));
+ assertTrue(handler.toString().contains("aaaaaaaaaa"));
+ assertTrue(handler.toString().length() > 110000);
+
+ //now test write limit
+ p = new MockParser(10);
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertTrue(handler.toString().contains("This "));
+ assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+ //now test outputstream call
+ p = new MockParser(OVER_DEFAULT);
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof ToXMLContentHandler);
+ p.parse(null, handler, null, null);
+ assertContains("This is the title", os.toByteArray());
+ assertContains("aaaaaaaaaa", os.toByteArray());
+ assertContains("<body", os.toByteArray());
+ assertContains("<html", os.toByteArray());
+ assertTrue(os.toByteArray().length > 110000);
+
+
+ p = new MockParser(10);
+ os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertEquals(0, os.toByteArray().length);
+ }
+
+
+ @Test
+ public void testBody() throws Exception {
+ Parser p = new MockParser(OVER_DEFAULT);
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+ ContentHandler handler =
+ new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+ assertTrue(handler instanceof BodyContentHandler);
+
+ p.parse(null, handler, null, null);
+ assertFalse(handler.toString().contains("title"));
+ assertTrue(handler.toString().contains("aaaaaaaaaa"));
+ assertTrue(handler.toString().length() > 110000);
+
+ //now test write limit
+ p = new MockParser(10);
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ assertTrue(handler instanceof BodyContentHandler);
+ assertWriteLimitReached(p, (BodyContentHandler)handler);
+ assertFalse(handler.toString().contains("This "));
+ assertTrue(handler.toString().toLowerCase().contains("aaaa"));
+
+ //now test outputstream call
+ p = new MockParser(OVER_DEFAULT);
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof BodyContentHandler);
+ p.parse(null, handler, null, null);
+ assertNotContains("title", os.toByteArray());
+ assertContains("aaaaaaaaaa", os.toByteArray());
+ assertNotContains("<body", os.toByteArray());
+ assertNotContains("<html", os.toByteArray());
+ assertTrue(os.toByteArray().length > 110000);
+
+ p = new MockParser(10);
+ os = new ByteArrayOutputStream();
+ handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+ assertTrue(handler instanceof WriteOutContentHandler);
+ assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+ assertEquals(0, os.toByteArray().length);
+ }
+
+ private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) throws Exception {
+ boolean wlr = false;
+ try {
+ p.parse(null, handler, null, null);
+ } catch (SAXException e) {
+ if (! handler.isWriteLimitReached(e)) {
+ throw e;
+ }
+ wlr = true;
+ }
+ assertTrue("WriteLimitReached", wlr);
+ }
+ //TODO: is there a better way than to repeat this with diff signature?
+ private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception {
+ boolean wlr = false;
+ try {
+ p.parse(null, handler, null, null);
+ } catch (SAXException e) {
+ if (! e.getClass().toString().contains("org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException")){
+ throw e;
+ }
+
+ wlr = true;
+ }
+ assertTrue("WriteLimitReached", wlr);
+ }
+
+ private void assertNotContains(String needle, byte[] hayStack)
+ throws UnsupportedEncodingException {
+ String s = new String(hayStack, ENCODING);
+ assertFalse(s.toLowerCase().contains(needle));
+ }
+
+ private void assertContains(String needle, byte[] hayStack)
+ throws UnsupportedEncodingException {
+ String s = new String(hayStack, ENCODING);
+ assertTrue(s.contains(needle));
+ }
+
+ //Simple mockparser that writes a title
+ //and charsToWrite number of 'a'
+ private class MockParser implements Parser {
+ private final String XHTML = "http://www.w3.org/1999/xhtml";
+ private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+ private final char[] TITLE = "This is the title".toCharArray();
+
+ private final int charsToWrite;
+ public MockParser(int charsToWrite) {
+ this.charsToWrite = charsToWrite;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return null;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ handler.startDocument();
+ handler.startPrefixMapping("", XHTML);
+ handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
+ handler.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
+ handler.startElement(XHTML, "title", "head", EMPTY_ATTRIBUTES);
+ handler.characters(TITLE, 0, TITLE.length);
+ handler.endElement(XHTML, "title", "head");
+
+ handler.endElement(XHTML, "head", "head");
+ handler.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+ char[] body = new char[charsToWrite];
+ for (int i = 0; i < charsToWrite; i++) {
+ body[i] = 'a';
+ }
+ handler.characters(body, 0, body.length);
+ handler.endElement(XHTML, "body", "body");
+ handler.endElement(XHTML, "html", "html");
+ handler.endDocument();
+ }
+ }
+}