You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/27 18:00:04 UTC

svn commit: r1634594 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java

Author: tallison
Date: Mon Oct 27 17:00:03 2014
New Revision: 1634594

URL: http://svn.apache.org/r1634594
Log:
TIKA-1459 fix write limit bug in BasicContentHandlerFactory when creating a BodyContentHandler

Added:
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1634594&r1=1634593&r2=1634594&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Mon Oct 27 17:00:03 2014
@@ -121,7 +121,7 @@ public class TikaGUI extends JFrame
     }
 
     //maximum length to allow for mark for reparse to get JSON
-    private final int MAX_MARK = 20971520;//20MB
+    private final int MAX_MARK = 20*1024*1024;//20MB
     /**
      * Parsing context.
      */
@@ -379,7 +379,8 @@ public class TikaGUI extends JFrame
         }
         if (isReset) {
             RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
-                    new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+                    new BasicContentHandlerFactory(
+                            BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
             wrapper.parse(input, null, new Metadata(), new ParseContext());
             StringWriter jsonBuffer = new StringWriter();
             JsonMetadataList.setPrettyPrinting(true);

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1634594&r1=1634593&r2=1634594&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Mon Oct 27 17:00:03 2014
@@ -15,12 +15,14 @@ package org.apache.tika.sax;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
 
 import java.io.OutputStream;
+import java.io.OutputStreamWriter;
 import java.io.UnsupportedEncodingException;
 
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
 /**
  * Basic factory for creating common types of ContentHandlers
  */
@@ -53,12 +55,13 @@ public class BasicContentHandlerFactory 
     @Override
     public ContentHandler getNewContentHandler() {
 
+        if (type == HANDLER_TYPE.BODY) {
+            return new BodyContentHandler(writeLimit);
+        } else if (type == HANDLER_TYPE.IGNORE) {
+            return new DefaultHandler();
+        }
         if (writeLimit > -1) {
             switch(type) {
-                case BODY:
-                    return new BodyContentHandler(writeLimit);
-                case IGNORE:
-                    return new DefaultHandler();
                 case TEXT:
                     return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
                 case HTML:
@@ -70,10 +73,6 @@ public class BasicContentHandlerFactory 
             }
         } else {
             switch (type) {
-                case BODY:
-                    return new BodyContentHandler();
-                case IGNORE:
-                    return new DefaultHandler();
                 case TEXT:
                     return new ToTextContentHandler();
                 case HTML:
@@ -89,12 +88,17 @@ public class BasicContentHandlerFactory 
 
     @Override
     public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
+
+        if (type == HANDLER_TYPE.IGNORE) {
+            return new DefaultHandler();
+        }
+
         if (writeLimit > -1) {
             switch(type) {
                 case BODY:
-                    return new WriteOutContentHandler(new BodyContentHandler(new ToTextContentHandler(os, encoding)), writeLimit);
-                case IGNORE:
-                    return new DefaultHandler();
+                    return new WriteOutContentHandler(
+                            new BodyContentHandler(
+                                    new OutputStreamWriter(os, encoding)), writeLimit);
                 case TEXT:
                     return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
                 case HTML:
@@ -107,9 +111,7 @@ public class BasicContentHandlerFactory 
         } else {
             switch (type) {
                 case BODY:
-                    return new BodyContentHandler(new ToTextContentHandler(os, encoding));
-                case IGNORE:
-                    return new DefaultHandler();
+                    return new BodyContentHandler(new OutputStreamWriter(os, encoding));
                 case TEXT:
                     return new ToTextContentHandler(os, encoding);
                 case HTML:

Added: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java?rev=1634594&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java (added)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java Mon Oct 27 17:00:03 2014
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class.
+ */
+public class BasicContentHandlerFactoryTest {
+    private static final String ENCODING = "UTF-8";
+    //default max char len (at least in WriteOutContentHandler is 100k)
+    private static final int OVER_DEFAULT = 120000;
+
+    @Test
+    public void testIgnore() throws Exception {
+        Parser p = new MockParser(OVER_DEFAULT);
+        ContentHandler handler =
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler();
+        assertTrue(handler instanceof DefaultHandler);
+        p.parse(null, handler, null, null);
+        assertTrue(handler.toString().contains(""));
+
+        //tests that no write limit exception is thrown
+        p = new MockParser(100);
+        handler =
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler();
+        assertTrue(handler instanceof DefaultHandler);
+        p.parse(null, handler, null, null);
+        assertTrue(handler.toString().contains(""));
+    }
+
+    @Test
+    public void testText() throws Exception {
+        Parser p = new MockParser(OVER_DEFAULT);
+        BasicContentHandlerFactory.HANDLER_TYPE type = 
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+        ContentHandler handler =
+                new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+        assertTrue(handler instanceof ToTextContentHandler);
+        p.parse(null, handler, null, null);
+        assertTrue(handler.toString().contains("This is the title"));
+        assertTrue(handler.toString().contains("aaaaaaaaaa"));
+        assertFalse(handler.toString().toLowerCase().contains("<body"));
+        assertFalse(handler.toString().toLowerCase().contains("<html"));
+        assertTrue(handler.toString().length() > 110000);
+        //now test write limit
+        p = new MockParser(10);
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertTrue(handler.toString().contains("This "));
+        assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+        //now test outputstream call
+        p = new MockParser(OVER_DEFAULT);
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof ToTextContentHandler);
+        p.parse(null, handler, null, null);
+        assertContains("This is the title", os.toByteArray());
+        assertContains("aaaaaaaaaa", os.toByteArray());
+        assertTrue(os.toByteArray().length > 110000);
+        assertNotContains("<body", os.toByteArray());
+        assertNotContains("<html", os.toByteArray());
+
+        p = new MockParser(10);
+        os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        //When writing to an OutputStream and a write limit is reached,
+        //currently, nothing is written.
+        assertEquals(0, os.toByteArray().length);
+    }
+
+
+    @Test
+    public void testHTML() throws Exception {
+        Parser p = new MockParser(OVER_DEFAULT);
+        BasicContentHandlerFactory.HANDLER_TYPE type =
+                BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+        ContentHandler handler =
+                new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+        assertTrue(handler instanceof ToHTMLContentHandler);
+        p.parse(null, handler, null, null);
+        assertTrue(handler.toString().contains("<head><title>This is the title"));
+        assertTrue(handler.toString().contains("aaaaaaaaaa"));
+        assertTrue(handler.toString().length() > 110000);
+
+        //now test write limit
+        p = new MockParser(10);
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertTrue(handler.toString().contains("This "));
+        assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+        //now test outputstream call
+        p = new MockParser(OVER_DEFAULT);
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof ToHTMLContentHandler);
+        p.parse(null, handler, null, null);
+        assertContains("This is the title", os.toByteArray());
+        assertContains("aaaaaaaaaa", os.toByteArray());
+        assertContains("<body", os.toByteArray());
+        assertContains("<html", os.toByteArray());
+        assertTrue(os.toByteArray().length > 110000);
+
+
+        p = new MockParser(10);
+        os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertEquals(0, os.toByteArray().length);
+    }
+
+    @Test
+    public void testXML() throws Exception {
+        Parser p = new MockParser(OVER_DEFAULT);
+        BasicContentHandlerFactory.HANDLER_TYPE type =
+                BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+        ContentHandler handler =
+                new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+        assertTrue(handler instanceof ToXMLContentHandler);
+        p.parse(null, handler, new Metadata(), null);
+        assertTrue(handler.toString().contains("<head><title>This is the title"));
+        assertTrue(handler.toString().contains("aaaaaaaaaa"));
+        assertTrue(handler.toString().length() > 110000);
+
+        //now test write limit
+        p = new MockParser(10);
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertTrue(handler.toString().contains("This "));
+        assertFalse(handler.toString().toLowerCase().contains("aaaa"));
+
+        //now test outputstream call
+        p = new MockParser(OVER_DEFAULT);
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof ToXMLContentHandler);
+        p.parse(null, handler, null, null);
+        assertContains("This is the title", os.toByteArray());
+        assertContains("aaaaaaaaaa", os.toByteArray());
+        assertContains("<body", os.toByteArray());
+        assertContains("<html", os.toByteArray());
+        assertTrue(os.toByteArray().length > 110000);
+
+
+        p = new MockParser(10);
+        os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertEquals(0, os.toByteArray().length);
+    }
+
+
+    @Test
+    public void testBody() throws Exception {
+        Parser p = new MockParser(OVER_DEFAULT);
+        BasicContentHandlerFactory.HANDLER_TYPE type =
+                BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+        ContentHandler handler =
+                new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+
+        assertTrue(handler instanceof BodyContentHandler);
+
+        p.parse(null, handler, null, null);
+        assertFalse(handler.toString().contains("title"));
+        assertTrue(handler.toString().contains("aaaaaaaaaa"));
+        assertTrue(handler.toString().length() > 110000);
+
+        //now test write limit
+        p = new MockParser(10);
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+        assertTrue(handler instanceof BodyContentHandler);
+        assertWriteLimitReached(p, (BodyContentHandler)handler);
+        assertFalse(handler.toString().contains("This "));
+        assertTrue(handler.toString().toLowerCase().contains("aaaa"));
+
+        //now test outputstream call
+        p = new MockParser(OVER_DEFAULT);
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof BodyContentHandler);
+        p.parse(null, handler, null, null);
+        assertNotContains("title", os.toByteArray());
+        assertContains("aaaaaaaaaa", os.toByteArray());
+        assertNotContains("<body", os.toByteArray());
+        assertNotContains("<html", os.toByteArray());
+        assertTrue(os.toByteArray().length > 110000);
+
+        p = new MockParser(10);
+        os = new ByteArrayOutputStream();
+        handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
+        assertTrue(handler instanceof WriteOutContentHandler);
+        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
+        assertEquals(0, os.toByteArray().length);
+    }
+
+    private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) throws Exception {
+        boolean wlr = false;
+        try {
+            p.parse(null, handler, null, null);
+        } catch (SAXException e) {
+            if (! handler.isWriteLimitReached(e)) {
+                throw e;
+            }
+            wlr = true;
+        }
+        assertTrue("WriteLimitReached", wlr);
+    }
+    //TODO: is there a better way than to repeat this with diff signature?
+    private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception {
+        boolean wlr = false;
+        try {
+            p.parse(null, handler, null, null);
+        } catch (SAXException e) {
+            if (! e.getClass().toString().contains("org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException")){
+                throw e;
+            }
+
+            wlr = true;
+        }
+        assertTrue("WriteLimitReached", wlr);
+    }
+
+    private void assertNotContains(String needle, byte[] hayStack)
+            throws UnsupportedEncodingException {
+        String s  = new String(hayStack, ENCODING);
+        assertFalse(s.toLowerCase().contains(needle));
+    }
+
+    private void assertContains(String needle, byte[] hayStack)
+            throws UnsupportedEncodingException {
+        String s  = new String(hayStack, ENCODING);
+        assertTrue(s.contains(needle));
+    }
+
+    //Simple mockparser that writes a title
+    //and charsToWrite number of 'a'
+    private class MockParser implements Parser {
+        private final String XHTML = "http://www.w3.org/1999/xhtml";
+        private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+        private final char[] TITLE = "This is the title".toCharArray();
+
+        private final int charsToWrite;
+        public MockParser(int charsToWrite) {
+            this.charsToWrite = charsToWrite;
+        }
+
+        @Override
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            return null;
+        }
+
+        @Override
+        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+            handler.startDocument();
+            handler.startPrefixMapping("", XHTML);
+            handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
+            handler.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
+            handler.startElement(XHTML, "title", "head", EMPTY_ATTRIBUTES);
+            handler.characters(TITLE, 0, TITLE.length);
+            handler.endElement(XHTML, "title", "head");
+
+            handler.endElement(XHTML, "head", "head");
+            handler.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+            char[] body = new char[charsToWrite];
+            for (int i = 0; i < charsToWrite; i++) {
+                body[i] = 'a';
+            }
+            handler.characters(body, 0, body.length);
+            handler.endElement(XHTML, "body", "body");
+            handler.endElement(XHTML, "html", "html");
+            handler.endDocument();
+        }
+    }
+}