You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/03/11 18:32:16 UTC

svn commit: r1299404 - in /tika/trunk: CHANGES.txt tika-core/src/main/java/org/apache/tika/Tika.java tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Author: mikemccand
Date: Sun Mar 11 17:32:16 2012
New Revision: 1299404

URL: http://svn.apache.org/viewvc?rev=1299404&view=rev
Log:
TIKA-870: allow setting maxStringLength per-call to Tika.parseToString

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Mar 11 17:32:16 2012
@@ -1,6 +1,9 @@
 Apache Tika Change Log
 ======================
 
+ * Tika: parseToString now lets you specify the max string length
+   per-call, in addition to per-Tika-instance. (TIKA-870)
+
 Release 1.1 - 3/7/2012
 ---------------------------------
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sun Mar 11 17:32:16 2012
@@ -392,6 +392,47 @@ public class Tika {
 
     /**
      * Parses the given document and returns the extracted text content.
+     * The given input stream is closed by this method. This method lets
+     * you control the maxStringLength per call.
+     * <p>
+     * To avoid unpredictable excess memory use, the returned string contains
+     * only up to maxLength (parameter) first characters extracted
+     * from the input document.
+     * <p>
+     * <strong>NOTE:</strong> Unlike most other Tika methods that take an
+     * {@link InputStream}, this method will close the given stream for
+     * you as a convenience. With other methods you are still responsible
+     * for closing the stream or a wrapper instance returned by Tika.
+     *
+     * @param stream the document to be parsed
+     * @param metadata document metadata
+     * @param maxLength maximum length of the returned string
+     * @return extracted text content
+     * @throws IOException if the document can not be read
+     * @throws TikaException if the document can not be parsed
+     */
+    public String parseToString(InputStream stream, Metadata metadata, int maxLength)
+        throws IOException, TikaException {
+        WriteOutContentHandler handler =
+            new WriteOutContentHandler(maxLength);
+        try {
+            ParseContext context = new ParseContext();
+            context.set(Parser.class, parser);
+            parser.parse(
+                         stream, new BodyContentHandler(handler), metadata, context);
+        } catch (SAXException e) {
+            if (!handler.isWriteLimitReached(e)) {
+                // This should never happen with BodyContentHandler...
+                throw new TikaException("Unexpected SAX processing failure", e);
+            }
+        } finally {
+            stream.close();
+        }
+        return handler.toString();
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
      * The given input stream is closed by this method.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java Sun Mar 11 17:32:16 2012
@@ -146,6 +146,24 @@ public class WriteOutContentHandler exte
         }
     }
 
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+        if (writeLimit == -1 || writeCount + length <= writeLimit) {
+            super.ignorableWhitespace(ch, start, length);
+            writeCount += length;
+        } else {
+            super.ignorableWhitespace(ch, start, writeLimit - writeCount);
+            writeCount = writeLimit;
+            throw new WriteLimitReachedException(
+                    "Your document contained more than " + writeLimit
+                    + " characters, and so your requested limit has been"
+                    + " reached. To receive the full text of the document,"
+                    + " increase your limit. (Text up to the limit is"
+                    + " however available).", tag);
+        }
+    }
+
     /**
      * Checks whether the given exception (or any of it's root causes) was
      * thrown by this handler as a signal of reaching the write limit.

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sun Mar 11 17:32:16 2012
@@ -28,6 +28,7 @@ import javax.xml.transform.stream.Stream
 
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.WriteOutContentHandler;
@@ -165,6 +166,35 @@ public class RTFParserTest extends TikaT
         assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
     }
 
+    public void testMaxLength() throws Exception {
+        File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(file, metadata);
+
+        // Test w/ default limit:
+        Tika localTika = new Tika();
+        String content = localTika.parseToString(stream, metadata);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() > 500);
+
+        // Test setting max length on the instance:
+        localTika.setMaxStringLength(200);
+        stream = TikaInputStream.get(file, metadata);
+        content = localTika.parseToString(stream, metadata);
+        
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 200);
+        
+        // Test setting max length per-call:
+        stream = TikaInputStream.get(file, metadata);
+        content = localTika.parseToString(stream, metadata, 100);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 100);
+    }
+
     public void testTextWithCurlyBraces() throws Exception {
         String content = getText("testRTFWithCurlyBraces.rtf");
         assertContains("{ some text inside curly brackets }", content);