You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/03/11 18:32:16 UTC
svn commit: r1299404 - in /tika/trunk: CHANGES.txt
tika-core/src/main/java/org/apache/tika/Tika.java
tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Author: mikemccand
Date: Sun Mar 11 17:32:16 2012
New Revision: 1299404
URL: http://svn.apache.org/viewvc?rev=1299404&view=rev
Log:
TIKA-870: allow setting maxStringLength per-call to Tika.parseToString
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Mar 11 17:32:16 2012
@@ -1,6 +1,9 @@
Apache Tika Change Log
======================
+ * Tika: parseToString now lets you specify the max string length
+ per-call, in addition to per-Tika-instance. (TIKA-870)
+
Release 1.1 - 3/7/2012
---------------------------------
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sun Mar 11 17:32:16 2012
@@ -392,6 +392,47 @@ public class Tika {
/**
* Parses the given document and returns the extracted text content.
+ * The given input stream is closed by this method. This method lets
+ * you control the maxStringLength per call.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to maxLength (parameter) first characters extracted
+ * from the input document.
+ * <p>
+ * <strong>NOTE:</strong> Unlike most other Tika methods that take an
+ * {@link InputStream}, this method will close the given stream for
+ * you as a convenience. With other methods you are still responsible
+ * for closing the stream or a wrapper instance returned by Tika.
+ *
+ * @param stream the document to be parsed
+ * @param metadata document metadata
+ * @param maxLength maximum length of the returned string
+ * @return extracted text content
+ * @throws IOException if the document can not be read
+ * @throws TikaException if the document can not be parsed
+ */
+ public String parseToString(InputStream stream, Metadata metadata, int maxLength)
+ throws IOException, TikaException {
+ WriteOutContentHandler handler =
+ new WriteOutContentHandler(maxLength);
+ try {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ parser.parse(
+ stream, new BodyContentHandler(handler), metadata, context);
+ } catch (SAXException e) {
+ if (!handler.isWriteLimitReached(e)) {
+ // This should never happen with BodyContentHandler...
+ throw new TikaException("Unexpected SAX processing failure", e);
+ }
+ } finally {
+ stream.close();
+ }
+ return handler.toString();
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
* The given input stream is closed by this method.
* <p>
* To avoid unpredictable excess memory use, the returned string contains
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java Sun Mar 11 17:32:16 2012
@@ -146,6 +146,24 @@ public class WriteOutContentHandler exte
}
}
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (writeLimit == -1 || writeCount + length <= writeLimit) {
+ super.ignorableWhitespace(ch, start, length);
+ writeCount += length;
+ } else {
+ super.ignorableWhitespace(ch, start, writeLimit - writeCount);
+ writeCount = writeLimit;
+ throw new WriteLimitReachedException(
+ "Your document contained more than " + writeLimit
+ + " characters, and so your requested limit has been"
+ + " reached. To receive the full text of the document,"
+ + " increase your limit. (Text up to the limit is"
+ + " however available).", tag);
+ }
+ }
+
/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the write limit.
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1299404&r1=1299403&r2=1299404&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sun Mar 11 17:32:16 2012
@@ -28,6 +28,7 @@ import javax.xml.transform.stream.Stream
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
@@ -165,6 +166,35 @@ public class RTFParserTest extends TikaT
assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
}
+ public void testMaxLength() throws Exception {
+ File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
+ Metadata metadata = new Metadata();
+ InputStream stream = TikaInputStream.get(file, metadata);
+
+ // Test w/ default limit:
+ Tika localTika = new Tika();
+ String content = localTika.parseToString(stream, metadata);
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() > 500);
+
+ // Test setting max length on the instance:
+ localTika.setMaxStringLength(200);
+ stream = TikaInputStream.get(file, metadata);
+ content = localTika.parseToString(stream, metadata);
+
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() <= 200);
+
+ // Test setting max length per-call:
+ stream = TikaInputStream.get(file, metadata);
+ content = localTika.parseToString(stream, metadata, 100);
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() <= 100);
+ }
+
public void testTextWithCurlyBraces() throws Exception {
String content = getText("testRTFWithCurlyBraces.rtf");
assertContains("{ some text inside curly brackets }", content);