You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/08 19:58:47 UTC
[tika] branch main updated: TIKA-3787 -- allow parse to continue after writelimit has been reached
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7c93ddf7e TIKA-3787 -- allow parse to continue after writelimit has been reached
7c93ddf7e is described below
commit 7c93ddf7e3183fcbd811e04c1621455d961b1bb5
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 8 15:58:40 2022 -0400
TIKA-3787 -- allow parse to continue after writelimit has been reached
---
CHANGES.txt | 3 +
.../apache/tika/metadata/TikaCoreProperties.java | 4 ++
.../org/apache/tika/parser/CompositeParser.java | 41 +++++------
.../java/org/apache/tika/parser/ParseRecord.java | 83 ++++++++++++++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 54 +++++++++++---
.../java/org/apache/tika/pipes/HandlerConfig.java | 20 ++++--
.../tika/pipes/pipesiterator/PipesIterator.java | 3 +-
.../tika/sax/BasicContentHandlerFactory.java | 78 +++++++++++++-------
.../java/org/apache/tika/sax/WriteLimiter.java | 22 ++++++
.../apache/tika/sax/WriteOutContentHandler.java | 51 +++++++++++--
.../apache/tika/parser/AutoDetectParserTest.java | 46 ++++++++++++
.../tika/parser/RecursiveParserWrapperTest.java | 29 +++++++-
.../metadata/serialization/JsonFetchEmitTuple.java | 4 +-
.../serialization/JsonFetchEmitTupleTest.java | 4 +-
.../core/resource/RecursiveMetadataResource.java | 2 +-
.../tika/server/core/resource/TikaResource.java | 20 +++++-
.../org/apache/tika/server/core/TikaPipesTest.java | 2 +-
.../apache/tika/server/core/TikaResourceTest.java | 17 +++++
.../standard/RecursiveMetadataResourceTest.java | 37 +++++++++-
.../apache/tika/server/standard/TikaPipesTest.java | 2 +-
.../tika/server/standard/TikaResourceTest.java | 21 +++++-
21 files changed, 464 insertions(+), 79 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8696143f2..cb76c07f2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.4.1 - ???
+ * Allow continuation of parsing after write limit has
+ been reached (TIKA-3787).
+
* Allow pass-through of 'Content-Length' header to metadata
in TikaResource (TIKA-3786).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 21581a482..c4035ea31 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -76,6 +76,10 @@ public interface TikaCoreProperties {
Property EMBEDDED_EXCEPTION =
Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ //warning while parsing in an embedded file
+ Property EMBEDDED_WARNING =
+ Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
+
Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
/**
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 60dfa3d97..29546546b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,7 +23,6 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -40,6 +39,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
/**
@@ -281,17 +281,17 @@ public class CompositeParser extends AbstractParser {
ParseContext context) throws IOException, SAXException, TikaException {
Parser parser = getParser(metadata, context);
TemporaryResources tmp = new TemporaryResources();
- ParserRecord parserRecord = context.get(ParserRecord.class);
+ ParseRecord parserRecord = context.get(ParseRecord.class);
if (parserRecord == null) {
- parserRecord = new ParserRecord();
- context.set(ParserRecord.class, parserRecord);
+ parserRecord = new ParseRecord();
+ context.set(ParseRecord.class, parserRecord);
}
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
String parserClassname = ParserUtils.getParserClassname(parser);
- parserRecord.add(parserClassname);
+ parserRecord.addParserClass(parserClassname);
ParserUtils.recordParserDetails(parserClassname, metadata);
parserRecord.beforeParse();
try {
@@ -316,32 +316,25 @@ public class CompositeParser extends AbstractParser {
parserRecord.afterParse();
if (parserRecord.getDepth() == 0) {
metadata.set(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET, parserRecord.getParsers());
+ recordEmbeddedMetadata(metadata, context);
}
}
}
- private static class ParserRecord {
- int depth = 0;
- Set<String> parsers = new LinkedHashSet<>();
-
- void beforeParse() {
- depth++;
- }
-
- void afterParse() {
- depth--;
+ private void recordEmbeddedMetadata(Metadata metadata, ParseContext context) {
+ ParseRecord record = context.get(ParseRecord.class);
+ if (record == null) {
+ //this should never happen
+ return;
}
-
- int getDepth() {
- return depth;
+ for (Exception e : record.getExceptions()) {
+ metadata.add(TikaCoreProperties.EMBEDDED_EXCEPTION, ExceptionUtils.getStackTrace(e));
}
-
- String[] getParsers() {
- return parsers.toArray(new String[0]);
+ for (String msg : record.getWarnings()) {
+ metadata.add(TikaCoreProperties.EMBEDDED_WARNING, msg);
}
-
- void add(String parserClass) {
- parsers.add(parserClass);
+ if (record.isWriteLimitReached()) {
+ metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, true);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
new file mode 100644
index 000000000..081c01920
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Use this class to store exceptions, warnings and other information
+ * during the parse. This information is added to the parent's metadata
+ * after the parse by the {@link CompositeParser}.
+ */
+public class ParseRecord {
+ private int depth = 0;
+ private final Set<String> parsers = new LinkedHashSet<>();
+
+ private final List<Exception> exceptions = new ArrayList<>();
+
+ private final List<String> warnings = new ArrayList<>();
+
+ private boolean writeLimitReached = false;
+
+ void beforeParse() {
+ depth++;
+ }
+
+ void afterParse() {
+ depth--;
+ }
+
+ public int getDepth() {
+ return depth;
+ }
+
+ public String[] getParsers() {
+ return parsers.toArray(new String[0]);
+ }
+
+ void addParserClass(String parserClass) {
+ parsers.add(parserClass);
+ }
+
+ public void addException(Exception e) {
+ exceptions.add(e);
+ }
+
+ public void addWarning(String msg) {
+ warnings.add(msg);
+ }
+
+ public void setWriteLimitReached(boolean writeLimitReached) {
+ this.writeLimitReached = writeLimitReached;
+ }
+
+ public List<Exception> getExceptions() {
+ return exceptions;
+ }
+
+ public List<String> getWarnings() {
+ return warnings;
+ }
+
+
+ public boolean isWriteLimitReached() {
+ return writeLimitReached;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 2d1b6c5fb..59db9b3f9 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -35,9 +35,10 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.SecureContentHandler;
+import org.apache.tika.sax.WriteLimiter;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
@@ -143,16 +144,21 @@ public class RecursiveParserWrapper extends ParserDecorator {
parserState.recursiveParserWrapperHandler.startDocument();
TemporaryResources tmp = new TemporaryResources();
int writeLimit = -1;
- //TODO -- rely on a new interface WriteLimiting...?
- //It'd be better not to tie this to a specific class
- if (recursiveParserWrapperHandler instanceof BasicContentHandlerFactory) { // TODO this cond is always false
- writeLimit =
- ((BasicContentHandlerFactory)recursiveParserWrapperHandler).getWriteLimit();
+ boolean throwOnWriteLimitReached = true;
+
+ if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
+ ContentHandlerFactory factory =
+ ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getContentHandlerFactory();
+ if (factory instanceof WriteLimiter) {
+ writeLimit = ((WriteLimiter)factory).getWriteLimit();
+ throwOnWriteLimitReached = ((WriteLimiter)factory).isThrowOnWriteLimitReached();
+ }
}
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
RecursivelySecureContentHandler secureContentHandler =
- new RecursivelySecureContentHandler(localHandler, tis, writeLimit);
+ new RecursivelySecureContentHandler(localHandler, tis, writeLimit,
+ throwOnWriteLimitReached, context);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (Throwable e) {
@@ -287,13 +293,22 @@ public class RecursiveParserWrapper extends ParserDecorator {
//total allowable chars across all handlers
private final int totalWriteLimit;
+ private final boolean throwOnWriteLimitReached;
+
+ private final ParseContext parseContext;
+
+ private boolean writeLimitReached = false;
+
//total chars written to all handlers
private int totalChars = 0;
public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream,
- int totalWriteLimit) {
+ int totalWriteLimit,
+ boolean throwOnWriteLimitReached, ParseContext parseContext) {
super(handler, stream);
this.handler = handler;
this.totalWriteLimit = totalWriteLimit;
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ this.parseContext = parseContext;
}
public void updateContentHandler(ContentHandler handler) {
@@ -328,6 +343,10 @@ public class RecursiveParserWrapper extends ParserDecorator {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
+ if (writeLimitReached) {
+ return;
+ }
+
if (totalWriteLimit < 0) {
super.characters(ch, start, length);
return;
@@ -335,12 +354,16 @@ public class RecursiveParserWrapper extends ParserDecorator {
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.characters(ch, start, availableLength);
if (availableLength < length) {
- throw new WriteLimitReachedException(totalWriteLimit);
+ handleWriteLimitReached();
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (writeLimitReached) {
+ return;
+ }
+
if (totalWriteLimit < 0) {
super.ignorableWhitespace(ch, start, length);
return;
@@ -348,7 +371,20 @@ public class RecursiveParserWrapper extends ParserDecorator {
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.ignorableWhitespace(ch, start, availableLength);
if (availableLength < length) {
+ handleWriteLimitReached();
+ }
+ }
+
+ private void handleWriteLimitReached() throws WriteLimitReachedException {
+ writeLimitReached = true;
+
+ if (throwOnWriteLimitReached) {
throw new WriteLimitReachedException(totalWriteLimit);
+ } else {
+ ParseRecord parseRecord = parseContext.get(ParseRecord.class);
+ if (parseRecord != null) {
+ parseRecord.setWriteLimitReached(true);
+ }
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
index a73e2290b..d128dcb3d 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
@@ -31,7 +31,7 @@ public class HandlerConfig implements Serializable {
public static final HandlerConfig DEFAULT_HANDLER_CONFIG =
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, PARSE_MODE.RMETA,
- -1, -1);
+ -1, -1, true);
/**
* {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option
@@ -73,16 +73,19 @@ public class HandlerConfig implements Serializable {
int writeLimit = -1;
int maxEmbeddedResources = -1;
+
+ boolean throwOnWriteLimitReached = true;
PARSE_MODE parseMode = PARSE_MODE.RMETA;
public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode,
int writeLimit,
- int maxEmbeddedResources) {
+ int maxEmbeddedResources, boolean throwOnWriteLimitReached) {
this.type = type;
this.parseMode = parseMode;
this.writeLimit = writeLimit;
this.maxEmbeddedResources = maxEmbeddedResources;
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
}
public BasicContentHandlerFactory.HANDLER_TYPE getType() {
@@ -101,6 +104,10 @@ public class HandlerConfig implements Serializable {
return parseMode;
}
+ public boolean isThrowOnWriteLimitReached() {
+ return throwOnWriteLimitReached;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) {
@@ -111,17 +118,20 @@ public class HandlerConfig implements Serializable {
}
HandlerConfig that = (HandlerConfig) o;
return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources &&
- type == that.type && parseMode == that.parseMode;
+ throwOnWriteLimitReached == that.throwOnWriteLimitReached && type == that.type &&
+ parseMode == that.parseMode;
}
@Override
public int hashCode() {
- return Objects.hash(type, writeLimit, maxEmbeddedResources, parseMode);
+ return Objects.hash(type, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached,
+ parseMode);
}
@Override
public String toString() {
return "HandlerConfig{" + "type=" + type + ", writeLimit=" + writeLimit +
- ", maxEmbeddedResources=" + maxEmbeddedResources + ", mode=" + parseMode + '}';
+ ", maxEmbeddedResources=" + maxEmbeddedResources + ", throwOnWriteLimitReached=" +
+ throwOnWriteLimitReached + ", parseMode=" + parseMode + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
index 42272743e..98b766ce7 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
@@ -167,7 +167,8 @@ public abstract class PipesIterator extends ConfigBase
}
protected HandlerConfig getHandlerConfig() {
- return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources);
+ //TODO: make throwOnWriteLimitReached configurable
+ return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, false);
}
protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 9bc5da41f..9de0d4071 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -26,22 +26,50 @@ import java.util.Locale;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.parser.ParseContext;
+
/**
* Basic factory for creating common types of ContentHandlers
*/
-public class BasicContentHandlerFactory implements ContentHandlerFactory {
+public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter {
private final HANDLER_TYPE type;
private final int writeLimit;
+ private final boolean throwOnWriteLimitReached;
+
+ private final ParseContext parseContext;
+
/**
* @param type basic type of handler
* @param writeLimit max number of characters to store; if < 0,
* the handler will store all characters
*/
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
+ this(type, writeLimit, true, null);
+ }
+
+ /**
+ *
+ * @param type basic type of handler
+ * @param writeLimit maximum number of characters to store
+ * @param throwOnWriteLimitReached whether or not to throw a
+ * {@link org.apache.tika.exception.WriteLimitReachedException}
+ * when the write limit has been reached
+ * @param parseContext to store the writelimitreached warning if
+ * throwOnWriteLimitReached is set to <code>false</code>
+ */
+ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
+ boolean throwOnWriteLimitReached, ParseContext parseContext) {
this.type = type;
this.writeLimit = writeLimit;
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ this.parseContext = parseContext;
+ if (throwOnWriteLimitReached == false && parseContext == null) {
+ throw new IllegalArgumentException("parse context must not be null if " +
+ "throwOnWriteLimitReached is false");
+ }
+
}
/**
@@ -82,33 +110,30 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory {
public ContentHandler getNewContentHandler() {
if (type == HANDLER_TYPE.BODY) {
- return new BodyContentHandler(writeLimit);
+ return new BodyContentHandler(
+ new WriteOutContentHandler(new ToTextContentHandler(), writeLimit,
+ throwOnWriteLimitReached, parseContext));
} else if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
}
- if (writeLimit > -1) {
- switch (type) {
- case TEXT:
- return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
- case HTML:
- return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit);
- case XML:
- return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit);
- default:
- return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
- }
- } else {
- switch (type) {
- case TEXT:
- return new ToTextContentHandler();
- case HTML:
- return new ToHTMLContentHandler();
- case XML:
- return new ToXMLContentHandler();
- default:
- return new ToTextContentHandler();
+ ContentHandler formatHandler = getFormatHandler();
+ if (writeLimit < 0) {
+ return formatHandler;
+ }
+ return new WriteOutContentHandler(formatHandler, writeLimit, throwOnWriteLimitReached,
+ parseContext);
+ }
- }
+ private ContentHandler getFormatHandler() {
+ switch (type) {
+ case TEXT:
+ return new ToTextContentHandler();
+ case HTML:
+ return new ToHTMLContentHandler();
+ case XML:
+ return new ToXMLContentHandler();
+ default:
+ return new ToTextContentHandler();
}
}
@@ -182,4 +207,9 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory {
public int getWriteLimit() {
return writeLimit;
}
+
+ @Override
+ public boolean isThrowOnWriteLimitReached() {
+ return throwOnWriteLimitReached;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java
new file mode 100644
index 000000000..d82895a1b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+public interface WriteLimiter {
+ int getWriteLimit();
+ boolean isThrowOnWriteLimitReached();
+}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
index 87a31b973..672a8bf03 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
@@ -26,6 +26,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParseRecord;
/**
* SAX event handler that writes content up to an optional write
@@ -45,6 +47,12 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
*/
private int writeCount = 0;
+ private boolean throwOnWriteLimitReached = true;
+
+ private ParseContext parseContext = null;
+
+ private boolean writeLimitReached;
+
/**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
@@ -118,37 +126,70 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
* The internal string buffer is bounded at 100k characters. If this
* write limit is reached, then a {@link SAXException} is thrown. The
* {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to
- * detect
- * this case.
+ * detect this case.
*/
public WriteOutContentHandler() {
this(100 * 1000);
}
+ /**
+ * The default is to throw a {@link WriteLimitReachedException}
+ * @param handler
+ * @param writeLimit
+ * @param throwOnWriteLimitReached
+ * @param parseContext
+ */
+ public WriteOutContentHandler(ContentHandler handler,
+ int writeLimit, boolean throwOnWriteLimitReached,
+ ParseContext parseContext) {
+ super(handler);
+ this.writeLimit = writeLimit;
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ this.parseContext = parseContext;
+ }
+
/**
* Writes the given characters to the given character stream.
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
+ if (writeLimitReached) {
+ return;
+ }
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.characters(ch, start, length);
writeCount += length;
} else {
super.characters(ch, start, writeLimit - writeCount);
- writeCount = writeLimit;
- throw new WriteLimitReachedException(writeLimit);
+ handleWriteLimitReached();
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (writeLimitReached) {
+ return;
+ }
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.ignorableWhitespace(ch, start, length);
writeCount += length;
} else {
super.ignorableWhitespace(ch, start, writeLimit - writeCount);
- writeCount = writeLimit;
+ handleWriteLimitReached();
+ }
+ }
+
+ private void handleWriteLimitReached() throws WriteLimitReachedException {
+ writeLimitReached = true;
+ writeCount = writeLimit;
+ if (throwOnWriteLimitReached) {
throw new WriteLimitReachedException(writeLimit);
+ } else {
+ ParseRecord parseRecord = parseContext.get(ParseRecord.class);
+ if (parseRecord != null) {
+ parseRecord.setWriteLimitReached(true);
+ }
}
}
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 8e81d603d..ec3598d8d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -41,6 +41,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -49,6 +50,8 @@ import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.external.CompositeExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
public class AutoDetectParserTest extends TikaTest {
// Easy to read constants for the MIME types:
@@ -403,6 +406,49 @@ public class AutoDetectParserTest extends TikaTest {
assertNotNull(p);
}
+ @Test
+ public void testWriteLimit() throws Exception {
+ ContentHandler handler = new WriteOutContentHandler(500);
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ try (InputStream stream =
+ getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, parseContext);
+ fail("write limit reached should have percolated to here");
+ } catch (WriteLimitReachedException e) {
+ //expected
+ }
+ String txt = handler.toString();
+ //test that the writelimit does intervene between these two
+ //pieces of text and that the first is there, but the second isn't
+ assertContains("assume among the powers", txt);
+ assertNotContained("unalienable Rights", txt);
+ //test that text from other embedded files after this one are not processed
+ assertNotContained("embed_4", txt);
+ }
+
+ @Test
+ public void testWriteLimitNoThrow() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ ContentHandler handler = new WriteOutContentHandler(new ToXMLContentHandler(),
+ 500, false, parseContext);
+ Metadata metadata = new Metadata();
+ try (InputStream stream =
+ getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, parseContext);
+ }
+ String txt = handler.toString();
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ //test that the writelimit does intervene between these two
+ //pieces of text and that the first is there, but the second isn't
+ assertContains("assume among the powers", txt);
+ assertNotContained("unalienable Rights", txt);
+ //test that text from other embedded files after this one are not processed,
+ //but that the entry is there for the embedded file, i.e. the parse continued
+ assertContains("id=\"embed4.txt\"", txt);
+ assertNotContained("embed_4", txt);
+ }
+
//This is not the complete/correct way to look for parsers within another parser
//However, it is good enough for this unit test for now.
private Parser find(CompositeParser parser, Class clazz) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 847e5d129..03461d5f3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -93,10 +93,12 @@ public class RecursiveParserWrapperTest extends TikaTest {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
- InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
- wrapper.parse(stream, handler, metadata, context);
+ try (InputStream stream =
+ getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
+ wrapper.parse(stream, handler, metadata, context);
+ }
List<Metadata> list = handler.getMetadataList();
assertEquals(5, list.size());
@@ -111,6 +113,29 @@ public class RecursiveParserWrapperTest extends TikaTest {
assertEquals(2, wlr);
}
+ @Test
+ public void testCharLimitNoThrowOnWriteLimit() throws Exception {
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500,
+ false, context));
+ try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" +
+ ".docx")) {
+ wrapper.parse(stream, handler, metadata, context);
+ }
+ List<Metadata> list = handler.getMetadataList();
+
+ assertEquals(12, list.size());
+
+ assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+
+ assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("unalienable Rights",
+ list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ }
@Test
public void testMaxEmbedded() throws Exception {
diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
index 73c0737fb..714610786 100644
--- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
+++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
@@ -162,7 +162,9 @@ public class JsonFetchEmitTuple {
}
fieldName = jParser.nextFieldName();
}
- return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources);
+ //TODO: implement configuration of throwOnWriteLimitReached
+ return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources,
+ true);
}
private static String getValue(JsonParser jParser) throws IOException {
diff --git a/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java b/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
index a95431e54..aeb4fefd4 100644
--- a/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
+++ b/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
@@ -46,7 +46,7 @@ public class JsonFetchEmitTupleTest {
new EmitKey("my_emitter", "emitKey1"), m,
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
HandlerConfig.PARSE_MODE.CONCATENATE,
- 10000,10),
+ 10000,10, true),
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
@@ -69,7 +69,7 @@ public class JsonFetchEmitTupleTest {
new EmitKey("my_emitter", "emitKey1"), m,
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
HandlerConfig.PARSE_MODE.CONCATENATE,
- 10000,10),
+ 10000,10, true),
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 545d4ae2c..76e24b926 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -141,7 +141,7 @@ public class RecursiveMetadataResource {
return new HandlerConfig(
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE),
parseMode,
- writeLimit, maxEmbeddedResources);
+ writeLimit, maxEmbeddedResources, TikaResource.getThrowOnWriteLimitReached(httpHeaders));
}
/**
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 5a60018f2..3d4954df8 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
@@ -590,12 +591,15 @@ public class TikaResource {
logRequest(LOG, "/tika", metadata);
int writeLimit = -1;
+ boolean throwOnWriteLimitReached = getThrowOnWriteLimitReached(httpHeaders);
if (httpHeaders.containsKey("writeLimit")) {
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
}
+
BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
- BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit);
+ BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit,
+ throwOnWriteLimitReached, context);
ContentHandler contentHandler = fact.getNewContentHandler();
try {
@@ -630,6 +634,20 @@ public class TikaResource {
}
}
+ public static boolean getThrowOnWriteLimitReached(MultivaluedMap<String, String> httpHeaders) {
+ if (httpHeaders.containsKey("throwOnWriteLimitReached")) {
+ String val = httpHeaders.getFirst("throwOnWriteLimitReached");
+ if ("true".equalsIgnoreCase(val)) {
+ return true;
+ } else if ("false".equalsIgnoreCase(val)) {
+ return false;
+ } else {
+ throw new IllegalArgumentException("'throwOnWriteLimitReached' must be either 'true' or 'false'");
+ }
+ }
+ return HandlerConfig.DEFAULT_HANDLER_CONFIG.isThrowOnWriteLimitReached();
+ }
+
private StreamingOutput produceOutput(final InputStream is, Metadata metadata,
final MultivaluedMap<String, String> httpHeaders,
final UriInfo info, final String format) {
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
index d760dae1d..b1b73a896 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
@@ -223,7 +223,7 @@ public class TikaPipesTest extends CXFTestBase {
new EmitKey("fse", ""),
userMetadata,
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
- HandlerConfig.PARSE_MODE.RMETA, -1, -1),
+ HandlerConfig.PARSE_MODE.RMETA, -1, -1, true),
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index eb5222bfe..82474b38a 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -170,6 +170,23 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
}
+ @Test
+ public void testNoWriteLimitOnStreamingWrite() throws Exception {
+ //this test shows that write limit is not active for
+ //text or xhtml or anything that does streaming writes
+ Response response = WebClient.create(endPoint + TIKA_PATH).header("writeLimit", "100")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ String content = getStringFromInputStream((InputStream) response.getEntity());
+ assertContains("separation.", content);
+
+ response = WebClient.create(endPoint + TIKA_PATH).header("writeLimit", "100")
+ .accept("text/html")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ content = getStringFromInputStream((InputStream) response.getEntity());
+ assertContains("separation.</p>", content);
+ }
+
@Test
public void testJsonHandlerType() throws Exception {
Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 1663fb71d..3de5c0e65 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -345,7 +345,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
assertEquals(1, metadataList.size());
assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- //now try with a write limit of 1000
+ //now try with a write limit of 200
writeLimit = 200;
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.header("writeLimit", Integer.toString(writeLimit))
@@ -378,4 +378,39 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
}
+ @Test
+ public void testNoThrowOnWriteLimitReached() throws Exception {
+ int writeLimit = 100;
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .header("throwOnWriteLimitReached", "false")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ assertEquals(200, response.getStatus());
+ // Check results
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(10, metadataList.size());
+ assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+
+ //now try with a write limit of 200
+ writeLimit = 200;
+ response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .header("throwOnWriteLimitReached", "false")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ assertEquals(200, response.getStatus());
+ // Check results
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(10, metadataList.size());
+ assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ assertContains("When in the Course of human events it becomes necessary for one people",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ TikaTest.assertNotContained("We hold these truths",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+
+ }
+
}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
index e48455b43..172600ec0 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -195,7 +195,7 @@ public class TikaPipesTest extends CXFTestBase {
new EmitKey("fse", ""),
new Metadata(),
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
- HandlerConfig.PARSE_MODE.CONCATENATE, -1, -1000),
+ HandlerConfig.PARSE_MODE.CONCATENATE, -1, -1000, true),
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index a42c80f6f..a427b6e00 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -581,7 +581,7 @@ public class TikaResourceTest extends CXFTestBase {
@Test
public void testJsonWriteLimitEmbedded() throws Exception {
Response response =
- WebClient.create(endPoint + TIKA_PATH + "/text").accept("application/json")
+ WebClient.create(endPoint + TIKA_PATH + "/html").accept("application/json")
.header("writeLimit", "500")
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
Metadata metadata = JsonMetadata.fromJson(
@@ -594,7 +594,26 @@ public class TikaResourceTest extends CXFTestBase {
assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION)
.startsWith("org.apache.tika.exception.WriteLimitReachedException"));
assertNotFound("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ }
+ @Test
+ public void testJsonNoThrowWriteLimitEmbedded() throws Exception {
+ Response response =
+ WebClient.create(endPoint + TIKA_PATH + "/html").accept("application/json")
+ .header("writeLimit", "500")
+ .header("throwOnWriteLimitReached", "false")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()),
+ StandardCharsets.UTF_8));
+ String txt = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ assertContains("embed2a.txt", txt);
+ assertContains("When in the Course", txt);
+ assertNotFound("declare the causes", txt);
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ assertContains("<div class=\"embedded\" id=\"embed4.txt",
+ metadata.get(TikaCoreProperties.TIKA_CONTENT));
}
@Test