You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/28 11:12:00 UTC
(tika) branch TIKA-4207 updated: TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4207 by this push:
new 8cdaff4b3 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor
8cdaff4b3 is described below
commit 8cdaff4b3e2a4a477f753f3bfca751d804721a9d
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 28 07:11:46 2024 -0400
TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor
---
...a => AbstractEmbeddedDocumentBytesHandler.java} | 2 +-
...java => BasicEmbeddedDocumentBytesHandler.java} | 12 ++-
...EmbeddedDocumentByteStoreExtractorFactory.java} | 24 +++---
...tore.java => EmbeddedDocumentBytesHandler.java} | 4 +-
.../tika/extractor/EmbeddedDocumentUtil.java | 2 +-
.../ParsingEmbeddedDocumentExtractor.java | 93 +---------------------
.../ParsingEmbeddedDocumentExtractorFactory.java | 74 +----------------
...ocumentExtractor.java => RUnpackExtractor.java} | 19 +++--
...orFactory.java => RUnpackExtractorFactory.java} | 11 ++-
.../org/apache/tika/parser/AutoDetectParser.java | 11 ++-
.../apache/tika/parser/AutoDetectParserConfig.java | 4 +-
.../java/org/apache/tika/pipes/PipesServer.java | 67 +++++++++++-----
.../extractor/EmbeddedDocumentBytesConfig.java | 9 +++
...a => EmittingEmbeddedDocumentBytesHandler.java} | 15 ++--
.../tika/parser/AutoDetectParserConfigTest.java | 10 +--
.../org/apache/tika/pipes/PipesServerTest.java | 17 +++-
.../config/TIKA-4207-embedded-bytes-config.xml | 2 +-
.../apache/tika/pipes/TIKA-4207-limit-bytes.xml | 2 +-
.../apache/tika/example/ExtractEmbeddedFiles.java | 2 +-
.../parser/microsoft/pst/OutlookPSTParserTest.java | 2 +-
.../apache/tika/parser/pdf/PDFRenderingTest.java | 2 +-
.../resources/configs/tika-config-no-names.xml | 2 +-
.../resources/configs/tika-config-with-names.xml | 2 +-
23 files changed, 142 insertions(+), 246 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
similarity index 96%
rename from tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
rename to tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 15b26451a..3f2f38f94 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -28,7 +28,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.utils.StringUtils;
-public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocumentByteStore {
+public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler {
List<Integer> ids = new ArrayList<>();
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
similarity index 80%
rename from tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
rename to tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
index d3aeb4507..cf6441b4f 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
@@ -27,9 +27,16 @@ import org.apache.commons.io.input.UnsynchronizedBufferedInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
-public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByteStore {
+/**
+ * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores
+ * all the bytes in memory. Users can retrieve the documents with {@link #getDocument(int)}.
+ *
+ * We'll need to make this cache to disk at some point if there are many bytes of
+ * embedded documents.
+ */
+public class BasicEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler {
private final EmbeddedDocumentBytesConfig config;
- public BasicEmbeddedDocumentByteStore(EmbeddedDocumentBytesConfig config) {
+ public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig config) {
this.config = config;
}
//this won't scale, but let's start fully in memory for now;
@@ -40,7 +47,6 @@ public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByte
docBytes.put(id, IOUtils.toByteArray(is));
}
- @Override
public InputStream getDocument(int id) throws IOException {
return new UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get();
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
similarity index 59%
copy from tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
copy to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
index 8e1e8e325..f7237bd6a 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
@@ -16,18 +16,18 @@
*/
package org.apache.tika.extractor;
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-import org.apache.tika.metadata.Metadata;
-
-public interface EmbeddedDocumentByteStore extends Closeable {
- //we need metadata for the emitter store...can we get away without it?
- void add(int id, Metadata metadata, InputStream inputStream) throws IOException;
-
- InputStream getDocument(int id) throws IOException;
+/**
+ * This factory creates EmbeddedDocumentExtractors that require an
+ * {@link EmbeddedDocumentBytesHandler} in the
+ * {@link org.apache.tika.parser.ParseContext} should extend this.
+ *
+ * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer}
+ * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom
+ * EmbeddedDocumentExtractor.
+ *
+ * TODO: Figure out how to simplify this and allow for emitting of the source document.
+ */
+public interface EmbeddedDocumentByteStoreExtractorFactory extends EmbeddedDocumentExtractorFactory {
- List<Integer> getIds();
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
similarity index 90%
rename from tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
rename to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
index 8e1e8e325..12357a718 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
@@ -23,11 +23,9 @@ import java.util.List;
import org.apache.tika.metadata.Metadata;
-public interface EmbeddedDocumentByteStore extends Closeable {
+public interface EmbeddedDocumentBytesHandler extends Closeable {
//we need metadata for the emitter store...can we get away without it?
void add(int id, Metadata metadata, InputStream inputStream) throws IOException;
- InputStream getDocument(int id) throws IOException;
-
List<Integer> getIds();
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 99a3f3921..d6e2c28a8 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable {
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
}
- EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context, 0);
+ EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
context.set(EmbeddedDocumentExtractor.class, ex);
return ex;
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 97cf5b57f..8391624a3 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -22,12 +22,8 @@ import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -35,7 +31,6 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -55,9 +50,6 @@ import org.apache.tika.sax.EmbeddedContentHandler;
*/
public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
- private static final Logger LOGGER =
- LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
-
private static final File ABSTRACT_PATH = new File("");
private static final Parser DELEGATING_PARSER = new DelegatingParser();
@@ -66,14 +58,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private final ParseContext context;
- private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL;
-
- private long bytesExtracted = 0;
- private final long maxEmbeddedBytesForExtraction;
-
- public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) {
+ public ParsingEmbeddedDocumentExtractor(ParseContext context) {
this.context = context;
- this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
}
public boolean shouldParseEmbedded(Metadata metadata) {
@@ -113,19 +99,15 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream =
- TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata);
+ TikaInputStream.get(new CloseShieldInputStream(stream), tmp, metadata);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
- EmbeddedDocumentByteStore store = context.get(EmbeddedDocumentByteStore.class);
- if (store != null) {
- parseWithBytes(newStream, handler, metadata);
- } else {
- parse(newStream, handler, metadata);
- }
+ DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata, context);
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
@@ -141,65 +123,6 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
}
}
- private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata)
- throws TikaException, IOException, SAXException {
- //TODO -- improve the efficiency of this so that we're not
- //literally writing out a file per request
- Path p = stream.getPath();
- try {
- parse(stream, handler, metadata);
- } finally {
- storeEmbeddedBytes(p, metadata);
- }
- }
-
- private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata)
- throws TikaException, IOException, SAXException {
- DELEGATING_PARSER.parse(stream,
- new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
- }
-
- private void storeEmbeddedBytes(Path p, Metadata metadata) {
- if (! embeddedBytesSelector.select(metadata)) {
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("skipping embedded bytes {} <-> {}",
- metadata.get(Metadata.CONTENT_TYPE),
- metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
- }
- return;
- }
- EmbeddedDocumentByteStore embeddedDocumentByteStore =
- context.get(EmbeddedDocumentByteStore.class);
- int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
- try (InputStream is = Files.newInputStream(p)) {
- if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
- throw new IOException("Bytes extracted (" + bytesExtracted +
- ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")");
- }
- long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
-
- try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) {
- embeddedDocumentByteStore.add(id, metadata, boundedIs);
- bytesExtracted += boundedIs.getPos();
- if (boundedIs.hasHitBound()) {
- throw new IOException("Bytes extracted (" + bytesExtracted +
- ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " +
- "bytes");
- }
- }
- } catch (IOException e) {
- LOGGER.warn("problem writing out embedded bytes", e);
- //info in metadata doesn't actually make it back to the metadata list
- //because we're filtering and cloning the metadata at the end of the parse
- //which happens before we try to copy out the files.
- //TODO fix this
- //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
- // ExceptionUtils.getStackTrace(e));
- }
- }
-
-
private void recordException(Exception e, ParseContext context) {
ParseRecord record = context.get(ParseRecord.class);
if (record == null) {
@@ -215,12 +138,4 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
}
-
- public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) {
- this.embeddedBytesSelector = embeddedBytesSelector;
- }
-
- public EmbeddedBytesSelector getEmbeddedBytesSelector() {
- return embeddedBytesSelector;
- }
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index fd8cf54b1..9136228c4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -16,93 +16,25 @@
*/
package org.apache.tika.extractor;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory
+ implements EmbeddedDocumentExtractorFactory {
private boolean writeFileNameToContent = true;
- private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
- private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
- private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET;
- private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
- private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB
@Field
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
}
- @Field
- public void setEmbeddedBytesIncludeMimeTypes(List<String> includeMimeTypes) {
- embeddedBytesIncludeMimeTypes = new HashSet<>();
- embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
- }
-
- @Field
- public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) {
- embeddedBytesExcludeMimeTypes = new HashSet<>();
- embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
-
- }
-
- @Field
- public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) {
- embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
- embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
-
- }
-
- @Field
- public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) {
- embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
- embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
-
- }
-
- /**
- * Total number of bytes to write out. A good zip bomb may contain petabytes
- * compressed into a few kb. Make sure that you can't fill up a disk!
- *
- * This does not include the container file in the count of bytes written out.
- * This only counts the lengths of the embedded files.
- *
- * @param maxEmbeddedBytesForExtraction
- */
- @Field
- public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException {
- if (maxEmbeddedBytesForExtraction < 0) {
- throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0");
- }
- this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
- }
-
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction);
+ new ParsingEmbeddedDocumentExtractor(parseContext);
ex.setWriteFileNameToContent(writeFileNameToContent);
- ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
return ex;
}
-
-
- private EmbeddedBytesSelector createEmbeddedBytesSelector() {
- if (embeddedBytesIncludeMimeTypes.size() == 0 &&
- embeddedBytesExcludeMimeTypes.size() == 0 &&
- embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
- embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
- return EmbeddedBytesSelector.ACCEPT_ALL;
- }
- return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
- embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes,
- embeddedBytesExcludeEmbeddedResourceTypes);
- }
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
similarity index 92%
copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
copy to tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 97cf5b57f..4c69d0997 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -48,12 +48,11 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
/**
- * Helper class for parsers of package archives or other compound document
- * formats that support embedded or attached component documents.
+ * Recursive Unpacker and text and metadata extractor.
*
- * @since Apache Tika 0.8
+ * @since Apache Tika 3.0.0
*/
-public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
+public class RUnpackExtractor implements EmbeddedDocumentExtractor {
private static final Logger LOGGER =
LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
@@ -71,7 +70,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private long bytesExtracted = 0;
private final long maxEmbeddedBytesForExtraction;
- public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) {
+ public RUnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) {
this.context = context;
this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
}
@@ -120,8 +119,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
newStream.setOpenContainer(container);
}
}
- EmbeddedDocumentByteStore store = context.get(EmbeddedDocumentByteStore.class);
- if (store != null) {
+ EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class);
+ if (bytesHandler != null) {
parseWithBytes(newStream, handler, metadata);
} else {
parse(newStream, handler, metadata);
@@ -169,8 +168,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
}
return;
}
- EmbeddedDocumentByteStore embeddedDocumentByteStore =
- context.get(EmbeddedDocumentByteStore.class);
+ EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler =
+ context.get(EmbeddedDocumentBytesHandler.class);
int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
try (InputStream is = Files.newInputStream(p)) {
if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
@@ -180,7 +179,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) {
- embeddedDocumentByteStore.add(id, metadata, boundedIs);
+ embeddedDocumentBytesHandler.add(id, metadata, boundedIs);
bytesExtracted += boundedIs.getPos();
if (boundedIs.hasHitBound()) {
throw new IOException("Bytes extracted (" + bytesExtracted +
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
similarity index 91%
copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
copy to tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
index fd8cf54b1..a715ed25f 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
@@ -26,7 +26,9 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory {
+public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtractorFactory {
+
+ public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l * 1024l * 1024l;
private boolean writeFileNameToContent = true;
private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
@@ -34,7 +36,7 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument
private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET;
private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
- private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB
+ private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION;
@Field
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
@@ -86,8 +88,9 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction);
+ RUnpackExtractor ex =
+ new RUnpackExtractor(parseContext,
+ maxEmbeddedBytesForExtraction);
ex.setWriteFileNameToContent(writeFileNameToContent);
ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
return ex;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index d333c2e9a..86eae692a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -28,6 +28,8 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.HttpHeaders;
@@ -197,7 +199,6 @@ public class AutoDetectParser extends CompositeParser {
createSecureContentHandler(handler, tis, autoDetectParserConfig) : null;
initializeEmbeddedDocumentExtractor(metadata, context);
-
try {
// Parse the document
super.parse(tis, sch, metadata, context);
@@ -267,8 +268,12 @@ public class AutoDetectParser extends CompositeParser {
if (p == null) {
context.set(Parser.class, this);
}
- EmbeddedDocumentExtractor edx = autoDetectParserConfig.getEmbeddedDocumentExtractorFactory()
- .newInstance(metadata, context);
+ EmbeddedDocumentExtractorFactory edxf =
+ autoDetectParserConfig.getEmbeddedDocumentExtractorFactory();
+ if (edxf == null) {
+ edxf = new ParsingEmbeddedDocumentExtractorFactory();
+ }
+ EmbeddedDocumentExtractor edx = edxf.newInstance(metadata, context);
context.set(EmbeddedDocumentExtractor.class, edx);
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index bc4904367..afe65b07e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -25,7 +25,6 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
@@ -87,8 +86,7 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
private MetadataWriteFilterFactory metadataWriteFilterFactory = null;
- private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory =
- new ParsingEmbeddedDocumentExtractorFactory();
+ private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory = null;
private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory =
NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 5cc22d378..d8957368d 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -45,9 +45,14 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.BasicEmbeddedDocumentByteStore;
+import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler;
import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.extractor.EmbeddedDocumentByteStore;
+import org.apache.tika.extractor.EmbeddedDocumentByteStoreExtractorFactory;
+import org.apache.tika.extractor.EmbeddedDocumentBytesHandler;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.RUnpackExtractor;
+import org.apache.tika.extractor.RUnpackExtractorFactory;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -64,7 +69,7 @@ import org.apache.tika.pipes.emitter.Emitter;
import org.apache.tika.pipes.emitter.EmitterManager;
import org.apache.tika.pipes.emitter.StreamEmitter;
import org.apache.tika.pipes.emitter.TikaEmitterException;
-import org.apache.tika.pipes.extractor.EmbeddedDocumentEmitterStore;
+import org.apache.tika.pipes.extractor.EmittingEmbeddedDocumentBytesHandler;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.fetcher.FetcherManager;
@@ -381,9 +386,9 @@ public class PipesServer implements Runnable {
emitParseData(t, parseData);
} finally {
if (parseData != null && parseData.hasEmbeddedDocumentByteStore() &&
- parseData.getEmbeddedDocumentByteStore() instanceof Closeable) {
+ parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) {
try {
- ((Closeable) parseData.getEmbeddedDocumentByteStore()).close();
+ ((Closeable) parseData.getEmbeddedDocumentBytesHandler()).close();
} catch (IOException e) {
LOG.warn("problem closing embedded document byte store", e);
}
@@ -536,7 +541,7 @@ public class PipesServer implements Runnable {
}
return new MetadataListAndEmbeddedBytes(metadataList,
- parseContext.get(EmbeddedDocumentByteStore.class));
+ parseContext.get(EmbeddedDocumentBytesHandler.class));
}
private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple)
@@ -545,14 +550,28 @@ public class PipesServer implements Runnable {
if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) {
return parseContext;
}
-
- //TODO: clean this up.
+ EmbeddedDocumentExtractorFactory factory = ((AutoDetectParser)autoDetectParser)
+ .getAutoDetectParserConfig().getEmbeddedDocumentExtractorFactory();
+ if (factory == null) {
+ parseContext.set(EmbeddedDocumentExtractor.class, new RUnpackExtractor(parseContext,
+ RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION));
+ } else {
+ if (! (factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) {
+ throw new TikaConfigException("EmbeddedDocumentExtractorFactory must be an " +
+ "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" +
+ "to extract embedded bytes! I see this embedded doc factory: " +
+ factory.getClass() + "and a request: " +
+ fetchEmitTuple.getEmbeddedDocumentBytesConfig());
+ }
+ }
+ //TODO: especially clean this up.
if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) {
- parseContext.set(EmbeddedDocumentByteStore.class,
- new EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(),
+ parseContext.set(EmbeddedDocumentBytesHandler.class,
+ new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(),
fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager));
} else {
- parseContext.set(EmbeddedDocumentByteStore.class, new BasicEmbeddedDocumentByteStore(
+ parseContext.set(EmbeddedDocumentBytesHandler.class,
+ new BasicEmbeddedDocumentBytesHandler(
fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
}
return parseContext;
@@ -677,8 +696,8 @@ public class PipesServer implements Runnable {
if (t.getEmbeddedDocumentBytesConfig() != null &&
t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
- EmbeddedDocumentByteStore embeddedDocumentByteStore =
- parseContext.get(EmbeddedDocumentByteStore.class);
+ EmbeddedDocumentBytesHandler embeddedDocumentByteStore =
+ parseContext.get(EmbeddedDocumentBytesHandler.class);
try (InputStream is = Files.newInputStream(tis.getPath())) {
embeddedDocumentByteStore.add(0, metadata, is);
} catch (IOException e) {
@@ -747,6 +766,14 @@ public class PipesServer implements Runnable {
//override this value because we'll be digesting before parse
((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
.setSkipContainerDocument(true);
+ //if the user hasn't configured an embedded document extractor, set up the
+ // RUnpackExtractorFactory
+ if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
+ .getEmbeddedDocumentExtractorFactory() == null) {
+ ((AutoDetectParser) autoDetectParser)
+ .getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory(
+ new RUnpackExtractorFactory());
+ }
}
this.detector = ((AutoDetectParser) this.autoDetectParser).getDetector();
this.rMetaParser = new RecursiveParserWrapper(autoDetectParser);
@@ -809,20 +836,20 @@ public class PipesServer implements Runnable {
class MetadataListAndEmbeddedBytes {
final List<Metadata> metadataList;
- final Optional<EmbeddedDocumentByteStore> embeddedDocumentByteStore;
+ final Optional<EmbeddedDocumentBytesHandler> embeddedDocumentBytesHandler;
public MetadataListAndEmbeddedBytes(List<Metadata> metadataList,
- EmbeddedDocumentByteStore embeddedDocumentByteStore) {
+ EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) {
this.metadataList = metadataList;
- this.embeddedDocumentByteStore = Optional.ofNullable(embeddedDocumentByteStore);
+ this.embeddedDocumentBytesHandler = Optional.ofNullable(embeddedDocumentBytesHandler);
}
public List<Metadata> getMetadataList() {
return metadataList;
}
- public EmbeddedDocumentByteStore getEmbeddedDocumentByteStore() {
- return embeddedDocumentByteStore.get();
+ public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() {
+ return embeddedDocumentBytesHandler.get();
}
/**
@@ -832,7 +859,7 @@ public class PipesServer implements Runnable {
* @return
*/
public boolean hasEmbeddedDocumentByteStore() {
- return embeddedDocumentByteStore.isPresent();
+ return embeddedDocumentBytesHandler.isPresent();
}
/**
@@ -844,7 +871,7 @@ public class PipesServer implements Runnable {
* @return
*/
public boolean toBePackagedForStreamEmitter() {
- return !(embeddedDocumentByteStore.get() instanceof EmbeddedDocumentEmitterStore);
+ return !(embeddedDocumentBytesHandler.get() instanceof EmittingEmbeddedDocumentBytesHandler);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index 66b7321ac..071de05c4 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -55,6 +55,15 @@ public class EmbeddedDocumentBytesConfig implements Serializable {
private boolean includeOriginal = false;
+ /**
+ * Create an EmbeddedDocumentBytesConfig with
+ * {@link EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes}
+ * set to <code>true</code>
+ */
+ public EmbeddedDocumentBytesConfig() {
+ this.extractEmbeddedDocumentBytes = true;
+ }
+
public EmbeddedDocumentBytesConfig(boolean extractEmbeddedDocumentBytes) {
this.extractEmbeddedDocumentBytes = extractEmbeddedDocumentBytes;
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
similarity index 83%
rename from tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
rename to tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
index 5d09cfe18..1132a4bc6 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
@@ -23,7 +23,7 @@ import java.io.InputStream;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore;
+import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.emitter.Emitter;
@@ -31,15 +31,15 @@ import org.apache.tika.pipes.emitter.EmitterManager;
import org.apache.tika.pipes.emitter.StreamEmitter;
import org.apache.tika.pipes.emitter.TikaEmitterException;
-public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteStore {
+public class EmittingEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler {
private final EmitKey containerEmitKey;
private final EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig;
private final StreamEmitter emitter;
private static final Metadata METADATA = new Metadata();
- public EmbeddedDocumentEmitterStore(EmitKey containerEmitKey,
- EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
- EmitterManager emitterManager) throws TikaConfigException {
+ public EmittingEmbeddedDocumentBytesHandler(EmitKey containerEmitKey,
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
+ EmitterManager emitterManager) throws TikaConfigException {
this.containerEmitKey = containerEmitKey;
this.embeddedDocumentBytesConfig = embeddedDocumentBytesConfig;
Emitter tmpEmitter =
@@ -64,11 +64,6 @@ public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteSt
}
}
- @Override
- public InputStream getDocument(int id) {
- throw new UnsupportedOperationException("this is emit only.");
- }
-
@Override
public void close() throws IOException {
if (emitter instanceof Closeable) {
diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index a0d5d4896..62b061d98 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -25,8 +25,8 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedBytesSelector;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.RUnpackExtractor;
+import org.apache.tika.extractor.RUnpackExtractorFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
@@ -41,12 +41,12 @@ public class AutoDetectParserConfigTest {
config = new TikaConfig(is);
}
AutoDetectParserConfig c = config.getAutoDetectParserConfig();
- ParsingEmbeddedDocumentExtractorFactory f =
- (ParsingEmbeddedDocumentExtractorFactory) c.getEmbeddedDocumentExtractorFactory();
+ RUnpackExtractorFactory f =
+ (RUnpackExtractorFactory) c.getEmbeddedDocumentExtractorFactory();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
- ParsingEmbeddedDocumentExtractor ex = (ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext);
+ RUnpackExtractor ex = (RUnpackExtractor) f.newInstance(metadata, parseContext);
EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector();
assertFalse(selector.select(getMetadata("", "")));
assertTrue(selector.select(getMetadata("application/pdf", "")));
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 6f55e5d11..6794f1a8f 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
@@ -117,9 +118,13 @@ public class PipesServerTest extends TikaTest {
assertEquals(2, parseData.metadataList.size());
byte[] bytes0 =
- IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+ IOUtils.toByteArray(
+ ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ .getDocument(0));
byte[] bytes1 =
- IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+ IOUtils.toByteArray(
+ ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ .getDocument(1));
assertContains("is to trigger mock on the embedded",
new String(bytes0, StandardCharsets.UTF_8));
@@ -170,9 +175,13 @@ public class PipesServerTest extends TikaTest {
assertEquals(2, parseData.metadataList.size());
byte[] bytes0 =
- IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+ IOUtils.toByteArray(
+ ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ .getDocument(0));
byte[] bytes1 =
- IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+ IOUtils.toByteArray(
+ ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ .getDocument(1));
assertContains("is to trigger mock on the embedded",
new String(bytes0, StandardCharsets.UTF_8));
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
index d60c6b1ca..5e1339a40 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
@@ -22,7 +22,7 @@
<autoDetectParserConfig>
<spoolToDisk>123450</spoolToDisk>
<outputThreshold>678900</outputThreshold>
- <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory">
<writeFileNameToContent>false</writeFileNameToContent>
<embeddedBytesIncludeMimeTypes>
<mime>application/pdf</mime>
diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
index 610bad77b..5e46a09e9 100644
--- a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
+++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
@@ -20,7 +20,7 @@
<digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory">
<skipContainerDocument>false</skipContainerDocument>
</digesterFactory>
- <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory">
<writeFileNameToContent>false</writeFileNameToContent>
<maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction>
</embeddedDocumentExtractorFactory>
diff --git a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
index 43c0b1d3a..091facc21 100644
--- a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
+++ b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
@@ -64,7 +64,7 @@ public class ExtractEmbeddedFiles {
private int fileCount = 0;
private MyEmbeddedDocumentExtractor(Path outputDir, ParseContext context) {
- super(context, 1000000l);
+ super(context);
this.outputDir = outputDir;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index bcd45460c..c95547aee 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest {
List<Metadata> trackingMetadata = new ArrayList<>();
public EmbeddedTrackingExtrator(ParseContext context) {
- super(context, 0);
+ super(context);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 8503e8bd8..08d18b6c1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest {
Map<Integer, byte[]> embedded = new HashMap<>();
public RenderCaptureExtractor(ParseContext context) {
- super(context, 0);
+ super(context);
}
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
index 0e2f26bd2..9cedc9ed4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
@@ -22,7 +22,7 @@
<autoDetectParserConfig>
<spoolToDisk>123450</spoolToDisk>
<outputThreshold>678900</outputThreshold>
- <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory">
<writeFileNameToContent>false</writeFileNameToContent>
</embeddedDocumentExtractorFactory>
</autoDetectParserConfig>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
index f54eb9a0a..369acafc9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
@@ -22,7 +22,7 @@
<autoDetectParserConfig>
<spoolToDisk>123450</spoolToDisk>
<outputThreshold>678900</outputThreshold>
- <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory">
<writeFileNameToContent>true</writeFileNameToContent>
</embeddedDocumentExtractorFactory>
</autoDetectParserConfig>