You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/21 21:19:52 UTC
(tika) branch TIKA-4207 updated: TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4207 by this push:
new 59608e69b TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted
59608e69b is described below
commit 59608e69bdaeb8a8151e1e9f27b1ef7c3030288b
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 21 17:19:37 2024 -0400
TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted
---
.../AbstractEmbeddedDocumentByteStore.java | 3 +-
.../extractor/BasicEmbeddedDocumentByteStore.java | 16 ++--
.../tika/extractor/EmbeddedDocumentByteStore.java | 5 +-
.../tika/extractor/EmbeddedDocumentUtil.java | 2 +-
.../ParsingEmbeddedDocumentExtractor.java | 40 +++++++--
.../ParsingEmbeddedDocumentExtractorFactory.java | 22 ++++-
.../org/apache/tika/io/BoundedInputStream.java | 4 +
.../java/org/apache/tika/pipes/PipesServer.java | 5 +-
.../extractor/EmbeddedDocumentBytesConfig.java | 6 +-
.../extractor/EmbeddedDocumentEmitterStore.java | 9 +-
.../org/apache/tika/pipes/PipesServerTest.java | 58 ++++++++++++-
.../apache/tika/pipes/TIKA-4207-limit-bytes.xml | 34 ++++++++
.../parser/microsoft/pst/OutlookPSTParserTest.java | 2 +-
.../apache/tika/parser/pdf/PDFRenderingTest.java | 2 +-
.../apache/tika/server/standard/TikaPipesTest.java | 97 +++++++++++++++++++++-
15 files changed, 270 insertions(+), 35 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
index 214c2ab4e..15b26451a 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
@@ -17,6 +17,7 @@
package org.apache.tika.extractor;
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@@ -57,7 +58,7 @@ public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocum
}
@Override
- public void add(int id, Metadata metadata, byte[] bytes) throws IOException {
+ public void add(int id, Metadata metadata, InputStream bytes) throws IOException {
ids.add(id);
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
index b41285eb0..d3aeb4507 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
@@ -17,9 +17,13 @@
package org.apache.tika.extractor;
import java.io.IOException;
+import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedBufferedInputStream;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
@@ -30,13 +34,15 @@ public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByte
}
//this won't scale, but let's start fully in memory for now;
Map<Integer, byte[]> docBytes = new HashMap<>();
- public void add(int id, Metadata metadata, byte[] bytes) throws IOException {
- super.add(id, metadata, bytes);
- docBytes.put(id, bytes);
+ @Override
+ public void add(int id, Metadata metadata, InputStream is) throws IOException {
+ super.add(id, metadata, is);
+ docBytes.put(id, IOUtils.toByteArray(is));
}
- public byte[] getDocument(int id) {
- return docBytes.get(id);
+ @Override
+ public InputStream getDocument(int id) throws IOException {
+ return new UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get();
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
index ad1bb81f3..8e1e8e325 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
@@ -18,15 +18,16 @@ package org.apache.tika.extractor;
import java.io.Closeable;
import java.io.IOException;
+import java.io.InputStream;
import java.util.List;
import org.apache.tika.metadata.Metadata;
public interface EmbeddedDocumentByteStore extends Closeable {
//we need metadata for the emitter store...can we get away without it?
- void add(int id, Metadata metadata, byte[] bytes) throws IOException;
+ void add(int id, Metadata metadata, InputStream inputStream) throws IOException;
- byte[] getDocument(int id);
+ InputStream getDocument(int id) throws IOException;
List<Integer> getIds();
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index d6e2c28a8..99a3f3921 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable {
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
}
- EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
+ EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context, 0);
context.set(EmbeddedDocumentExtractor.class, ex);
return ex;
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index ee15c1e22..97cf5b57f 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -35,6 +35,7 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -45,7 +46,6 @@ import org.apache.tika.parser.ParseRecord;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.utils.ExceptionUtils;
/**
* Helper class for parsers of package archives or other compound document
@@ -68,8 +68,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL;
- public ParsingEmbeddedDocumentExtractor(ParseContext context) {
+ private long bytesExtracted = 0;
+ private final long maxEmbeddedBytesForExtraction;
+
+ public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) {
this.context = context;
+ this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
}
public boolean shouldParseEmbedded(Metadata metadata) {
@@ -139,6 +143,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata)
throws TikaException, IOException, SAXException {
+ //TODO -- improve the efficiency of this so that we're not
+ //literally writing out a file per request
Path p = stream.getPath();
try {
parse(stream, handler, metadata);
@@ -157,7 +163,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private void storeEmbeddedBytes(Path p, Metadata metadata) {
if (! embeddedBytesSelector.select(metadata)) {
if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("skipping embedded bytes {} {}",
+ LOGGER.debug("skipping embedded bytes {} <-> {}",
metadata.get(Metadata.CONTENT_TYPE),
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
@@ -166,12 +172,30 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
EmbeddedDocumentByteStore embeddedDocumentByteStore =
context.get(EmbeddedDocumentByteStore.class);
int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
-
- try {
- embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
+ try (InputStream is = Files.newInputStream(p)) {
+ if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
+ throw new IOException("Bytes extracted (" + bytesExtracted +
+ ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")");
+ }
+ long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
+
+ try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) {
+ embeddedDocumentByteStore.add(id, metadata, boundedIs);
+ bytesExtracted += boundedIs.getPos();
+ if (boundedIs.hasHitBound()) {
+ throw new IOException("Bytes extracted (" + bytesExtracted +
+ ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " +
+ "bytes");
+ }
+ }
} catch (IOException e) {
- metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
- ExceptionUtils.getStackTrace(e));
+ LOGGER.warn("problem writing out embedded bytes", e);
+ //info in metadata doesn't actually make it back to the metadata list
+ //because we're filtering and cloning the metadata at the end of the parse
+ //which happens before we try to copy out the files.
+ //TODO fix this
+ //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+ // ExceptionUtils.getStackTrace(e));
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 7632ed49c..fd8cf54b1 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -22,6 +22,7 @@ import java.util.List;
import java.util.Set;
import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -33,6 +34,7 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument
private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET;
private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
+ private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB
@Field
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
@@ -65,15 +67,33 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument
}
+ /**
+ * Total number of bytes to write out. A good zip bomb may contain petabytes
+ * compressed into a few kb. Make sure that you can't fill up a disk!
+ *
+ * This does not include the container file in the count of bytes written out.
+ * This only counts the lengths of the embedded files.
+ *
+ * @param maxEmbeddedBytesForExtraction
+ */
+ @Field
+ public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException {
+ if (maxEmbeddedBytesForExtraction < 0) {
+ throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0");
+ }
+ this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
+ }
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext);
+ ParsingEmbeddedDocumentExtractor ex =
+ new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction);
ex.setWriteFileNameToContent(writeFileNameToContent);
ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
return ex;
}
+
private EmbeddedBytesSelector createEmbeddedBytesSelector() {
if (embeddedBytesIncludeMimeTypes.size() == 0 &&
embeddedBytesExcludeMimeTypes.size() == 0 &&
diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
index a80009cd2..31290cc1a 100644
--- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
@@ -147,4 +147,8 @@ public class BoundedInputStream extends InputStream {
public long transferTo(OutputStream out) throws IOException {
return in.transferTo(out);
}
+
+ public long getPos() {
+ return pos;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 851805d06..5cc22d378 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -546,6 +546,7 @@ public class PipesServer implements Runnable {
return parseContext;
}
+ //TODO: clean this up.
if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) {
parseContext.set(EmbeddedDocumentByteStore.class,
new EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(),
@@ -678,8 +679,8 @@ public class PipesServer implements Runnable {
t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
EmbeddedDocumentByteStore embeddedDocumentByteStore =
parseContext.get(EmbeddedDocumentByteStore.class);
- try {
- embeddedDocumentByteStore.add(0, metadata, Files.readAllBytes(tis.getPath()));
+ try (InputStream is = Files.newInputStream(tis.getPath())) {
+ embeddedDocumentByteStore.add(0, metadata, is);
} catch (IOException e) {
LOG.warn("problem reading source file into embedded document byte store", e);
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index 42538ff80..66b7321ac 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -44,11 +44,7 @@ public class EmbeddedDocumentBytesConfig implements Serializable {
}
}
private final boolean extractEmbeddedDocumentBytes;
- //TODO -- add these at some point
- /*
- private Set<String> includeMimeTypes = new HashSet<>();
- private Set<String> excludeMimeTypes = new HashSet<>();
- */
+
private int zeroPadName = 0;
private SUFFIX_STRATEGY suffixStrategy = SUFFIX_STRATEGY.NONE;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
index 915b44d44..5d09cfe18 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
@@ -18,9 +18,9 @@ package org.apache.tika.pipes.extractor;
import java.io.Closeable;
import java.io.IOException;
+import java.io.InputStream;
import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore;
@@ -53,20 +53,19 @@ public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteSt
}
@Override
- public void add(int id, Metadata metadata, byte[] bytes) throws IOException {
+ public void add(int id, Metadata metadata, InputStream inputStream) throws IOException {
//intentionally do not call super.add, because we want the ids list to be empty
String emitKey = getEmitKey(containerEmitKey.getEmitKey(),
id, embeddedDocumentBytesConfig, metadata);
-
try {
- emitter.emit(emitKey, new UnsynchronizedByteArrayInputStream(bytes), METADATA);
+ emitter.emit(emitKey, inputStream, METADATA);
} catch (TikaEmitterException e) {
throw new IOExceptionWithCause(e);
}
}
@Override
- public byte[] getDocument(int id) {
+ public InputStream getDocument(int id) {
throw new UnsupportedOperationException("this is emit only.");
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 857bf485f..6f55e5d11 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -116,8 +116,10 @@ public class PipesServerTest extends TikaTest {
parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
assertEquals(2, parseData.metadataList.size());
- byte[] bytes0 = parseData.getEmbeddedDocumentByteStore().getDocument(0);
- byte[] bytes1 = parseData.getEmbeddedDocumentByteStore().getDocument(1);
+ byte[] bytes0 =
+ IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+ byte[] bytes1 =
+ IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
assertContains("is to trigger mock on the embedded",
new String(bytes0, StandardCharsets.UTF_8));
@@ -127,4 +129,56 @@ public class PipesServerTest extends TikaTest {
assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
}
+
+ @Test
+ public void testEmbeddedStreamEmitterLimitBytes() throws Exception {
+ Path tmp = Paths.get("/home/tallison/Desktop/tmp");
+ if (Files.isDirectory(tmp)) {
+ FileUtils.deleteDirectory(tmp.toFile());
+ }
+ Files.createDirectories(tmp);
+ Path tikaConfig = tmp.resolve("tika-config.xml");
+
+ String xml = IOUtils.toString(
+ PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"),
+ StandardCharsets.UTF_8);
+ xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
+ Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
+
+ Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
+ tmp.resolve("mock.xml"));
+
+ PipesServer pipesServer = new PipesServer(tikaConfig,
+ new UnsynchronizedByteArrayInputStream(new byte[0]),
+ new PrintStream(new UnsynchronizedByteArrayOutputStream(), true,
+ StandardCharsets.UTF_8.name()),
+ -1, 30000, 30000);
+
+ pipesServer.initializeResources();
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
+ new EmbeddedDocumentBytesConfig(true);
+ embeddedDocumentBytesConfig.setIncludeOriginal(true);
+
+ FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
+ new FetchKey("fs", "mock.xml"),
+ new EmitKey("", ""), new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+ embeddedDocumentBytesConfig);
+ Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
+ PipesServer.MetadataListAndEmbeddedBytes
+ parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ assertEquals(2, parseData.metadataList.size());
+
+ byte[] bytes0 =
+ IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+ byte[] bytes1 =
+ IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+
+ assertContains("is to trigger mock on the embedded",
+ new String(bytes0, StandardCharsets.UTF_8));
+
+ assertEquals(10, bytes1.length);
+ assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
+ parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
+ }
}
diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
new file mode 100644
index 000000000..610bad77b
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <autoDetectParserConfig>
+ <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory">
+ <skipContainerDocument>false</skipContainerDocument>
+ </digesterFactory>
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <writeFileNameToContent>false</writeFileNameToContent>
+ <maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction>
+ </embeddedDocumentExtractorFactory>
+ </autoDetectParserConfig>
+ <fetchers>
+ <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+ <name>fs</name>
+ <basePath>BASE_PATH</basePath>
+ </fetcher>
+ </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index c95547aee..bcd45460c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest {
List<Metadata> trackingMetadata = new ArrayList<>();
public EmbeddedTrackingExtrator(ParseContext context) {
- super(context);
+ super(context, 0);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 08d18b6c1..8503e8bd8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest {
Map<Integer, byte[]> embedded = new HashMap<>();
public RenderCaptureExtractor(ParseContext context) {
- super(context);
+ super(context, 0);
}
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata,
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
index 7f41e065c..110c3f7e8 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -16,20 +16,27 @@
*/
package org.apache.tika.server.standard;
+import static org.apache.tika.TikaTest.debug;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import java.io.ByteArrayInputStream;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
+import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import jakarta.ws.rs.core.Response;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
@@ -49,6 +56,7 @@ import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.FetcherManager;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -72,6 +80,7 @@ public class TikaPipesTest extends CXFTestBase {
private static Path TMP_WORKING_DIR;
private static Path TMP_OUTPUT_DIR;
private static Path TMP_OUTPUT_FILE;
+ private static Path TMP_BYTES_DIR;
private static Path TIKA_PIPES_LOG4j2_PATH;
private static Path TIKA_CONFIG_PATH;
private static String TIKA_CONFIG_XML;
@@ -81,6 +90,7 @@ public class TikaPipesTest extends CXFTestBase {
public static void setUpBeforeClass() throws Exception {
Path inputDir = TMP_WORKING_DIR.resolve("input");
TMP_OUTPUT_DIR = TMP_WORKING_DIR.resolve("output");
+ TMP_BYTES_DIR = TMP_WORKING_DIR.resolve("bytes");
TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
Files.createDirectories(inputDir);
@@ -103,7 +113,10 @@ public class TikaPipesTest extends CXFTestBase {
"<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
"<params>" + "<name>fse</name>" + "<basePath>" +
TMP_OUTPUT_DIR.toAbsolutePath() + "</basePath>" + "</params>" +
- "</emitter>" + "</emitters>" + "<pipes><params><tikaConfig>" +
+ "</emitter>" + "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
+ "<params>" + "<name>bytes</name>" + "<basePath>" +
+ TMP_BYTES_DIR.toAbsolutePath() + "</basePath>" + "</params>" +
+ "</emitter>" +"</emitters>" + "<pipes><params><tikaConfig>" +
ProcessUtils.escapeCommandLine(
TIKA_CONFIG_PATH.toAbsolutePath().toString()) +
"</tikaConfig><numClients>10</numClients>" + "<forkedJvmArgs>" +
@@ -203,4 +216,86 @@ public class TikaPipesTest extends CXFTestBase {
assertContains("When in the Course",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ @Test
+ public void testBytes() throws Exception {
+ EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true);
+ config.setEmitter("bytes");
+ config.setIncludeOriginal(true);
+ config.setEmbeddedIdPrefix("-");
+ config.setZeroPadNameLength(10);
+ config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING);
+
+ FetchEmitTuple t =
+ new FetchEmitTuple("myId", new FetchKey("fsf", "test_recursive_embedded.docx"),
+ new EmitKey("fse", "test_recursive_embedded.docx"), new Metadata(),
+ new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ HandlerConfig.PARSE_MODE.RMETA, -1, -1, false),
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, config);
+ StringWriter writer = new StringWriter();
+ JsonFetchEmitTuple.toJson(t, writer);
+
+ String getUrl = endPoint + PIPES_PATH;
+ Response response =
+ WebClient.create(getUrl).accept("application/json").post(writer.toString());
+ assertEquals(200, response.getStatus());
+
+ List<Metadata> metadataList = null;
+ try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+ metadataList = JsonMetadataList.fromJson(reader);
+ }
+ assertEquals(12, metadataList.size());
+ assertContains("When in the Course",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ Map<String, Long> expected = loadExpected();
+ Map<String, Long> byteFileNames = getFileNames(TMP_BYTES_DIR);
+ assertEquals(expected, byteFileNames);
+ }
+
+ private Map<String, Long> loadExpected() {
+ Map<String, Long> m = new HashMap<>();
+ m.put("test_recursive_embedded.docx-0000000009.txt", 8151l);
+ m.put("test_recursive_embedded.docx-0000000007.txt", 8l);
+ m.put("test_recursive_embedded.docx-0000000006.txt", 8l);
+ m.put("test_recursive_embedded.docx-0000000002.zip", 4827l);
+ m.put("test_recursive_embedded.docx-0000000001.emf", 4992l);
+ m.put("test_recursive_embedded.docx-0000000008.zip", 4048l);
+ m.put("test_recursive_embedded.docx-0000000004.txt", 8l);
+ m.put("test_recursive_embedded.docx-0000000000.docx", 27082l);
+ m.put("test_recursive_embedded.docx-0000000003.txt", 8l);
+ m.put("test_recursive_embedded.docx-0000000011.txt", 7l);
+ m.put("test_recursive_embedded.docx-0000000005.zip", 4492l);
+ m.put("test_recursive_embedded.docx-0000000010.zip", 163l);
+ return m;
+ }
+
+ private Map<String, Long> getFileNames(Path p) throws Exception {
+ final Map<String, Long> ret = new HashMap<>();
+ Files.walkFileTree(TMP_BYTES_DIR, new FileVisitor<Path>() {
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs)
+ throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
+ throws IOException {
+ ret.put(file.getFileName().toString(), Files.size(file));
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc)
+ throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ return ret;
+ }
}