You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/03 21:41:43 UTC
[tika] 01/01: TIKA-3083 -- add fuzzing module
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3083
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bd56182c548b027883ceb69d29a91a6aae3c081a
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 3 17:41:18 2020 -0400
TIKA-3083 -- add fuzzing module
---
pom.xml | 1 +
tika-fuzzing/pom.xml | 59 +
.../apache/tika/fuzzing/AutoDetectTransformer.java | 96 ++
.../java/org/apache/tika/fuzzing/Transformer.java | 41 +
.../java/org/apache/tika/fuzzing/cli/FuzzOne.java | 266 ++++
.../org/apache/tika/fuzzing/cli/FuzzingCLI.java | 240 ++++
.../apache/tika/fuzzing/cli/FuzzingCLIConfig.java | 160 +++
.../tika/fuzzing/exceptions/CantFuzzException.java | 25 +
.../apache/tika/fuzzing/general/ByteDeleter.java | 53 +
.../apache/tika/fuzzing/general/ByteFlipper.java | 67 +
.../apache/tika/fuzzing/general/ByteInjector.java | 76 ++
.../tika/fuzzing/general/GeneralTransformer.java | 95 ++
.../apache/tika/fuzzing/general/SpanSwapper.java | 84 ++
.../org/apache/tika/fuzzing/general/Truncator.java | 60 +
.../org/apache/tika/fuzzing/pdf/EvilCOSWriter.java | 1283 ++++++++++++++++++++
.../apache/tika/fuzzing/pdf/PDFTransformer.java | 52 +
.../tika/fuzzing/pdf/PDFTransformerConfig.java | 26 +
.../services/org.apache.tika.fuzzing.Transformer | 17 +
tika-fuzzing/src/main/resources/log4j.properties | 24 +
tika-fuzzing/src/test/java/TestFuzzingCLI.java | 67 +
tika-fuzzing/src/test/java/TestTransformer.java | 49 +
.../test/resources/test-documents/heavy_hang.xml | 25 +
.../test/resources/test-documents/null_pointer.xml | 25 +
.../test/resources/test-documents/system_exit.xml | 25 +
24 files changed, 2916 insertions(+)
diff --git a/pom.xml b/pom.xml
index 89ee2e2..486c789 100644
--- a/pom.xml
+++ b/pom.xml
@@ -44,6 +44,7 @@
<module>tika-batch</module>
<module>tika-app</module>
<module>tika-server</module>
+ <module>tika-fuzzing</module>
<module>tika-translate</module>
<module>tika-langdetect</module>
<module>tika-example</module>
diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml
new file mode 100644
index 0000000..19c89ed
--- /dev/null
+++ b/tika-fuzzing/pom.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>2.0.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-fuzzing</artifactId>
+ <name>Apache Tika fuzzing</name>
+ <url>http://tika.apache.org/</url>
+
+ <modelVersion>4.0.0</modelVersion>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>${cli.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <!-- logging -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jul-to-slf4j</artifactId>
+ </dependency>
+ <!-- test -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <!-- bring in the mock parser -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
\ No newline at end of file
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java
new file mode 100644
index 0000000..f27f4a0
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fuzzing.general.GeneralTransformer;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class AutoDetectTransformer implements Transformer {
+
+ private static final ServiceLoader DEFAULT_LOADER =
+ new ServiceLoader(AutoDetectTransformer.class.getClassLoader());
+
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ Detector detector = TikaConfig.getDefaultConfig().getDetector();
+
+ Transformer fallback = new GeneralTransformer();
+ Map<MediaType, Transformer> transformerMap = new HashMap<>();
+
+ public AutoDetectTransformer() {
+ this(DEFAULT_LOADER.loadServiceProviders(org.apache.tika.fuzzing.Transformer.class));
+ }
+
+ public AutoDetectTransformer(List<Transformer> transformers) {
+ for (Transformer t : transformers) {
+ for (MediaType mediaType : t.getSupportedTypes()) {
+ transformerMap.put(mediaType, t);
+ }
+ }
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return transformerMap.keySet();
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ // Automatically detect the MIME type of the document
+ Metadata metadata = new Metadata();
+ MediaType type = detector.detect(tis, metadata);
+ Transformer transformer = getTransformer(type);
+ transformer.transform(tis, os);
+ }
+ }
+
+ private Transformer getTransformer(MediaType type) {
+ if (type == null) {
+ return fallback;
+ }
+ // We always work on the normalised, canonical form
+ type = registry.normalize(type);
+
+ while (type != null) {
+ // Try finding a parser for the type
+ Transformer transformer = transformerMap.get(type);
+ if (transformer != null) {
+ return transformer;
+ }
+
+ // Failing that, try for the parent of the type
+ type = registry.getSupertype(type);
+ }
+ return fallback;
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java
new file mode 100644
index 0000000..7e3d083
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Set;
+
+public interface Transformer {
+
+ /**
+ * Returns the set of media types supported by this parser when used
+ * with the given parse context.
+ *
+ * @since Apache Tika 1.24.1
+ * @return immutable set of media types
+ */
+ Set<MediaType> getSupportedTypes();
+
+
+ void transform(InputStream is, OutputStream os) throws IOException, TikaException;
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java
new file mode 100644
index 0000000..faa1383
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.cli;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fuzzing.AutoDetectTransformer;
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.fuzzing.exceptions.CantFuzzException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.ExceptionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+/**
+ * Child process that runs against a single input file
+ */
+public class FuzzOne {
+ private static final Logger LOG = LoggerFactory.getLogger(FuzzOne.class);
+
+ static Options OPTIONS;
+ static {
+ //By the time this commandline is parsed, there should be both an extracts and an inputDir
+ Option extracts = new Option("extracts", true, "directory for extract files");
+ extracts.setRequired(true);
+
+
+ OPTIONS = new Options()
+ .addOption(Option.builder("i")
+ .longOpt("inputFile")
+ .desc("input directory for seed files")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("o")
+ .longOpt("outputFile")
+ .desc("output file base")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("m")
+ .longOpt("timeoutMs")
+ .desc("timeout in ms -- max time allowed to parse a file")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("n")
+ .desc("thread id (thread number)")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("p")
+ .longOpt("perFile")
+ .desc("number of iterations to run per seed file")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("t")
+ .longOpt("maxTransformers")
+ .desc("maximum number of transformers to run per iteration")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("r")
+ .longOpt("retryId")
+ .desc("which retry is this")
+ .hasArg(true)
+ .required(true)
+ .build());
+ }
+ Parser parser = new AutoDetectParser();
+
+ public static void main(String[] args) throws Exception {
+ FuzzOneConfig config = FuzzOneConfig.parse(args);
+ FuzzOne fuzzOne = new FuzzOne();
+ fuzzOne.execute(config);
+ }
+
+ private void execute(FuzzOneConfig config) {
+ Path src = config.inputFile;
+ Path targetDir = config.outputFileBase;
+ AutoDetectTransformer transformer = new AutoDetectTransformer();
+ for (int i = 0; i < config.perFileIterations; i++) {
+ try {
+ String ext = "-"+config.threadNum + "-" + config.retryNum + "-"+i;
+ fuzz(ext, src, targetDir, transformer, config.timeoutMs);
+ } catch (IOException e) {
+ LOG.warn("problem transforming file", e);
+ } catch (CantFuzzException e) {
+ LOG.warn("can't fuzz this file "+src, e);
+ return;
+ } catch (TikaException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ private void fuzz(String ext, Path src, Path targetFileBase,
+ Transformer transformer, long timeoutMs) throws IOException, TikaException {
+
+ Path target = targetFileBase.getParent().resolve(
+ targetFileBase.getFileName().toString() +ext);
+
+ try {
+ transformFile(transformer, src, target);
+ } catch (Throwable t) {
+ LOG.warn("failed to transform: " + src.toString());
+ Files.delete(target);
+ throw t;
+ }
+ ExecutorService executor = Executors.newSingleThreadExecutor();
+ Future<Integer> future = executor.submit(new ParseTask(target));
+
+ try {
+ int result = future.get(timeoutMs, TimeUnit.MILLISECONDS);
+ if (result == 1 && Files.exists(target)) {
+ LOG.warn("failed to delete target: "+target);
+ }
+ } catch (TimeoutException e) {
+ LOG.warn("timeout exception:"+target);
+ future.cancel(true);
+ writeErrFile(target, ".timeout");
+ System.exit(1);
+ } catch (InterruptedException|ExecutionException e) {
+ LOG.warn("problem parsing "+target, e);
+ System.exit(1);
+ } finally {
+ executor.shutdownNow();
+ }
+ }
+
+ private void writeErrFile(Path target, String ext) {
+ try {
+ Path err = target.getParent().resolve(target.getFileName().toString()+ext);
+ Files.write(err, new byte[0]);
+ } catch (IOException e) {
+ LOG.warn("things aren't going right today.", e);
+ }
+ }
+
+ private void handleThrowable(Path target, Throwable t) {
+
+ try {
+ Path errMsg = target.getParent().resolve(target.getFileName().toString()+".stacktrace");
+ Files.write(errMsg, ExceptionUtils.getStackTrace(t).getBytes(StandardCharsets.UTF_8));
+ } catch (IOException e) {
+ LOG.warn("things aren't going right today.", t);
+ }
+
+ }
+
+ private void transformFile(Transformer transformer, Path src, Path target) throws IOException, TikaException {
+ try (InputStream is = Files.newInputStream(src); OutputStream os =
+ Files.newOutputStream(target)) {
+ transformer.transform(is, os);
+ }
+ }
+
+ private static class FuzzOneConfig {
+ static FuzzOneConfig parse(String[] args) throws ParseException {
+ CommandLineParser parser = new DefaultParser();
+ CommandLine commandLine = parser.parse(OPTIONS, args);
+ FuzzOneConfig config = new FuzzOneConfig();
+ config.inputFile = Paths.get(commandLine.getOptionValue("i"));
+ config.outputFileBase = Paths.get(commandLine.getOptionValue("o"));
+ config.perFileIterations = Integer.parseInt(commandLine.getOptionValue("p"));
+ config.maxTransformers = Integer.parseInt(commandLine.getOptionValue("t"));
+ config.threadNum = Integer.parseInt(commandLine.getOptionValue("n"));
+ config.retryNum = Integer.parseInt(commandLine.getOptionValue("r"));
+ config.timeoutMs = Integer.parseInt(commandLine.getOptionValue("m"));
+ return config;
+ }
+
+ private Path inputFile;
+ private Path outputFileBase;
+ int perFileIterations;
+ int maxTransformers;
+ int threadNum;
+ int retryNum;
+ long timeoutMs;
+
+ }
+
+ private class ParseTask implements Callable<Integer> {
+ private final Path target;
+ public ParseTask(Path target) {
+ this.target = target;
+ }
+
+ /**
+ *
+ * @return 1 if success
+ * @throws Exception
+ */
+ @Override
+ public Integer call() throws Exception {
+ boolean success = false;
+ try (InputStream is = Files.newInputStream(target)) {
+ LOG.debug("parsing "+target);
+ parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+ success = true;
+ } catch (TikaException e) {
+ if (e.getCause() != null && e.getCause() instanceof RuntimeException) {
+ //handleThrowable(target, e.getCause());
+ success = true;
+ } else {
+ success = true;
+ }
+ } catch (SAXException|IOException e) {
+ success = true;
+ } catch (Throwable t) {
+ handleThrowable(target, t);
+ } finally {
+ if (success) {
+ try {
+ Files.delete(target);
+ } catch (IOException e) {
+ LOG.warn("couldn't delete: "+target.toAbsolutePath());
+ }
+ } else {
+ LOG.info("FOUND PROBLEM: "+target);
+ }
+ }
+ return success ? 1 : 0;
+ }
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
new file mode 100644
index 0000000..3857a9a
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.cli;
+
+import org.apache.tika.utils.ProcessUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class FuzzingCLI {
+ private static final Logger LOG = LoggerFactory.getLogger(FuzzingCLI.class);
+
+ private static final Path POISON = Paths.get("");
+
+ private int maxFiles = -1;
+
+ public static void main (String[] args) throws Exception {
+ FuzzingCLIConfig config = FuzzingCLIConfig.parse(args);
+ if (config.getMaxTransformers() == 0) {
+ LOG.warn("max transformers == 0!");
+ }
+ if (! Files.isDirectory(config.inputDir)) {
+ throw new IllegalArgumentException("input directory doesn't exist: " + config.inputDir);
+ }
+ FuzzingCLI fuzzingCLI = new FuzzingCLI();
+ Files.createDirectories(config.getOutputDirectory());
+ fuzzingCLI.execute(config);
+ }
+
+ private void execute(FuzzingCLIConfig config) {
+ ArrayBlockingQueue<Path> q = new ArrayBlockingQueue(10000);
+ ExecutorService executorService = Executors.newFixedThreadPool(config.getNumThreads()+1);
+ ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(executorService);
+ FileAdder fileAdder = new FileAdder(config.getInputDirectory(), config.getNumThreads(), q);
+ executorCompletionService.submit(fileAdder);
+ for (int i = 0; i < config.numThreads; i++) {
+ executorCompletionService.submit(new Fuzzer(q, config));
+ }
+ int finished = 0;
+ while (finished < config.getNumThreads()+1) {
+ Future<Integer> future = null;
+ try {
+ future = executorCompletionService.poll(1, TimeUnit.SECONDS);
+ if (future != null) {
+ future.get();
+ finished++;
+ }
+ } catch (InterruptedException | ExecutionException e) {
+ e.printStackTrace();
+ break;
+ }
+ }
+ executorService.shutdownNow();
+ }
+
+ private static class Fuzzer implements Callable<Integer> {
+ static AtomicInteger COUNTER = new AtomicInteger();
+ private final int threadId = COUNTER.getAndIncrement();
+ private final ArrayBlockingQueue<Path> q;
+ private final FuzzingCLIConfig config;
+ public Fuzzer(ArrayBlockingQueue<Path> q, FuzzingCLIConfig config) {
+ this.q = q;
+ this.config = config;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ while (true) {
+ Path p = q.take();
+ if (p.equals(POISON)) {
+ LOG.debug("Thread "+threadId + " stopping");
+ return 1;
+ }
+ boolean success = false;
+ int tries = 0;
+ while (! success && tries < config.getRetries()) {
+ if (tries > 0) {
+ LOG.warn("Retrying ("+tries+") "+p);
+ }
+ success = fuzzIt(config, p, tries);
+ tries++;
+ }
+ }
+ }
+
+ private boolean fuzzIt(FuzzingCLIConfig config, Path p, int retryId) {
+ //the target files should be flattened so that
+ //problematic files are all in one directory...may rethink this option later
+ Path target = config.getOutputDirectory().resolve(
+ p.getFileName());
+ String cp = System.getProperty("java.class.path");
+
+ String[] args = new String[] {
+ "java",
+ "-ea",
+ "-cp",
+ ProcessUtils.escapeCommandLine(cp),
+ "org.apache.tika.fuzzing.cli.FuzzOne",
+ "-i",
+ ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()),
+ "-o",
+ ProcessUtils.escapeCommandLine(target.toAbsolutePath().toString()),
+ "-p",
+ Integer.toString(config.getPerFileIterations()),
+ "-t",
+ Integer.toString(config.getMaxTransformers()),
+ "-n",
+ Integer.toString(threadId),
+ "-r",
+ Integer.toString(retryId),
+ "-m",
+ Long.toString(config.getTimeoutMs())
+ };
+ ProcessBuilder pb = new ProcessBuilder(args);
+ pb.inheritIO();
+ Process process = null;
+ boolean success = false;
+ try {
+ process = pb.start();
+ } catch (IOException e) {
+ LOG.warn("problem starting process", e);
+ }
+ try {
+ long totalTime = 2*config.getTimeoutMs()+config.getPerFileIterations();
+ success = process.waitFor(totalTime, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ LOG.warn("problem waiting for process to finish", e);
+ } finally {
+ if (process.isAlive()) {
+ LOG.warn("process still alive for " + target.toAbsolutePath());
+ process.destroyForcibly();
+ }
+ try {
+ int exitValue = process.exitValue();
+ if (exitValue != 0) {
+ success = false;
+ LOG.warn("bad exit value for " + target.toAbsolutePath());
+ }
+ } catch (IllegalThreadStateException e) {
+ success = false;
+ LOG.warn("not exited");
+ process.destroyForcibly();
+ }
+ }
+ return success;
+ }
+
+ }
+
+ private class FileAdder implements Callable<Integer> {
+ private final Path inputDir;
+ private final int numThreads;
+ private final ArrayBlockingQueue<Path> queue;
+ private int added = 0;
+ public FileAdder(Path inputDirectory, int numThreads, ArrayBlockingQueue<Path> queue) {
+ this.inputDir = inputDirectory;
+ this.numThreads = numThreads;
+ this.queue = queue;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ Files.walkFileTree(inputDir, new DirWalker());
+ for (int i = 0; i < numThreads; i++) {
+ queue.add(POISON);
+ }
+ return 1;
+ }
+
+ private class DirWalker implements FileVisitor<Path> {
+
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+ if (maxFiles > -1 && added >= maxFiles) {
+ LOG.info("hit maxfiles; file crawler is stopping early");
+ return FileVisitResult.TERMINATE;
+ }
+
+ try {
+ boolean offered = queue.offer(file, 10, TimeUnit.MINUTES);
+ if (offered) {
+ added++;
+ return FileVisitResult.CONTINUE;
+ } else {
+ LOG.error("couldn't add a file after 10 minutes!");
+ return FileVisitResult.TERMINATE;
+ }
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ return FileVisitResult.TERMINATE;
+ }
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+ }
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
new file mode 100644
index 0000000..324b934
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.cli;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class FuzzingCLIConfig {
+
+ private static final int DEFAULT_NUM_THREADS = 4;
+ private static final int DEFAULT_NUM_ITERATIONS = 1000;
+ //allow all transformers to operate
+ private static final int DEFAULT_MAX_TRANSFORMERS = -1;
+
+ private static final long DEFAULT_TIMEOUT_MS = 120000;
+
+ private static final int DEFAULT_RETRIES = 2;
+
+ static Options OPTIONS;
+ static {
+ //By the time this commandline is parsed, there should be both an extracts and an inputDir
+ Option extracts = new Option("extracts", true, "directory for extract files");
+ extracts.setRequired(true);
+
+
+ OPTIONS = new Options()
+ .addOption(Option.builder("i")
+ .longOpt("inputDir")
+ .desc("input directory for seed files")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("o")
+ .longOpt("outputDir")
+ .desc("output directory for files that triggered problems")
+ .hasArg(true)
+ .required(true)
+ .build())
+ .addOption(Option.builder("n")
+ .longOpt("numThreads")
+ .desc("number of threads")
+ .hasArg(true)
+ .required(false)
+ .build())
+ .addOption(Option.builder("p")
+ .longOpt("perFile")
+ .desc("number of iterations to run per seed file")
+ .hasArg(true)
+ .required(false)
+ .build())
+ .addOption(Option.builder("t")
+ .longOpt("maxTransformers")
+ .desc("maximum number of transformers to run per iteration")
+ .hasArg(true)
+ .required(false)
+ .build())
+ .addOption(Option.builder("m")
+ .longOpt("timeoutMs")
+ .desc("timeout in ms -- max time allowed to parse a file")
+ .hasArg(true)
+ .required(false)
+ .build())
+ .addOption(Option.builder("r")
+ .longOpt("retries")
+ .desc("number of times to retry a seed file if there's a catastrophic failure")
+ .hasArg(true)
+ .required(false)
+ .build());
+
+ }
+
+ public static FuzzingCLIConfig parse(String[] args) throws ParseException {
+ CommandLineParser parser = new DefaultParser();
+ CommandLine commandLine = parser.parse(OPTIONS, args);
+ FuzzingCLIConfig config = new FuzzingCLIConfig();
+ config.inputDir = Paths.get(commandLine.getOptionValue("i"));
+ config.outputDir = Paths.get(commandLine.getOptionValue("o"));
+ config.numThreads = (commandLine.hasOption("n")) ?
+ Integer.parseInt(commandLine.getOptionValue("n")) :
+ DEFAULT_NUM_THREADS;
+ config.perFileIterations = (commandLine.hasOption("p")) ?
+ Integer.parseInt(commandLine.getOptionValue("p")) :
+ DEFAULT_NUM_ITERATIONS;
+ config.maxTransformers = (commandLine.hasOption("t")) ?
+ Integer.parseInt(commandLine.getOptionValue("t")) :
+ DEFAULT_MAX_TRANSFORMERS;
+ config.timeoutMS = (commandLine.hasOption("m")) ?
+ Integer.parseInt(commandLine.getOptionValue("m")) :
+ DEFAULT_TIMEOUT_MS;
+ config.retries = (commandLine.hasOption("r")) ?
+ Integer.parseInt(commandLine.getOptionValue("r")) :
+ DEFAULT_RETRIES;
+ return config;
+ }
+
+
+ int numThreads;
+ //number of variants tried per file
+ int perFileIterations;
+ //maxTransformers per file
+ int maxTransformers;
+
+ //max time allowed to process each file in milliseconds
+ long timeoutMS;
+
+ //times to retry a seed file after a catastrophic failure
+ int retries;
+ Path inputDir;
+ Path outputDir;
+
+
+ public int getNumThreads() {
+ return numThreads;
+ }
+
+ public Path getInputDirectory() {
+ return inputDir;
+ }
+
+ public Path getOutputDirectory() {
+ return outputDir;
+ }
+
+ public int getMaxTransformers() {
+ return maxTransformers;
+ }
+
+ public long getTimeoutMs() {
+ return timeoutMS;
+ }
+
+ public int getPerFileIterations() {
+ return perFileIterations;
+ }
+
+ public int getRetries() {
+ return retries;
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java
new file mode 100644
index 0000000..3540822
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.exceptions;
+
+import org.apache.tika.exception.TikaException;
+
+public class CantFuzzException extends TikaException {
+ public CantFuzzException(String msg) {
+ super(msg);
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java
new file mode 100644
index 0000000..ff26f7f
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.mime.MediaType;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Set;
+
+public class ByteDeleter implements Transformer {
+ Random random = new Random();
+ float percentDeleted = 0.01f;
+
+ static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException {
+ int c = is.read();
+ while (c != -1) {
+ if (random.nextFloat() >= percentDeleted) {
+ os.write(c);
+ } else {
+ //skip
+ }
+ c = is.read();
+ }
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java
new file mode 100644
index 0000000..74e9b5f
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Set;
+
+public class ByteFlipper implements Transformer {
+
+ //TODO add something about protecting first x bytes?
+ private Random random = new Random();
+ private float percentCorrupt = 0.01f;
+
+ static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException {
+ //TODO -- don't load the full thing into memory
+ byte[] input = IOUtils.toByteArray(is);
+ if (input.length == 0) {
+ return;
+ }
+ byte[] singleByte = new byte[1];
+ //make sure that there's at least one change, even in short files
+ int atLeastOneIndex = random.nextInt(input.length);
+
+ for (int i = 0; i < input.length; i++) {
+ if (random.nextFloat() <= percentCorrupt || i == atLeastOneIndex) {
+ random.nextBytes(singleByte);
+ os.write(singleByte[0]);
+ } else {
+ os.write(input[i]);
+ }
+ }
+ }
+
+ public void setPercentCorrupt(float percentCorrupt) {
+ percentCorrupt = percentCorrupt;
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java
new file mode 100644
index 0000000..2dbfec8
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.mime.MediaType;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Set;
+
+public class ByteInjector implements Transformer {
+ Random random = new Random();
+ float injectionFrequency = 0.01f;
+ int maxSpan = 100;
+ static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException {
+ //TODO -- don't load the full thing into memory
+ byte[] input = IOUtils.toByteArray(is);
+ int numInjections = (int) Math.floor((double)injectionFrequency*(double)input.length);
+ //at least one injection
+ numInjections = numInjections == 0 ? 1 : numInjections;
+ int[] starts = new int[numInjections];
+ if (numInjections > 1) {
+ for (int i = 0; i < numInjections; i++) {
+ starts[i] = random.nextInt(input.length - 1);
+ }
+ } else {
+ starts[0] = 0;
+ }
+ Arrays.sort(starts);
+ int startIndex = 0;
+
+ for (int i = 0; i < input.length; i++) {
+ os.write(input[i]);
+ if (startIndex < starts.length && starts[startIndex] == i) {
+ inject(os);
+ startIndex++;
+ }
+ }
+ }
+
+ private void inject(OutputStream os) throws IOException {
+ int len = random.nextInt(maxSpan);
+ byte[] randBytes = new byte[len];
+ random.nextBytes(randBytes);
+ os.write(randBytes);
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
new file mode 100644
index 0000000..803784e
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.mime.MediaType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+public class GeneralTransformer implements Transformer {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GeneralTransformer.class);
+
+ Random random = new Random();
+
+ private final int maxTransforms;
+ private final Transformer[] transformers;
+ private final Set<MediaType> supportedTypes;
+ public GeneralTransformer() {
+ this(new ByteDeleter(), new ByteFlipper(),
+ new ByteInjector(), new Truncator(), new SpanSwapper());
+ }
+
+ public GeneralTransformer(Transformer ... transformers) {
+ this(transformers.length, transformers);
+ }
+
+ public GeneralTransformer(int maxTransforms, Transformer ... transformers) {
+ this.maxTransforms = (maxTransforms < 0) ? transformers.length : maxTransforms;
+ this.transformers = transformers;
+ Set<MediaType> tmpTypes = new HashSet<>();
+ for (Transformer transformer : transformers) {
+ tmpTypes.addAll(transformer.getSupportedTypes());
+ }
+ supportedTypes = Collections.unmodifiableSet(tmpTypes);
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return supportedTypes;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
+ //used for debugging
+ if (maxTransforms == 0) {
+ return;
+ }
+ int transformerCount = (maxTransforms == 1) ? 1 : 1 + random.nextInt(maxTransforms);
+ int[] transformerIndices = new int[transformerCount];
+ for (int i = 0; i < transformerCount; i++) {
+ transformerIndices[i] = random.nextInt(transformerCount);
+ }
+ //TODO -- make this actually streaming
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(is, bos);
+ for (int i = 0; i < transformerIndices.length-1; i++) {
+ byte[] bytes = bos.toByteArray();
+ bos = new ByteArrayOutputStream();
+ transformers[transformerIndices[i]].transform(
+ new ByteArrayInputStream(bytes), bos);
+ bos.flush();
+ if (bos.toByteArray().length == 0) {
+ LOG.warn("zero length: "+transformers[transformerIndices[i]]);
+ }
+ }
+ os.write(bos.toByteArray());
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java
new file mode 100644
index 0000000..e2bc16c
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Set;
+
+/**
+ * randomly swaps spans from the input
+ *
+ */
+public class SpanSwapper implements Transformer {
+
+ Random random = new Random();
+ private float swapProbability = 0.01f;
+ int maxSpanLength = 10000;
+
+ static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException {
+ byte[] input = IOUtils.toByteArray(is);
+ int numSwaps = (int) Math.floor(swapProbability*input.length);
+ //at least one swap
+ numSwaps = numSwaps == 0 ? 1 : numSwaps;
+ byte[] ret = new byte[input.length];
+ System.arraycopy(input, 0, ret, 0, input.length);
+ for (int i = 0; i < numSwaps; i++) {
+ ret = swap(ret);
+ }
+ os.write(ret);
+ }
+
+ private byte[] swap(byte[] ret) {
+ if (ret.length == 0) {
+ return new byte[0];
+ }
+ int srcStart = random.nextInt(ret.length);
+ int targStart = random.nextInt(ret.length);
+ //these spans can overlap;
+
+ int len = random.nextInt(maxSpanLength);
+ int maxStart = Math.max(srcStart, targStart);
+ len = (len+maxStart < ret.length) ? len :
+ ret.length-maxStart;
+
+ byte[] landingBytes = new byte[len];
+ //copy the landing zone
+ System.arraycopy(ret, targStart, landingBytes, 0, len);
+ //now copy the src onto the targ
+ System.arraycopy(ret, srcStart, ret, targStart, len);
+ //now copy the targ over to the src
+ System.arraycopy(landingBytes, 0, ret, srcStart, len);
+ return ret;
+ }
+
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java
new file mode 100644
index 0000000..209810c
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.general;
+
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Set;
+
+public class Truncator implements Transformer {
+
+ Random random = new Random();
+ static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
+
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException {
+ //TODO -- redo streaming
+ byte[] input = IOUtils.toByteArray(is);
+ if (input.length == 0) {
+ return;
+ }
+ int len = 1 + random.nextInt(input.length);
+ //at least one
+ if (len >= input.length) {
+ len = input.length-2;
+ if (len < 0) {
+ len = 0;
+ }
+ }
+
+ byte[] ret = new byte[len];
+ System.arraycopy(input, 0, ret, 0, len);
+ os.write(ret);
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
new file mode 100644
index 0000000..0484c93
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
@@ -0,0 +1,1283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.pdf;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSBoolean;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSFloat;
+import org.apache.pdfbox.cos.COSInteger;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSNull;
+import org.apache.pdfbox.cos.COSNumber;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSObjectKey;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.cos.COSUpdateInfo;
+import org.apache.pdfbox.cos.ICOSVisitor;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.io.RandomAccessInputStream;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.pdfparser.PDFXRefStream;
+import org.apache.pdfbox.pdfwriter.COSStandardOutputStream;
+import org.apache.pdfbox.pdfwriter.COSWriter;
+import org.apache.pdfbox.pdfwriter.COSWriterXRefEntry;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
+import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.COSFilterInputStream;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface;
+import org.apache.pdfbox.util.Hex;
+
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.SequenceInputStream;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+public class EvilCOSWriter implements ICOSVisitor, Closeable {
+ /**
+ * The dictionary open token.
+ */
+ public static final byte[] DICT_OPEN = "<<".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The dictionary close token.
+ */
+ public static final byte[] DICT_CLOSE = ">>".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * space character.
+ */
+ public static final byte[] SPACE = {' '};
+ /**
+ * The start to a PDF comment.
+ */
+ public static final byte[] COMMENT = {'%'};
+
+ /**
+ * The output version of the PDF.
+ */
+ public static final byte[] VERSION = "PDF-1.4".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * Garbage bytes used to create the PDF header.
+ */
+ public static final byte[] GARBAGE = new byte[]{(byte) 0xf6, (byte) 0xe4, (byte) 0xfc, (byte) 0xdf};
+ /**
+ * The EOF constant.
+ */
+ public static final byte[] EOF = "%%EOF".getBytes(StandardCharsets.US_ASCII);
+ // pdf tokens
+
+ /**
+ * The reference token.
+ */
+ public static final byte[] REFERENCE = "R".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The XREF token.
+ */
+ public static final byte[] XREF = "xref".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The xref free token.
+ */
+ public static final byte[] XREF_FREE = "f".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The xref used token.
+ */
+ public static final byte[] XREF_USED = "n".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The trailer token.
+ */
+ public static final byte[] TRAILER = "trailer".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The start xref token.
+ */
+ public static final byte[] STARTXREF = "startxref".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The starting object token.
+ */
+ public static final byte[] OBJ = "obj".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The end object token.
+ */
+ public static final byte[] ENDOBJ = "endobj".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The array open token.
+ */
+ public static final byte[] ARRAY_OPEN = "[".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The array close token.
+ */
+ public static final byte[] ARRAY_CLOSE = "]".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The open stream token.
+ */
+ public static final byte[] STREAM = "stream".getBytes(StandardCharsets.US_ASCII);
+ /**
+ * The close stream token.
+ */
+ public static final byte[] ENDSTREAM = "endstream".getBytes(StandardCharsets.US_ASCII);
+
+ private final NumberFormat formatXrefOffset = new DecimalFormat("0000000000",
+ DecimalFormatSymbols.getInstance(Locale.US));
+
+ // the decimal format for the xref object generation number data
+ private final NumberFormat formatXrefGeneration = new DecimalFormat("00000",
+ DecimalFormatSymbols.getInstance(Locale.US));
+
+ // the stream where we create the pdf output
+ private OutputStream output;
+
+ // the stream used to write standard cos data
+ private COSStandardOutputStream standardOutput;
+
+ // the start position of the x ref section
+ private long startxref = 0;
+
+ // the current object number
+ private long number = 0;
+
+ // maps the object to the keys generated in the writer
+ // these are used for indirect references in other objects
+ //A hashtable is used on purpose over a hashmap
+ //so that null entries will not get added.
+ @SuppressWarnings({"squid:S1149"})
+ private final Map<COSBase, COSObjectKey> objectKeys = new Hashtable<>();
+
+ private final Map<COSObjectKey, COSBase> keyObject = new HashMap<>();
+
+ // the list of x ref entries to be made so far
+ private final List<COSWriterXRefEntry> xRefEntries = new ArrayList<>();
+ private final Set<COSBase> objectsToWriteSet = new HashSet<>();
+
+ //A list of objects to write.
+ private final Deque<COSBase> objectsToWrite = new LinkedList<>();
+
+ //a list of objects already written
+ private final Set<COSBase> writtenObjects = new HashSet<>();
+
+ //An 'actual' is any COSBase that is not a COSObject.
+ //need to keep a list of the actuals that are added
+ //as well as the objects because there is a problem
+ //when adding a COSObject and then later adding
+ //the actual for that object, so we will track
+ //actuals separately.
+ private final Set<COSBase> actualsAdded = new HashSet<>();
+
+ private COSObjectKey currentObjectKey = null;
+ private PDDocument pdDocument = null;
+ private FDFDocument fdfDocument = null;
+ private boolean willEncrypt = false;
+
+ // signing
+ private boolean incrementalUpdate = false;
+ private boolean reachedSignature = false;
+ private long signatureOffset;
+ private long signatureLength;
+ private long byteRangeOffset;
+ private long byteRangeLength;
+ private RandomAccessRead incrementalInput;
+ private OutputStream incrementalOutput;
+ private SignatureInterface signatureInterface;
+ private byte[] incrementPart;
+ private COSArray byteRangeArray;
+
+ private final PDFTransformerConfig config;
+ private final Random random = new Random();
+ /**
+ * COSWriter constructor.
+ *
+ * @param outputStream The output stream to write the PDF. It will be closed when this object is
+ * closed.
+ */
+ public EvilCOSWriter(OutputStream outputStream, PDFTransformerConfig config) {
+ setOutput(outputStream);
+ setStandardOutput(new COSStandardOutputStream(output));
+ this.config = config;
+ }
+
+ private void prepareIncrement(PDDocument doc) throws IOException {
+ if (doc != null) {
+ COSDocument cosDoc = doc.getDocument();
+
+ Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+ Set<COSObjectKey> keySet = xrefTable.keySet();
+ long highestNumber = doc.getDocument().getHighestXRefObjectNumber();
+ for (COSObjectKey cosObjectKey : keySet) {
+ COSBase object = cosDoc.getObjectFromPool(cosObjectKey).getObject();
+ if (object != null && cosObjectKey != null && !(object instanceof COSNumber)) {
+ objectKeys.put(object, cosObjectKey);
+ keyObject.put(cosObjectKey, object);
+ }
+
+ if (cosObjectKey != null) {
+ long num = cosObjectKey.getNumber();
+ if (num > highestNumber) {
+ highestNumber = num;
+ }
+ }
+ }
+ setNumber(highestNumber);
+ }
+ }
+
+ /**
+ * add an entry in the x ref table for later dump.
+ *
+ * @param entry The new entry to add.
+ */
+ protected void addXRefEntry(COSWriterXRefEntry entry) {
+ getXRefEntries().add(entry);
+ }
+
+ /**
+ * This will close the stream.
+ *
+ * @throws IOException If the underlying stream throws an exception.
+ */
+ @Override
+ public void close() throws IOException {
+ if (getStandardOutput() != null) {
+ getStandardOutput().close();
+ }
+ if (incrementalOutput != null) {
+ incrementalOutput.close();
+ }
+ }
+
+ /**
+ * This will get the current object number.
+ *
+ * @return The current object number.
+ */
+ protected long getNumber() {
+ return number;
+ }
+
+ /**
+ * This will get all available object keys.
+ *
+ * @return A map of all object keys.
+ */
+ public Map<COSBase, COSObjectKey> getObjectKeys() {
+ return objectKeys;
+ }
+
+ /**
+ * This will get the output stream.
+ *
+ * @return The output stream.
+ */
+ protected java.io.OutputStream getOutput() {
+ return output;
+ }
+
+ /**
+ * This will get the standard output stream.
+ *
+ * @return The standard output stream.
+ */
+ protected COSStandardOutputStream getStandardOutput() {
+ return standardOutput;
+ }
+
+ /**
+ * This will get the current start xref.
+ *
+ * @return The current start xref.
+ */
+ protected long getStartxref() {
+ return startxref;
+ }
+
+ /**
+ * This will get the xref entries.
+ *
+ * @return All available xref entries.
+ */
+ protected List<COSWriterXRefEntry> getXRefEntries() {
+ return xRefEntries;
+ }
+
+ /**
+ * This will set the current object number.
+ *
+ * @param newNumber The new object number.
+ */
+ protected void setNumber(long newNumber) {
+ number = newNumber;
+
+ }
+
+ /**
+ * This will set the output stream.
+ *
+ * @param newOutput The new output stream.
+ */
+ private void setOutput(OutputStream newOutput) {
+ output = newOutput;
+ }
+
+ /**
+ * This will set the standard output stream.
+ *
+ * @param newStandardOutput The new standard output stream.
+ */
+ private void setStandardOutput(COSStandardOutputStream newStandardOutput) {
+ standardOutput = newStandardOutput;
+ }
+
+ /**
+ * This will set the start xref.
+ *
+ * @param newStartxref The new start xref attribute.
+ */
+ protected void setStartxref(long newStartxref) {
+ startxref = newStartxref;
+ }
+
+ /**
+ * This will write the body of the document.
+ *
+ * @param doc The document to write the body for.
+ * @throws IOException If there is an error writing the data.
+ */
+ protected void doWriteBody(COSDocument doc) throws IOException {
+ COSDictionary trailer = doc.getTrailer();
+ COSDictionary root = trailer.getCOSDictionary(COSName.ROOT);
+ COSDictionary info = trailer.getCOSDictionary(COSName.INFO);
+ COSDictionary encrypt = trailer.getCOSDictionary(COSName.ENCRYPT);
+ if (root != null) {
+ addObjectToWrite(root);
+ }
+ if (info != null) {
+ addObjectToWrite(info);
+ }
+
+ doWriteObjects();
+ willEncrypt = false;
+ if (encrypt != null) {
+ addObjectToWrite(encrypt);
+ }
+
+ doWriteObjects();
+ }
+
+ private void doWriteObjects() throws IOException {
+ while (objectsToWrite.size() > 0) {
+ COSBase nextObject = objectsToWrite.removeFirst();
+ objectsToWriteSet.remove(nextObject);
+ doWriteObject(nextObject);
+ }
+ }
+
+ private void addObjectToWrite(COSBase object) {
+ COSBase actual = object;
+ if (actual instanceof COSObject) {
+ actual = ((COSObject) actual).getObject();
+ }
+
+ if (!writtenObjects.contains(object) &&
+ !objectsToWriteSet.contains(object) &&
+ !actualsAdded.contains(actual)) {
+ COSBase cosBase = null;
+ COSObjectKey cosObjectKey = null;
+ if (actual != null) {
+ cosObjectKey = objectKeys.get(actual);
+ }
+ if (cosObjectKey != null) {
+ cosBase = keyObject.get(cosObjectKey);
+ }
+ if (actual != null && objectKeys.containsKey(actual)
+ && object instanceof COSUpdateInfo && !((COSUpdateInfo) object).isNeedToBeUpdated()
+ && cosBase instanceof COSUpdateInfo && !((COSUpdateInfo) cosBase).isNeedToBeUpdated()) {
+ return;
+ }
+ objectsToWrite.add(object);
+ objectsToWriteSet.add(object);
+ if (actual != null) {
+ actualsAdded.add(actual);
+ }
+ }
+ }
+
+ /**
+ * This will write a COS object.
+ *
+ * @param obj The object to write.
+ * @throws IOException if the output cannot be written
+ */
+ public void doWriteObject(COSBase obj) throws IOException {
+ writtenObjects.add(obj);
+ // find the physical reference
+ currentObjectKey = getObjectKey(obj);
+ // add a x ref entry
+ addXRefEntry(new COSWriterXRefEntry(getStandardOutput().getPos(), obj, currentObjectKey));
+ // write the object
+
+ long objectNumber = currentObjectKey.getNumber();
+ if (config.getRandomizeObjectNumbers()) {
+ if (random.nextFloat() < 0.99) {
+ long orig = objectNumber;
+ objectNumber = 1;//random.nextInt(((int)objectNumber)*2);
+ }
+ }
+ getStandardOutput().write(String.valueOf(objectNumber).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(String.valueOf(currentObjectKey.getGeneration()).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(OBJ);
+ getStandardOutput().writeEOL();
+ // null test added to please Sonar
+ // TODO: shouldn't all public methods be guarded against passing null. Passing null to most methods will
+ // fail with an NPE
+ mutate(obj);
+ if (obj != null) {
+ obj.accept(this);
+ }
+ getStandardOutput().writeEOL();
+ getStandardOutput().write(ENDOBJ);
+ getStandardOutput().writeEOL();
+ }
+
+ private void mutate(COSBase obj) {
+ //stub
+ if (obj instanceof COSStream) {
+ COSStream stream = (COSStream)obj;
+ //manipulate filters and stream length
+ }
+ }
+
+ /**
+ * This will write the header to the PDF document.
+ *
+ * @param doc The document to get the data from.
+ * @throws IOException If there is an error writing to the stream.
+ */
+ protected void doWriteHeader(COSDocument doc) throws IOException {
+ String headerString;
+ if (fdfDocument != null) {
+ headerString = "%FDF-" + Float.toString(doc.getVersion());
+ } else {
+ headerString = "%PDF-" + Float.toString(doc.getVersion());
+ }
+ getStandardOutput().write(headerString.getBytes(StandardCharsets.ISO_8859_1));
+
+ getStandardOutput().writeEOL();
+ getStandardOutput().write(COMMENT);
+ getStandardOutput().write(GARBAGE);
+ getStandardOutput().writeEOL();
+ }
+
+
+ /**
+ * This will write the trailer to the PDF document.
+ *
+ * @param doc The document to create the trailer for.
+ * @throws IOException If there is an IOError while writing the document.
+ */
+ protected void doWriteTrailer(COSDocument doc) throws IOException {
+ getStandardOutput().write(TRAILER);
+ getStandardOutput().writeEOL();
+
+ COSDictionary trailer = doc.getTrailer();
+ //sort xref, needed only if object keys not regenerated
+ Collections.sort(getXRefEntries());
+ COSWriterXRefEntry lastEntry = getXRefEntries().get(getXRefEntries().size() - 1);
+ trailer.setLong(COSName.SIZE, lastEntry.getKey().getNumber() + 1);
+ // Only need to stay, if an incremental update will be performed
+ if (!incrementalUpdate) {
+ trailer.removeItem(COSName.PREV);
+ }
+ if (!doc.isXRefStream()) {
+ trailer.removeItem(COSName.XREF_STM);
+ }
+ // Remove a checksum if present
+ trailer.removeItem(COSName.DOC_CHECKSUM);
+
+ COSArray idArray = trailer.getCOSArray(COSName.ID);
+ if (idArray != null) {
+ idArray.setDirect(true);
+ }
+
+ trailer.accept(this);
+ }
+
+ private void doWriteXRefInc(COSDocument doc, long hybridPrev) throws IOException {
+ if (doc.isXRefStream() || hybridPrev != -1) {
+ // the file uses XrefStreams, so we need to update
+ // it with an xref stream. We create a new one and fill it
+ // with data available here
+
+ // create a new XRefStrema object
+ PDFXRefStream pdfxRefStream = new PDFXRefStream(doc);
+
+ // add all entries from the incremental update.
+ List<COSWriterXRefEntry> xRefEntries2 = getXRefEntries();
+ for (COSWriterXRefEntry cosWriterXRefEntry : xRefEntries2) {
+ pdfxRefStream.addEntry(cosWriterXRefEntry);
+ }
+
+ COSDictionary trailer = doc.getTrailer();
+ if (incrementalUpdate) {
+ // use previous startXref value as new PREV value
+ trailer.setLong(COSName.PREV, doc.getStartXref());
+ } else {
+ trailer.removeItem(COSName.PREV);
+ }
+ pdfxRefStream.addTrailerInfo(trailer);
+ // the size is the highest object number+1. we add one more
+ // for the xref stream object we are going to write
+ pdfxRefStream.setSize(getNumber() + 2);
+
+ setStartxref(getStandardOutput().getPos());
+ COSStream stream2 = pdfxRefStream.getStream();
+ doWriteObject(stream2);
+ }
+
+ if (!doc.isXRefStream() || hybridPrev != -1) {
+ COSDictionary trailer = doc.getTrailer();
+ trailer.setLong(COSName.PREV, doc.getStartXref());
+ if (hybridPrev != -1) {
+ COSName xrefStm = COSName.XREF_STM;
+ trailer.removeItem(xrefStm);
+ trailer.setLong(xrefStm, getStartxref());
+ }
+ doWriteXRefTable();
+ doWriteTrailer(doc);
+ }
+ }
+
+ // writes the "xref" table
+ private void doWriteXRefTable() throws IOException {
+ addXRefEntry(COSWriterXRefEntry.getNullEntry());
+
+ // sort xref, needed only if object keys not regenerated
+ Collections.sort(getXRefEntries());
+
+ // remember the position where x ref was written
+ setStartxref(getStandardOutput().getPos());
+
+ getStandardOutput().write(XREF);
+ getStandardOutput().writeEOL();
+ // write start object number and object count for this x ref section
+ // we assume starting from scratch
+
+ Long[] xRefRanges = getXRefRanges(getXRefEntries());
+ int xRefLength = xRefRanges.length;
+ int x = 0;
+ int j = 0;
+ while (x < xRefLength && (xRefLength % 2) == 0) {
+ writeXrefRange(xRefRanges[x], xRefRanges[x + 1]);
+
+ for (int i = 0; i < xRefRanges[x + 1]; ++i) {
+ writeXrefEntry(xRefEntries.get(j++));
+ }
+ x += 2;
+ }
+ }
+
+ /**
+ * Write an incremental update for a non signature case. This can be used for e.g. augmenting
+ * signatures.
+ *
+ * @throws IOException
+ */
+ private void doWriteIncrement() throws IOException {
+ // write existing PDF
+ IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput);
+ // write the actual incremental update
+ incrementalOutput.write(((ByteArrayOutputStream) output).toByteArray());
+ }
+
+ private void doWriteSignature() throws IOException {
+ // calculate the ByteRange values
+ long inLength = incrementalInput.length();
+ long beforeLength = signatureOffset;
+ long afterOffset = signatureOffset + signatureLength;
+ long afterLength = getStandardOutput().getPos() - (inLength + signatureLength) - (signatureOffset - inLength);
+
+ String byteRange = "0 " + beforeLength + " " + afterOffset + " " + afterLength + "]";
+
+ // Assign the values to the actual COSArray, so that the user can access it before closing
+ byteRangeArray.set(0, COSInteger.ZERO);
+ byteRangeArray.set(1, COSInteger.get(beforeLength));
+ byteRangeArray.set(2, COSInteger.get(afterOffset));
+ byteRangeArray.set(3, COSInteger.get(afterLength));
+
+ if (byteRange.length() > byteRangeLength) {
+ throw new IOException("Can't write new byteRange '" + byteRange +
+ "' not enough space: byteRange.length(): " + byteRange.length() +
+ ", byteRangeLength: " + byteRangeLength);
+ }
+
+ // copy the new incremental data into a buffer (e.g. signature dict, trailer)
+ ByteArrayOutputStream byteOut = (ByteArrayOutputStream) output;
+ byteOut.flush();
+ incrementPart = byteOut.toByteArray();
+
+ // overwrite the ByteRange in the buffer
+ byte[] byteRangeBytes = byteRange.getBytes(StandardCharsets.ISO_8859_1);
+ for (int i = 0; i < byteRangeLength; i++) {
+ if (i >= byteRangeBytes.length) {
+ incrementPart[(int) (byteRangeOffset + i - inLength)] = 0x20; // SPACE
+ } else {
+ incrementPart[(int) (byteRangeOffset + i - inLength)] = byteRangeBytes[i];
+ }
+ }
+
+ if (signatureInterface != null) {
+ // data to be signed
+ final InputStream dataToSign = getDataToSign();
+
+ // sign the bytes
+ byte[] signatureBytes = signatureInterface.sign(dataToSign);
+ writeExternalSignature(signatureBytes);
+ }
+ // else signature should created externally and set via writeSignature()
+ }
+
+ /**
+ * Return the stream of PDF data to be signed. Clients should use this method only to create
+ * signatures externally. {@link #write(PDDocument)} method should have been called prior. The
+ * created signature should be set using {@link #writeExternalSignature(byte[])}.
+ * <p>
+ * When {@link SignatureInterface} instance is used, COSWriter obtains and writes the signature
+ * itself.
+ * </p>
+ *
+ * @return data stream to be signed
+ * @throws IllegalStateException if PDF is not prepared for external signing
+ * @throws IOException if input data is closed
+ */
+ public InputStream getDataToSign() throws IOException {
+ if (incrementPart == null || incrementalInput == null) {
+ throw new IllegalStateException("PDF not prepared for signing");
+ }
+ // range of incremental bytes to be signed (includes /ByteRange but not /Contents)
+ int incPartSigOffset = (int) (signatureOffset - incrementalInput.length());
+ int afterSigOffset = incPartSigOffset + (int) signatureLength;
+ int[] range =
+ {
+ 0, incPartSigOffset,
+ afterSigOffset, incrementPart.length - afterSigOffset
+ };
+
+ return new SequenceInputStream(
+ new RandomAccessInputStream(incrementalInput),
+ new COSFilterInputStream(incrementPart, range));
+ }
+
+ /**
+ * Write externally created signature of PDF data obtained via {@link #getDataToSign()} method.
+ *
+ * @param cmsSignature CMS signature byte array
+ * @throws IllegalStateException if PDF is not prepared for external signing
+ * @throws IOException if source data stream is closed
+ */
+ public void writeExternalSignature(byte[] cmsSignature) throws IOException {
+
+ if (incrementPart == null || incrementalInput == null) {
+ throw new IllegalStateException("PDF not prepared for setting signature");
+ }
+ byte[] signatureBytes = Hex.getBytes(cmsSignature);
+
+ // subtract 2 bytes because of the enclosing "<>"
+ if (signatureBytes.length > signatureLength - 2) {
+ throw new IOException("Can't write signature, not enough space");
+ }
+
+ // overwrite the signature Contents in the buffer
+ int incPartSigOffset = (int) (signatureOffset - incrementalInput.length());
+ System.arraycopy(signatureBytes, 0, incrementPart, incPartSigOffset + 1, signatureBytes.length);
+
+ // write the data to the incremental output stream
+ IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput);
+ incrementalOutput.write(incrementPart);
+
+ // prevent further use
+ incrementPart = null;
+ }
+
+ private void writeXrefRange(long x, long y) throws IOException {
+ getStandardOutput().write(String.valueOf(x).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(String.valueOf(y).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().writeEOL();
+ }
+
+ private void writeXrefEntry(COSWriterXRefEntry entry) throws IOException {
+ String offset = formatXrefOffset.format(entry.getOffset());
+ String generation = formatXrefGeneration.format(entry.getKey().getGeneration());
+ getStandardOutput().write(offset.getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(generation.getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(entry.isFree() ? XREF_FREE : XREF_USED);
+ getStandardOutput().writeCRLF();
+ }
+
+ /**
+ * check the xref entries and write out the ranges. The format of the
+ * returned array is exactly the same as the pdf specification. See section
+ * 7.5.4 of ISO32000-1:2008, example 1 (page 40) for reference.
+ * <p>
+ * example: 0 1 2 5 6 7 8 10
+ * <p>
+ * will create a array with follow ranges
+ * <p>
+ * 0 3 5 4 10 1
+ * <p>
+ * this mean that the element 0 is followed by two other related numbers
+ * that represent a cluster of the size 3. 5 is follow by three other
+ * related numbers and create a cluster of size 4. etc.
+ *
+ * @param xRefEntriesList list with the xRef entries that was written
+ * @return a integer array with the ranges
+ */
+ protected Long[] getXRefRanges(List<COSWriterXRefEntry> xRefEntriesList) {
+ long last = -2;
+ long count = 1;
+
+ List<Long> list = new ArrayList<>();
+ for (Object object : xRefEntriesList) {
+ long nr = (int) ((COSWriterXRefEntry) object).getKey().getNumber();
+ if (nr == last + 1) {
+ ++count;
+ last = nr;
+ } else if (last == -2) {
+ last = nr;
+ } else {
+ list.add(last - count + 1);
+ list.add(count);
+ last = nr;
+ count = 1;
+ }
+ }
+ // If no new entry is found, we need to write out the last result
+ if (xRefEntriesList.size() > 0) {
+ list.add(last - count + 1);
+ list.add(count);
+ }
+ return list.toArray(new Long[list.size()]);
+ }
+
+ /**
+ * This will get the object key for the object.
+ *
+ * @param obj The object to get the key for.
+ * @return The object key for the object.
+ */
+ private COSObjectKey getObjectKey(COSBase obj) {
+ COSBase actual = obj;
+ if (actual instanceof COSObject) {
+ actual = ((COSObject) obj).getObject();
+ }
+ // PDFBOX-4540: because objectKeys is accessible from outside, it is possible
+ // that a COSObject obj is already in the objectKeys map.
+ COSObjectKey key = objectKeys.get(obj);
+ if (key == null && actual != null) {
+ key = objectKeys.get(actual);
+ }
+ if (key == null) {
+ setNumber(getNumber() + 1);
+ key = new COSObjectKey(getNumber(), 0);
+ objectKeys.put(obj, key);
+ if (actual != null) {
+ objectKeys.put(actual, key);
+ }
+ }
+ return key;
+ }
+
+ @Override
+ public Object visitFromArray(COSArray obj) throws IOException {
+ int count = 0;
+ getStandardOutput().write(ARRAY_OPEN);
+ for (Iterator<COSBase> i = obj.iterator(); i.hasNext(); ) {
+ COSBase current = i.next();
+ if (current instanceof COSDictionary) {
+ if (current.isDirect()) {
+ visitFromDictionary((COSDictionary) current);
+ } else {
+ addObjectToWrite(current);
+ writeReference(current);
+ }
+ } else if (current instanceof COSObject) {
+ COSBase subValue = ((COSObject) current).getObject();
+ if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || subValue == null) {
+ // PDFBOX-4308: added willEncrypt to prevent an object
+ // that is referenced several times from being written
+ // direct and indirect, thus getting encrypted
+ // with wrong object number or getting encrypted twice
+ addObjectToWrite(current);
+ writeReference(current);
+ } else {
+ subValue.accept(this);
+ }
+ } else if (current == null) {
+ COSNull.NULL.accept(this);
+ } else {
+ current.accept(this);
+ }
+ count++;
+ if (i.hasNext()) {
+ if (count % 10 == 0) {
+ getStandardOutput().writeEOL();
+ } else {
+ getStandardOutput().write(SPACE);
+ }
+ }
+ }
+ getStandardOutput().write(ARRAY_CLOSE);
+ getStandardOutput().writeEOL();
+ return null;
+ }
+
+ @Override
+ public Object visitFromBoolean(COSBoolean obj) throws IOException {
+ obj.writePDF(getStandardOutput());
+ return null;
+ }
+
+ @Override
+ public Object visitFromDictionary(COSDictionary obj) throws IOException {
+ if (!reachedSignature) {
+ COSBase itemType = obj.getItem(COSName.TYPE);
+ if (COSName.SIG.equals(itemType) || COSName.DOC_TIME_STAMP.equals(itemType)) {
+ reachedSignature = true;
+ }
+ }
+ getStandardOutput().write(DICT_OPEN);
+ getStandardOutput().writeEOL();
+ for (Map.Entry<COSName, COSBase> entry : obj.entrySet()) {
+ COSBase value = entry.getValue();
+ if (value != null) {
+ entry.getKey().accept(this);
+ getStandardOutput().write(SPACE);
+ if (value instanceof COSDictionary) {
+ COSDictionary dict = (COSDictionary) value;
+
+ if (!incrementalUpdate) {
+ // write all XObjects as direct objects, this will save some size
+ // PDFBOX-3684: but avoid dictionary that references itself
+ COSBase item = dict.getItem(COSName.XOBJECT);
+ if (item != null && !COSName.XOBJECT.equals(entry.getKey())) {
+ item.setDirect(true);
+ }
+ item = dict.getItem(COSName.RESOURCES);
+ if (item != null && !COSName.RESOURCES.equals(entry.getKey())) {
+ item.setDirect(true);
+ }
+ }
+
+ if (dict.isDirect()) {
+ // If the object should be written direct, we need
+ // to pass the dictionary to the visitor again.
+ visitFromDictionary(dict);
+ } else {
+ addObjectToWrite(dict);
+ writeReference(dict);
+ }
+ } else if (value instanceof COSObject) {
+ COSBase subValue = ((COSObject) value).getObject();
+ if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || subValue == null) {
+ // PDFBOX-4308: added willEncrypt to prevent an object
+ // that is referenced several times from being written
+ // direct and indirect, thus getting encrypted
+ // with wrong object number or getting encrypted twice
+ addObjectToWrite(value);
+ writeReference(value);
+ } else {
+ subValue.accept(this);
+ }
+ } else {
+ // If we reach the pdf signature, we need to determinate the position of the
+ // content and byterange
+ if (reachedSignature && COSName.CONTENTS.equals(entry.getKey())) {
+ signatureOffset = getStandardOutput().getPos();
+ value.accept(this);
+ signatureLength = getStandardOutput().getPos() - signatureOffset;
+ } else if (reachedSignature && COSName.BYTERANGE.equals(entry.getKey())) {
+ byteRangeArray = (COSArray) entry.getValue();
+ byteRangeOffset = getStandardOutput().getPos() + 1;
+ value.accept(this);
+ byteRangeLength = getStandardOutput().getPos() - 1 - byteRangeOffset;
+ reachedSignature = false;
+ } else {
+ value.accept(this);
+ }
+ }
+ getStandardOutput().writeEOL();
+
+ } else {
+ //then we won't write anything, there are a couple cases
+ //were the value of an entry in the COSDictionary will
+ //be a dangling reference that points to nothing
+ //so we will just not write out the entry if that is the case
+ }
+ }
+ getStandardOutput().write(DICT_CLOSE);
+ getStandardOutput().writeEOL();
+ return null;
+ }
+
+ @Override
+ public Object visitFromDocument(COSDocument doc) throws IOException {
+ if (!incrementalUpdate) {
+ doWriteHeader(doc);
+ } else {
+ // Sometimes the original file will be missing a newline at the end
+ // In order to avoid having %%EOF the first object on the same line
+ // as the %%EOF, we put a newline here. If there's already one at
+ // the end of the file, an extra one won't hurt. PDFBOX-1051
+ getStandardOutput().writeCRLF();
+ }
+
+ doWriteBody(doc);
+
+ // get the previous trailer
+ COSDictionary trailer = doc.getTrailer();
+ long hybridPrev = -1;
+
+ if (trailer != null) {
+ hybridPrev = trailer.getLong(COSName.XREF_STM);
+ }
+
+ if (incrementalUpdate || doc.isXRefStream()) {
+ doWriteXRefInc(doc, hybridPrev);
+ } else {
+ doWriteXRefTable();
+ doWriteTrailer(doc);
+ }
+
+ // write endof
+ getStandardOutput().write(STARTXREF);
+ getStandardOutput().writeEOL();
+ getStandardOutput().write(String.valueOf(getStartxref()).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().writeEOL();
+ getStandardOutput().write(EOF);
+ getStandardOutput().writeEOL();
+
+ if (incrementalUpdate) {
+ if (signatureOffset == 0 || byteRangeOffset == 0) {
+ doWriteIncrement();
+ } else {
+ doWriteSignature();
+ }
+ }
+
+ return null;
+ }
+
+ @Override
+ public Object visitFromFloat(COSFloat obj) throws IOException {
+ obj.writePDF(getStandardOutput());
+ return null;
+ }
+
+ @Override
+ public Object visitFromInt(COSInteger obj) throws IOException {
+ obj.writePDF(getStandardOutput());
+ return null;
+ }
+
+ @Override
+ public Object visitFromName(COSName obj) throws IOException {
+ obj.writePDF(getStandardOutput());
+ return null;
+ }
+
+ @Override
+ public Object visitFromNull(COSNull obj) throws IOException {
+ obj.writePDF(getStandardOutput());
+ return null;
+ }
+
+ /**
+ * visitFromObjRef method comment.
+ *
+ * @param obj The object that is being visited.
+ * @throws IOException If there is an exception while visiting this object.
+ */
+ public void writeReference(COSBase obj) throws IOException {
+ COSObjectKey key = getObjectKey(obj);
+ getStandardOutput().write(String.valueOf(key.getNumber()).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1));
+ getStandardOutput().write(SPACE);
+ getStandardOutput().write(REFERENCE);
+ }
+
+ @Override
+ public Object visitFromStream(COSStream obj) throws IOException {
+ if (willEncrypt) {
+ pdDocument.getEncryption().getSecurityHandler()
+ .encryptStream(obj, currentObjectKey.getNumber(), currentObjectKey.getGeneration());
+ }
+
+ InputStream input = null;
+ try {
+ // write the stream content
+ visitFromDictionary(obj);
+ getStandardOutput().write(STREAM);
+ getStandardOutput().writeCRLF();
+
+ input = obj.createRawInputStream();
+ IOUtils.copy(input, getStandardOutput());
+
+ getStandardOutput().writeCRLF();
+ getStandardOutput().write(ENDSTREAM);
+ getStandardOutput().writeEOL();
+ return null;
+ } finally {
+ if (input != null) {
+ input.close();
+ }
+ }
+ }
+
+ @Override
+ public Object visitFromString(COSString obj) throws IOException {
+ if (willEncrypt) {
+ pdDocument.getEncryption().getSecurityHandler().encryptString(
+ obj,
+ currentObjectKey.getNumber(),
+ currentObjectKey.getGeneration());
+ }
+ COSWriter.writeString(obj, getStandardOutput());
+ return null;
+ }
+
+ /**
+ * This will write the pdf document.
+ *
+ * @param doc The document to write.
+ * @throws IOException If an error occurs while generating the data.
+ */
+ public void write(COSDocument doc) throws IOException {
+ PDDocument pdDoc = new PDDocument(doc);
+ write(pdDoc);
+ }
+
+ /**
+ * This will write the pdf document. If signature should be created externally,
+ * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method.
+ *
+ * @param doc The document to write.
+ * @throws IOException If an error occurs while generating the data.
+ */
+ public void write(PDDocument doc) throws IOException {
+ write(doc, null);
+ }
+
+ /**
+ * This will write the pdf document. If signature should be created externally,
+ * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method.
+ *
+ * @param doc The document to write.
+ * @param signInterface class to be used for signing; {@code null} if external signing would be performed
+ * or there will be no signing at all
+ * @throws IOException If an error occurs while generating the data.
+ * @throws IllegalStateException If the document has an encryption dictionary but no protection
+ * policy.
+ */
+ public void write(PDDocument doc, SignatureInterface signInterface) throws IOException {
+ Long idTime = doc.getDocumentId() == null ? System.currentTimeMillis() :
+ doc.getDocumentId();
+
+ pdDocument = doc;
+ signatureInterface = signInterface;
+
+ if (incrementalUpdate) {
+ prepareIncrement(doc);
+ }
+
+ // if the document says we should remove encryption, then we shouldn't encrypt
+ if (doc.isAllSecurityToBeRemoved()) {
+ willEncrypt = false;
+ // also need to get rid of the "Encrypt" in the trailer so readers
+ // don't try to decrypt a document which is not encrypted
+ COSDocument cosDoc = doc.getDocument();
+ COSDictionary trailer = cosDoc.getTrailer();
+ trailer.removeItem(COSName.ENCRYPT);
+ } else {
+ if (pdDocument.getEncryption() != null) {
+ if (!incrementalUpdate) {
+ SecurityHandler securityHandler = pdDocument.getEncryption().getSecurityHandler();
+ if (!securityHandler.hasProtectionPolicy()) {
+ throw new IllegalStateException("PDF contains an encryption dictionary, please remove it with "
+ + "setAllSecurityToBeRemoved() or set a protection policy with protect()");
+ }
+ securityHandler.prepareDocumentForEncryption(pdDocument);
+ }
+ willEncrypt = true;
+ } else {
+ willEncrypt = false;
+ }
+ }
+
+ COSDocument cosDoc = pdDocument.getDocument();
+ COSDictionary trailer = cosDoc.getTrailer();
+ COSArray idArray;
+ boolean missingID = true;
+ COSBase base = trailer.getDictionaryObject(COSName.ID);
+ if (base instanceof COSArray) {
+ idArray = (COSArray) base;
+ if (idArray.size() == 2) {
+ missingID = false;
+ }
+ } else {
+ idArray = new COSArray();
+ }
+ if (missingID || incrementalUpdate) {
+ MessageDigest md5;
+ try {
+ md5 = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ // should never happen
+ throw new RuntimeException(e);
+ }
+
+ // algorithm says to use time/path/size/values in doc to generate the id.
+ // we don't have path or size, so do the best we can
+ md5.update(Long.toString(idTime).getBytes(StandardCharsets.ISO_8859_1));
+
+ COSDictionary info = trailer.getCOSDictionary(COSName.INFO);
+ if (info != null) {
+ for (COSBase cosBase : info.getValues()) {
+ md5.update(cosBase.toString().getBytes(StandardCharsets.ISO_8859_1));
+ }
+ }
+ // reuse origin documentID if available as first value
+ COSString firstID = missingID ? new COSString(md5.digest()) : (COSString) idArray.get(0);
+ // it's ok to use the same ID for the second part if the ID is created for the first time
+ COSString secondID = missingID ? firstID : new COSString(md5.digest());
+ idArray = new COSArray();
+ idArray.add(firstID);
+ idArray.add(secondID);
+ trailer.setItem(COSName.ID, idArray);
+ }
+ cosDoc.accept(this);
+ }
+
+ /**
+ * This will write the fdf document.
+ *
+ * @param doc The document to write.
+ * @throws IOException If an error occurs while generating the data.
+ */
+ public void write(FDFDocument doc) throws IOException {
+ fdfDocument = doc;
+ willEncrypt = false;
+ COSDocument cosDoc = fdfDocument.getDocument();
+ cosDoc.accept(this);
+ }
+
+ /**
+ * This will output the given byte getString as a PDF object.
+ *
+ * @param string COSString to be written
+ * @param output The stream to write to.
+ * @throws IOException If there is an error writing to the stream.
+ */
+ public static void writeString(COSString string, OutputStream output) throws IOException {
+ writeString(string.getBytes(), string.getForceHexForm(), output);
+ }
+
+ /**
+ * This will output the given text/byte getString as a PDF object.
+ *
+ * @param bytes byte array representation of a string to be written
+ * @param output The stream to write to.
+ * @throws IOException If there is an error writing to the stream.
+ */
+ public static void writeString(byte[] bytes, OutputStream output) throws IOException {
+ writeString(bytes, false, output);
+ }
+
+ /**
+ * This will output the given text/byte string as a PDF object.
+ *
+ * @param output The stream to write to.
+ * @throws IOException If there is an error writing to the stream.
+ */
+ private static void writeString(byte[] bytes, boolean forceHex, OutputStream output)
+ throws IOException {
+ // check for non-ASCII characters
+ boolean isASCII = true;
+ if (!forceHex) {
+ for (byte b : bytes) {
+ // if the byte is negative then it is an eight bit byte and is outside the ASCII range
+ if (b < 0) {
+ isASCII = false;
+ break;
+ }
+ // PDFBOX-3107 EOL markers within a string are troublesome
+ if (b == 0x0d || b == 0x0a) {
+ isASCII = false;
+ break;
+ }
+ }
+ }
+
+ if (isASCII && !forceHex) {
+ // write ASCII string
+ output.write('(');
+ for (byte b : bytes) {
+ switch (b) {
+ case '(':
+ case ')':
+ case '\\':
+ output.write('\\');
+ output.write(b);
+ break;
+ default:
+ output.write(b);
+ break;
+ }
+ }
+ output.write(')');
+ } else {
+ // write hex string
+ output.write('<');
+ Hex.writeHexBytes(bytes, output);
+ output.write('>');
+ }
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java
new file mode 100644
index 0000000..ab7fa11
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.pdf;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.fuzzing.exceptions.CantFuzzException;
+import org.apache.tika.mime.MediaType;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.Set;
+
+public class PDFTransformer implements Transformer {
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+ @Override
+ public Set<MediaType> getSupportedTypes() {
+ return SUPPORTED_TYPES;
+ }
+
+ private PDFTransformerConfig config = new PDFTransformerConfig();
+
+ @Override
+ public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
+ try (PDDocument pdDocument = PDDocument.load(is)) {
+ try (EvilCOSWriter cosWriter = new EvilCOSWriter(os, config)) {
+ cosWriter.write(pdDocument);
+ }
+ } catch (InvalidPasswordException e) {
+ throw new CantFuzzException("encrypted doc");
+ }
+ }
+}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
new file mode 100644
index 0000000..d152878
--- /dev/null
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fuzzing.pdf;
+
+public class PDFTransformerConfig {
+
+ private boolean randomizeObjectNumbers = true;
+
+ public boolean getRandomizeObjectNumbers() {
+ return randomizeObjectNumbers;
+ }
+}
diff --git a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer
new file mode 100644
index 0000000..07390de
--- /dev/null
+++ b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.fuzzing.general.GeneralTransformer
+#org.apache.tika.fuzzing.pdf.PDFTransformer
\ No newline at end of file
diff --git a/tika-fuzzing/src/main/resources/log4j.properties b/tika-fuzzing/src/main/resources/log4j.properties
new file mode 100644
index 0000000..7d3b372
--- /dev/null
+++ b/tika-fuzzing/src/main/resources/log4j.properties
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#info,debug, error,fatal ...
+log4j.rootLogger=info,stderr
+
+#console
+log4j.appender.stderr=org.apache.log4j.ConsoleAppender
+log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
+log4j.appender.stderr.Target=System.err
+
+log4j.appender.stderr.layout.ConversionPattern= %-5p %m%n
diff --git a/tika-fuzzing/src/test/java/TestFuzzingCLI.java b/tika-fuzzing/src/test/java/TestFuzzingCLI.java
new file mode 100644
index 0000000..a98291b
--- /dev/null
+++ b/tika-fuzzing/src/test/java/TestFuzzingCLI.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.fuzzing.cli.FuzzingCLI;
+import org.apache.tika.utils.ProcessUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class TestFuzzingCLI {
+
+ @Test
+ @Ignore
+ public void testBasic() throws Exception {
+ //convert to actual unit test
+ String inputDir = "";// fill in
+ String outputDir = "";//fill in
+ String[] args = new String[] {
+ "-i", inputDir,
+ "-o", outputDir,
+ "-n", "8", // num threads
+ "-t", "1", //max transformers
+ "-p", "100", //per file iterations
+ "-r", "3"
+ };
+ FuzzingCLI.main(args);
+ }
+
+ @Test
+ @Ignore
+ public void testMock() throws Exception {
+ //convert to actual unit test
+ Path inputDir = Paths.get(getClass().getResource("/test-documents").toURI());
+ Path outputDir = Files.createTempDirectory("tika-fuzzing-");
+ String[] args = new String[] {
+ "-i", ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()),
+ "-o", ProcessUtils.escapeCommandLine(outputDir.toAbsolutePath().toString()),
+ "-n", "8", // num threads
+ "-t", "0", //max transformers
+ "-p", "10", //per file iterations
+ "-m", "10000", //max ms per file
+ "-r", "3"
+ };
+ try {
+ FuzzingCLI.main(args);
+ } finally {
+ FileUtils.deleteDirectory(outputDir.toFile());
+ }
+ }
+}
diff --git a/tika-fuzzing/src/test/java/TestTransformer.java b/tika-fuzzing/src/test/java/TestTransformer.java
new file mode 100644
index 0000000..1db2e1e
--- /dev/null
+++ b/tika-fuzzing/src/test/java/TestTransformer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.tika.fuzzing.general.GeneralTransformer;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+
+public class TestTransformer {
+
+ @Test
+ @Ignore
+ public void testBasic() throws Exception {
+ //turn into actual unit test
+ Path path = Paths.get("");//put something meaningful here
+
+ GeneralTransformer transformer = new GeneralTransformer();
+ byte[] bytes = Files.readAllBytes(path);
+
+ for (int i = 0; i < 100; i++) {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ transformer.transform(
+ new ByteArrayInputStream(bytes), bos);
+
+ if (Arrays.equals(bos.toByteArray(), bytes)) {
+ System.out.println("SAME");
+ }
+ }
+ }
+}
diff --git a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml
new file mode 100644
index 0000000..f1f5b67
--- /dev/null
+++ b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <write element="p">some content</write>
+ <hang millis="30000" heavy="true" pulse_millis="100" />
+</mock>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml
new file mode 100644
index 0000000..4561c3a
--- /dev/null
+++ b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <write element="p">some content</write>
+ <throw class="java.lang.NullPointerException">another null pointer exception</throw>
+</mock>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml
new file mode 100644
index 0000000..75d1d3b
--- /dev/null
+++ b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <write element="p">some content</write>
+ <system_exit />
+</mock>
\ No newline at end of file