You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/18 14:41:49 UTC
[tika] branch main updated: TIKA-3199 -- improve fuzzing of PDF
streams; fix typos and improve documentation
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4c4ef20 TIKA-3199 -- improve fuzzing of PDF streams; fix typos and improve documentation
4c4ef20 is described below
commit 4c4ef209ab78cc74b2b9c1779e0e1c8e06f754fb
Author: tballison <ta...@apache.org>
AuthorDate: Fri Sep 18 10:41:39 2020 -0400
TIKA-3199 -- improve fuzzing of PDF streams; fix typos and improve documentation
---
.../org/apache/tika/fuzzing/pdf/EvilCOSWriter.java | 22 ++++----
.../tika/fuzzing/pdf/PDFTransformerConfig.java | 59 +++++++++++++++++++---
2 files changed, 63 insertions(+), 18 deletions(-)
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
index 8a576a7..15e84a2 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
@@ -32,7 +32,6 @@ import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.COSUpdateInfo;
import org.apache.pdfbox.cos.ICOSVisitor;
-import org.apache.pdfbox.filter.DecodeResult;
import org.apache.pdfbox.filter.Filter;
import org.apache.pdfbox.filter.FilterFactory;
import org.apache.pdfbox.io.IOUtils;
@@ -51,7 +50,6 @@ import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface
import org.apache.pdfbox.util.Hex;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.fuzzing.general.GeneralTransformer;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -192,7 +190,7 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
// the current object number
private long number = 0;
- private int roughtNumberOfObjects = 0;
+ private int roughNumberOfObjects = 0;
// maps the object to the keys generated in the writer
// these are used for indirect references in other objects
@@ -406,7 +404,7 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
COSDictionary root = trailer.getCOSDictionary(COSName.ROOT);
COSDictionary info = trailer.getCOSDictionary(COSName.INFO);
COSDictionary encrypt = trailer.getCOSDictionary(COSName.ENCRYPT);
- roughtNumberOfObjects = doc.getObjects().size();
+ roughNumberOfObjects = doc.getObjects().size();
if (root != null) {
addObjectToWrite(root);
}
@@ -476,7 +474,7 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
// write the object
long objectNumber = currentObjectKey.getNumber();
- if (config.getRandomizeObjectNumbers() > -1.0f && random.nextFloat() <
+ if (config.getRandomizeObjectNumbers() > 0.0f && random.nextFloat() <
config.getRandomizeObjectNumbers()) {
objectNumber = random.nextInt(((int)objectNumber)*2);
}
@@ -506,16 +504,16 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
COSObject cosObject = (COSObject)obj;
COSBase underlyingObject = cosObject.getObject();
- if (underlyingObject instanceof COSStream && config.getRawStreamTransformer() != null) {
+ if (underlyingObject instanceof COSStream && config.getUnfilteredStreamTransformer() != null) {
COSStream cosStream = (COSStream)underlyingObject;
- Transformer rawStreamTransformer = config.getRawStreamTransformer();
+ Transformer unfilteredStreamTransformer = config.getUnfilteredStreamTransformer();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try (InputStream is = cosStream.createRawInputStream()) {
IOUtils.copy(is, bos);
}
ByteArrayOutputStream transformed = new ByteArrayOutputStream();
try {
- rawStreamTransformer.transform(new ByteArrayInputStream(bos.toByteArray()), transformed);
+ unfilteredStreamTransformer.transform(new ByteArrayInputStream(bos.toByteArray()), transformed);
} catch (TikaException e) {
throw new IOExceptionWithCause(e);
}
@@ -609,10 +607,10 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
}
private TikaInputStream transformRawStream(TikaInputStream is) throws IOException {
- if (config.getRawStreamTransformer() != null) {
+ if (config.getUnfilteredStreamTransformer() != null) {
if (is.getLength() < 10000000) {
try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
- config.getRawStreamTransformer().transform(is, bos);
+ config.getUnfilteredStreamTransformer().transform(is, bos);
bos.flush();
bos.close();
return TikaInputStream.get(bos.toByteArray());
@@ -623,7 +621,7 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
TemporaryResources tmp = new TemporaryResources();
Path p = tmp.createTempFile();
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(p))) {
- config.getRawStreamTransformer().transform(is, os);
+ config.getUnfilteredStreamTransformer().transform(is, os);
os.flush();
} catch (TikaException e) {
throw new IOExceptionWithCause(e);
@@ -1232,7 +1230,7 @@ public class EvilCOSWriter implements ICOSVisitor, Closeable {
float r = random.nextFloat();
if (randomThreshold > 0.0f &&
r < randomThreshold) {
- long num = random.nextInt(roughtNumberOfObjects);
+ long num = random.nextInt(roughNumberOfObjects);
LOG.debug("corrupting ref number: "+key.getNumber() + " -> "+num);
getStandardOutput().write(String.valueOf(num).getBytes(StandardCharsets.ISO_8859_1));
} else {
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
index d864ef2..aa7729c 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
@@ -19,7 +19,6 @@ package org.apache.tika.fuzzing.pdf;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSObject;
import org.apache.tika.fuzzing.Transformer;
import org.apache.tika.fuzzing.general.ByteDeleter;
import org.apache.tika.fuzzing.general.ByteFlipper;
@@ -53,7 +52,7 @@ public class PDFTransformerConfig {
new ByteDeleter(),
new ByteFlipper(), new ByteInjector(), new SpanSwapper(), new Truncator());
- private Transformer rawStreamTransformer = new GeneralTransformer(1,
+ private Transformer unfilteredStreamTransformer = new GeneralTransformer(1,
new ByteDeleter(),
new ByteFlipper(), new ByteInjector(), new SpanSwapper(), new Truncator());
@@ -61,10 +60,20 @@ public class PDFTransformerConfig {
return randomizeObjectNumbers;
}
+ /**
+ *
+ * @param randomizeObjectNumbers probability that a given object number will be randomized.
+ * If < 0, this will be ignored.
+ */
public void setRandomizeObjectNumbers(float randomizeObjectNumbers) {
this.randomizeObjectNumbers = randomizeObjectNumbers;
}
+ /**
+ *
+ * @param randomizeRefNumbers probability that a given reference number will be randomized.
+ * If < 0, this will be ignored.
+ */
public void setRandomizeRefNumbers(float randomizeRefNumbers) {
this.randomizeRefNumbers = randomizeRefNumbers;
}
@@ -73,26 +82,44 @@ public class PDFTransformerConfig {
return randomizeRefNumbers;
}
- public Transformer getRawStreamTransformer() {
- return rawStreamTransformer;
+ public Transformer getUnfilteredStreamTransformer() {
+ return unfilteredStreamTransformer;
}
public Transformer getStreamTransformer() {
return streamTransformer;
}
+ /**
+ * This transformer is applied to the stream _after_ each filter has been applied.
+ *
+ * @param transformer
+ */
public void setStreamTransformer(Transformer transformer) {
this.streamTransformer = transformer;
}
- public void setRawStreamTransformer(Transformer transformer) {
- this.rawStreamTransformer = transformer;
+ /**
+ * This transformer is applied to the stream _before_ any filters
+ * are applied.
+ * @param transformer
+ */
+ public void setUnfilteredStreamTransformer(Transformer transformer) {
+ this.unfilteredStreamTransformer = transformer;
}
+ /**
+ *
+ * @param maxFilters maximum number of filters to apply
+ */
public void setMaxFilters(int maxFilters) {
this.maxFilters = maxFilters;
}
+ /**
+ * Which filters are allowed
+ * @return
+ */
public Set<COSName> getAllowableFilters() {
return allowableFilters;
}
@@ -101,6 +128,14 @@ public class PDFTransformerConfig {
this.allowableFilters = allowableFilters;
}
+ /**
+ * If {@link #maxFilters} > 0, this will randomly select filters given
+ * the {@link #maxFilters} and {@link #minFilters}. If {@link #maxFilters} < 0,
+ * this will return the existing filters.
+ *
+ * @param existingFilters
+ * @return
+ */
public List<COSName> getFilters(COSBase existingFilters) {
if (maxFilters < 0) {
List<COSName> ret = new ArrayList<>();
@@ -132,6 +167,11 @@ public class PDFTransformerConfig {
return filters;
}
+ /**
+ * Minimum number of filters to apply to streams.
+ *
+ * @param minFilters
+ */
public void setMinFilters(int minFilters) {
this.minFilters = minFilters;
}
@@ -140,6 +180,13 @@ public class PDFTransformerConfig {
return maxFilteredStreamLength;
}
+ /**
+ * Maximum filtered stream length. AsciiHex doubles the size of the stream with
+ * each encoding. This is used as a circuit breaker to stop adding filters
+ * if the stream goes above a given length.
+ *
+ * @param maxFilteredStreamLength
+ */
public void setMaxFilteredStreamLength(long maxFilteredStreamLength) {
this.maxFilteredStreamLength = maxFilteredStreamLength;
}