You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/03/07 23:03:59 UTC
svn commit: r1575427 -
/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
Author: tilman
Date: Fri Mar 7 22:03:59 2014
New Revision: 1575427
URL: http://svn.apache.org/r1575427
Log:
PDFBOX-1164: add heuristic by Timo Boehme to detect wrongly assumed end of inline image
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1575427&r1=1575426&r2=1575427&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Fri Mar 7 22:03:59 2014
@@ -19,6 +19,7 @@ package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.PushbackInputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -47,6 +48,8 @@ public class PDFStreamParser extends Bas
{
private List<Object> streamObjects = new ArrayList<Object>( 100 );
private RandomAccess file;
+ private final int maxBinCharTestLength = 5;
+ private final byte[] binCharTestArr = new byte[maxBinCharTestLength];
/**
* Constructor that takes a stream to parse.
@@ -393,7 +396,8 @@ public class PDFStreamParser extends Bas
// Be aware not all kind of whitespaces are allowed here. see PDFBOX1561
while( !(lastByte == 'E' &&
currentByte == 'I' &&
- isSpaceOrReturn()) &&
+ isSpaceOrReturn() &&
+ hasNoFollowingBinData( pdfSource )) &&
!pdfSource.isEOF() )
{
imageData.write( lastByte );
@@ -435,6 +439,37 @@ public class PDFStreamParser extends Bas
}
/**
+ * Looks up next 5 bytes if they contain only ASCII characters (no control
+ * sequences etc.).
+ *
+ * @return <code>true</code> if next 5 bytes are printable ASCII characters,
+ * otherwise <code>false</code>
+ */
+ private boolean hasNoFollowingBinData(final PushbackInputStream pdfSource)
+ throws IOException
+ {
+ // as suggested in PDFBOX-1164
+ final int readBytes = pdfSource.read(binCharTestArr, 0, maxBinCharTestLength);
+ boolean noBinData = true;
+
+ if (readBytes > 0)
+ {
+ for (int bIdx = 0; bIdx < readBytes; bIdx++)
+ {
+ final byte b = binCharTestArr[bIdx];
+ if ((b < 0x09) || ((b > 0x0a) && (b < 0x20) && (b != 0x0d)))
+ {
+ // control character or > 0x7f -> we have binary data
+ noBinData = false;
+ break;
+ }
+ }
+ pdfSource.unread(binCharTestArr, 0, readBytes);
+ }
+ return noBinData;
+ }
+
+ /**
* This will read an operator from the stream.
*
* @return The operator that was read from the stream.