You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/09/07 12:11:03 UTC
svn commit: r1622993 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser:
BaseParser.java NonSequentialPDFParser.java
Author: tilman
Date: Sun Sep 7 10:11:03 2014
New Revision: 1622993
URL: http://svn.apache.org/r1622993
Log:
PDFBOX-2320: use readUntilEndStream from BaseParser, remove the method from NonSequentialParser; better log output
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1622993&r1=1622992&r2=1622993&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Sep 7 10:11:03 2014
@@ -639,7 +639,7 @@ public abstract class BaseParser
*
* @throws IOException
*/
- private void readUntilEndStream( final OutputStream out ) throws IOException
+ protected void readUntilEndStream( final OutputStream out ) throws IOException
{
int bufSize;
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1622993&r1=1622992&r2=1622993&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun Sep 7 10:11:03 2014
@@ -1606,121 +1606,14 @@ public class NonSequentialPDFParser exte
{
streamLengthIsValid = false;
LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream");
+ LOG.error("Stream start offset: " + originOffset);
+ LOG.error("Expected endofstream offset: " + expectedEndOfStream);
}
pdfSource.seek(originOffset);
}
return streamLengthIsValid;
}
- private void readUntilEndStream(final OutputStream out) throws IOException
- {
- int bufSize;
- int charMatchCount = 0;
- byte[] keyw = ENDSTREAM;
-
- final int quickTestOffset = 5; // last character position of shortest
- // keyword ('endobj')
-
- // read next chunk into buffer; already matched chars are added to
- // beginning of buffer
- while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen
- - charMatchCount)) > 0)
- {
- // number of already matching chars
- int startingMatchCount = charMatchCount;
- int bIdx = charMatchCount;
- int quickTestIdx;
-
- // iterate over buffer, trying to find keyword match
- for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++)
- {
- // reduce compare operations by first test last character we
- // would have to
- // match if current one matches; if it is not a character from
- // keywords
- // we can move behind the test character;
- // this shortcut is inspired by Boyer-Moore string search
- // algorithm
- // and can reduce parsing time by approx. 20%
- if ((charMatchCount == 0)
- && ((quickTestIdx = bIdx + quickTestOffset) < maxQuicktestIdx))
- {
-
- final byte ch = streamCopyBuf[quickTestIdx];
- if ((ch > 't') || (ch < 'a'))
- {
- // last character we would have to match if current
- // character would match
- // is not a character from keywords -> jump behind and
- // start over
- bIdx = quickTestIdx;
- continue;
- }
- }
-
- final byte ch = streamCopyBuf[bIdx]; // could be negative - but
- // we only compare to ASCII
- if (ch == keyw[charMatchCount])
- {
- if (++charMatchCount == keyw.length)
- {
- // match found
- bIdx++;
- break;
- }
- }
- else
- {
- if ((charMatchCount == 3) && (ch == ENDOBJ[charMatchCount]))
- {
- // maybe ENDSTREAM is missing but we could have ENDOBJ
- keyw = ENDOBJ;
- charMatchCount++;
- }
- else
- {
- // no match; incrementing match start by 1 would be dumb
- // since we already know matched chars
- // depending on current char read we may already have
- // beginning of a new match:
- // 'e': first char matched;
- // 'n': if we are at match position idx 7 we already
- // read 'e' thus 2 chars matched
- // for each other char we have to start matching first
- // keyword char beginning with next
- // read position
- charMatchCount = (ch == E) ? 1 : ((ch == N) && (charMatchCount == 7)) ? 2 : 0;
- // search again for 'endstream'
- keyw = ENDSTREAM;
- }
- }
- } // for
-
- int contentBytes = Math.max(0, bIdx - charMatchCount);
-
- // write buffer content until first matched char to output stream
- if (contentBytes > 0)
- {
- out.write(streamCopyBuf, 0, contentBytes);
- }
- if (charMatchCount == keyw.length)
- {
- // keyword matched;
- // unread matched keyword (endstream/endobj) and following buffered content
- pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length
- + startingMatchCount);
- break;
- }
- else
- {
- // copy matched chars at start of buffer
- System.arraycopy(keyw, 0, streamCopyBuf, 0, charMatchCount);
- }
- } // while
-
- out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
- }
-
/**
*
* @param startXRefOffset