You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/09/07 12:11:03 UTC

svn commit: r1622993 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BaseParser.java NonSequentialPDFParser.java

Author: tilman
Date: Sun Sep  7 10:11:03 2014
New Revision: 1622993

URL: http://svn.apache.org/r1622993
Log:
PDFBOX-2320: use readUntilEndStream from BaseParser, remove the method from NonSequentialParser; better log output

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1622993&r1=1622992&r2=1622993&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Sep  7 10:11:03 2014
@@ -639,7 +639,7 @@ public abstract class BaseParser
      * 
      * @throws IOException
      */
-    private void readUntilEndStream( final OutputStream out ) throws IOException
+    protected void readUntilEndStream( final OutputStream out ) throws IOException
     {
 
         int bufSize;

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1622993&r1=1622992&r2=1622993&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun Sep  7 10:11:03 2014
@@ -1606,121 +1606,14 @@ public class NonSequentialPDFParser exte
             {
                 streamLengthIsValid = false;
                 LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream");
+                LOG.error("Stream start offset: " + originOffset);
+                LOG.error("Expected endofstream offset: " + expectedEndOfStream);
             }
             pdfSource.seek(originOffset);
         }
         return streamLengthIsValid;
     }
 
-    private void readUntilEndStream(final OutputStream out) throws IOException
-    {
-        int bufSize;
-        int charMatchCount = 0;
-        byte[] keyw = ENDSTREAM;
-
-        final int quickTestOffset = 5; // last character position of shortest
-                                       // keyword ('endobj')
-
-        // read next chunk into buffer; already matched chars are added to
-        // beginning of buffer
-        while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen
-                - charMatchCount)) > 0)
-        {
-            // number of already matching chars
-            int startingMatchCount = charMatchCount;
-            int bIdx = charMatchCount;
-            int quickTestIdx;
-
-            // iterate over buffer, trying to find keyword match
-            for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++)
-            {
-                // reduce compare operations by first test last character we
-                // would have to
-                // match if current one matches; if it is not a character from
-                // keywords
-                // we can move behind the test character;
-                // this shortcut is inspired by Boyer-Moore string search
-                // algorithm
-                // and can reduce parsing time by approx. 20%
-                if ((charMatchCount == 0)
-                        && ((quickTestIdx = bIdx + quickTestOffset) < maxQuicktestIdx))
-                {
-
-                    final byte ch = streamCopyBuf[quickTestIdx];
-                    if ((ch > 't') || (ch < 'a'))
-                    {
-                        // last character we would have to match if current
-                        // character would match
-                        // is not a character from keywords -> jump behind and
-                        // start over
-                        bIdx = quickTestIdx;
-                        continue;
-                    }
-                }
-
-                final byte ch = streamCopyBuf[bIdx]; // could be negative - but
-                                                     // we only compare to ASCII
-                if (ch == keyw[charMatchCount])
-                {
-                    if (++charMatchCount == keyw.length)
-                    {
-                        // match found
-                        bIdx++;
-                        break;
-                    }
-                }
-                else
-                {
-                    if ((charMatchCount == 3) && (ch == ENDOBJ[charMatchCount]))
-                    {
-                        // maybe ENDSTREAM is missing but we could have ENDOBJ
-                        keyw = ENDOBJ;
-                        charMatchCount++;
-                    }
-                    else
-                    {
-                        // no match; incrementing match start by 1 would be dumb
-                        // since we already know matched chars
-                        // depending on current char read we may already have
-                        // beginning of a new match:
-                        // 'e': first char matched;
-                        // 'n': if we are at match position idx 7 we already
-                        // read 'e' thus 2 chars matched
-                        // for each other char we have to start matching first
-                        // keyword char beginning with next
-                        // read position
-                        charMatchCount = (ch == E) ? 1 : ((ch == N) && (charMatchCount == 7)) ? 2 : 0;
-                        // search again for 'endstream'
-                        keyw = ENDSTREAM;
-                    }
-                }
-            } // for
-
-            int contentBytes = Math.max(0, bIdx - charMatchCount);
-
-            // write buffer content until first matched char to output stream
-            if (contentBytes > 0)
-            {
-                out.write(streamCopyBuf, 0, contentBytes);
-            }
-            if (charMatchCount == keyw.length)
-            {
-                // keyword matched;
-                // unread matched keyword (endstream/endobj) and following buffered content
-                pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length
-                        + startingMatchCount);
-                break;
-            }
-            else
-            {
-                // copy matched chars at start of buffer
-                System.arraycopy(keyw, 0, streamCopyBuf, 0, charMatchCount);
-            }
-        } // while
-
-        out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
-    }
-
     /**
      * 
      * @param startXRefOffset