You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2011/12/01 15:02:51 UTC

svn commit: r1209088 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Author: lehmi
Date: Thu Dec  1 14:02:50 2011
New Revision: 1209088

URL: http://svn.apache.org/viewvc?rev=1209088&view=rev
Log:
PDFBOX-1175: added the improved readUntilEndStream implementation as proposed by Timo Boehme, code reformatted, added some string constants

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1209088&r1=1209087&r2=1209088&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Dec  1 14:02:50 2011
@@ -55,7 +55,7 @@ public abstract class BaseParser
     /**
      * Log instance.
      */
-    private static final Log log = LogFactory.getLog(BaseParser.class);
+    private static final Log LOG = LogFactory.getLog(BaseParser.class);
 
     private static final int E = 'e';
     private static final int N = 'n';
@@ -64,7 +64,6 @@ public abstract class BaseParser
     private static final int S = 's';
     private static final int T = 't';
     private static final int R = 'r';
-    //private static final int E = 'e';
     private static final int A = 'a';
     private static final int M = 'm';
 
@@ -72,6 +71,9 @@ public abstract class BaseParser
     private static final int B = 'b';
     private static final int J = 'j';
 
+    private final int    strmBufLen = 2048;
+    private final byte[] strmBuf    = new byte[ strmBufLen ];
+
     /**
      * This is a byte array that will be used for comparisons.
      */
@@ -85,9 +87,33 @@ public abstract class BaseParser
         new byte[] { E, N, D, O, B, J };
 
     /**
-     * This is a byte array that will be used for comparisons.
+     * This is a string constant that will be used for comparisons.
      */
     public static final String DEF = "def";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String ENDOBJ_STRING = "endobj";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String ENDSTREAM_STRING = "endstream";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String STREAM_STRING = "stream";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String TRUE = "true";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String FALSE = "false";
+    /**
+     * This is a string constant that will be used for comparisons.
+     */
+    private static final String NULL = "null";
 
     /**
      * Default value of the {@link #forceParsing} flag.
@@ -110,6 +136,9 @@ public abstract class BaseParser
      */
     protected final boolean forceParsing;
 
+    /**
+     * Default constructor.
+     */
     public BaseParser()
     {
         this.forceParsing = FORCE_PARSING;
@@ -120,16 +149,16 @@ public abstract class BaseParser
      *
      * @since Apache PDFBox 1.3.0
      * @param input The input stream to read the data from.
-     * @param forceParcing flag to skip malformed or otherwise unparseable
+     * @param forceParsingValue flag to skip malformed or otherwise unparseable
      *                     input where possible
      * @throws IOException If there is an error reading the input stream.
      */
-    public BaseParser(InputStream input, boolean forceParsing)
+    public BaseParser(InputStream input, boolean forceParsingValue)
             throws IOException
     {
         this.pdfSource = new PushBackInputStream(
                 new BufferedInputStream(input, 16384),  4096);
-        this.forceParsing = forceParsing;
+        this.forceParsing = forceParsingValue;
     }
 
     /**
@@ -138,7 +167,8 @@ public abstract class BaseParser
      * @param input The input stream to read the data from.
      * @throws IOException If there is an error reading the input stream.
      */
-    public BaseParser(InputStream input) throws IOException {
+    public BaseParser(InputStream input) throws IOException 
+    {
         this(input, FORCE_PARSING);
     }
 
@@ -148,7 +178,8 @@ public abstract class BaseParser
      * @param input The array to read the data from.
      * @throws IOException If there is an error reading the byte data.
      */
-    protected BaseParser(byte[] input) throws IOException {
+    protected BaseParser(byte[] input) throws IOException 
+    {
         this(new ByteArrayInputStream(input));
     }
 
@@ -240,40 +271,53 @@ public abstract class BaseParser
                 {
                     //an invalid dictionary, we are expecting
                     //the key, read until we can recover
-                    log.warn("Invalid dictionary, found: '" + c + "' but expected: '/'");
+                    LOG.warn("Invalid dictionary, found: '" + c + "' but expected: '/'");
                     int read = pdfSource.read();
                     while(read != -1 && read != '/' && read != '>')
                     {
                         // in addition to stopping when we find / or >, we also want
                         // to stop when we find endstream or endobj.
-                        if(read==E) {
+                        if(read==E) 
+                        {
                             read = pdfSource.read();
-                            if(read==N) {
+                            if(read==N) 
+                            {
                                 read = pdfSource.read();
-                                if(read==D) {
+                                if(read==D)
+                                {
                                     read = pdfSource.read();
-                                    if(read==S) {
+                                    if(read==S) 
+                                    {
                                         read = pdfSource.read();
-                                        if(read==T) {
+                                        if(read==T) 
+                                        {
                                             read = pdfSource.read();
-                                            if(read==R) {
+                                            if(read==R) 
+                                            {
                                                 read = pdfSource.read();
-                                                if(read==E) {
+                                                if(read==E) 
+                                                {
                                                     read = pdfSource.read();
-                                                    if(read==A) {
+                                                    if(read==A) 
+                                                    {
                                                         read = pdfSource.read();
-                                                        if(read==M) {
+                                                        if(read==M) 
+                                                        {
                                                             return obj; // we're done reading this object!
                                                         }
                                                     }
                                                 }
                                             }
                                         }
-                                    } else if(read==O) {
+                                    } 
+                                    else if(read==O) 
+                                    {
                                         read = pdfSource.read();
-                                        if(read==B) {
+                                        if(read==B) 
+                                        {
                                             read = pdfSource.read();
-                                            if(read==J) {
+                                            if(read==J) 
+                                            {
                                                 return obj; // we're done reading this object!
                                             }
                                         }
@@ -314,7 +358,7 @@ public abstract class BaseParser
 
                 if( value == null )
                 {
-                    log.warn("Bad Dictionary Declaration " + pdfSource );
+                    LOG.warn("Bad Dictionary Declaration " + pdfSource );
                 }
                 else
                 {
@@ -354,7 +398,7 @@ public abstract class BaseParser
             String streamString = readString();
             //long streamLength;
 
-            if (!streamString.equals("stream"))
+            if (!streamString.equals(STREAM_STRING))
             {
                 throw new IOException("expected='stream' actual='" + streamString + "'");
             }
@@ -407,14 +451,14 @@ public abstract class BaseParser
             skipSpaces();
             endStream = readString();
 
-            if (!endStream.equals("endstream"))
+            if (!endStream.equals(ENDSTREAM_STRING))
             {
                 /*
                  * Sometimes stream objects don't have an endstream tag so readUntilEndStream(out)
                  * also can stop on endobj tags. If that's the case we need to make sure to unread
                  * the endobj so parseObject() can handle that case normally.
                  */
-                if (endStream.startsWith("endobj"))
+                if (endStream.startsWith(ENDOBJ_STRING))
                 {
                     byte[] endobjarray = endStream.getBytes("ISO-8859-1");
                     pdfSource.unread(endobjarray);
@@ -425,7 +469,7 @@ public abstract class BaseParser
                  * and not part of the endstream keyword. Ex. Some files would have "endstream8"
                  * instead of "endstream"
                  */
-                else if(endStream.startsWith("endstream"))
+                else if(endStream.startsWith(ENDSTREAM_STRING))
                 {
                     String extra = endStream.substring(9, endStream.length());
                     endStream = endStream.substring(0, 9);
@@ -440,7 +484,7 @@ public abstract class BaseParser
                      */
                     readUntilEndStream( out );
                     endStream = readString();
-                    if( !endStream.equals( "endstream" ) )
+                    if( !endStream.equals( ENDSTREAM_STRING ) )
                     {
                         throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
                     }
@@ -463,83 +507,111 @@ public abstract class BaseParser
      * object. Some pdf files, however, forget to write some endstream tags
      * and just close off objects with an "endobj" tag so we have to handle
      * this case as well.
-     * @param out The stream we write out to.
+     * 
+     * This method is optimized using buffered IO and reduced number of
+     * byte compare operations.
+     * 
+     * @param out  stream we write out to.
+     * 
      * @throws IOException
      */
-    private void readUntilEndStream( OutputStream out ) throws IOException{
-        int byteRead;
-        do{ //use a fail fast test for end of stream markers
-            byteRead = pdfSource.read();
-            if(byteRead==E){//only branch if "e"
-                byteRead = pdfSource.read();
-                if(byteRead==N){ //only continue branch if "en"
-                    byteRead = pdfSource.read();
-                    if(byteRead==D){//up to "end" now
-                        byteRead = pdfSource.read();
-                        if(byteRead==S){
-                            byteRead = pdfSource.read();
-                            if(byteRead==T){
-                                byteRead = pdfSource.read();
-                                if(byteRead==R){
-                                    byteRead = pdfSource.read();
-                                    if(byteRead==E){
-                                        byteRead = pdfSource.read();
-                                        if(byteRead==A){
-                                            byteRead = pdfSource.read();
-                                            if(byteRead==M){
-                                                //found the whole marker
-                                                pdfSource.unread( ENDSTREAM );
-                                                return;
-                                            }else{
-                                                out.write(ENDSTREAM, 0, 8);
-                                            }
-                                        }else{
-                                            out.write(ENDSTREAM, 0, 7);
-                                        }
-                                    }else{
-                                        out.write(ENDSTREAM, 0, 6);
-                                    }
-                                }else{
-                                    out.write(ENDSTREAM, 0, 5);
-                                }
-                            }else{
-                                out.write(ENDSTREAM, 0, 4);
-                            }
-                        }else if(byteRead==O){
-                            byteRead = pdfSource.read();
-                            if(byteRead==B){
-                                byteRead = pdfSource.read();
-                                if(byteRead==J){
-                                    //found whole marker
-                                    pdfSource.unread( ENDOBJ );
-                                    return;
-                                }else{
-                                    out.write(ENDOBJ, 0, 5);
-                                }
-                            }else{
-                                out.write(ENDOBJ, 0, 4);
-                            }
-                        }else{
-                            out.write(E);
-                            out.write(N);
-                            out.write(D);
-                        }
-                    }else{
-                        out.write(E);
-                        out.write(N);
+    private void readUntilEndStream( final OutputStream out ) throws IOException
+    {
+
+        int bufSize;
+        int charMatchCount = 0;
+        byte[] keyw = ENDSTREAM;
+        
+        final int quickTestOffset = 5;  // last character position of shortest keyword ('endobj')
+        
+        // read next chunk into buffer; already matched chars are added to beginning of buffer
+        while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, strmBufLen - charMatchCount ) ) > 0 ) 
+        {
+            bufSize += charMatchCount;
+            
+            int bIdx = charMatchCount;
+            int quickTestIdx;
+        
+            // iterate over buffer, trying to find keyword match
+            for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ ) 
+            {
+                // reduce compare operations by first test last character we would have to
+                // match if current one matches; if it is not a character from keywords
+                // we can move behind the test character;
+                // this shortcut is inspired by Boyer–Moore string search algorithm
+                // and can reduce parsing time by approx. 20%
+                if ( ( charMatchCount == 0 ) &&
+                         ( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx ) ) 
+                {
+                    
+                    final byte ch = strmBuf[quickTestIdx];
+                    if ( ( ch > 't' ) || ( ch < 'a' ) ) 
+                    {
+                        // last character we would have to match if current character would match
+                        // is not a character from keywords -> jump behind and start over
+                        bIdx = quickTestIdx;
+                        continue;
                     }
-                }else{
-                    out.write(E);
                 }
+                
+                final byte ch = strmBuf[bIdx];  // could be negative - but we only compare to ASCII
+            
+                if ( ch == keyw[ charMatchCount ] ) 
+                {
+                    if ( ++charMatchCount == keyw.length ) 
+                    {
+                        // match found
+                        bIdx++;
+                        break;
+                    }
+                } 
+                else 
+                {
+                    if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) ) 
+                    {
+                        // maybe ENDSTREAM is missing but we could have ENDOBJ
+                        keyw = ENDOBJ;
+                        charMatchCount++;
+                        
+                    } 
+                    else 
+                    {
+                        // no match; incrementing match start by 1 would be dumb since we already know matched chars
+                        // depending on current char read we may already have beginning of a new match:
+                        // 'e': first char matched;
+                        // 'n': if we are at match position idx 7 we already read 'e' thus 2 chars matched
+                        // for each other char we have to start matching first keyword char beginning with next 
+                        // read position
+                        charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0;
+                        // search again for 'endstream'
+                        keyw = ENDSTREAM;
+                    }
+                } 
+            }  // for
+            
+            int contentBytes = Math.max( 0, bIdx - charMatchCount );
+            
+            // write buffer content until first matched char to output stream
+            if ( contentBytes > 0 )
+            {
+                out.write( strmBuf, 0, contentBytes );
             }
-            if(byteRead!=-1)
+            if ( charMatchCount == keyw.length ) 
+            {
+                // keyword matched; unread matched keyword (endstream/endobj) and following buffered content
+                pdfSource.unread( strmBuf, contentBytes, bufSize - contentBytes );
+                break;
+                
+            } 
+            else 
             {
-                out.write(byteRead);
+                // copy matched chars at start of buffer
+                System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
             }
-
-        }while(byteRead!=-1);
+            
+        }  // while
     }
-
+    
     /**
      * This is really a bug in the Document creators code, but it caused a crash
      * in PDFBox, the first bug was in this format:
@@ -841,13 +913,13 @@ public abstract class BaseParser
             else
             {
                 //it could be a bad object in the array which is just skipped
-                log.warn("Corrupt object reference" );
+                LOG.warn("Corrupt object reference" );
 
                 // This could also be an "endobj" or "endstream" which means we can assume that
                 // the array has ended.
                 String isThisTheEnd = readString();
                 pdfSource.unread(isThisTheEnd.getBytes("ISO-8859-1"));
-                if("endobj".equals(isThisTheEnd) || "endstream".equals(isThisTheEnd))
+                if(ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd))
                 {
                     return po;
                 }
@@ -958,7 +1030,7 @@ public abstract class BaseParser
         if( c == 't' )
         {
             String trueString = new String( pdfSource.readFully( 4 ), "ISO-8859-1" );
-            if( !trueString.equals( "true" ) )
+            if( !trueString.equals( TRUE ) )
             {
                 throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
             }
@@ -970,7 +1042,7 @@ public abstract class BaseParser
         else if( c == 'f' )
         {
             String falseString = new String( pdfSource.readFully( 5 ), "ISO-8859-1" );
-            if( !falseString.equals( "false" ) )
+            if( !falseString.equals( FALSE ) )
             {
                 throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
             }
@@ -1033,7 +1105,7 @@ public abstract class BaseParser
         case 'n':   // null
         {
             String nullString = readString();
-            if( !nullString.equals( "null") )
+            if( !nullString.equals( NULL) )
             {
                 throw new IOException("Expected='null' actual='" + nullString + "'");
             }
@@ -1043,7 +1115,7 @@ public abstract class BaseParser
         case 't':
         {
             String trueString = new String( pdfSource.readFully(4), "ISO-8859-1" );
-            if( trueString.equals( "true" ) )
+            if( trueString.equals( TRUE ) )
             {
                 retval = COSBoolean.TRUE;
             }
@@ -1056,7 +1128,7 @@ public abstract class BaseParser
         case 'f':
         {
             String falseString = new String( pdfSource.readFully(5), "ISO-8859-1" );
-            if( falseString.equals( "false" ) )
+            if( falseString.equals( FALSE ) )
             {
                 retval = COSBoolean.FALSE;
             }
@@ -1113,7 +1185,7 @@ public abstract class BaseParser
                 }
 
                 // if it's an endstream/endobj, we want to put it back so the caller will see it
-                if("endobj".equals(badString) || "endstream".equals(badString))
+                if(ENDOBJ_STRING.equals(badString) || ENDSTREAM_STRING.equals(badString))
                 {
                     pdfSource.unread(badString.getBytes("ISO-8859-1"));
                 }