You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2017/06/18 13:01:32 UTC

svn commit: r1799082 - /pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Author: lehmi
Date: Sun Jun 18 13:01:31 2017
New Revision: 1799082

URL: http://svn.apache.org/viewvc?rev=1799082&view=rev
Log:
PDFBOX-3536: improve the the end of COSString detection for unbalanced parenthesis, some small optimizations

Modified:
    pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1799082&r1=1799081&r2=1799082&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Jun 18 13:01:31 2017
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CharsetDecoder;
-import java.util.Arrays;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSArray;
@@ -341,60 +340,43 @@ public abstract class BaseParser
     }
 
     /**
-     * This is really a bug in the Document creators code, but it caused a crash
-     * in PDFBox, the first bug was in this format:
-     * /Title ( (5)
-     * /Creator which was patched in 1 place.
-     * However it missed the case where the Close Paren was escaped
-     *
-     * The second bug was in this format
-     * /Title (c:\)
-     * /Producer
+     * This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
+     * format: /Title ( (5) /Creator which was patched in 1 place.
      *
-     * This patch  moves this code out of the parseCOSString method, so it can be used twice.
+     * However it missed the case where the number of opening and closing parenthesis isn't balanced
      *
+     * The second bug was in this format /Title (c:\) /Producer
+     *
+     * This patch moves this code out of the parseCOSString method, so it can be used twice.
      *
      * @param bracesParameter the number of braces currently open.
      *
      * @return the corrected value of the brace counter
      * @throws IOException
      */
-    private int checkForMissingCloseParen(final int bracesParameter) throws IOException
+    private int checkForEndOfString(final int bracesParameter) throws IOException
     {
         int braces = bracesParameter;
         byte[] nextThreeBytes = new byte[3];
         int amountRead = seqSource.read(nextThreeBytes);
 
-        //lets handle the special case seen in Bull  River Rules and Regulations.pdf
-        //The dictionary looks like this
-        //    2 0 obj
-        //    <<
-        //        /Type /Info
-        //        /Creator (PaperPort http://www.scansoft.com)
-        //        /Producer (sspdflib 1.0 http://www.scansoft.com)
-        //        /Title ( (5)
-        //        /Author ()
-        //        /Subject ()
-        //
-        // Notice the /Title, the braces are not even but they should
-        // be.  So lets assume that if we encounter an this scenario
-        //   <end_brace><new_line><opening_slash> then that
-        // means that there is an error in the pdf and assume that
-        // was the end of the document.
-        //
-        if (amountRead == 3 &&
-               (( nextThreeBytes[0] == ASCII_CR  // Look for a carriage return
-               && nextThreeBytes[1] == ASCII_LF  // Look for a new line
-               && nextThreeBytes[2] == 0x2f ) // Look for a slash /
-                                              // Add a second case without a new line
-               || (nextThreeBytes[0] == ASCII_CR  // Look for a carriage return
-                && nextThreeBytes[1] == 0x2f )))  // Look for a slash /
+        // Check the next 3 bytes if available
+        // The following cases are valid indicators for the end of the string
+        // 1. Next line contains another COSObject: CR + LF + '/'
+        // 2. COSDictionary ends in the next line: CR + LF + '>'
+        // 3. Next line contains another COSObject: CR + '/'
+        // 4. COSDictionary ends in the next line: CR + '>'
+        if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
+        {
+            if ( (nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
+                    || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
             {
                 braces = 0;
             }
+        }
         if (amountRead > 0)
         {
-            seqSource.unread(Arrays.copyOfRange(nextThreeBytes, 0, amountRead));
+            seqSource.unread(nextThreeBytes, 0, amountRead);
         }
         return braces;
     }
@@ -409,18 +391,11 @@ public abstract class BaseParser
     protected COSString parseCOSString() throws IOException
     {
         char nextChar = (char) seqSource.read();
-        char openBrace;
-        char closeBrace;
-        if( nextChar == '(' )
-        {
-            openBrace = '(';
-            closeBrace = ')';
-        }
-        else if( nextChar == '<' )
+        if (nextChar == '<')
         {
             return parseCOSHexString();
         }
-        else
+        else if (nextChar != '(')
         {
             throw new IOException( "parseCOSString string should start with '(' or '<' and not '" +
                     nextChar + "' " + seqSource);
@@ -428,8 +403,7 @@ public abstract class BaseParser
         
         ByteArrayOutputStream out = new ByteArrayOutputStream();
 
-        //This is the number of braces read
-        //
+        // This is the number of braces read
         int braces = 1;
         int c = seqSource.read();
         while( braces > 0 && c != -1)
@@ -437,17 +411,17 @@ public abstract class BaseParser
             char ch = (char)c;
             int nextc = -2; // not yet read
 
-            if(ch == closeBrace)
+            if (ch == ')')
             {
 
                 braces--;
-                braces = checkForMissingCloseParen(braces);
+                braces = checkForEndOfString(braces);
                 if( braces != 0 )
                 {
                     out.write(ch);
                 }
             }
-            else if( ch == openBrace )
+            else if (ch == '(')
             {
                 braces++;
                 out.write(ch);
@@ -475,7 +449,7 @@ public abstract class BaseParser
                         break;
                     case ')':
                         // PDFBox 276 /Title (c:\)
-                        braces = checkForMissingCloseParen(braces);
+                    braces = checkForEndOfString(braces);
                         if( braces != 0 )
                         {
                             out.write(next);