You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/04/05 18:11:19 UTC
svn commit: r1671405 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BaseParser.java COSParser.java

Author: tilman
Date: Sun Apr  5 16:11:18 2015
New Revision: 1671405

URL: http://svn.apache.org/r1671405
Log:
PDFBOX-2576: refactor double code in parseCOSStream

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1671405&r1=1671404&r2=1671405&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Apr  5 16:11:18 2015
@@ -375,40 +375,7 @@ public abstract class BaseParser impleme
         {
             readExpectedString(STREAM_STRING);
 
-            //PDF Ref 3.2.7 A stream must be followed by either
-            //a CRLF or LF but nothing else.
-
-            int whitespace = pdfSource.read();
-
-            //see brother_scan_cover.pdf, it adds whitespaces
-            //after the stream but before the start of the
-            //data, so just read those first
-            while (ASCII_SPACE == whitespace)
-            {
-                whitespace = pdfSource.read();
-            }
-
-            if( ASCII_CR == whitespace )
-            {
-                whitespace = pdfSource.read();
-                if( ASCII_LF != whitespace )
-                {
-                    pdfSource.unread( whitespace );
-                    //The spec says this is invalid but it happens in the real
-                    //world so we must support it.
-                }
-            }
-            else if (ASCII_LF == whitespace)
-            {
-                //that is fine
-            }
-            else
-            {
-                //we are in an error.
-                //but again we will do a lenient parsing and just assume that everything
-                //is fine
-                pdfSource.unread( whitespace );
-            }
+            skipWhiteSpaces();
 
             // This needs to be dic.getItem because when we are parsing, the underlying object
             // might still be null.
@@ -557,6 +524,40 @@ public abstract class BaseParser impleme
         return stream;
     }
 
+    protected void skipWhiteSpaces() throws IOException
+    {
+        //PDF Ref 3.2.7 A stream must be followed by either
+        //a CRLF or LF but nothing else.
+
+        int whitespace = pdfSource.read();
+
+        //see brother_scan_cover.pdf, it adds whitespaces
+        //after the stream but before the start of the
+        //data, so just read those first
+        while (ASCII_SPACE == whitespace)
+        {
+            whitespace = pdfSource.read();
+        }
+
+        if (ASCII_CR == whitespace)
+        {
+            whitespace = pdfSource.read();
+            if (ASCII_LF != whitespace)
+            {
+                pdfSource.unread(whitespace);
+                //The spec says this is invalid but it happens in the real
+                //world so we must support it.
+            }
+        }
+        else if (ASCII_LF != whitespace)
+        {
+            //we are in an error.
+            //but again we will do a lenient parsing and just assume that everything
+            //is fine
+            pdfSource.unread(whitespace);
+        }
+    }
+
     /**
      * This method will read through the current stream object until
      * we find the keyword "endstream" meaning we're at the end of this

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1671405&r1=1671404&r2=1671405&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Apr  5 16:11:18 2015
@@ -891,36 +891,8 @@ public class COSParser extends BaseParse
         {
             // read 'stream'; this was already tested in parseObjectsDynamically()
             readString(); 
-            // skip whitespaces before start of data
-            // PDF Ref 1.7, chap. 3.2.7:
-            // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF
-            // but nothing else.
-            int whitespace = pdfSource.read();
             
-            // see brother_scan_cover.pdf, it adds whitespaces
-            // after the stream but before the start of the
-            // data, so just read those first
-            while (whitespace == 0x20)
-            {
-                whitespace = pdfSource.read();
-            }
-
-            if (whitespace == 0x0D)
-            {
-                whitespace = pdfSource.read();
-                if (whitespace != 0x0A)
-                {
-                    // the spec says this is invalid but it happens in the
-                    // real world so we must support it
-                    pdfSource.unread(whitespace);
-                }
-            }
-            else if (whitespace != 0x0A)
-            {
-                // no whitespace after 'stream'; PDF ref. says 'should' so
-                // that is ok
-                pdfSource.unread(whitespace);
-            }
+            skipWhiteSpaces();
 
             /*
              * This needs to be dic.getItem because when we are parsing, the underlying object might still be null.