You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2019/06/11 17:57:42 UTC

svn commit: r1861060 - in /pdfbox/branches/issue4569: pdfbox/src/main/java/org/apache/pdfbox/cos/ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ pdfbox/src/test/java/org/apache/pdfbox/cos/ preflight/src/main/java/org/apache/pdfbox/preflight/parser/

Author: lehmi
Date: Tue Jun 11 17:57:42 2019
New Revision: 1861060

URL: http://svn.apache.org/viewvc?rev=1861060&view=rev
Log:
PDFBOX-4569: alpha version of an ondemand parser

Modified:
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
    pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
    pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java
    pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java Tue Jun 11 17:57:42 2019
@@ -1576,7 +1576,10 @@ public class COSDictionary extends COSBa
         if (base instanceof COSObject)
         {
             COSObject obj = (COSObject) base;
-            return "COSObject{" + getDictionaryString(obj.getObject(), objs) + "}";
+            return "COSObject{"
+                    + getDictionaryString(
+                            obj.isObjectNull() ? COSNull.NULL : obj.getObject(), objs)
+                    + "}";
         }
         return base.toString();
     }

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Tue Jun 11 17:57:42 2019
@@ -27,7 +27,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.io.IOUtils;
 import org.apache.pdfbox.io.ScratchFile;
-import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
+import org.apache.pdfbox.pdfparser.COSParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 
 /**
@@ -90,6 +90,8 @@ public class COSDocument extends COSBase
      */
     private long highestXRefObjectNumber;
 
+    public COSParser parser;
+
     /**
      * Constructor. Uses main memory to buffer PDF streams.
      */
@@ -447,10 +449,14 @@ public class COSDocument extends COSBase
             // close all open I/O streams
             for (COSObject object : getObjects())
             {
-                COSBase cosObject = object.getObject();
-                if (cosObject instanceof COSStream)
+                if (!object.isObjectNull())
                 {
-                    firstException = IOUtils.closeAndLogException((COSStream) cosObject, LOG, "COSStream", firstException);
+                    COSBase cosObject = object.getObject();
+                    if (cosObject instanceof COSStream)
+                    {
+                        firstException = IOUtils.closeAndLogException((COSStream) cosObject, LOG,
+                                "COSStream", firstException);
+                    }
                 }
             }
 
@@ -513,34 +519,6 @@ public class COSDocument extends COSBase
     }
 
     /**
-     * This method will search the list of objects for types of ObjStm.  If it finds
-     * them then it will parse out all of the objects from the stream that is contains.
-     *
-     * @throws IOException If there is an error parsing the stream.
-     */
-    public void dereferenceObjectStreams() throws IOException
-    {
-        for( COSObject objStream : getObjectsByType( COSName.OBJ_STM ) )
-        {
-            COSStream stream = (COSStream)objStream.getObject();
-            PDFObjectStreamParser parser = new PDFObjectStreamParser(stream, this);
-            parser.parse();
-            for (COSObject next : parser.getObjects())
-            {
-                COSObjectKey key = new COSObjectKey(next);
-                if (objectPool.get(key) == null || objectPool.get(key).getObject() == null
-                        // xrefTable stores negated objNr of objStream for objects in objStreams
-                        || (xrefTable.containsKey(key)
-                            && xrefTable.get(key) == -objStream.getObjectNumber()))
-                {
-                    COSObject obj = getObjectFromPool(key);
-                    obj.setObject(next.getObject());
-                }
-            }
-        }
-    }
-
-    /**
      * This will get an object from the pool.
      *
      * @param key The object key.
@@ -557,7 +535,7 @@ public class COSDocument extends COSBase
         if (obj == null)
         {
             // this was a forward reference, make "proxy" object
-            obj = new COSObject(null);
+            obj = new COSObject(null, parser);
             if( key != null )
             {
                 obj.setObjectNumber(key.getNumber());

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java Tue Jun 11 17:57:42 2019
@@ -18,6 +18,9 @@ package org.apache.pdfbox.cos;
 
 import java.io.IOException;
 
+import org.apache.pdfbox.pdfparser.BaseParser;
+import org.apache.pdfbox.pdfparser.COSParser;
+
 /**
  * This class represents a PDF object.
  *
@@ -30,6 +33,7 @@ public class COSObject extends COSBase i
     private long objectNumber;
     private int generationNumber;
     private boolean needToBeUpdated;
+    private BaseParser parser;
 
     /**
      * Constructor.
@@ -37,9 +41,11 @@ public class COSObject extends COSBase i
      * @param object The object that this encapsulates.
      *
      */
-    public COSObject(COSBase object)
+    public COSObject(COSBase object, BaseParser parser)
+    // public COSObject(COSBase object)
     {
         setObject( object );
+        this.parser = parser;
     }
 
     /**
@@ -77,6 +83,10 @@ public class COSObject extends COSBase i
         return retval;
     }
 
+    public boolean isObjectNull()
+    {
+        return baseObject == null;
+    }
     /**
      * This will get the object that this object encapsulates.
      *
@@ -84,6 +94,18 @@ public class COSObject extends COSBase i
      */
     public COSBase getObject()
     {
+        if (baseObject == null || baseObject instanceof COSNull)
+        {
+            if (parser instanceof COSParser)
+            {
+                boolean returnValue = ((COSParser) parser).dereferenceCOSObject(this);
+                if (!returnValue)
+                {
+                    // remove parser to avoid endless recursions
+                    parser = null;
+                }
+            }
+        }
         return baseObject;
     }
 
@@ -97,6 +119,12 @@ public class COSObject extends COSBase i
         baseObject = object;
     }
 
+    public final void setToNull()
+    {
+        baseObject = COSNull.NULL;
+        parser = null;
+    }
+
     /**
      * {@inheritDoc}
      */

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Tue Jun 11 17:57:42 2019
@@ -924,7 +924,7 @@ public abstract class BaseParser
         }
         case 'R':
             seqSource.read();
-            retval = new COSObject(null);
+            retval = new COSObject(null, this);
             break;
         case (char)-1:
             return null;
@@ -1158,6 +1158,17 @@ public abstract class BaseParser
     }
 
     /**
+     * This will tell if the end of the data is reached.
+     * 
+     * @return true if the end of the data is reached.
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected boolean isEOF() throws IOException
+    {
+        return seqSource.isEOF();
+    }
+
+    /**
      * This will tell if the next byte to be read is an end of line byte.
      *
      * @param c The character to check against end of line

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Tue Jun 11 17:57:42 2019
@@ -772,6 +772,42 @@ public class COSParser extends BaseParse
         }
     }
 
+    public boolean dereferenceCOSObject(COSObject obj)
+    {
+        COSBase parsedObj = null;
+        long currentPos = 0;
+        try
+        {
+            currentPos = source.getPosition();
+            parsedObj = parseObjectDynamically(obj, false);
+        }
+        catch (IOException e)
+        {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+            // parsedObj = COSBroken.BROKEN;
+        }
+        finally
+        {
+            if (currentPos > 0)
+                try
+                {
+                    source.seek(currentPos);
+                }
+                catch (IOException e)
+                {
+                    // TODO Auto-generated catch block
+                    e.printStackTrace();
+                }
+        }
+        if (parsedObj != null)
+        {
+            obj.setObject(parsedObj);
+            return true;
+        }
+        return false;
+    }
+
     // add objects not to be parsed to list of already parsed objects
     private void addExcludedToList(COSName[] excludeObjects, COSDictionary dict, final Set<Long> parsedObjects)
     {
@@ -826,7 +862,7 @@ public class COSParser extends BaseParse
         final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
         final COSObject pdfObject = document.getObjectFromPool(objKey);
 
-        if (pdfObject.getObject() == null)
+        if (pdfObject.isObjectNull())
         {
             // not previously parsed
             // ---- read offset or object stream object number from xref table
@@ -855,7 +891,8 @@ public class COSParser extends BaseParse
             if (offsetOrObjstmObNr == null)
             {
                 // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
-                pdfObject.setObject(COSNull.NULL);
+                // remove parser to avoid endless recursion
+                pdfObject.setToNull();
             }
             else if (offsetOrObjstmObNr > 0)
             {
@@ -935,7 +972,14 @@ public class COSParser extends BaseParse
             securityHandler.decrypt(pb, objKey.getNumber(), objKey.getGeneration());
         }
 
-        pdfObject.setObject(pb);
+        if (pb != null)
+        {
+            pdfObject.setObject(pb);
+        }
+        else
+        {
+            pdfObject.setToNull();
+        }
 
         if (!endObjectKey.startsWith(ENDOBJ_STRING))
         {
@@ -2768,7 +2812,7 @@ public class COSParser extends BaseParse
             xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
             xrefTrailerResolver.setTrailer( stream );
         }        
-        PDFXrefStreamParser parser = new PDFXrefStreamParser( stream, document, xrefTrailerResolver );
+        PDFXrefStreamParser parser = new PDFXrefStreamParser(stream, document, xrefTrailerResolver);
         parser.parse();
     }
 

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Tue Jun 11 17:57:42 2019
@@ -87,7 +87,7 @@ public class PDFObjectStreamParser exten
             int objectCounter = 0;
             while( (cosObject = parseDirObject()) != null )
             {
-                object = new COSObject(cosObject);
+                object = new COSObject(cosObject, null);
                 object.setGenerationNumber(0);
                 if (objectCounter >= objectNumbers.size())
                 {
@@ -103,7 +103,7 @@ public class PDFObjectStreamParser exten
                 // According to the spec objects within an object stream shall not be enclosed 
                 // by obj/endobj tags, but there are some pdfs in the wild using those tags 
                 // skip endobject marker if present
-                if (!seqSource.isEOF() && seqSource.peek() == 'e')
+                if (!isEOF() && seqSource.peek() == 'e')
                 {
                     readLine();
                 }
@@ -125,4 +125,22 @@ public class PDFObjectStreamParser exten
     {
         return streamObjects;
     }
+
+    public boolean dereferenceCOSObject(COSObject obj)
+    {
+        if (streamObjects != null) 
+        {
+            long objectNumber = obj.getObjectNumber();
+            for (COSObject cosObject : streamObjects)
+            {
+                if (cosObject.getObjectNumber() == objectNumber)
+                {
+                    obj.setObject(cosObject);
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
 }

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Tue Jun 11 17:57:42 2019
@@ -21,10 +21,10 @@ import java.io.InputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.io.IOUtils;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.io.ScratchFile;
@@ -141,6 +141,7 @@ public class PDFParser extends COSParser
             }
         }
         document = new COSDocument(scratchFile);
+        document.parser = this;
     }
     
     /**
@@ -170,28 +171,21 @@ public class PDFParser extends COSParser
     {
         COSDictionary trailer = retrieveTrailer();
     
-        COSBase base = parseTrailerValuesDynamically(trailer);
-        if (!(base instanceof COSDictionary))
+        COSObject rootObj = trailer.getCOSObject(COSName.ROOT);
+        if (rootObj == null)
         {
-            throw new IOException("Expected root dictionary, but got this: " + base);
+            throw new IOException("Missing root object specification in trailer.");
         }
-        COSDictionary root = (COSDictionary) base;
+        COSDictionary root = (COSDictionary) rootObj.getObject();
         // in some pdfs the type value "Catalog" is missing in the root object
         if (isLenient() && !root.containsKey(COSName.TYPE))
         {
             root.setItem(COSName.TYPE, COSName.CATALOG);
         }
-        // parse all objects, starting at the root dictionary
-        parseDictObjects(root, (COSName[]) null);
-        // parse all objects of the info dictionary
-        COSBase infoBase = trailer.getDictionaryObject(COSName.INFO);
-        if (infoBase instanceof COSDictionary)
-        {
-            parseDictObjects((COSDictionary) infoBase, (COSName[]) null);
-        }
         // check pages dictionaries
         checkPages(root);
         document.setDecrypted();
+        document.parser = this;
         initialParseDone = true;
     }
 

Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Tue Jun 11 17:57:42 2019
@@ -191,7 +191,7 @@ public class PDFStreamParser extends Bas
                 String line = readString();
                 if( line.equals( "R" ) )
                 {
-                    retval = new COSObject( null );
+                retval = new COSObject(null, this);
                 }
                 else
                 {
@@ -295,8 +295,8 @@ public class PDFStreamParser extends Bas
                 while( !(lastByte == 'E' &&
                          currentByte == 'I' &&
                          hasNextSpaceOrReturn() &&
-                         hasNoFollowingBinData(seqSource)) &&
-                       !seqSource.isEOF() )
+                    hasNoFollowingBinData()) &&
+                    !isEOF())
                 {
                     imageData.write( lastByte );
                     lastByte = currentByte;
@@ -344,10 +344,10 @@ public class PDFStreamParser extends Bas
      * @return <code>true</code> if next bytes are probably printable ASCII
      * characters starting with a PDF operator, otherwise <code>false</code>
      */
-    private boolean hasNoFollowingBinData(SequentialSource pdfSource) throws IOException
+    private boolean hasNoFollowingBinData() throws IOException
     {
         // as suggested in PDFBOX-1164
-        final int readBytes = pdfSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
+        final int readBytes = seqSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
         boolean noBinData = true;
         int startOpIdx = -1;
         int endOpIdx = -1;
@@ -399,12 +399,12 @@ public class PDFStreamParser extends Bas
                     noBinData = false;
                 }
             }
-            pdfSource.unread(binCharTestArr, 0, readBytes);
+            seqSource.unread(binCharTestArr, 0, readBytes);
         }
         if (!noBinData)
         {
             LOG.warn("ignoring 'EI' assumed to be in the middle of inline image at stream offset " + 
-                    pdfSource.getPosition());
+                    seqSource.getPosition());
         }
         return noBinData;
     }

Modified: pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java Tue Jun 11 17:57:42 2019
@@ -43,7 +43,7 @@ public class TestCOSUpdateInfo
 
         // COSObject
         COSUpdateInfo testCOSObject;
-        testCOSObject = new COSObject(null);
+        testCOSObject = new COSObject(null, null);
         testCOSObject.setNeedToBeUpdated(true);
         assertTrue(testCOSObject.isNeedToBeUpdated());
         testCOSObject.setNeedToBeUpdated(false);

Modified: pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java (original)
+++ pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java Tue Jun 11 17:57:42 2019
@@ -718,7 +718,7 @@ public class PreflightParser extends PDF
         final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
         final COSObject pdfObject = document.getObjectFromPool(objKey);
 
-        if (pdfObject.getObject() == null)
+        if (pdfObject.isObjectNull())
         {
             // not previously parsed
             // ---- read offset or object stream object number from xref table
@@ -737,7 +737,9 @@ public class PreflightParser extends PDF
             if (offsetOrObjstmObNr == null)
             {
                 // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
-                pdfObject.setObject(COSNull.NULL);
+                // remove parser to avoid endless recursion
+                pdfObject.setToNull();
+
             }
             else if (offsetOrObjstmObNr == 0)
             {