You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2019/06/11 17:57:42 UTC
svn commit: r1861060 - in /pdfbox/branches/issue4569:
pdfbox/src/main/java/org/apache/pdfbox/cos/
pdfbox/src/main/java/org/apache/pdfbox/pdfparser/
pdfbox/src/test/java/org/apache/pdfbox/cos/
preflight/src/main/java/org/apache/pdfbox/preflight/parser/
Author: lehmi
Date: Tue Jun 11 17:57:42 2019
New Revision: 1861060
URL: http://svn.apache.org/viewvc?rev=1861060&view=rev
Log:
PDFBOX-4569: alpha version of an ondemand parser
Modified:
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java
pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDictionary.java Tue Jun 11 17:57:42 2019
@@ -1576,7 +1576,10 @@ public class COSDictionary extends COSBa
if (base instanceof COSObject)
{
COSObject obj = (COSObject) base;
- return "COSObject{" + getDictionaryString(obj.getObject(), objs) + "}";
+ return "COSObject{"
+ + getDictionaryString(
+ obj.isObjectNull() ? COSNull.NULL : obj.getObject(), objs)
+ + "}";
}
return base.toString();
}
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Tue Jun 11 17:57:42 2019
@@ -27,7 +27,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.ScratchFile;
-import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
+import org.apache.pdfbox.pdfparser.COSParser;
import org.apache.pdfbox.pdmodel.PDDocument;
/**
@@ -90,6 +90,8 @@ public class COSDocument extends COSBase
*/
private long highestXRefObjectNumber;
+ public COSParser parser;
+
/**
* Constructor. Uses main memory to buffer PDF streams.
*/
@@ -447,10 +449,14 @@ public class COSDocument extends COSBase
// close all open I/O streams
for (COSObject object : getObjects())
{
- COSBase cosObject = object.getObject();
- if (cosObject instanceof COSStream)
+ if (!object.isObjectNull())
{
- firstException = IOUtils.closeAndLogException((COSStream) cosObject, LOG, "COSStream", firstException);
+ COSBase cosObject = object.getObject();
+ if (cosObject instanceof COSStream)
+ {
+ firstException = IOUtils.closeAndLogException((COSStream) cosObject, LOG,
+ "COSStream", firstException);
+ }
}
}
@@ -513,34 +519,6 @@ public class COSDocument extends COSBase
}
/**
- * This method will search the list of objects for types of ObjStm. If it finds
- * them then it will parse out all of the objects from the stream that is contains.
- *
- * @throws IOException If there is an error parsing the stream.
- */
- public void dereferenceObjectStreams() throws IOException
- {
- for( COSObject objStream : getObjectsByType( COSName.OBJ_STM ) )
- {
- COSStream stream = (COSStream)objStream.getObject();
- PDFObjectStreamParser parser = new PDFObjectStreamParser(stream, this);
- parser.parse();
- for (COSObject next : parser.getObjects())
- {
- COSObjectKey key = new COSObjectKey(next);
- if (objectPool.get(key) == null || objectPool.get(key).getObject() == null
- // xrefTable stores negated objNr of objStream for objects in objStreams
- || (xrefTable.containsKey(key)
- && xrefTable.get(key) == -objStream.getObjectNumber()))
- {
- COSObject obj = getObjectFromPool(key);
- obj.setObject(next.getObject());
- }
- }
- }
- }
-
- /**
* This will get an object from the pool.
*
* @param key The object key.
@@ -557,7 +535,7 @@ public class COSDocument extends COSBase
if (obj == null)
{
// this was a forward reference, make "proxy" object
- obj = new COSObject(null);
+ obj = new COSObject(null, parser);
if( key != null )
{
obj.setObjectNumber(key.getNumber());
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObject.java Tue Jun 11 17:57:42 2019
@@ -18,6 +18,9 @@ package org.apache.pdfbox.cos;
import java.io.IOException;
+import org.apache.pdfbox.pdfparser.BaseParser;
+import org.apache.pdfbox.pdfparser.COSParser;
+
/**
* This class represents a PDF object.
*
@@ -30,6 +33,7 @@ public class COSObject extends COSBase i
private long objectNumber;
private int generationNumber;
private boolean needToBeUpdated;
+ private BaseParser parser;
/**
* Constructor.
@@ -37,9 +41,11 @@ public class COSObject extends COSBase i
* @param object The object that this encapsulates.
*
*/
- public COSObject(COSBase object)
+ public COSObject(COSBase object, BaseParser parser)
+ // public COSObject(COSBase object)
{
setObject( object );
+ this.parser = parser;
}
/**
@@ -77,6 +83,10 @@ public class COSObject extends COSBase i
return retval;
}
+ public boolean isObjectNull()
+ {
+ return baseObject == null;
+ }
/**
* This will get the object that this object encapsulates.
*
@@ -84,6 +94,18 @@ public class COSObject extends COSBase i
*/
public COSBase getObject()
{
+ if (baseObject == null || baseObject instanceof COSNull)
+ {
+ if (parser instanceof COSParser)
+ {
+ boolean returnValue = ((COSParser) parser).dereferenceCOSObject(this);
+ if (!returnValue)
+ {
+ // remove parser to avoid endless recursions
+ parser = null;
+ }
+ }
+ }
return baseObject;
}
@@ -97,6 +119,12 @@ public class COSObject extends COSBase i
baseObject = object;
}
+ public final void setToNull()
+ {
+ baseObject = COSNull.NULL;
+ parser = null;
+ }
+
/**
* {@inheritDoc}
*/
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Tue Jun 11 17:57:42 2019
@@ -924,7 +924,7 @@ public abstract class BaseParser
}
case 'R':
seqSource.read();
- retval = new COSObject(null);
+ retval = new COSObject(null, this);
break;
case (char)-1:
return null;
@@ -1158,6 +1158,17 @@ public abstract class BaseParser
}
/**
+ * This will tell if the end of the data is reached.
+ *
+ * @return true if the end of the data is reached.
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected boolean isEOF() throws IOException
+ {
+ return seqSource.isEOF();
+ }
+
+ /**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Tue Jun 11 17:57:42 2019
@@ -772,6 +772,42 @@ public class COSParser extends BaseParse
}
}
+ public boolean dereferenceCOSObject(COSObject obj)
+ {
+ COSBase parsedObj = null;
+ long currentPos = 0;
+ try
+ {
+ currentPos = source.getPosition();
+ parsedObj = parseObjectDynamically(obj, false);
+ }
+ catch (IOException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ // parsedObj = COSBroken.BROKEN;
+ }
+ finally
+ {
+ if (currentPos > 0)
+ try
+ {
+ source.seek(currentPos);
+ }
+ catch (IOException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if (parsedObj != null)
+ {
+ obj.setObject(parsedObj);
+ return true;
+ }
+ return false;
+ }
+
// add objects not to be parsed to list of already parsed objects
private void addExcludedToList(COSName[] excludeObjects, COSDictionary dict, final Set<Long> parsedObjects)
{
@@ -826,7 +862,7 @@ public class COSParser extends BaseParse
final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
final COSObject pdfObject = document.getObjectFromPool(objKey);
- if (pdfObject.getObject() == null)
+ if (pdfObject.isObjectNull())
{
// not previously parsed
// ---- read offset or object stream object number from xref table
@@ -855,7 +891,8 @@ public class COSParser extends BaseParse
if (offsetOrObjstmObNr == null)
{
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
- pdfObject.setObject(COSNull.NULL);
+ // remove parser to avoid endless recursion
+ pdfObject.setToNull();
}
else if (offsetOrObjstmObNr > 0)
{
@@ -935,7 +972,14 @@ public class COSParser extends BaseParse
securityHandler.decrypt(pb, objKey.getNumber(), objKey.getGeneration());
}
- pdfObject.setObject(pb);
+ if (pb != null)
+ {
+ pdfObject.setObject(pb);
+ }
+ else
+ {
+ pdfObject.setToNull();
+ }
if (!endObjectKey.startsWith(ENDOBJ_STRING))
{
@@ -2768,7 +2812,7 @@ public class COSParser extends BaseParse
xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
xrefTrailerResolver.setTrailer( stream );
}
- PDFXrefStreamParser parser = new PDFXrefStreamParser( stream, document, xrefTrailerResolver );
+ PDFXrefStreamParser parser = new PDFXrefStreamParser(stream, document, xrefTrailerResolver);
parser.parse();
}
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Tue Jun 11 17:57:42 2019
@@ -87,7 +87,7 @@ public class PDFObjectStreamParser exten
int objectCounter = 0;
while( (cosObject = parseDirObject()) != null )
{
- object = new COSObject(cosObject);
+ object = new COSObject(cosObject, null);
object.setGenerationNumber(0);
if (objectCounter >= objectNumbers.size())
{
@@ -103,7 +103,7 @@ public class PDFObjectStreamParser exten
// According to the spec objects within an object stream shall not be enclosed
// by obj/endobj tags, but there are some pdfs in the wild using those tags
// skip endobject marker if present
- if (!seqSource.isEOF() && seqSource.peek() == 'e')
+ if (!isEOF() && seqSource.peek() == 'e')
{
readLine();
}
@@ -125,4 +125,22 @@ public class PDFObjectStreamParser exten
{
return streamObjects;
}
+
+ public boolean dereferenceCOSObject(COSObject obj)
+ {
+ if (streamObjects != null)
+ {
+ long objectNumber = obj.getObjectNumber();
+ for (COSObject cosObject : streamObjects)
+ {
+ if (cosObject.getObjectNumber() == objectNumber)
+ {
+ obj.setObject(cosObject);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
}
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Tue Jun 11 17:57:42 2019
@@ -21,10 +21,10 @@ import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.ScratchFile;
@@ -141,6 +141,7 @@ public class PDFParser extends COSParser
}
}
document = new COSDocument(scratchFile);
+ document.parser = this;
}
/**
@@ -170,28 +171,21 @@ public class PDFParser extends COSParser
{
COSDictionary trailer = retrieveTrailer();
- COSBase base = parseTrailerValuesDynamically(trailer);
- if (!(base instanceof COSDictionary))
+ COSObject rootObj = trailer.getCOSObject(COSName.ROOT);
+ if (rootObj == null)
{
- throw new IOException("Expected root dictionary, but got this: " + base);
+ throw new IOException("Missing root object specification in trailer.");
}
- COSDictionary root = (COSDictionary) base;
+ COSDictionary root = (COSDictionary) rootObj.getObject();
// in some pdfs the type value "Catalog" is missing in the root object
if (isLenient() && !root.containsKey(COSName.TYPE))
{
root.setItem(COSName.TYPE, COSName.CATALOG);
}
- // parse all objects, starting at the root dictionary
- parseDictObjects(root, (COSName[]) null);
- // parse all objects of the info dictionary
- COSBase infoBase = trailer.getDictionaryObject(COSName.INFO);
- if (infoBase instanceof COSDictionary)
- {
- parseDictObjects((COSDictionary) infoBase, (COSName[]) null);
- }
// check pages dictionaries
checkPages(root);
document.setDecrypted();
+ document.parser = this;
initialParseDone = true;
}
Modified: pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Tue Jun 11 17:57:42 2019
@@ -191,7 +191,7 @@ public class PDFStreamParser extends Bas
String line = readString();
if( line.equals( "R" ) )
{
- retval = new COSObject( null );
+ retval = new COSObject(null, this);
}
else
{
@@ -295,8 +295,8 @@ public class PDFStreamParser extends Bas
while( !(lastByte == 'E' &&
currentByte == 'I' &&
hasNextSpaceOrReturn() &&
- hasNoFollowingBinData(seqSource)) &&
- !seqSource.isEOF() )
+ hasNoFollowingBinData()) &&
+ !isEOF())
{
imageData.write( lastByte );
lastByte = currentByte;
@@ -344,10 +344,10 @@ public class PDFStreamParser extends Bas
* @return <code>true</code> if next bytes are probably printable ASCII
* characters starting with a PDF operator, otherwise <code>false</code>
*/
- private boolean hasNoFollowingBinData(SequentialSource pdfSource) throws IOException
+ private boolean hasNoFollowingBinData() throws IOException
{
// as suggested in PDFBOX-1164
- final int readBytes = pdfSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
+ final int readBytes = seqSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
boolean noBinData = true;
int startOpIdx = -1;
int endOpIdx = -1;
@@ -399,12 +399,12 @@ public class PDFStreamParser extends Bas
noBinData = false;
}
}
- pdfSource.unread(binCharTestArr, 0, readBytes);
+ seqSource.unread(binCharTestArr, 0, readBytes);
}
if (!noBinData)
{
LOG.warn("ignoring 'EI' assumed to be in the middle of inline image at stream offset " +
- pdfSource.getPosition());
+ seqSource.getPosition());
}
return noBinData;
}
Modified: pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java (original)
+++ pdfbox/branches/issue4569/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSUpdateInfo.java Tue Jun 11 17:57:42 2019
@@ -43,7 +43,7 @@ public class TestCOSUpdateInfo
// COSObject
COSUpdateInfo testCOSObject;
- testCOSObject = new COSObject(null);
+ testCOSObject = new COSObject(null, null);
testCOSObject.setNeedToBeUpdated(true);
assertTrue(testCOSObject.isNeedToBeUpdated());
testCOSObject.setNeedToBeUpdated(false);
Modified: pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java?rev=1861060&r1=1861059&r2=1861060&view=diff
==============================================================================
--- pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java (original)
+++ pdfbox/branches/issue4569/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java Tue Jun 11 17:57:42 2019
@@ -718,7 +718,7 @@ public class PreflightParser extends PDF
final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
final COSObject pdfObject = document.getObjectFromPool(objKey);
- if (pdfObject.getObject() == null)
+ if (pdfObject.isObjectNull())
{
// not previously parsed
// ---- read offset or object stream object number from xref table
@@ -737,7 +737,9 @@ public class PreflightParser extends PDF
if (offsetOrObjstmObNr == null)
{
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
- pdfObject.setObject(COSNull.NULL);
+ // remove parser to avoid endless recursion
+ pdfObject.setToNull();
+
}
else if (offsetOrObjstmObNr == 0)
{