You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2020/06/27 12:52:39 UTC
svn commit: r1879267 - in /pdfbox/branches/issue45: ./ pdfbox/
pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
Author: lehmi
Date: Sat Jun 27 12:52:39 2020
New Revision: 1879267
URL: http://svn.apache.org/viewvc?rev=1879267&view=rev
Log:
PDFBOX-4897: backported the optimized behaviour of the object stream parser from 3.0 as proposed by Simon Steiner
Added:
pdfbox/branches/issue45/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
- copied unchanged from r1879261, pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
Modified:
pdfbox/branches/issue45/ (props changed)
pdfbox/branches/issue45/pdfbox/ (props changed)
pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
Propchange: pdfbox/branches/issue45/
------------------------------------------------------------------------------
Merged /pdfbox/branches/2.0:r1879261
Propchange: pdfbox/branches/issue45/pdfbox/
------------------------------------------------------------------------------
Merged /pdfbox/branches/2.0/pdfbox:r1879261
Modified: pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1879267&r1=1879266&r2=1879267&view=diff
==============================================================================
--- pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Sat Jun 27 12:52:39 2020
@@ -18,12 +18,16 @@ package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
@@ -41,7 +45,8 @@ public class PDFObjectStreamParser exten
private static final Log LOG = LogFactory.getLog(PDFObjectStreamParser.class);
private List<COSObject> streamObjects = null;
- private final COSStream stream;
+ private final int numberOfObjects;
+ private final int firstObject;
/**
* Constructor.
@@ -53,8 +58,19 @@ public class PDFObjectStreamParser exten
public PDFObjectStreamParser(COSStream stream, COSDocument document) throws IOException
{
super(new InputStreamSource(stream.createInputStream()));
- this.stream = stream;
this.document = document;
+ // get mandatory number of objects
+ numberOfObjects = stream.getInt(COSName.N);
+ if (numberOfObjects == -1)
+ {
+ throw new IOException("/N entry missing in object stream");
+ }
+ // get mandatory stream offset of the first object
+ firstObject = stream.getInt(COSName.FIRST);
+ if (firstObject == -1)
+ {
+ throw new IOException("/First entry missing in object stream");
+ }
}
/**
@@ -67,47 +83,19 @@ public class PDFObjectStreamParser exten
{
try
{
- //need to first parse the header.
- int numberOfObjects = stream.getInt( "N" );
- if (numberOfObjects == -1)
- {
- throw new IOException("/N entry missing in object stream");
- }
- List<Long> objectNumbers = new ArrayList<Long>( numberOfObjects );
+ Map<Long, Integer> offsets = readOffsets();
streamObjects = new ArrayList<COSObject>( numberOfObjects );
- for( int i=0; i<numberOfObjects; i++ )
+ for (Entry<Long, Integer> offset : offsets.entrySet())
{
- long objectNumber = readObjectNumber();
- // skip offset
- readLong();
- objectNumbers.add( objectNumber);
- }
- COSObject object;
- COSBase cosObject;
- int objectCounter = 0;
- while( (cosObject = parseDirObject()) != null )
- {
- object = new COSObject(cosObject);
+ COSBase cosObject = parseObject(offset.getValue());
+ COSObject object = new COSObject(cosObject);
object.setGenerationNumber(0);
- if (objectCounter >= objectNumbers.size())
- {
- LOG.error("/ObjStm (object stream) has more objects than /N " + numberOfObjects);
- break;
- }
- object.setObjectNumber( objectNumbers.get( objectCounter) );
- streamObjects.add( object );
- if(LOG.isDebugEnabled())
- {
- LOG.debug( "parsed=" + object );
- }
- // According to the spec objects within an object stream shall not be enclosed
- // by obj/endobj tags, but there are some pdfs in the wild using those tags
- // skip endobject marker if present
- if (!seqSource.isEOF() && seqSource.peek() == 'e')
+ object.setObjectNumber(offset.getKey());
+ streamObjects.add(object);
+ if (LOG.isDebugEnabled())
{
- readLine();
+ LOG.debug("parsed=" + object);
}
- objectCounter++;
}
}
finally
@@ -125,4 +113,30 @@ public class PDFObjectStreamParser exten
{
return streamObjects;
}
+
+ private Map<Long, Integer> readOffsets() throws IOException
+ {
+ // use LinkesHashMap to preserve order for the sequential parsing
+ Map<Long, Integer> objectNumbers = new LinkedHashMap<Long, Integer>(numberOfObjects);
+ for (int i = 0; i < numberOfObjects; i++)
+ {
+ long objectNumber = readObjectNumber();
+ int offset = (int) readLong();
+ objectNumbers.put(objectNumber, offset);
+ }
+ return objectNumbers;
+ }
+
+ private COSBase parseObject(int offset) throws IOException
+ {
+ long currentPosition = seqSource.getPosition();
+ int finalPosition = firstObject + offset;
+ if (finalPosition > 0 && currentPosition < finalPosition)
+ {
+ // jump to the offset of the object to be parsed
+ seqSource.readFully(finalPosition - (int) currentPosition);
+ }
+ return parseDirObject();
+ }
+
}