You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/10/30 22:34:36 UTC
svn commit: r1635619 - in
/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser:
NonSequentialPDFParser.java PDFParser.java
Author: lehmi
Date: Thu Oct 30 21:34:36 2014
New Revision: 1635619
URL: http://svn.apache.org/r1635619
Log:
PDFBOX-1595: create xref table using brute force search if startxref is missing
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (contents, props changed)
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (contents, props changed)
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1635619&r1=1635618&r2=1635619&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Thu Oct 30 21:34:36 2014
@@ -110,6 +110,12 @@ public class NonSequentialPDFParser exte
*/
protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
+ /**
+ * trailer-marker.
+ */
+ private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
+
+ private long trailerOffset;
private final File pdfFile;
private long fileLen;
private final RandomAccessBufferedFileInputStream raStream;
@@ -123,6 +129,7 @@ public class NonSequentialPDFParser exte
* Contains all found objects of a brute force search
*/
private HashMap<String, Long> bfSearchObjectOffsets = null;
+ private HashMap<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
private Vector<Long> bfSearchXRefOffsets = null;
/**
@@ -326,7 +333,6 @@ public class NonSequentialPDFParser exte
}
}
- // ------------------------------------------------------------------------
/**
* The initial parse will first parse only the trailer, the xrefstart and
* all xref tables to have a pointer (offset) to all the pdf's objects. It
@@ -337,8 +343,140 @@ public class NonSequentialPDFParser exte
*/
protected void initialParse() throws IOException
{
+ COSDictionary trailer = null;
// ---- parse startxref
- setPdfSource(getStartxrefOffset());
+ long startXRefOffset = getStartxrefOffset();
+ if (startXRefOffset > 0)
+ {
+ trailer = parseXref(startXRefOffset);
+ }
+ else if (isFDFDocment || isLenient)
+ {
+ // signal start of new XRef
+ xrefTrailerResolver.nextXrefObj( startXRefOffset );
+ bfSearchForObjects();
+ for (COSObjectKey objectKey : bfSearchCOSObjectKeyOffsets.keySet())
+ {
+ xrefTrailerResolver.setXRef(objectKey, bfSearchCOSObjectKeyOffsets.get(objectKey));
+ }
+ // parse the last trailer.
+ pdfSource.seek(trailerOffset);
+ if (!parseTrailer())
+ {
+ throw new IOException("Expected trailer object at position: "
+ + pdfSource.getOffset());
+ }
+ xrefTrailerResolver.setStartxref(startXRefOffset);
+ trailer = xrefTrailerResolver.getCurrentTrailer();
+ document.setTrailer(trailer);
+ }
+ // ---- prepare decryption if necessary
+ prepareDecryption();
+
+ // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer
+ // PDFBOX-1606 - after securityHandler has been instantiated
+ for (COSBase trailerEntry : trailer.getValues())
+ {
+ if (trailerEntry instanceof COSObject)
+ {
+ COSObject tmpObj = (COSObject) trailerEntry;
+ parseObjectDynamically(tmpObj, false);
+ }
+ }
+ // ---- parse catalog or root object
+ COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT);
+
+ if (root == null)
+ {
+ throw new IOException("Missing root object specification in trailer.");
+ }
+
+ parseObjectDynamically(root, false);
+
+ // ---- resolve all objects (including pages)
+ if (!parseMinimalCatalog)
+ {
+ COSObject catalogObj = document.getCatalog();
+ if (catalogObj != null)
+ {
+ if (catalogObj.getObject() instanceof COSDictionary)
+ {
+ parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null);
+ allPagesParsed = true;
+ document.setDecrypted();
+ }
+ }
+ }
+
+ // PDFBOX-1922: read the version again now that all objects have been resolved
+ readVersionInTrailer(trailer);
+
+ initialParseDone = true;
+ }
+
+ /**
+ * Prepare for decryption.
+ *
+ * @throws IOException if something went wrong
+ */
+ private void prepareDecryption() throws IOException
+ {
+ COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT);
+ if (trailerEncryptItem != null && !(trailerEncryptItem instanceof COSNull))
+ {
+ if (trailerEncryptItem instanceof COSObject)
+ {
+ COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
+ parseObjectDynamically(trailerEncryptObj, true);
+ }
+ try
+ {
+ PDEncryptionDictionary encParameters = new PDEncryptionDictionary(document.getEncryptionDictionary());
+
+ DecryptionMaterial decryptionMaterial = null;
+ if (keyStoreFilename != null)
+ {
+ KeyStore ks = KeyStore.getInstance("PKCS12");
+ ks.load(new FileInputStream(keyStoreFilename), password.toCharArray());
+
+ decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password);
+ }
+ else
+ {
+ decryptionMaterial = new StandardDecryptionMaterial(password);
+ }
+
+ securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler(encParameters.getFilter());
+ securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial);
+
+ AccessPermission permission = securityHandler.getCurrentAccessPermission();
+ if (!permission.canExtractContent())
+ {
+ LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content.");
+ }
+
+ }
+ catch (Exception e)
+ {
+ throw new IOException("Error (" + e.getClass().getSimpleName()
+ + ") while creating security handler for decryption: " + e.getMessage() /*
+ * , e TODO: remove
+ * remark with Java 1.6
+ */);
+ }
+ }
+ }
+
+ /**
+ * Parses cross reference tables.
+ *
+ * @param startXRefOffset start offset of the first table
+ * @return the trailer dictionary
+ * @throws IOException if something went wrong
+ */
+ private COSDictionary parseXref(long startXRefOffset) throws IOException
+ {
+ setPdfSource(startXRefOffset);
parseStartXref();
long startXrefOffset = document.getStartXref();
@@ -366,7 +504,7 @@ public class NonSequentialPDFParser exte
// use existing parser to parse xref table
parseXrefTable(prev);
// parse the last trailer.
- long trailerOffset = pdfSource.getOffset();
+ trailerOffset = pdfSource.getOffset();
//PDFBOX-1739 skip extra xref entries in RegisSTAR documents
while (isLenient && pdfSource.peek() != 't')
{
@@ -426,7 +564,6 @@ public class NonSequentialPDFParser exte
}
}
}
-
// ---- build valid xrefs out of the xref chain
xrefTrailerResolver.setStartxref(startXrefOffset);
COSDictionary trailer = xrefTrailerResolver.getTrailer();
@@ -434,95 +571,9 @@ public class NonSequentialPDFParser exte
// check the offsets of all referenced objects
checkXrefOffsets();
-
- // ---- prepare encryption if necessary
- COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT);
- if (trailerEncryptItem != null && !(trailerEncryptItem instanceof COSNull))
- {
- if (trailerEncryptItem instanceof COSObject)
- {
- COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
- parseObjectDynamically(trailerEncryptObj, true);
- }
- try
- {
- PDEncryptionDictionary encParameters = new PDEncryptionDictionary(document.getEncryptionDictionary());
-
- DecryptionMaterial decryptionMaterial = null;
- if (keyStoreFilename != null)
- {
- KeyStore ks = KeyStore.getInstance("PKCS12");
- ks.load(new FileInputStream(keyStoreFilename), password.toCharArray());
-
- decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password);
- }
- else
- {
- decryptionMaterial = new StandardDecryptionMaterial(password);
- }
-
- securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler(encParameters.getFilter());
- securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial);
-
- AccessPermission permission = securityHandler.getCurrentAccessPermission();
- if (!permission.canExtractContent())
- {
- LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content.");
- }
-
- }
- catch (Exception e)
- {
- throw new IOException("Error (" + e.getClass().getSimpleName()
- + ") while creating security handler for decryption: " + e.getMessage() /*
- * , e TODO: remove
- * remark with Java 1.6
- */);
- }
- }
-
- // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer
- // PDFBOX-1606 - after securityHandler has been instantiated
- for (COSBase trailerEntry : trailer.getValues())
- {
- if (trailerEntry instanceof COSObject)
- {
- COSObject tmpObj = (COSObject) trailerEntry;
- parseObjectDynamically(tmpObj, false);
- }
- }
- // ---- parse catalog or root object
- COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT);
-
- if (root == null)
- {
- throw new IOException("Missing root object specification in trailer.");
- }
-
- parseObjectDynamically(root, false);
-
- // ---- resolve all objects (including pages)
- if (!parseMinimalCatalog)
- {
- COSObject catalogObj = document.getCatalog();
- if (catalogObj != null)
- {
- if (catalogObj.getObject() instanceof COSDictionary)
- {
- parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null);
- allPagesParsed = true;
- document.setDecrypted();
- }
- }
- }
-
- // PDFBOX-1922: read the version again now that all objects have been resolved
- readVersionInTrailer(trailer);
-
- initialParseDone = true;
+ return trailer;
}
- // ------------------------------------------------------------------------
/**
* Parses an xref object stream starting with indirect object id.
*
@@ -665,7 +716,19 @@ public class NonSequentialPDFParser exte
if (bufOff < 0)
{
- throw new IOException("Missing 'startxref' marker.");
+ if (isLenient)
+ {
+ trailerOffset = lastIndexOf(TRAILER_MARKER, buf, buf.length);
+ if (trailerOffset > 0)
+ {
+ trailerOffset += skipBytes;
+ }
+ return -1;
+ }
+ else
+ {
+ throw new IOException("Missing 'startxref' marker.");
+ }
}
return skipBytes + bufOff;
}
@@ -1921,6 +1984,7 @@ public class NonSequentialPDFParser exte
if (bfSearchObjectOffsets == null)
{
bfSearchObjectOffsets = new HashMap<String, Long>();
+ bfSearchCOSObjectKeyOffsets = new HashMap<COSObjectKey, Long>();
long originOffset = pdfSource.getOffset();
long currentOffset = MINIMUM_SEARCH_OFFSET;
String objString = " obj";
@@ -1971,6 +2035,7 @@ public class NonSequentialPDFParser exte
{
bfSearchObjectOffsets.put(
createObjectString(objectID, genID), ++tempOffset);
+ bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(objectID, genID), tempOffset);
}
}
}
Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
------------------------------------------------------------------------------
Merged /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java:r1635617
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1635619&r1=1635618&r2=1635619&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Thu Oct 30 21:34:36 2014
@@ -61,6 +61,8 @@ public class PDFParser extends BaseParse
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
+ protected boolean isFDFDocment = false;
+
private static final String PDF_DEFAULT_VERSION = "1.4";
private static final String FDF_DEFAULT_VERSION = "1.0";
@@ -417,6 +419,7 @@ public class PDFParser extends BaseParse
}
else
{
+ isFDFDocment = true;
if (!header.matches(FDF_HEADER + "\\d.\\d"))
{
if (header.length() < FDF_HEADER.length() + 3)
Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
------------------------------------------------------------------------------
Merged /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java:r1635617