You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2022/05/01 11:39:05 UTC
svn commit: r1900449 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BruteForceParser.java COSParser.java FDFParser.java
Author: lehmi
Date: Sun May 1 11:39:05 2022
New Revision: 1900449
URL: http://svn.apache.org/viewvc?rev=1900449&view=rev
Log:
PDFBOX-5031: separate brute force parser into its own class
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java (with props)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java
Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java?rev=1900449&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java Sun May 1 11:39:05 2022
@@ -0,0 +1,858 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdfparser;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSObjectKey;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
+import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
+
+/**
+ * Brute force parser to be used as last resort if a malformed pdf can't be read.
+ */
+public class BruteForceParser extends COSParser
+{
+ private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
+ private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
+
+ private static final long MINIMUM_SEARCH_OFFSET = 6;
+
+ /**
+ * EOF-marker.
+ */
+ private static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
+ /**
+ * obj-marker.
+ */
+ private static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
+
+ /**
+ * trailer-marker.
+ */
+ private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
+
+ /**
+ * ObjStream-marker.
+ */
+ private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
+
+ private static final Log LOG = LogFactory.getLog(COSParser.class);
+
+ /**
+ * Contains all found objects of a brute force search.
+ */
+ private final Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = new HashMap<>();
+
+ /**
+ * Constructor. Triggers a brute force search for all objects of the document.
+ *
+ * @param source input representing the pdf.
+ * @param document the corresponding COS document
+ *
+ */
+ public BruteForceParser(RandomAccessRead source, COSDocument document) throws IOException
+ {
+ super(source);
+ this.document = document;
+ bfSearchForObjects();
+ }
+
+ /**
+ * Returns all found objects of a brute force search.
+ *
+ * @return map containing all found objects of a brute force search
+ */
+ protected Map<COSObjectKey, Long> getBFCOSObjectOffsets()
+ {
+ return bfSearchCOSObjectKeyOffsets;
+ }
+
+ /**
+ * Brute force search for every object in the pdf.
+ *
+ * @throws IOException if something went wrong
+ */
+ private void bfSearchForObjects() throws IOException
+ {
+ long lastEOFMarker = bfSearchForLastEOFMarker();
+ long originOffset = source.getPosition();
+ long currentOffset = MINIMUM_SEARCH_OFFSET;
+ long lastObjectId = Long.MIN_VALUE;
+ int lastGenID = Integer.MIN_VALUE;
+ long lastObjOffset = Long.MIN_VALUE;
+ char[] endobjString = "ndo".toCharArray();
+ char[] endobjRemainingString = "bj".toCharArray();
+ boolean endOfObjFound = false;
+ do
+ {
+ source.seek(currentOffset);
+ int nextChar = source.read();
+ currentOffset++;
+ if (isWhitespace(nextChar) && isString(OBJ_MARKER))
+ {
+ long tempOffset = currentOffset - 2;
+ source.seek(tempOffset);
+ int genID = source.peek();
+ // is the next char a digit?
+ if (isDigit(genID))
+ {
+ genID -= 48;
+ tempOffset--;
+ source.seek(tempOffset);
+ if (isWhitespace())
+ {
+ while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace())
+ {
+ source.seek(--tempOffset);
+ }
+ boolean objectIDFound = false;
+ while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+ {
+ source.seek(--tempOffset);
+ objectIDFound = true;
+ }
+ if (objectIDFound)
+ {
+ source.read();
+ long objectId = readObjectNumber();
+ if (lastObjOffset > 0)
+ {
+ // add the former object ID only if there was a subsequent object ID
+ bfSearchCOSObjectKeyOffsets.put(
+ new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
+ }
+ lastObjectId = objectId;
+ lastGenID = genID;
+ lastObjOffset = tempOffset + 1;
+ currentOffset += OBJ_MARKER.length - 1;
+ endOfObjFound = false;
+ }
+ }
+ }
+ }
+ // check for "endo" as abbreviation for "endobj", as the pdf may be cut off
+ // in the middle of the keyword, see PDFBOX-3936.
+ // We could possibly implement a more intelligent algorithm if necessary
+ else if (nextChar == 'e' && isString(endobjString))
+ {
+ currentOffset += endobjString.length;
+ source.seek(currentOffset);
+ if (source.isEOF())
+ {
+ endOfObjFound = true;
+ }
+ else if (isString(endobjRemainingString))
+ {
+ currentOffset += endobjRemainingString.length;
+ endOfObjFound = true;
+ }
+ }
+ } while (currentOffset < lastEOFMarker && !source.isEOF());
+ if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
+ {
+ // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
+ // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
+ bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
+ lastObjOffset);
+ }
+ // reestablish origin position
+ source.seek(originOffset);
+ }
+
+ /**
+ * Search for the offset of the given xref table/stream among those found by a brute force search.
+ *
+ * @param xrefOffset the given offset to be searched for
+ *
+ * @return the offset of the xref entry
+ * @throws IOException if something went wrong
+ */
+ protected long bfSearchForXRef(long xrefOffset) throws IOException
+ {
+ long newOffset = -1;
+
+ // initialize bfSearchXRefTablesOffsets -> not null
+ List<Long> bfSearchXRefTablesOffsets = bfSearchForXRefTables();
+ // initialize bfSearchXRefStreamsOffsets -> not null
+ List<Long> bfSearchXRefStreamsOffsets = bfSearchForXRefStreams();
+
+ // TODO to be optimized, this won't work in every case
+ long newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
+
+ // TODO to be optimized, this won't work in every case
+ long newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
+
+ // choose the nearest value
+ if (newOffsetTable > -1 && newOffsetStream > -1)
+ {
+ long differenceTable = xrefOffset - newOffsetTable;
+ long differenceStream = xrefOffset - newOffsetStream;
+ if (Math.abs(differenceTable) > Math.abs(differenceStream))
+ {
+ newOffset = newOffsetStream;
+ bfSearchXRefStreamsOffsets.remove(newOffsetStream);
+ }
+ else
+ {
+ newOffset = newOffsetTable;
+ bfSearchXRefTablesOffsets.remove(newOffsetTable);
+ }
+ }
+ else if (newOffsetTable > -1)
+ {
+ newOffset = newOffsetTable;
+ bfSearchXRefTablesOffsets.remove(newOffsetTable);
+ }
+ else if (newOffsetStream > -1)
+ {
+ newOffset = newOffsetStream;
+ bfSearchXRefStreamsOffsets.remove(newOffsetStream);
+ }
+ return newOffset;
+ }
+
+ private long searchNearestValue(List<Long> values, long offset)
+ {
+ long newValue = -1;
+ Long currentDifference = null;
+ int currentOffsetIndex = -1;
+ int numberOfOffsets = values.size();
+ // find the nearest value
+ for (int i = 0; i < numberOfOffsets; i++)
+ {
+ long newDifference = offset - values.get(i);
+ // find the nearest offset
+ if (currentDifference == null
+ || (Math.abs(currentDifference) > Math.abs(newDifference)))
+ {
+ currentDifference = newDifference;
+ currentOffsetIndex = i;
+ }
+ }
+ if (currentOffsetIndex > -1)
+ {
+ newValue = values.get(currentOffsetIndex);
+ }
+ return newValue;
+ }
+
+ /**
+ * Brute force search for all objects streams of a pdf.
+ *
+ * @param trailerResolver the trailer resolver of the document
+ * @param securityHandler security handler to be used to decrypt encrypted documents
+ * @throws IOException if something went wrong
+ */
+ protected void bfSearchForObjStreams(XrefTrailerResolver trailerResolver,
+ SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
+ {
+ // update security handler
+ this.securityHandler = securityHandler;
+ // save origin offset
+ long originOffset = source.getPosition();
+
+ Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = bfSearchForObjStreamOffsets();
+ // log warning about skipped stream
+ bfSearchForObjStreamOffsets.entrySet().stream() //
+ .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) == null) //
+ .forEach(o -> LOG.warn(
+ "Skipped incomplete object stream:" + o.getValue() + " at " + o.getKey()));
+
+ // collect all stream offsets
+ List<Long> objStreamOffsets = bfSearchForObjStreamOffsets.entrySet().stream() //
+ .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) != null) //
+ .filter(o -> o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
+ .map(Map.Entry::getKey) //
+ .collect(Collectors.toList());
+ // add all found compressed objects to the brute force search result
+ for (Long offset : objStreamOffsets)
+ {
+ source.seek(offset);
+ long stmObjNumber = readObjectNumber();
+ int stmGenNumber = readGenerationNumber();
+ readExpectedString(OBJ_MARKER, true);
+ COSStream stream = null;
+ try
+ {
+ COSDictionary dict = parseCOSDictionary(false);
+ stream = parseCOSStream(dict);
+ if (securityHandler != null)
+ {
+ securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber);
+ }
+ PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document);
+ Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers();
+ Map<COSObjectKey, Long> xrefOffset = trailerResolver.getXrefTable();
+ for (Long objNumber : objectNumbers.keySet())
+ {
+ COSObjectKey objKey = new COSObjectKey(objNumber, 0);
+ Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
+ if (existingOffset != null && existingOffset < 0)
+ {
+ // translate stream object key to its offset
+ COSObjectKey objStmKey = new COSObjectKey(Math.abs(existingOffset), 0);
+ existingOffset = bfSearchCOSObjectKeyOffsets.get(objStmKey);
+ }
+ if (existingOffset == null || offset > existingOffset)
+ {
+ bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
+ xrefOffset.put(objKey, -stmObjNumber);
+ }
+ }
+ }
+ catch (IOException exception)
+ {
+ LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset,
+ exception);
+ }
+ finally
+ {
+ if (stream != null)
+ {
+ stream.close();
+ }
+ }
+ }
+ // restore origin offset
+ source.seek(originOffset);
+ }
+
+ /**
+ * Brute force search for all trailer marker.
+ *
+ * @param trailer dictionary to be used as trailer dictionary
+ *
+ * @throws IOException if something went wrong
+ */
+ private boolean bfSearchForTrailer(COSDictionary trailer) throws IOException
+ {
+ long originOffset = source.getPosition();
+ source.seek(MINIMUM_SEARCH_OFFSET);
+ // search for trailer marker
+ long trailerOffset = findString(TRAILER_MARKER);
+ while (trailerOffset != -1)
+ {
+ try
+ {
+ boolean rootFound = false;
+ boolean infoFound = false;
+ skipSpaces();
+ COSDictionary trailerDict = parseCOSDictionary(true);
+ COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
+ if (rootObj != null)
+ {
+ // check if the dictionary can be dereferenced and is the one we are looking for
+ COSBase rootDict = rootObj.getObject();
+ if (rootDict instanceof COSDictionary && isCatalog((COSDictionary) rootDict))
+ {
+ rootFound = true;
+ }
+ }
+ COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
+ if (infoObj != null)
+ {
+ // check if the dictionary can be dereferenced and is the one we are looking for
+ COSBase infoDict = infoObj.getObject();
+ if (infoDict instanceof COSDictionary && isInfo((COSDictionary) infoDict))
+ {
+ infoFound = true;
+ }
+ }
+ if (rootFound && infoFound)
+ {
+ trailer.setItem(COSName.ROOT, rootObj);
+ trailer.setItem(COSName.INFO, infoObj);
+ if (trailerDict.containsKey(COSName.ENCRYPT))
+ {
+ COSObject encObj = trailerDict.getCOSObject(COSName.ENCRYPT);
+ // check if the dictionary can be dereferenced
+ // TODO check if the dictionary is an encryption dictionary?
+ if (encObj != null && encObj.getObject() instanceof COSDictionary)
+ {
+ trailer.setItem(COSName.ENCRYPT, encObj);
+ }
+ }
+ if (trailerDict.containsKey(COSName.ID))
+ {
+ COSBase idObj = trailerDict.getItem(COSName.ID);
+ if (idObj instanceof COSArray)
+ {
+ trailer.setItem(COSName.ID, idObj);
+ }
+ }
+ return true;
+ }
+ }
+ catch (IOException exception)
+ {
+ LOG.debug("An exception occurred during brute force search for trailer - ignoring",
+ exception);
+ }
+ trailerOffset = findString(TRAILER_MARKER);
+ }
+ source.seek(originOffset);
+ return false;
+ }
+
+ /**
+ * Search for the different parts of the trailer dictionary.
+ *
+ * @param trailer dictionary to be used as trailer dictionary
+ * @return true if the root was found, false if not.
+ * @throws IOException if the page tree root is null
+ */
+ private boolean searchForTrailerItems(COSDictionary trailer) throws IOException
+ {
+ COSObject rootObject = null;
+ COSObject infoObject = null;
+ for (Entry<COSObjectKey, Long> entrySet : bfSearchCOSObjectKeyOffsets.entrySet())
+ {
+ COSObjectKey currentKey = entrySet.getKey();
+ COSObject cosObject = document.getObjectFromPool(currentKey);
+ COSBase baseObject = cosObject.getObject();
+
+ if (!(baseObject instanceof COSDictionary))
+ {
+ continue;
+ }
+ COSDictionary dictionary = (COSDictionary) baseObject;
+ // document catalog
+ if (isCatalog(dictionary))
+ {
+ rootObject = compareCOSObjects(cosObject, entrySet.getValue(), rootObject);
+ }
+ // info dictionary
+ else if (isInfo(dictionary))
+ {
+ infoObject = compareCOSObjects(cosObject, entrySet.getValue(), infoObject);
+ }
+ // encryption dictionary, if existing, is lost
+ // We can't run "Algorithm 2" from PDF specification because of missing ID
+ }
+ if (rootObject != null)
+ {
+ trailer.setItem(COSName.ROOT, rootObject);
+ }
+ if (infoObject != null)
+ {
+ trailer.setItem(COSName.INFO, infoObject);
+ }
+ return rootObject != null;
+ }
+
+ private COSObject compareCOSObjects(COSObject newObject, Long newOffset,
+ COSObject currentObject)
+ {
+ if (currentObject != null && currentObject.getKey() != null)
+ {
+ COSObjectKey currentKey = currentObject.getKey();
+ COSObjectKey newKey = newObject.getKey();
+ // check if the current object is an updated version of the previous found object
+ if (currentKey.getNumber() == newKey.getNumber())
+ {
+ return currentKey.getGeneration() < newKey.getGeneration() ? newObject
+ : currentObject;
+ }
+ // most likely the object with the bigger offset is the newer one
+ Long currentOffset = document.getXrefTable().get(currentKey);
+ return currentOffset != null && newOffset > currentOffset ? newObject : currentObject;
+ }
+ return newObject;
+ }
+
+ /**
+ * Brute force search for the last EOF marker.
+ *
+ * @throws IOException if something went wrong
+ */
+ private long bfSearchForLastEOFMarker() throws IOException
+ {
+ long lastEOFMarker = -1;
+ long originOffset = source.getPosition();
+ source.seek(MINIMUM_SEARCH_OFFSET);
+ long tempMarker = findString(EOF_MARKER);
+ while (tempMarker != -1)
+ {
+ try
+ {
+ // check if the following data is some valid pdf content
+ // which most likely indicates that the pdf is linearized,
+ // updated or just cut off somewhere in the middle
+ skipSpaces();
+ if (!isString(XREF_TABLE))
+ {
+ readObjectNumber();
+ readGenerationNumber();
+ }
+ }
+ catch (IOException exception)
+ {
+ // save the EOF marker as the following data is most likely some garbage
+ LOG.debug("An exception occurred during brute force for last EOF - ignoring",
+ exception);
+ lastEOFMarker = tempMarker;
+ }
+ tempMarker = findString(EOF_MARKER);
+ }
+ source.seek(originOffset);
+ // no EOF marker found
+ if (lastEOFMarker == -1)
+ {
+ lastEOFMarker = Long.MAX_VALUE;
+ }
+ return lastEOFMarker;
+ }
+
+ /**
+ * Search for all offsets of object streams within the given pdf
+ *
+ * @return a map of all offsets for object streams
+ * @throws IOException if something went wrong
+ */
+ private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws IOException
+ {
+ HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
+ source.seek(MINIMUM_SEARCH_OFFSET);
+ char[] string = " obj".toCharArray();
+ // search for object stream marker
+ long positionObjStream = findString(OBJ_STREAM);
+ while (positionObjStream != -1)
+ {
+ // search backwards for the beginning of the object
+ long newOffset = -1;
+ boolean objFound = false;
+ for (int i = 1; i < 40 && !objFound; i++)
+ {
+ long currentOffset = positionObjStream - (i * 10);
+ if (currentOffset > 0)
+ {
+ source.seek(currentOffset);
+ for (int j = 0; j < 10; j++)
+ {
+ if (isString(string))
+ {
+ long tempOffset = currentOffset - 1;
+ source.seek(tempOffset);
+ int genID = source.peek();
+ // is the next char a digit?
+ if (isDigit(genID))
+ {
+ tempOffset--;
+ source.seek(tempOffset);
+ if (isSpace())
+ {
+ int length = 0;
+ source.seek(--tempOffset);
+ while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+ {
+ source.seek(--tempOffset);
+ length++;
+ }
+ if (length > 0)
+ {
+ source.read();
+ newOffset = source.getPosition();
+ long objNumber = readObjectNumber();
+ int genNumber = readGenerationNumber();
+ COSObjectKey streamObjectKey = new COSObjectKey(objNumber,
+ genNumber);
+ bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
+ }
+ }
+ }
+ LOG.debug("Dictionary start for object stream -> " + newOffset);
+ objFound = true;
+ break;
+ }
+ else
+ {
+ currentOffset++;
+ source.read();
+ }
+ }
+ }
+ }
+ source.seek(positionObjStream + OBJ_STREAM.length);
+ positionObjStream = findString(OBJ_STREAM);
+ }
+ return bfSearchObjStreamsOffsets;
+ }
+
+ /**
+ * Brute force search for all xref entries (tables).
+ *
+ * @throws IOException if something went wrong
+ */
+ private List<Long> bfSearchForXRefTables() throws IOException
+ {
+ List<Long> bfSearchXRefTablesOffsets = new ArrayList<>();
+ // a pdf may contain more than one xref entry
+ source.seek(MINIMUM_SEARCH_OFFSET);
+ // search for xref tables
+ long newOffset = findString(XREF_TABLE);
+ while (newOffset != -1)
+ {
+ source.seek(newOffset - 1);
+ // ensure that we don't read "startxref" instead of "xref"
+ if (isWhitespace())
+ {
+ bfSearchXRefTablesOffsets.add(newOffset);
+ }
+ source.seek(newOffset + 4);
+ newOffset = findString(XREF_TABLE);
+ }
+ return bfSearchXRefTablesOffsets;
+ }
+
+ /**
+ * Brute force search for all /XRef entries (streams).
+ *
+ * @throws IOException if something went wrong
+ */
+ private List<Long> bfSearchForXRefStreams() throws IOException
+ {
+ List<Long> bfSearchXRefStreamsOffsets = new ArrayList<>();
+ // a pdf may contain more than one /XRef entry
+ source.seek(MINIMUM_SEARCH_OFFSET);
+ // search for XRef streams
+ String objString = " obj";
+ char[] string = objString.toCharArray();
+ long xrefOffset = findString(XREF_STREAM);
+ while (xrefOffset != -1)
+ {
+ // search backwards for the beginning of the stream
+ long newOffset = -1;
+ boolean objFound = false;
+ for (int i = 1; i < 40 && !objFound; i++)
+ {
+ long currentOffset = xrefOffset - (i * 10);
+ if (currentOffset > 0)
+ {
+ source.seek(currentOffset);
+ for (int j = 0; j < 10; j++)
+ {
+ if (isString(string))
+ {
+ long tempOffset = currentOffset - 1;
+ source.seek(tempOffset);
+ int genID = source.peek();
+ // is the next char a digit?
+ if (isDigit(genID))
+ {
+ tempOffset--;
+ source.seek(tempOffset);
+ if (isSpace())
+ {
+ int length = 0;
+ source.seek(--tempOffset);
+ while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+ {
+ source.seek(--tempOffset);
+ length++;
+ }
+ if (length > 0)
+ {
+ source.read();
+ newOffset = source.getPosition();
+ }
+ }
+ }
+ LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> "
+ + newOffset);
+ objFound = true;
+ break;
+ }
+ else
+ {
+ currentOffset++;
+ source.read();
+ }
+ }
+ }
+ }
+ if (newOffset > -1)
+ {
+ bfSearchXRefStreamsOffsets.add(newOffset);
+ }
+ source.seek(xrefOffset + 5);
+ xrefOffset = findString(XREF_STREAM);
+ }
+ return bfSearchXRefStreamsOffsets;
+ }
+
+ /**
+ * Tell if the dictionary is an info dictionary.
+ *
+ * @param dictionary the dictionary to be checked
+ * @return true if the given dictionary is an info dictionary
+ */
+ private boolean isInfo(COSDictionary dictionary)
+ {
+ if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
+ || dictionary.containsKey(COSName.DEST))
+ {
+ return false;
+ }
+ if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
+ && !dictionary.containsKey(COSName.AUTHOR)
+ && !dictionary.containsKey(COSName.SUBJECT)
+ && !dictionary.containsKey(COSName.KEYWORDS)
+ && !dictionary.containsKey(COSName.CREATOR)
+ && !dictionary.containsKey(COSName.PRODUCER)
+ && !dictionary.containsKey(COSName.CREATION_DATE))
+ {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Tell if the dictionary is a PDF or FDF catalog.
+ *
+ * @param dictionary
+ * @return true if the given dictionary is a root dictionary
+ */
+ private boolean isCatalog(COSDictionary dictionary)
+ {
+ return COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE))
+ || dictionary.containsKey(COSName.FDF);
+ }
+
+ /**
+ * Checks if the given string can be found at the current offset.
+ *
+ * @param string the bytes of the string to look for
+ * @return true if the bytes are in place, false if not
+ * @throws IOException if something went wrong
+ */
+ private boolean isString(char[] string) throws IOException
+ {
+ boolean bytesMatching = true;
+ long originOffset = source.getPosition();
+ for (char c : string)
+ {
+ if (source.read() != c)
+ {
+ bytesMatching = false;
+ break;
+ }
+ }
+ source.seek(originOffset);
+ return bytesMatching;
+ }
+
+ /**
+ * Search for the given string. The search starts at the current position and returns the start position if the
+ * string was found. -1 is returned if there isn't any further occurrence of the given string. After returning the
+ * current position is either the end of the string or the end of the input.
+ *
+ * @param string the string to be searched
+ * @return the start position of the found string
+ * @throws IOException if something went wrong
+ */
+ private long findString(char[] string) throws IOException
+ {
+ long position = -1L;
+ int stringLength = string.length;
+ int counter = 0;
+ int readChar = source.read();
+ while (readChar != -1)
+ {
+ if (readChar == string[counter])
+ {
+ if (counter == 0)
+ {
+ position = source.getPosition() - 1;
+ }
+ counter++;
+ if (counter == stringLength)
+ {
+ return position;
+ }
+ }
+ else if (counter > 0)
+ {
+ counter = 0;
+ position = -1L;
+ continue;
+ }
+ readChar = source.read();
+ }
+ return position;
+ }
+
+ /**
+ * Rebuild the trailer dictionary if startxref can't be found.
+ *
+ * @param trailerResolver the trailer resolver of the document
+ * @param securityHandler security handler to be used to decrypt encrypted documents
+ * @return the rebuild trailer dictionary
+ *
+ * @throws IOException if something went wrong
+ */
+ protected COSDictionary rebuildTrailer(XrefTrailerResolver trailerResolver,
+ SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
+ {
+ // update security handler
+ this.securityHandler = securityHandler;
+ // reset trailer resolver
+ trailerResolver.reset();
+ // use the found objects to rebuild the trailer resolver
+ trailerResolver.nextXrefObj(0, XRefType.TABLE);
+ bfSearchCOSObjectKeyOffsets.forEach(xrefTrailerResolver::setXRef);
+ trailerResolver.setStartxref(0);
+ COSDictionary trailer = trailerResolver.getTrailer();
+ document.setTrailer(trailer);
+ boolean searchForObjStreamsDone = false;
+ if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
+ {
+ // root entry wasn't found, maybe it is part of an object stream
+ // brute force search for all object streams.
+ bfSearchForObjStreams(trailerResolver, securityHandler);
+ searchForObjStreamsDone = true;
+ // search again for the root entry
+ searchForTrailerItems(trailer);
+ }
+ // prepare decryption if necessary
+ prepareDecryption();
+ if (!searchForObjStreamsDone)
+ {
+ // brute force search for all object streams.
+ bfSearchForObjStreams(trailerResolver, securityHandler);
+ }
+ return trailer;
+ }
+
+}
Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1900449&r1=1900448&r2=1900449&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun May 1 11:39:05 2022
@@ -21,7 +21,6 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.security.KeyStore;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -29,7 +28,6 @@ import java.util.Map;
import java.util.Optional;
import java.util.Map.Entry;
import java.util.Set;
-import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -71,7 +69,6 @@ public class COSParser extends BaseParse
private static final String FDF_DEFAULT_VERSION = "1.0";
private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
- private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
private static final char[] STARTXREF = new char[] { 's','t','a','r','t','x','r','e','f' };
private static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M };
@@ -112,16 +109,6 @@ public class COSParser extends BaseParse
protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
/**
- * trailer-marker.
- */
- private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
-
- /**
- * ObjStream-marker.
- */
- private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
-
- /**
* file length.
*/
protected long fileLen;
@@ -134,13 +121,11 @@ public class COSParser extends BaseParse
protected boolean initialParseDone = false;
private boolean trailerWasRebuild = false;
- /**
- * Contains all found objects of a brute force search.
- */
- private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
- private boolean bruteForceSearchTriggered = false;
+
+ private BruteForceParser bruteForceParser = null;
+ private boolean bruteForceSearchSuccessful = false;
private PDEncryption encryption = null;
-
+
/**
* Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
* dereferencing them.
@@ -266,15 +251,16 @@ public class COSParser extends BaseParse
}
if (rebuildTrailer)
{
- trailer = rebuildTrailer();
+ trailer = getBruteForceParser().rebuildTrailer(xrefTrailerResolver, securityHandler);
+ trailerWasRebuild = true;
}
else
{
// prepare decryption if necessary
prepareDecryption();
- if (bruteForceSearchTriggered && !getBFCOSObjectOffsets().isEmpty())
+ if (bruteForceSearchSuccessful)
{
- bfSearchForObjStreams();
+ getBruteForceParser().bfSearchForObjStreams(xrefTrailerResolver, securityHandler);
}
}
if (resetTrailerResolver())
@@ -412,7 +398,10 @@ public class COSParser extends BaseParse
document.setTrailer(trailer);
document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
// check the offsets of all referenced objects
- checkXrefOffsets();
+ if (isLenient)
+ {
+ checkXrefOffsets();
+ }
// copy xref table
document.addXRefTable(xrefTrailerResolver.getXrefTable());
@@ -651,8 +640,7 @@ public class COSParser extends BaseParse
// maybe something is wrong with the xref table -> perform brute force search for all objects
if (offsetOrObjstmObNr == null && isLenient)
{
- Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
- offsetOrObjstmObNr = bfCOSObjectKeyOffsets.get(objKey);
+ offsetOrObjstmObNr = getBruteForceParser().getBFCOSObjectOffsets().get(objKey);
if (offsetOrObjstmObNr != null)
{
LOG.debug("Set missing offset " + offsetOrObjstmObNr + " for object " + objKey);
@@ -1160,8 +1148,8 @@ public class COSParser extends BaseParse
LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
return 0;
}
- // start a brute force search for all xref tables and try to find the offset we are looking for
- long newOffset = bfSearchForXRef(objectOffset);
+ // search for the offset of the given xref table/stream among those found by a brute force search.
+ long newOffset = getBruteForceParser().bfSearchForXRef(objectOffset);
if (newOffset > -1)
{
LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
@@ -1211,14 +1199,15 @@ public class COSParser extends BaseParse
if (!validKeys.contains(correctedKeyEntry.getValue()))
{
// Only replace entries, if the original entry does not point to a valid object
- correctedPointers.put(correctedKeyEntry.getValue(), xrefOffset.get(correctedKeyEntry.getKey()));
+ correctedPointers.put(correctedKeyEntry.getValue(),
+ xrefOffset.get(correctedKeyEntry.getKey()));
}
}
correctedKeys.entrySet().forEach(
// remove old invalid, as some might not be replaced
correctedKeyEntry -> xrefOffset.remove(correctedKeyEntry.getKey()));
- correctedPointers.entrySet().forEach(
- pointer -> xrefOffset.put(pointer.getKey(), pointer.getValue()));
+ correctedPointers.entrySet()
+ .forEach(pointer -> xrefOffset.put(pointer.getKey(), pointer.getValue()));
return true;
}
@@ -1229,15 +1218,11 @@ public class COSParser extends BaseParse
*/
private void checkXrefOffsets() throws IOException
{
- // repair mode isn't available in non-lenient mode
- if (!isLenient)
- {
- return;
- }
Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
if (!validateXrefOffsets(xrefOffset))
{
- Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
+ Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBruteForceParser()
+ .getBFCOSObjectOffsets();
if (!bfCOSObjectKeyOffsets.isEmpty())
{
LOG.debug("Replaced read xref table with the results of a brute force search");
@@ -1258,7 +1243,8 @@ public class COSParser extends BaseParse
*
* @throws IOException if something went wrong
*/
- private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset, Map<COSObjectKey, Long> xrefOffset) throws IOException
+ private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
+ Map<COSObjectKey, Long> xrefOffset) throws IOException
{
// there can't be any object at the very beginning of a pdf
if (offset < MINIMUM_SEARCH_OFFSET)
@@ -1340,661 +1326,17 @@ public class COSParser extends BaseParse
return null;
}
- private Map<COSObjectKey, Long> getBFCOSObjectOffsets() throws IOException
- {
- if (bfSearchCOSObjectKeyOffsets == null)
- {
- bfSearchCOSObjectKeyOffsets = bfSearchForObjects();
- bruteForceSearchTriggered = true;
- }
- return bfSearchCOSObjectKeyOffsets;
- }
-
- /**
- * Brute force search for every object in the pdf.
- *
- * @throws IOException if something went wrong
- */
- private Map<COSObjectKey, Long> bfSearchForObjects() throws IOException
- {
- Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = new HashMap<>();
- long lastEOFMarker = bfSearchForLastEOFMarker();
- long originOffset = source.getPosition();
- long currentOffset = MINIMUM_SEARCH_OFFSET;
- long lastObjectId = Long.MIN_VALUE;
- int lastGenID = Integer.MIN_VALUE;
- long lastObjOffset = Long.MIN_VALUE;
- char[] endobjString = "ndo".toCharArray();
- char[] endobjRemainingString = "bj".toCharArray();
- boolean endOfObjFound = false;
- do
- {
- source.seek(currentOffset);
- int nextChar = source.read();
- currentOffset++;
- if (isWhitespace(nextChar) && isString(OBJ_MARKER))
- {
- long tempOffset = currentOffset - 2;
- source.seek(tempOffset);
- int genID = source.peek();
- // is the next char a digit?
- if (isDigit(genID))
- {
- genID -= 48;
- tempOffset--;
- source.seek(tempOffset);
- if (isWhitespace())
- {
- while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace())
- {
- source.seek(--tempOffset);
- }
- boolean objectIDFound = false;
- while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
- {
- source.seek(--tempOffset);
- objectIDFound = true;
- }
- if (objectIDFound)
- {
- source.read();
- long objectId = readObjectNumber();
- if (lastObjOffset > 0)
- {
- // add the former object ID only if there was a subsequent object ID
- bfCOSObjectKeyOffsets.put(
- new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
- }
- lastObjectId = objectId;
- lastGenID = genID;
- lastObjOffset = tempOffset + 1;
- currentOffset += OBJ_MARKER.length - 1;
- endOfObjFound = false;
- }
- }
- }
- }
- // check for "endo" as abbreviation for "endobj", as the pdf may be cut off
- // in the middle of the keyword, see PDFBOX-3936.
- // We could possibly implement a more intelligent algorithm if necessary
- else if (nextChar == 'e' && isString(endobjString))
- {
- currentOffset += endobjString.length;
- source.seek(currentOffset);
- if (source.isEOF())
- {
- endOfObjFound = true;
- }
- else if (isString(endobjRemainingString))
- {
- currentOffset += endobjRemainingString.length;
- endOfObjFound = true;
- }
- }
- } while (currentOffset < lastEOFMarker && !source.isEOF());
- if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
- {
- // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
- // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
- bfCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
- lastObjOffset);
- }
- // reestablish origin position
- source.seek(originOffset);
- return bfCOSObjectKeyOffsets;
- }
-
- /**
- * Search for the offset of the given xref table/stream among those found by a brute force search.
- *
- * @return the offset of the xref entry
- * @throws IOException if something went wrong
- */
- private long bfSearchForXRef(long xrefOffset) throws IOException
- {
- long newOffset = -1;
-
- // initialize bfSearchXRefTablesOffsets -> not null
- List<Long> bfSearchXRefTablesOffsets = bfSearchForXRefTables();
- // initialize bfSearchXRefStreamsOffsets -> not null
- List<Long> bfSearchXRefStreamsOffsets = bfSearchForXRefStreams();
-
- // TODO to be optimized, this won't work in every case
- long newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
-
- // TODO to be optimized, this won't work in every case
- long newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
-
- // choose the nearest value
- if (newOffsetTable > -1 && newOffsetStream > -1)
- {
- long differenceTable = xrefOffset - newOffsetTable;
- long differenceStream = xrefOffset - newOffsetStream;
- if (Math.abs(differenceTable) > Math.abs(differenceStream))
- {
- newOffset = newOffsetStream;
- bfSearchXRefStreamsOffsets.remove(newOffsetStream);
- }
- else
- {
- newOffset = newOffsetTable;
- bfSearchXRefTablesOffsets.remove(newOffsetTable);
- }
- }
- else if (newOffsetTable > -1)
- {
- newOffset = newOffsetTable;
- bfSearchXRefTablesOffsets.remove(newOffsetTable);
- }
- else if (newOffsetStream > -1)
- {
- newOffset = newOffsetStream;
- bfSearchXRefStreamsOffsets.remove(newOffsetStream);
- }
- return newOffset;
- }
-
- private long searchNearestValue(List<Long> values, long offset)
- {
- long newValue = -1;
- Long currentDifference = null;
- int currentOffsetIndex = -1;
- int numberOfOffsets = values.size();
- // find the nearest value
- for (int i = 0; i < numberOfOffsets; i++)
- {
- long newDifference = offset - values.get(i);
- // find the nearest offset
- if (currentDifference == null
- || (Math.abs(currentDifference) > Math.abs(newDifference)))
- {
- currentDifference = newDifference;
- currentOffsetIndex = i;
- }
- }
- if (currentOffsetIndex > -1)
- {
- newValue = values.get(currentOffsetIndex);
- }
- return newValue;
- }
-
- /**
- * Brute force search for all trailer marker.
- *
- * @throws IOException if something went wrong
- */
- private boolean bfSearchForTrailer(COSDictionary trailer) throws IOException
- {
- long originOffset = source.getPosition();
- source.seek(MINIMUM_SEARCH_OFFSET);
- // search for trailer marker
- long trailerOffset = findString(TRAILER_MARKER);
- while (trailerOffset != -1)
- {
- try
- {
- boolean rootFound = false;
- boolean infoFound = false;
- skipSpaces();
- COSDictionary trailerDict = parseCOSDictionary(true);
- COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
- if (rootObj != null)
- {
- // check if the dictionary can be dereferenced and is the one we are looking for
- COSBase rootDict = rootObj.getObject();
- if (rootDict instanceof COSDictionary && isCatalog((COSDictionary) rootDict))
- {
- rootFound = true;
- }
- }
- COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
- if (infoObj != null)
- {
- // check if the dictionary can be dereferenced and is the one we are looking for
- COSBase infoDict = infoObj.getObject();
- if (infoDict instanceof COSDictionary && isInfo((COSDictionary) infoDict))
- {
- infoFound = true;
- }
- }
- if (rootFound && infoFound)
- {
- trailer.setItem(COSName.ROOT, rootObj);
- trailer.setItem(COSName.INFO, infoObj);
- if (trailerDict.containsKey(COSName.ENCRYPT))
- {
- COSObject encObj = trailerDict.getCOSObject(COSName.ENCRYPT);
- // check if the dictionary can be dereferenced
- // TODO check if the dictionary is an encryption dictionary?
- if (encObj != null && encObj.getObject() instanceof COSDictionary)
- {
- trailer.setItem(COSName.ENCRYPT, encObj);
- }
- }
- if (trailerDict.containsKey(COSName.ID))
- {
- COSBase idObj = trailerDict.getItem(COSName.ID);
- if (idObj instanceof COSArray)
- {
- trailer.setItem(COSName.ID, idObj);
- }
- }
- return true;
- }
- }
- catch (IOException exception)
- {
- LOG.debug("An exception occurred during brute force search for trailer - ignoring",
- exception);
- }
- trailerOffset = findString(TRAILER_MARKER);
- }
- source.seek(originOffset);
- return false;
- }
-
- /**
- * Brute force search for the last EOF marker.
- *
- * @throws IOException if something went wrong
- */
- private long bfSearchForLastEOFMarker() throws IOException
- {
- long lastEOFMarker = -1;
- long originOffset = source.getPosition();
- source.seek(MINIMUM_SEARCH_OFFSET);
- long tempMarker = findString(EOF_MARKER);
- while (tempMarker != -1)
- {
- try
- {
- // check if the following data is some valid pdf content
- // which most likely indicates that the pdf is linearized,
- // updated or just cut off somewhere in the middle
- skipSpaces();
- if (!isString(XREF_TABLE))
- {
- readObjectNumber();
- readGenerationNumber();
- }
- }
- catch (IOException exception)
- {
- // save the EOF marker as the following data is most likely some garbage
- LOG.debug("An exception occurred during brute force for last EOF - ignoring",
- exception);
- lastEOFMarker = tempMarker;
- }
- tempMarker = findString(EOF_MARKER);
- }
- source.seek(originOffset);
- // no EOF marker found
- if (lastEOFMarker == -1)
- {
- lastEOFMarker = Long.MAX_VALUE;
- }
- return lastEOFMarker;
- }
-
- /**
- * Brute force search for all object streams.
- *
- * @throws IOException if something went wrong
- */
- private void bfSearchForObjStreams() throws IOException
- {
- // save origin offset
- long originOffset = source.getPosition();
-
- Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = bfSearchForObjStreamOffsets();
- Map<COSObjectKey, Long> bfCOSObjectOffsets = getBFCOSObjectOffsets();
- // log warning about skipped stream
- bfSearchForObjStreamOffsets.entrySet().stream() //
- .filter(o -> bfCOSObjectOffsets.get(o.getValue()) == null) //
- .forEach(o -> LOG.warn(
- "Skipped incomplete object stream:" + o.getValue() + " at " + o.getKey()));
-
- // collect all stream offsets
- List<Long> objStreamOffsets = bfSearchForObjStreamOffsets.entrySet().stream() //
- .filter(o -> bfCOSObjectOffsets.get(o.getValue()) != null) //
- .filter(o -> o.getKey().equals(bfCOSObjectOffsets.get(o.getValue()))) //
- .map(Map.Entry::getKey) //
- .collect(Collectors.toList());
- // add all found compressed objects to the brute force search result
- for (Long offset : objStreamOffsets)
- {
- source.seek(offset);
- long stmObjNumber = readObjectNumber();
- int stmGenNumber = readGenerationNumber();
- readExpectedString(OBJ_MARKER, true);
- COSStream stream = null;
- try
- {
- COSDictionary dict = parseCOSDictionary(false);
- stream = parseCOSStream(dict);
- if (securityHandler != null)
- {
- securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber);
- }
- PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document);
- Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers();
- Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
- for (Long objNumber : objectNumbers.keySet())
- {
- COSObjectKey objKey = new COSObjectKey(objNumber, 0);
- Long existingOffset = bfCOSObjectOffsets.get(objKey);
- if (existingOffset != null && existingOffset < 0)
- {
- // translate stream object key to its offset
- COSObjectKey objStmKey = new COSObjectKey(Math.abs(existingOffset), 0);
- existingOffset = bfCOSObjectOffsets.get(objStmKey);
- }
- if (existingOffset == null || offset > existingOffset)
- {
- bfCOSObjectOffsets.put(objKey, -stmObjNumber);
- xrefOffset.put(objKey, -stmObjNumber);
- }
- }
- }
- catch (IOException exception)
- {
- LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset,
- exception);
- }
- finally
- {
- if (stream != null)
- {
- stream.close();
- }
- }
- }
- // restore origin offset
- source.seek(originOffset);
- }
-
- /**
- * Search for all offsets of object streams within the given pdf
- *
- * @return a map of all offsets for object streams
- * @throws IOException if something went wrong
- */
- private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws IOException
+ private BruteForceParser getBruteForceParser() throws IOException
{
- HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
- source.seek(MINIMUM_SEARCH_OFFSET);
- char[] string = " obj".toCharArray();
- // search for object stream marker
- long positionObjStream = findString(OBJ_STREAM);
- while (positionObjStream != -1)
- {
- // search backwards for the beginning of the object
- long newOffset = -1;
- boolean objFound = false;
- for (int i = 1; i < 40 && !objFound; i++)
- {
- long currentOffset = positionObjStream - (i * 10);
- if (currentOffset > 0)
- {
- source.seek(currentOffset);
- for (int j = 0; j < 10; j++)
- {
- if (isString(string))
- {
- long tempOffset = currentOffset - 1;
- source.seek(tempOffset);
- int genID = source.peek();
- // is the next char a digit?
- if (isDigit(genID))
- {
- tempOffset--;
- source.seek(tempOffset);
- if (isSpace())
- {
- int length = 0;
- source.seek(--tempOffset);
- while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
- {
- source.seek(--tempOffset);
- length++;
- }
- if (length > 0)
- {
- source.read();
- newOffset = source.getPosition();
- long objNumber = readObjectNumber();
- int genNumber = readGenerationNumber();
- COSObjectKey streamObjectKey = new COSObjectKey(objNumber,
- genNumber);
- bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
- }
- }
- }
- LOG.debug("Dictionary start for object stream -> " + newOffset);
- objFound = true;
- break;
- }
- else
- {
- currentOffset++;
- source.read();
- }
- }
- }
- }
- source.seek(positionObjStream + OBJ_STREAM.length);
- positionObjStream = findString(OBJ_STREAM);
- }
- return bfSearchObjStreamsOffsets;
- }
-
- /**
- * Brute force search for all xref entries (tables).
- *
- * @throws IOException if something went wrong
- */
- private List<Long> bfSearchForXRefTables() throws IOException
- {
- List<Long> bfSearchXRefTablesOffsets = new ArrayList<>();
- // a pdf may contain more than one xref entry
- source.seek(MINIMUM_SEARCH_OFFSET);
- // search for xref tables
- long newOffset = findString(XREF_TABLE);
- while (newOffset != -1)
- {
- source.seek(newOffset - 1);
- // ensure that we don't read "startxref" instead of "xref"
- if (isWhitespace())
- {
- bfSearchXRefTablesOffsets.add(newOffset);
- }
- source.seek(newOffset + 4);
- newOffset = findString(XREF_TABLE);
- }
- return bfSearchXRefTablesOffsets;
- }
-
- /**
- * Brute force search for all /XRef entries (streams).
- *
- * @throws IOException if something went wrong
- */
- private List<Long> bfSearchForXRefStreams() throws IOException
- {
- List<Long> bfSearchXRefStreamsOffsets = new ArrayList<>();
- // a pdf may contain more than one /XRef entry
- source.seek(MINIMUM_SEARCH_OFFSET);
- // search for XRef streams
- String objString = " obj";
- char[] string = objString.toCharArray();
- long xrefOffset = findString(XREF_STREAM);
- while (xrefOffset != -1)
- {
- // search backwards for the beginning of the stream
- long newOffset = -1;
- boolean objFound = false;
- for (int i = 1; i < 40 && !objFound; i++)
- {
- long currentOffset = xrefOffset - (i * 10);
- if (currentOffset > 0)
- {
- source.seek(currentOffset);
- for (int j = 0; j < 10; j++)
- {
- if (isString(string))
- {
- long tempOffset = currentOffset - 1;
- source.seek(tempOffset);
- int genID = source.peek();
- // is the next char a digit?
- if (isDigit(genID))
- {
- tempOffset--;
- source.seek(tempOffset);
- if (isSpace())
- {
- int length = 0;
- source.seek(--tempOffset);
- while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
- {
- source.seek(--tempOffset);
- length++;
- }
- if (length > 0)
- {
- source.read();
- newOffset = source.getPosition();
- }
- }
- }
- LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> "
- + newOffset);
- objFound = true;
- break;
- }
- else
- {
- currentOffset++;
- source.read();
- }
- }
- }
- }
- if (newOffset > -1)
- {
- bfSearchXRefStreamsOffsets.add(newOffset);
- }
- source.seek(xrefOffset + 5);
- xrefOffset = findString(XREF_STREAM);
- }
- return bfSearchXRefStreamsOffsets;
+ if (bruteForceParser == null)
+ {
+ bruteForceParser = new BruteForceParser(source, document);
+ bruteForceSearchSuccessful = !bruteForceParser.getBFCOSObjectOffsets().isEmpty();
+ }
+ return bruteForceParser;
}
/**
- * Rebuild the trailer dictionary if startxref can't be found.
- *
- * @return the rebuild trailer dictionary
- *
- * @throws IOException if something went wrong
- */
- private COSDictionary rebuildTrailer() throws IOException
- {
- Map<COSObjectKey, Long> bfCOSObjectOffsets = getBFCOSObjectOffsets();
- // reset trailer resolver
- xrefTrailerResolver.reset();
- // use the found objects to rebuild the trailer resolver
- xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE);
- bfCOSObjectOffsets.forEach(xrefTrailerResolver::setXRef);
- xrefTrailerResolver.setStartxref(0);
- COSDictionary trailer = xrefTrailerResolver.getTrailer();
- document.setTrailer(trailer);
- boolean searchForObjStreamsDone = false;
- if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
- {
- // root entry wasn't found, maybe it is part of an object stream
- bfSearchForObjStreams();
- searchForObjStreamsDone = true;
- // search again for the root entry
- searchForTrailerItems(trailer);
- }
- // prepare decryption if necessary
- prepareDecryption();
- if (!searchForObjStreamsDone)
- {
- bfSearchForObjStreams();
- }
- trailerWasRebuild = true;
- return trailer;
- }
-
- /**
- * Search for the different parts of the trailer dictionary.
- *
- * @param trailer
- * @return true if the root was found, false if not.
- * @throws java.io.IOException if the page tree root is null
- */
- private boolean searchForTrailerItems(COSDictionary trailer) throws IOException
- {
- COSObject rootObject = null;
- COSObject infoObject = null;
- for (Entry<COSObjectKey, Long> entrySet : getBFCOSObjectOffsets().entrySet())
- {
- COSObjectKey currentKey = entrySet.getKey();
- COSObject cosObject = document.getObjectFromPool(currentKey);
- COSBase baseObject = cosObject.getObject();
-
- if (!(baseObject instanceof COSDictionary))
- {
- continue;
- }
- COSDictionary dictionary = (COSDictionary) baseObject;
- // document catalog
- if (isCatalog(dictionary))
- {
- rootObject = compareCOSObjects(cosObject, entrySet.getValue(), rootObject);
- }
- // info dictionary
- else if (isInfo(dictionary))
- {
- infoObject = compareCOSObjects(cosObject, entrySet.getValue(), infoObject);
- }
- // encryption dictionary, if existing, is lost
- // We can't run "Algorithm 2" from PDF specification because of missing ID
- }
- if (rootObject != null)
- {
- trailer.setItem(COSName.ROOT, rootObject);
- }
- if (infoObject != null)
- {
- trailer.setItem(COSName.INFO, infoObject);
- }
- return rootObject != null;
- }
-
- private COSObject compareCOSObjects(COSObject newObject, Long newOffset,
- COSObject currentObject)
- {
- if (currentObject != null && currentObject.getKey() != null)
- {
- COSObjectKey currentKey = currentObject.getKey();
- COSObjectKey newKey = newObject.getKey();
- // check if the current object is an updated version of the previous found object
- if (currentKey.getNumber() == newKey.getNumber())
- {
- return currentKey.getGeneration() < newKey.getGeneration() ? newObject
- : currentObject;
- }
- // most likely the object with the bigger offset is the newer one
- Long currentOffset = document.getXrefTable().get(currentKey);
- return currentOffset != null && newOffset > currentOffset ? newObject : currentObject;
- }
- return newObject;
- }
-
- /**
* Check if all entries of the pages dictionary are present. Those which can't be dereferenced are removed.
*
* @param root the root dictionary of the pdf
@@ -2064,43 +1406,6 @@ public class COSParser extends BaseParse
}
/**
- * Tell if the dictionary is a PDF catalog. Override this for an FDF catalog.
- *
- * @param dictionary
- * @return true if the given dictionary is a root dictionary
- */
- protected boolean isCatalog(COSDictionary dictionary)
- {
- return COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE));
- }
-
- /**
- * Tell if the dictionary is an info dictionary.
- *
- * @param dictionary
- * @return true if the given dictionary is an info dictionary
- */
- private boolean isInfo(COSDictionary dictionary)
- {
- if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
- || dictionary.containsKey(COSName.DEST))
- {
- return false;
- }
- if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
- && !dictionary.containsKey(COSName.AUTHOR)
- && !dictionary.containsKey(COSName.SUBJECT)
- && !dictionary.containsKey(COSName.KEYWORDS)
- && !dictionary.containsKey(COSName.CREATOR)
- && !dictionary.containsKey(COSName.PRODUCER)
- && !dictionary.containsKey(COSName.CREATION_DATE))
- {
- return false;
- }
- return true;
- }
-
- /**
* This will parse the startxref section from the stream. The startxref value is ignored.
*
* @return the startxref value or -1 on parsing error
@@ -2166,46 +1471,6 @@ public class COSParser extends BaseParse
}
/**
- * Search for the given string. The search starts at the current position and returns the start position if the
- * string was found. -1 is returned if there isn't any further occurrence of the given string. After returning the
- * current position is either the end of the string or the end of the input.
- *
- * @param string the string to be searched
- * @return the start position of the found string
- * @throws IOException if something went wrong
- */
- private long findString(char[] string) throws IOException
- {
- long position = -1L;
- int stringLength = string.length;
- int counter = 0;
- int readChar = source.read();
- while (readChar != -1)
- {
- if (readChar == string[counter])
- {
- if (counter == 0)
- {
- position = source.getPosition() - 1;
- }
- counter++;
- if (counter == stringLength)
- {
- return position;
- }
- }
- else if (counter > 0)
- {
- counter = 0;
- position = -1L;
- continue;
- }
- readChar = source.read();
- }
- return position;
- }
-
- /**
* This will parse the trailer from the stream and add it to the state.
*
* @return false on parsing error
@@ -2524,7 +1789,7 @@ public class COSParser extends BaseParse
*
* @throws IOException If there is an error getting the document.
*/
- public PDEncryption getEncryption() throws IOException
+ protected PDEncryption getEncryption() throws IOException
{
if (document == null)
{
@@ -2541,7 +1806,7 @@ public class COSParser extends BaseParse
*
* @throws IOException If there is an error getting the document.
*/
- public AccessPermission getAccessPermission() throws IOException
+ protected AccessPermission getAccessPermission() throws IOException
{
if (document == null)
{
@@ -2557,7 +1822,7 @@ public class COSParser extends BaseParse
* @throws InvalidPasswordException If the password is incorrect.
* @throws IOException if something went wrong
*/
- private void prepareDecryption() throws IOException
+ protected void prepareDecryption() throws IOException
{
if (encryption != null)
{
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java?rev=1900449&r1=1900448&r2=1900449&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java Sun May 1 11:39:05 2022
@@ -44,18 +44,6 @@ public class FDFParser extends COSParser
init();
}
- /**
- * Tell if the dictionary is a FDF catalog.
- *
- * @param dictionary
- * @return true if the dictionary is a FDF catalog
- */
- @Override
- protected final boolean isCatalog(COSDictionary dictionary)
- {
- return dictionary.containsKey(COSName.FDF);
- }
-
private void init()
{
String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);