You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2022/05/01 11:39:05 UTC

svn commit: r1900449 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BruteForceParser.java COSParser.java FDFParser.java

Author: lehmi
Date: Sun May  1 11:39:05 2022
New Revision: 1900449

URL: http://svn.apache.org/viewvc?rev=1900449&view=rev
Log:
PDFBOX-5031: separate brute force parser into its own class

Added:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java   (with props)
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java?rev=1900449&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java Sun May  1 11:39:05 2022
@@ -0,0 +1,858 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdfparser;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSObjectKey;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
+import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
+
+/**
+ * Brute force parser to be used as last resort if a malformed pdf can't be read.
+ */
+public class BruteForceParser extends COSParser
+{
+    private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
+    private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
+
+    private static final long MINIMUM_SEARCH_OFFSET = 6;
+
+    /**
+     * EOF-marker.
+     */
+    private static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
+    /**
+     * obj-marker.
+     */
+    private static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
+
+    /**
+     * trailer-marker.
+     */
+    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
+
+    /**
+     * ObjStream-marker.
+     */
+    private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
+
+    private static final Log LOG = LogFactory.getLog(COSParser.class);
+
+    /**
+     * Contains all found objects of a brute force search.
+     */
+    private final Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = new HashMap<>();
+
+    /**
+     * Constructor. Triggers a brute force search for all objects of the document.
+     *
+     * @param source input representing the pdf.
+     * @param document the corresponding COS document
+     * 
+     */
+    public BruteForceParser(RandomAccessRead source, COSDocument document) throws IOException
+    {
+        super(source);
+        this.document = document;
+        bfSearchForObjects();
+    }
+
+    /**
+     * Returns all found objects of a brute force search.
+     * 
+     * @return map containing all found objects of a brute force search
+     */
+    protected Map<COSObjectKey, Long> getBFCOSObjectOffsets()
+    {
+        return bfSearchCOSObjectKeyOffsets;
+    }
+
+    /**
+     * Brute force search for every object in the pdf.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForObjects() throws IOException
+    {
+        long lastEOFMarker = bfSearchForLastEOFMarker();
+        long originOffset = source.getPosition();
+        long currentOffset = MINIMUM_SEARCH_OFFSET;
+        long lastObjectId = Long.MIN_VALUE;
+        int lastGenID = Integer.MIN_VALUE;
+        long lastObjOffset = Long.MIN_VALUE;
+        char[] endobjString = "ndo".toCharArray();
+        char[] endobjRemainingString = "bj".toCharArray();
+        boolean endOfObjFound = false;
+        do
+        {
+            source.seek(currentOffset);
+            int nextChar = source.read();
+            currentOffset++;
+            if (isWhitespace(nextChar) && isString(OBJ_MARKER))
+            {
+                long tempOffset = currentOffset - 2;
+                source.seek(tempOffset);
+                int genID = source.peek();
+                // is the next char a digit?
+                if (isDigit(genID))
+                {
+                    genID -= 48;
+                    tempOffset--;
+                    source.seek(tempOffset);
+                    if (isWhitespace())
+                    {
+                        while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace())
+                        {
+                            source.seek(--tempOffset);
+                        }
+                        boolean objectIDFound = false;
+                        while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+                        {
+                            source.seek(--tempOffset);
+                            objectIDFound = true;
+                        }
+                        if (objectIDFound)
+                        {
+                            source.read();
+                            long objectId = readObjectNumber();
+                            if (lastObjOffset > 0)
+                            {
+                                // add the former object ID only if there was a subsequent object ID
+                                bfSearchCOSObjectKeyOffsets.put(
+                                        new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
+                            }
+                            lastObjectId = objectId;
+                            lastGenID = genID;
+                            lastObjOffset = tempOffset + 1;
+                            currentOffset += OBJ_MARKER.length - 1;
+                            endOfObjFound = false;
+                        }
+                    }
+                }
+            }
+            // check for "endo" as abbreviation for "endobj", as the pdf may be cut off
+            // in the middle of the keyword, see PDFBOX-3936.
+            // We could possibly implement a more intelligent algorithm if necessary
+            else if (nextChar == 'e' && isString(endobjString))
+            {
+                currentOffset += endobjString.length;
+                source.seek(currentOffset);
+                if (source.isEOF())
+                {
+                    endOfObjFound = true;
+                }
+                else if (isString(endobjRemainingString))
+                {
+                    currentOffset += endobjRemainingString.length;
+                    endOfObjFound = true;
+                }
+            }
+        } while (currentOffset < lastEOFMarker && !source.isEOF());
+        if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
+        {
+            // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
+            // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
+            bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
+                    lastObjOffset);
+        }
+        // reestablish origin position
+        source.seek(originOffset);
+    }
+
+    /**
+     * Search for the offset of the given xref table/stream among those found by a brute force search.
+     * 
+     * @param xrefOffset the given offset to be searched for
+     * 
+     * @return the offset of the xref entry
+     * @throws IOException if something went wrong
+     */
+    protected long bfSearchForXRef(long xrefOffset) throws IOException
+    {
+        long newOffset = -1;
+
+        // initialize bfSearchXRefTablesOffsets -> not null
+        List<Long> bfSearchXRefTablesOffsets = bfSearchForXRefTables();
+        // initialize bfSearchXRefStreamsOffsets -> not null
+        List<Long> bfSearchXRefStreamsOffsets = bfSearchForXRefStreams();
+
+        // TODO to be optimized, this won't work in every case
+        long newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
+
+        // TODO to be optimized, this won't work in every case
+        long newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
+
+        // choose the nearest value
+        if (newOffsetTable > -1 && newOffsetStream > -1)
+        {
+            long differenceTable = xrefOffset - newOffsetTable;
+            long differenceStream = xrefOffset - newOffsetStream;
+            if (Math.abs(differenceTable) > Math.abs(differenceStream))
+            {
+                newOffset = newOffsetStream;
+                bfSearchXRefStreamsOffsets.remove(newOffsetStream);
+            }
+            else
+            {
+                newOffset = newOffsetTable;
+                bfSearchXRefTablesOffsets.remove(newOffsetTable);
+            }
+        }
+        else if (newOffsetTable > -1)
+        {
+            newOffset = newOffsetTable;
+            bfSearchXRefTablesOffsets.remove(newOffsetTable);
+        }
+        else if (newOffsetStream > -1)
+        {
+            newOffset = newOffsetStream;
+            bfSearchXRefStreamsOffsets.remove(newOffsetStream);
+        }
+        return newOffset;
+    }
+
+    private long searchNearestValue(List<Long> values, long offset)
+    {
+        long newValue = -1;
+        Long currentDifference = null;
+        int currentOffsetIndex = -1;
+        int numberOfOffsets = values.size();
+        // find the nearest value
+        for (int i = 0; i < numberOfOffsets; i++)
+        {
+            long newDifference = offset - values.get(i);
+            // find the nearest offset
+            if (currentDifference == null
+                    || (Math.abs(currentDifference) > Math.abs(newDifference)))
+            {
+                currentDifference = newDifference;
+                currentOffsetIndex = i;
+            }
+        }
+        if (currentOffsetIndex > -1)
+        {
+            newValue = values.get(currentOffsetIndex);
+        }
+        return newValue;
+    }
+
+    /**
+     * Brute force search for all objects streams of a pdf.
+     * 
+     * @param trailerResolver the trailer resolver of the document
+     * @param securityHandler security handler to be used to decrypt encrypted documents
+     * @throws IOException if something went wrong
+     */
+    protected void bfSearchForObjStreams(XrefTrailerResolver trailerResolver,
+            SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
+    {
+        // update security handler
+        this.securityHandler = securityHandler;
+        // save origin offset
+        long originOffset = source.getPosition();
+
+        Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = bfSearchForObjStreamOffsets();
+        // log warning about skipped stream
+        bfSearchForObjStreamOffsets.entrySet().stream() //
+                .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) == null) //
+                .forEach(o -> LOG.warn(
+                        "Skipped incomplete object stream:" + o.getValue() + " at " + o.getKey()));
+
+        // collect all stream offsets
+        List<Long> objStreamOffsets = bfSearchForObjStreamOffsets.entrySet().stream() //
+                .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) != null) //
+                .filter(o -> o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
+                .map(Map.Entry::getKey) //
+                .collect(Collectors.toList());
+        // add all found compressed objects to the brute force search result
+        for (Long offset : objStreamOffsets)
+        {
+            source.seek(offset);
+            long stmObjNumber = readObjectNumber();
+            int stmGenNumber = readGenerationNumber();
+            readExpectedString(OBJ_MARKER, true);
+            COSStream stream = null;
+            try
+            {
+                COSDictionary dict = parseCOSDictionary(false);
+                stream = parseCOSStream(dict);
+                if (securityHandler != null)
+                {
+                    securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber);
+                }
+                PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document);
+                Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers();
+                Map<COSObjectKey, Long> xrefOffset = trailerResolver.getXrefTable();
+                for (Long objNumber : objectNumbers.keySet())
+                {
+                    COSObjectKey objKey = new COSObjectKey(objNumber, 0);
+                    Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
+                    if (existingOffset != null && existingOffset < 0)
+                    {
+                        // translate stream object key to its offset
+                        COSObjectKey objStmKey = new COSObjectKey(Math.abs(existingOffset), 0);
+                        existingOffset = bfSearchCOSObjectKeyOffsets.get(objStmKey);
+                    }
+                    if (existingOffset == null || offset > existingOffset)
+                    {
+                        bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
+                        xrefOffset.put(objKey, -stmObjNumber);
+                    }
+                }
+            }
+            catch (IOException exception)
+            {
+                LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset,
+                        exception);
+            }
+            finally
+            {
+                if (stream != null)
+                {
+                    stream.close();
+                }
+            }
+        }
+        // restore origin offset
+        source.seek(originOffset);
+    }
+
+    /**
+     * Brute force search for all trailer marker.
+     * 
+     * @param trailer dictionary to be used as trailer dictionary
+     * 
+     * @throws IOException if something went wrong
+     */
+    private boolean bfSearchForTrailer(COSDictionary trailer) throws IOException
+    {
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        // search for trailer marker
+        long trailerOffset = findString(TRAILER_MARKER);
+        while (trailerOffset != -1)
+        {
+            try
+            {
+                boolean rootFound = false;
+                boolean infoFound = false;
+                skipSpaces();
+                COSDictionary trailerDict = parseCOSDictionary(true);
+                COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
+                if (rootObj != null)
+                {
+                    // check if the dictionary can be dereferenced and is the one we are looking for
+                    COSBase rootDict = rootObj.getObject();
+                    if (rootDict instanceof COSDictionary && isCatalog((COSDictionary) rootDict))
+                    {
+                        rootFound = true;
+                    }
+                }
+                COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
+                if (infoObj != null)
+                {
+                    // check if the dictionary can be dereferenced and is the one we are looking for
+                    COSBase infoDict = infoObj.getObject();
+                    if (infoDict instanceof COSDictionary && isInfo((COSDictionary) infoDict))
+                    {
+                        infoFound = true;
+                    }
+                }
+                if (rootFound && infoFound)
+                {
+                    trailer.setItem(COSName.ROOT, rootObj);
+                    trailer.setItem(COSName.INFO, infoObj);
+                    if (trailerDict.containsKey(COSName.ENCRYPT))
+                    {
+                        COSObject encObj = trailerDict.getCOSObject(COSName.ENCRYPT);
+                        // check if the dictionary can be dereferenced
+                        // TODO check if the dictionary is an encryption dictionary?
+                        if (encObj != null && encObj.getObject() instanceof COSDictionary)
+                        {
+                            trailer.setItem(COSName.ENCRYPT, encObj);
+                        }
+                    }
+                    if (trailerDict.containsKey(COSName.ID))
+                    {
+                        COSBase idObj = trailerDict.getItem(COSName.ID);
+                        if (idObj instanceof COSArray)
+                        {
+                            trailer.setItem(COSName.ID, idObj);
+                        }
+                    }
+                    return true;
+                }
+            }
+            catch (IOException exception)
+            {
+                LOG.debug("An exception occurred during brute force search for trailer - ignoring",
+                        exception);
+            }
+            trailerOffset = findString(TRAILER_MARKER);
+        }
+        source.seek(originOffset);
+        return false;
+    }
+
+    /**
+     * Search for the different parts of the trailer dictionary.
+     *
+     * @param trailer dictionary to be used as trailer dictionary
+     * @return true if the root was found, false if not.
+     * @throws IOException if the page tree root is null
+     */
+    private boolean searchForTrailerItems(COSDictionary trailer) throws IOException
+    {
+        COSObject rootObject = null;
+        COSObject infoObject = null;
+        for (Entry<COSObjectKey, Long> entrySet : bfSearchCOSObjectKeyOffsets.entrySet())
+        {
+            COSObjectKey currentKey = entrySet.getKey();
+            COSObject cosObject = document.getObjectFromPool(currentKey);
+            COSBase baseObject = cosObject.getObject();
+
+            if (!(baseObject instanceof COSDictionary))
+            {
+                continue;
+            }
+            COSDictionary dictionary = (COSDictionary) baseObject;
+            // document catalog
+            if (isCatalog(dictionary))
+            {
+                rootObject = compareCOSObjects(cosObject, entrySet.getValue(), rootObject);
+            }
+            // info dictionary
+            else if (isInfo(dictionary))
+            {
+                infoObject = compareCOSObjects(cosObject, entrySet.getValue(), infoObject);
+            }
+            // encryption dictionary, if existing, is lost
+            // We can't run "Algorithm 2" from PDF specification because of missing ID
+        }
+        if (rootObject != null)
+        {
+            trailer.setItem(COSName.ROOT, rootObject);
+        }
+        if (infoObject != null)
+        {
+            trailer.setItem(COSName.INFO, infoObject);
+        }
+        return rootObject != null;
+    }
+
+    private COSObject compareCOSObjects(COSObject newObject, Long newOffset,
+            COSObject currentObject)
+    {
+        if (currentObject != null && currentObject.getKey() != null)
+        {
+            COSObjectKey currentKey = currentObject.getKey();
+            COSObjectKey newKey = newObject.getKey();
+            // check if the current object is an updated version of the previous found object
+            if (currentKey.getNumber() == newKey.getNumber())
+            {
+                return currentKey.getGeneration() < newKey.getGeneration() ? newObject
+                        : currentObject;
+            }
+            // most likely the object with the bigger offset is the newer one
+            Long currentOffset = document.getXrefTable().get(currentKey);
+            return currentOffset != null && newOffset > currentOffset ? newObject : currentObject;
+        }
+        return newObject;
+    }
+
+    /**
+     * Brute force search for the last EOF marker.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private long bfSearchForLastEOFMarker() throws IOException
+    {
+        long lastEOFMarker = -1;
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        long tempMarker = findString(EOF_MARKER);
+        while (tempMarker != -1)
+        {
+            try
+            {
+                // check if the following data is some valid pdf content
+                // which most likely indicates that the pdf is linearized,
+                // updated or just cut off somewhere in the middle
+                skipSpaces();
+                if (!isString(XREF_TABLE))
+                {
+                    readObjectNumber();
+                    readGenerationNumber();
+                }
+            }
+            catch (IOException exception)
+            {
+                // save the EOF marker as the following data is most likely some garbage
+                LOG.debug("An exception occurred during brute force for last EOF - ignoring",
+                        exception);
+                lastEOFMarker = tempMarker;
+            }
+            tempMarker = findString(EOF_MARKER);
+        }
+        source.seek(originOffset);
+        // no EOF marker found
+        if (lastEOFMarker == -1)
+        {
+            lastEOFMarker = Long.MAX_VALUE;
+        }
+        return lastEOFMarker;
+    }
+
+    /**
+     * Search for all offsets of object streams within the given pdf
+     * 
+     * @return a map of all offsets for object streams
+     * @throws IOException if something went wrong
+     */
+    private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws IOException
+    {
+        HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        char[] string = " obj".toCharArray();
+        // search for object stream marker
+        long positionObjStream = findString(OBJ_STREAM);
+        while (positionObjStream != -1)
+        {
+            // search backwards for the beginning of the object
+            long newOffset = -1;
+            boolean objFound = false;
+            for (int i = 1; i < 40 && !objFound; i++)
+            {
+                long currentOffset = positionObjStream - (i * 10);
+                if (currentOffset > 0)
+                {
+                    source.seek(currentOffset);
+                    for (int j = 0; j < 10; j++)
+                    {
+                        if (isString(string))
+                        {
+                            long tempOffset = currentOffset - 1;
+                            source.seek(tempOffset);
+                            int genID = source.peek();
+                            // is the next char a digit?
+                            if (isDigit(genID))
+                            {
+                                tempOffset--;
+                                source.seek(tempOffset);
+                                if (isSpace())
+                                {
+                                    int length = 0;
+                                    source.seek(--tempOffset);
+                                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+                                    {
+                                        source.seek(--tempOffset);
+                                        length++;
+                                    }
+                                    if (length > 0)
+                                    {
+                                        source.read();
+                                        newOffset = source.getPosition();
+                                        long objNumber = readObjectNumber();
+                                        int genNumber = readGenerationNumber();
+                                        COSObjectKey streamObjectKey = new COSObjectKey(objNumber,
+                                                genNumber);
+                                        bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
+                                    }
+                                }
+                            }
+                            LOG.debug("Dictionary start for object stream -> " + newOffset);
+                            objFound = true;
+                            break;
+                        }
+                        else
+                        {
+                            currentOffset++;
+                            source.read();
+                        }
+                    }
+                }
+            }
+            source.seek(positionObjStream + OBJ_STREAM.length);
+            positionObjStream = findString(OBJ_STREAM);
+        }
+        return bfSearchObjStreamsOffsets;
+    }
+
+    /**
+     * Brute force search for all xref entries (tables).
+     * 
+     * @throws IOException if something went wrong
+     */
+    private List<Long> bfSearchForXRefTables() throws IOException
+    {
+        List<Long> bfSearchXRefTablesOffsets = new ArrayList<>();
+        // a pdf may contain more than one xref entry
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        // search for xref tables
+        long newOffset = findString(XREF_TABLE);
+        while (newOffset != -1)
+        {
+            source.seek(newOffset - 1);
+            // ensure that we don't read "startxref" instead of "xref"
+            if (isWhitespace())
+            {
+                bfSearchXRefTablesOffsets.add(newOffset);
+            }
+            source.seek(newOffset + 4);
+            newOffset = findString(XREF_TABLE);
+        }
+        return bfSearchXRefTablesOffsets;
+    }
+
+    /**
+     * Brute force search for all /XRef entries (streams).
+     * 
+     * @throws IOException if something went wrong
+     */
+    private List<Long> bfSearchForXRefStreams() throws IOException
+    {
+        List<Long> bfSearchXRefStreamsOffsets = new ArrayList<>();
+        // a pdf may contain more than one /XRef entry
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        // search for XRef streams
+        String objString = " obj";
+        char[] string = objString.toCharArray();
+        long xrefOffset = findString(XREF_STREAM);
+        while (xrefOffset != -1)
+        {
+            // search backwards for the beginning of the stream
+            long newOffset = -1;
+            boolean objFound = false;
+            for (int i = 1; i < 40 && !objFound; i++)
+            {
+                long currentOffset = xrefOffset - (i * 10);
+                if (currentOffset > 0)
+                {
+                    source.seek(currentOffset);
+                    for (int j = 0; j < 10; j++)
+                    {
+                        if (isString(string))
+                        {
+                            long tempOffset = currentOffset - 1;
+                            source.seek(tempOffset);
+                            int genID = source.peek();
+                            // is the next char a digit?
+                            if (isDigit(genID))
+                            {
+                                tempOffset--;
+                                source.seek(tempOffset);
+                                if (isSpace())
+                                {
+                                    int length = 0;
+                                    source.seek(--tempOffset);
+                                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+                                    {
+                                        source.seek(--tempOffset);
+                                        length++;
+                                    }
+                                    if (length > 0)
+                                    {
+                                        source.read();
+                                        newOffset = source.getPosition();
+                                    }
+                                }
+                            }
+                            LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> "
+                                    + newOffset);
+                            objFound = true;
+                            break;
+                        }
+                        else
+                        {
+                            currentOffset++;
+                            source.read();
+                        }
+                    }
+                }
+            }
+            if (newOffset > -1)
+            {
+                bfSearchXRefStreamsOffsets.add(newOffset);
+            }
+            source.seek(xrefOffset + 5);
+            xrefOffset = findString(XREF_STREAM);
+        }
+        return bfSearchXRefStreamsOffsets;
+    }
+
+    /**
+     * Tell if the dictionary is an info dictionary.
+     * 
+     * @param dictionary the dictionary to be checked
+     * @return true if the given dictionary is an info dictionary
+     */
+    private boolean isInfo(COSDictionary dictionary)
+    {
+        if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
+                || dictionary.containsKey(COSName.DEST))
+        {
+            return false;
+        }
+        if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
+                && !dictionary.containsKey(COSName.AUTHOR)
+                && !dictionary.containsKey(COSName.SUBJECT)
+                && !dictionary.containsKey(COSName.KEYWORDS)
+                && !dictionary.containsKey(COSName.CREATOR)
+                && !dictionary.containsKey(COSName.PRODUCER)
+                && !dictionary.containsKey(COSName.CREATION_DATE))
+        {
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * Tell if the dictionary is a PDF or FDF catalog.
+     * 
+     * @param dictionary
+     * @return true if the given dictionary is a root dictionary
+     */
+    private boolean isCatalog(COSDictionary dictionary)
+    {
+        return COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE))
+                || dictionary.containsKey(COSName.FDF);
+    }
+
+    /**
+     * Checks if the given string can be found at the current offset.
+     * 
+     * @param string the bytes of the string to look for
+     * @return true if the bytes are in place, false if not
+     * @throws IOException if something went wrong
+     */
+    private boolean isString(char[] string) throws IOException
+    {
+        boolean bytesMatching = true;
+        long originOffset = source.getPosition();
+        for (char c : string)
+        {
+            if (source.read() != c)
+            {
+                bytesMatching = false;
+                break;
+            }
+        }
+        source.seek(originOffset);
+        return bytesMatching;
+    }
+
+    /**
+     * Search for the given string. The search starts at the current position and returns the start position if the
+     * string was found. -1 is returned if there isn't any further occurrence of the given string. After returning the
+     * current position is either the end of the string or the end of the input.
+     * 
+     * @param string the string to be searched
+     * @return the start position of the found string
+     * @throws IOException if something went wrong
+     */
+    private long findString(char[] string) throws IOException
+    {
+        long position = -1L;
+        int stringLength = string.length;
+        int counter = 0;
+        int readChar = source.read();
+        while (readChar != -1)
+        {
+            if (readChar == string[counter])
+            {
+                if (counter == 0)
+                {
+                    position = source.getPosition() - 1;
+                }
+                counter++;
+                if (counter == stringLength)
+                {
+                    return position;
+                }
+            }
+            else if (counter > 0)
+            {
+                counter = 0;
+                position = -1L;
+                continue;
+            }
+            readChar = source.read();
+        }
+        return position;
+    }
+
+    /**
+     * Rebuild the trailer dictionary if startxref can't be found.
+     * 
+     * @param trailerResolver the trailer resolver of the document
+     * @param securityHandler security handler to be used to decrypt encrypted documents
+     * @return the rebuild trailer dictionary
+     * 
+     * @throws IOException if something went wrong
+     */
+    protected COSDictionary rebuildTrailer(XrefTrailerResolver trailerResolver,
+            SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
+    {
+        // update security handler
+        this.securityHandler = securityHandler;
+        // reset trailer resolver
+        trailerResolver.reset();
+        // use the found objects to rebuild the trailer resolver
+        trailerResolver.nextXrefObj(0, XRefType.TABLE);
+        bfSearchCOSObjectKeyOffsets.forEach(xrefTrailerResolver::setXRef);
+        trailerResolver.setStartxref(0);
+        COSDictionary trailer = trailerResolver.getTrailer();
+        document.setTrailer(trailer);
+        boolean searchForObjStreamsDone = false;
+        if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
+        {
+            // root entry wasn't found, maybe it is part of an object stream
+            // brute force search for all object streams.
+            bfSearchForObjStreams(trailerResolver, securityHandler);
+            searchForObjStreamsDone = true;
+            // search again for the root entry
+            searchForTrailerItems(trailer);
+        }
+        // prepare decryption if necessary
+        prepareDecryption();
+        if (!searchForObjStreamsDone)
+        {
+            // brute force search for all object streams.
+            bfSearchForObjStreams(trailerResolver, securityHandler);
+        }
+        return trailer;
+    }
+
+}

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1900449&r1=1900448&r2=1900449&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun May  1 11:39:05 2022
@@ -21,7 +21,6 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.security.GeneralSecurityException;
 import java.security.KeyStore;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -29,7 +28,6 @@ import java.util.Map;
 import java.util.Optional;
 import java.util.Map.Entry;
 import java.util.Set;
-import java.util.stream.Collectors;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -71,7 +69,6 @@ public class COSParser extends BaseParse
     private static final String FDF_DEFAULT_VERSION = "1.0";
 
     private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
-    private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
     private static final char[] STARTXREF = new char[] { 's','t','a','r','t','x','r','e','f' };
 
     private static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M };
@@ -112,16 +109,6 @@ public class COSParser extends BaseParse
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
     /**
-     * trailer-marker.
-     */
-    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
-
-    /**
-     * ObjStream-marker.
-     */
-    private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
-
-    /**
      * file length.
      */
     protected long fileLen;
@@ -134,13 +121,11 @@ public class COSParser extends BaseParse
     protected boolean initialParseDone = false;
 
     private boolean trailerWasRebuild = false;
-    /**
-     * Contains all found objects of a brute force search.
-     */
-    private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
-    private boolean bruteForceSearchTriggered = false;
+    
+    private BruteForceParser bruteForceParser = null;
+    private boolean bruteForceSearchSuccessful = false;
     private PDEncryption encryption = null;
-
+    
     /**
      * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
      * dereferencing them.
@@ -266,15 +251,16 @@ public class COSParser extends BaseParse
         }
         if (rebuildTrailer)
         {
-            trailer = rebuildTrailer();
+            trailer = getBruteForceParser().rebuildTrailer(xrefTrailerResolver, securityHandler);
+            trailerWasRebuild = true;
         }
         else
         {
             // prepare decryption if necessary
             prepareDecryption();
-            if (bruteForceSearchTriggered && !getBFCOSObjectOffsets().isEmpty())
+            if (bruteForceSearchSuccessful)
             {
-                bfSearchForObjStreams();
+                getBruteForceParser().bfSearchForObjStreams(xrefTrailerResolver, securityHandler);
             }
         }
         if (resetTrailerResolver())
@@ -412,7 +398,10 @@ public class COSParser extends BaseParse
         document.setTrailer(trailer);
         document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
         // check the offsets of all referenced objects
-        checkXrefOffsets();
+        if (isLenient)
+        {
+            checkXrefOffsets();
+        }
         // copy xref table
         document.addXRefTable(xrefTrailerResolver.getXrefTable());
 
@@ -651,8 +640,7 @@ public class COSParser extends BaseParse
         // maybe something is wrong with the xref table -> perform brute force search for all objects
         if (offsetOrObjstmObNr == null && isLenient)
         {
-            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
-            offsetOrObjstmObNr = bfCOSObjectKeyOffsets.get(objKey);
+            offsetOrObjstmObNr =  getBruteForceParser().getBFCOSObjectOffsets().get(objKey);
             if (offsetOrObjstmObNr != null)
             {
                 LOG.debug("Set missing offset " + offsetOrObjstmObNr + " for object " + objKey);
@@ -1160,8 +1148,8 @@ public class COSParser extends BaseParse
             LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
             return 0;
         }
-        // start a brute force search for all xref tables and try to find the offset we are looking for
-        long newOffset = bfSearchForXRef(objectOffset);
+        // search for the offset of the given xref table/stream among those found by a brute force search.
+        long newOffset = getBruteForceParser().bfSearchForXRef(objectOffset);
         if (newOffset > -1)
         {
             LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
@@ -1211,14 +1199,15 @@ public class COSParser extends BaseParse
             if (!validKeys.contains(correctedKeyEntry.getValue()))
             {
                 // Only replace entries, if the original entry does not point to a valid object
-                correctedPointers.put(correctedKeyEntry.getValue(), xrefOffset.get(correctedKeyEntry.getKey()));
+                correctedPointers.put(correctedKeyEntry.getValue(),
+                        xrefOffset.get(correctedKeyEntry.getKey()));
             }
         }
         correctedKeys.entrySet().forEach(
                 // remove old invalid, as some might not be replaced
                 correctedKeyEntry -> xrefOffset.remove(correctedKeyEntry.getKey()));
-        correctedPointers.entrySet().forEach(
-                pointer -> xrefOffset.put(pointer.getKey(), pointer.getValue()));
+        correctedPointers.entrySet()
+                .forEach(pointer -> xrefOffset.put(pointer.getKey(), pointer.getValue()));
         return true;
     }
 
@@ -1229,15 +1218,11 @@ public class COSParser extends BaseParse
      */
     private void checkXrefOffsets() throws IOException
     {
-        // repair mode isn't available in non-lenient mode
-        if (!isLenient)
-        {
-            return;
-        }
         Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
         if (!validateXrefOffsets(xrefOffset))
         {
-            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
+            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBruteForceParser()
+                    .getBFCOSObjectOffsets();
             if (!bfCOSObjectKeyOffsets.isEmpty())
             {
                 LOG.debug("Replaced read xref table with the results of a brute force search");
@@ -1258,7 +1243,8 @@ public class COSParser extends BaseParse
      * 
      * @throws IOException if something went wrong
      */
-    private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset, Map<COSObjectKey, Long> xrefOffset) throws IOException
+    private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
+            Map<COSObjectKey, Long> xrefOffset) throws IOException
     {
         // there can't be any object at the very beginning of a pdf
         if (offset < MINIMUM_SEARCH_OFFSET)
@@ -1340,661 +1326,17 @@ public class COSParser extends BaseParse
         return null;
     }
 
-    private Map<COSObjectKey, Long> getBFCOSObjectOffsets() throws IOException
-    {
-        if (bfSearchCOSObjectKeyOffsets == null)
-        {
-            bfSearchCOSObjectKeyOffsets = bfSearchForObjects();
-            bruteForceSearchTriggered = true;
-        }
-        return bfSearchCOSObjectKeyOffsets;
-    }
-
-    /**
-     * Brute force search for every object in the pdf.
-     *   
-     * @throws IOException if something went wrong
-     */
-    private Map<COSObjectKey, Long> bfSearchForObjects() throws IOException
-    {
-        Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = new HashMap<>();
-        long lastEOFMarker = bfSearchForLastEOFMarker();
-        long originOffset = source.getPosition();
-        long currentOffset = MINIMUM_SEARCH_OFFSET;
-        long lastObjectId = Long.MIN_VALUE;
-        int lastGenID = Integer.MIN_VALUE;
-        long lastObjOffset = Long.MIN_VALUE;
-        char[] endobjString = "ndo".toCharArray();
-        char[] endobjRemainingString = "bj".toCharArray();
-        boolean endOfObjFound = false;
-        do
-        {
-            source.seek(currentOffset);
-            int nextChar = source.read();
-            currentOffset++;
-            if (isWhitespace(nextChar) && isString(OBJ_MARKER))
-            {
-                long tempOffset = currentOffset - 2;
-                source.seek(tempOffset);
-                int genID = source.peek();
-                // is the next char a digit?
-                if (isDigit(genID))
-                {
-                    genID -= 48;
-                    tempOffset--;
-                    source.seek(tempOffset);
-                    if (isWhitespace())
-                    {
-                        while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace())
-                        {
-                            source.seek(--tempOffset);
-                        }
-                        boolean objectIDFound = false;
-                        while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
-                        {
-                            source.seek(--tempOffset);
-                            objectIDFound = true;
-                        }
-                        if (objectIDFound)
-                        {
-                            source.read();
-                            long objectId = readObjectNumber();
-                            if (lastObjOffset > 0)
-                            {
-                                // add the former object ID only if there was a subsequent object ID
-                                bfCOSObjectKeyOffsets.put(
-                                        new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
-                            }
-                            lastObjectId = objectId;
-                            lastGenID = genID;
-                            lastObjOffset = tempOffset + 1;
-                            currentOffset += OBJ_MARKER.length - 1;
-                            endOfObjFound = false;
-                        }
-                    }
-                }
-            }
-            // check for "endo" as abbreviation for "endobj", as the pdf may be cut off
-            // in the middle of the keyword, see PDFBOX-3936.
-            // We could possibly implement a more intelligent algorithm if necessary
-            else if (nextChar == 'e' && isString(endobjString))
-            {
-                currentOffset += endobjString.length;
-                source.seek(currentOffset);
-                if (source.isEOF())
-                {
-                    endOfObjFound = true;
-                }
-                else if (isString(endobjRemainingString))
-                {
-                    currentOffset += endobjRemainingString.length;
-                    endOfObjFound = true;
-                }
-            }
-        } while (currentOffset < lastEOFMarker && !source.isEOF());
-        if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
-        {
-            // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
-            // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
-            bfCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
-                    lastObjOffset);
-        }
-        // reestablish origin position
-        source.seek(originOffset);
-        return bfCOSObjectKeyOffsets;
-    }
-
-    /**
-     * Search for the offset of the given xref table/stream among those found by a brute force search.
-     * 
-     * @return the offset of the xref entry
-     * @throws IOException if something went wrong
-     */
-    private long bfSearchForXRef(long xrefOffset) throws IOException
-    {
-        long newOffset = -1;
-
-        // initialize bfSearchXRefTablesOffsets -> not null
-        List<Long> bfSearchXRefTablesOffsets = bfSearchForXRefTables();
-        // initialize bfSearchXRefStreamsOffsets -> not null
-        List<Long> bfSearchXRefStreamsOffsets = bfSearchForXRefStreams();
-
-        // TODO to be optimized, this won't work in every case
-        long newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
-        
-        // TODO to be optimized, this won't work in every case
-        long newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
-
-        // choose the nearest value
-        if (newOffsetTable > -1 && newOffsetStream > -1)
-        {
-            long differenceTable = xrefOffset - newOffsetTable;
-            long differenceStream = xrefOffset - newOffsetStream;
-            if (Math.abs(differenceTable) > Math.abs(differenceStream))
-            {
-                newOffset = newOffsetStream;
-                bfSearchXRefStreamsOffsets.remove(newOffsetStream);
-            }
-            else
-            {
-                newOffset = newOffsetTable;
-                bfSearchXRefTablesOffsets.remove(newOffsetTable);
-            }
-        }
-        else if (newOffsetTable > -1)
-        {
-            newOffset = newOffsetTable;
-            bfSearchXRefTablesOffsets.remove(newOffsetTable);
-        }
-        else if (newOffsetStream > -1)
-        {
-            newOffset = newOffsetStream;
-            bfSearchXRefStreamsOffsets.remove(newOffsetStream);
-        }
-        return newOffset;
-    }
-
-    private long searchNearestValue(List<Long> values, long offset)
-    {
-        long newValue = -1;
-        Long currentDifference = null;
-        int currentOffsetIndex = -1;
-        int numberOfOffsets = values.size();
-        // find the nearest value
-        for (int i = 0; i < numberOfOffsets; i++)
-        {
-            long newDifference = offset - values.get(i);
-            // find the nearest offset
-            if (currentDifference == null
-                    || (Math.abs(currentDifference) > Math.abs(newDifference)))
-            {
-                currentDifference = newDifference;
-                currentOffsetIndex = i;
-            }
-        }
-        if (currentOffsetIndex > -1)
-        {
-            newValue = values.get(currentOffsetIndex);
-        }
-        return newValue;
-    }
-
-    /**
-     * Brute force search for all trailer marker.
-     * 
-     * @throws IOException if something went wrong
-     */
-    private boolean bfSearchForTrailer(COSDictionary trailer) throws IOException
-    {
-        long originOffset = source.getPosition();
-        source.seek(MINIMUM_SEARCH_OFFSET);
-        // search for trailer marker
-        long trailerOffset = findString(TRAILER_MARKER);
-        while (trailerOffset != -1)
-        {
-            try
-            {
-                boolean rootFound = false;
-                boolean infoFound = false;
-                skipSpaces();
-                COSDictionary trailerDict = parseCOSDictionary(true);
-                COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
-                if (rootObj != null)
-                {
-                    // check if the dictionary can be dereferenced and is the one we are looking for
-                    COSBase rootDict = rootObj.getObject();
-                    if (rootDict instanceof COSDictionary && isCatalog((COSDictionary) rootDict))
-                    {
-                        rootFound = true;
-                    }
-                }
-                COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
-                if (infoObj != null)
-                {
-                    // check if the dictionary can be dereferenced and is the one we are looking for
-                    COSBase infoDict = infoObj.getObject();
-                    if (infoDict instanceof COSDictionary && isInfo((COSDictionary) infoDict))
-                    {
-                        infoFound = true;
-                    }
-                }
-                if (rootFound && infoFound)
-                {
-                    trailer.setItem(COSName.ROOT, rootObj);
-                    trailer.setItem(COSName.INFO, infoObj);
-                    if (trailerDict.containsKey(COSName.ENCRYPT))
-                    {
-                        COSObject encObj = trailerDict.getCOSObject(COSName.ENCRYPT);
-                        // check if the dictionary can be dereferenced
-                        // TODO check if the dictionary is an encryption dictionary?
-                        if (encObj != null && encObj.getObject() instanceof COSDictionary)
-                        {
-                            trailer.setItem(COSName.ENCRYPT, encObj);
-                        }
-                    }
-                    if (trailerDict.containsKey(COSName.ID))
-                    {
-                        COSBase idObj = trailerDict.getItem(COSName.ID);
-                        if (idObj instanceof COSArray)
-                        {
-                            trailer.setItem(COSName.ID, idObj);
-                        }
-                    }
-                    return true;
-                }
-            }
-            catch (IOException exception)
-            {
-                LOG.debug("An exception occurred during brute force search for trailer - ignoring",
-                        exception);
-            }
-            trailerOffset = findString(TRAILER_MARKER);
-        }
-        source.seek(originOffset);
-        return false;
-    }
-
-    /**
-     * Brute force search for the last EOF marker.
-     * 
-     * @throws IOException if something went wrong
-     */
-    private long bfSearchForLastEOFMarker() throws IOException
-    {
-        long lastEOFMarker = -1;
-        long originOffset = source.getPosition();
-        source.seek(MINIMUM_SEARCH_OFFSET);
-        long tempMarker = findString(EOF_MARKER);
-        while (tempMarker != -1)
-        {
-            try
-            {
-                // check if the following data is some valid pdf content
-                // which most likely indicates that the pdf is linearized,
-                // updated or just cut off somewhere in the middle
-                skipSpaces();
-                if (!isString(XREF_TABLE))
-                {
-                    readObjectNumber();
-                    readGenerationNumber();
-                }
-            }
-            catch (IOException exception)
-            {
-                // save the EOF marker as the following data is most likely some garbage
-                LOG.debug("An exception occurred during brute force for last EOF - ignoring",
-                        exception);
-                lastEOFMarker = tempMarker;
-            }
-            tempMarker = findString(EOF_MARKER);
-        }
-        source.seek(originOffset);
-        // no EOF marker found
-        if (lastEOFMarker == -1)
-        {
-            lastEOFMarker = Long.MAX_VALUE;
-        }
-        return lastEOFMarker;
-    }
-
-    /**
-     * Brute force search for all object streams.
-     * 
-     * @throws IOException if something went wrong
-     */
-    private void bfSearchForObjStreams() throws IOException
-    {
-        // save origin offset
-        long originOffset = source.getPosition();
-
-        Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = bfSearchForObjStreamOffsets();
-        Map<COSObjectKey, Long> bfCOSObjectOffsets = getBFCOSObjectOffsets();
-        // log warning about skipped stream
-        bfSearchForObjStreamOffsets.entrySet().stream() //
-                .filter(o -> bfCOSObjectOffsets.get(o.getValue()) == null) //
-                .forEach(o -> LOG.warn(
-                        "Skipped incomplete object stream:" + o.getValue() + " at " + o.getKey()));
-
-        // collect all stream offsets
-        List<Long> objStreamOffsets = bfSearchForObjStreamOffsets.entrySet().stream() //
-                .filter(o -> bfCOSObjectOffsets.get(o.getValue()) != null) //
-                .filter(o -> o.getKey().equals(bfCOSObjectOffsets.get(o.getValue()))) //
-                .map(Map.Entry::getKey) //
-                .collect(Collectors.toList());
-        // add all found compressed objects to the brute force search result
-        for (Long offset : objStreamOffsets)
-        {
-            source.seek(offset);
-            long stmObjNumber = readObjectNumber();
-            int stmGenNumber = readGenerationNumber();
-            readExpectedString(OBJ_MARKER, true);
-            COSStream stream = null;
-            try
-            {
-                COSDictionary dict = parseCOSDictionary(false);
-                stream = parseCOSStream(dict);
-                if (securityHandler != null)
-                {
-                    securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber);
-                }
-                PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document);
-                Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers();
-                Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
-                for (Long objNumber : objectNumbers.keySet())
-                {
-                    COSObjectKey objKey = new COSObjectKey(objNumber, 0);
-                    Long existingOffset = bfCOSObjectOffsets.get(objKey);
-                    if (existingOffset != null && existingOffset < 0)
-                    {
-                        // translate stream object key to its offset
-                        COSObjectKey objStmKey = new COSObjectKey(Math.abs(existingOffset), 0);
-                        existingOffset = bfCOSObjectOffsets.get(objStmKey);
-                    }
-                    if (existingOffset == null || offset > existingOffset)
-                    {
-                        bfCOSObjectOffsets.put(objKey, -stmObjNumber);
-                        xrefOffset.put(objKey, -stmObjNumber);
-                    }
-                }
-            }
-            catch (IOException exception)
-            {
-                LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset,
-                        exception);
-            }
-            finally
-            {
-                if (stream != null)
-                {
-                    stream.close();
-                }
-            }
-        }
-        // restore origin offset
-        source.seek(originOffset);
-    }
-
-    /**
-     * Search for all offsets of object streams within the given pdf
-     * 
-     * @return a map of all offsets for object streams
-     * @throws IOException if something went wrong
-     */
-    private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws IOException
+    private BruteForceParser getBruteForceParser() throws IOException
     {
-        HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
-        source.seek(MINIMUM_SEARCH_OFFSET);
-        char[] string = " obj".toCharArray();
-        // search for object stream marker
-        long positionObjStream = findString(OBJ_STREAM);
-        while (positionObjStream != -1)
-        {
-            // search backwards for the beginning of the object
-            long newOffset = -1;
-            boolean objFound = false;
-            for (int i = 1; i < 40 && !objFound; i++)
-            {
-                long currentOffset = positionObjStream - (i * 10);
-                if (currentOffset > 0)
-                {
-                    source.seek(currentOffset);
-                    for (int j = 0; j < 10; j++)
-                    {
-                        if (isString(string))
-                        {
-                            long tempOffset = currentOffset - 1;
-                            source.seek(tempOffset);
-                            int genID = source.peek();
-                            // is the next char a digit?
-                            if (isDigit(genID))
-                            {
-                                tempOffset--;
-                                source.seek(tempOffset);
-                                if (isSpace())
-                                {
-                                    int length = 0;
-                                    source.seek(--tempOffset);
-                                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
-                                    {
-                                        source.seek(--tempOffset);
-                                        length++;
-                                    }
-                                    if (length > 0)
-                                    {
-                                        source.read();
-                                        newOffset = source.getPosition();
-                                        long objNumber = readObjectNumber();
-                                        int genNumber = readGenerationNumber();
-                                        COSObjectKey streamObjectKey = new COSObjectKey(objNumber,
-                                                genNumber);
-                                        bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
-                                    }
-                                }
-                            }
-                            LOG.debug("Dictionary start for object stream -> " + newOffset);
-                            objFound = true;
-                            break;
-                        }
-                        else
-                        {
-                            currentOffset++;
-                            source.read();
-                        }
-                    }
-                }
-            }
-            source.seek(positionObjStream + OBJ_STREAM.length);
-            positionObjStream = findString(OBJ_STREAM);
-        }
-        return bfSearchObjStreamsOffsets;
-    }
-
-    /**
-     * Brute force search for all xref entries (tables).
-     * 
-     * @throws IOException if something went wrong
-     */
-    private List<Long> bfSearchForXRefTables() throws IOException
-    {
-        List<Long> bfSearchXRefTablesOffsets = new ArrayList<>();
-        // a pdf may contain more than one xref entry
-        source.seek(MINIMUM_SEARCH_OFFSET);
-        // search for xref tables
-        long newOffset = findString(XREF_TABLE);
-        while (newOffset != -1)
-        {
-            source.seek(newOffset - 1);
-            // ensure that we don't read "startxref" instead of "xref"
-            if (isWhitespace())
-            {
-                bfSearchXRefTablesOffsets.add(newOffset);
-            }
-            source.seek(newOffset + 4);
-            newOffset = findString(XREF_TABLE);
-        }
-        return bfSearchXRefTablesOffsets;
-    }
-
-    /**
-     * Brute force search for all /XRef entries (streams).
-     * 
-     * @throws IOException if something went wrong
-     */
-    private List<Long> bfSearchForXRefStreams() throws IOException
-    {
-        List<Long> bfSearchXRefStreamsOffsets = new ArrayList<>();
-        // a pdf may contain more than one /XRef entry
-        source.seek(MINIMUM_SEARCH_OFFSET);
-        // search for XRef streams
-        String objString = " obj";
-        char[] string = objString.toCharArray();
-        long xrefOffset = findString(XREF_STREAM);
-        while (xrefOffset != -1)
-        {
-            // search backwards for the beginning of the stream
-            long newOffset = -1;
-            boolean objFound = false;
-            for (int i = 1; i < 40 && !objFound; i++)
-            {
-                long currentOffset = xrefOffset - (i * 10);
-                if (currentOffset > 0)
-                {
-                    source.seek(currentOffset);
-                    for (int j = 0; j < 10; j++)
-                    {
-                        if (isString(string))
-                        {
-                            long tempOffset = currentOffset - 1;
-                            source.seek(tempOffset);
-                            int genID = source.peek();
-                            // is the next char a digit?
-                            if (isDigit(genID))
-                            {
-                                tempOffset--;
-                                source.seek(tempOffset);
-                                if (isSpace())
-                                {
-                                    int length = 0;
-                                    source.seek(--tempOffset);
-                                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
-                                    {
-                                        source.seek(--tempOffset);
-                                        length++;
-                                    }
-                                    if (length > 0)
-                                    {
-                                        source.read();
-                                        newOffset = source.getPosition();
-                                    }
-                                }
-                            }
-                            LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> "
-                                    + newOffset);
-                            objFound = true;
-                            break;
-                        }
-                        else
-                        {
-                            currentOffset++;
-                            source.read();
-                        }
-                    }
-                }
-            }
-            if (newOffset > -1)
-            {
-                bfSearchXRefStreamsOffsets.add(newOffset);
-            }
-            source.seek(xrefOffset + 5);
-            xrefOffset = findString(XREF_STREAM);
-        }
-        return bfSearchXRefStreamsOffsets;
+    	if (bruteForceParser == null)
+    	{
+    		bruteForceParser = new BruteForceParser(source, document);
+    		bruteForceSearchSuccessful = !bruteForceParser.getBFCOSObjectOffsets().isEmpty();
+    	}
+    	return bruteForceParser;
     }
     
     /**
-     * Rebuild the trailer dictionary if startxref can't be found.
-     *  
-     * @return the rebuild trailer dictionary
-     * 
-     * @throws IOException if something went wrong
-     */
-    private COSDictionary rebuildTrailer() throws IOException
-    {
-        Map<COSObjectKey, Long> bfCOSObjectOffsets = getBFCOSObjectOffsets();
-        // reset trailer resolver
-        xrefTrailerResolver.reset();
-        // use the found objects to rebuild the trailer resolver
-        xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE);
-        bfCOSObjectOffsets.forEach(xrefTrailerResolver::setXRef);
-        xrefTrailerResolver.setStartxref(0);
-        COSDictionary trailer = xrefTrailerResolver.getTrailer();
-        document.setTrailer(trailer);
-        boolean searchForObjStreamsDone = false;
-        if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
-        {
-            // root entry wasn't found, maybe it is part of an object stream
-            bfSearchForObjStreams();
-            searchForObjStreamsDone = true;
-            // search again for the root entry
-            searchForTrailerItems(trailer);
-        }
-        // prepare decryption if necessary
-        prepareDecryption();
-        if (!searchForObjStreamsDone)
-        {
-            bfSearchForObjStreams();
-        }
-        trailerWasRebuild = true;
-        return trailer;
-    }
-
-    /**
-     * Search for the different parts of the trailer dictionary.
-     *
-     * @param trailer
-     * @return true if the root was found, false if not.
-     * @throws java.io.IOException if the page tree root is null
-     */
-    private boolean searchForTrailerItems(COSDictionary trailer) throws IOException
-    {
-        COSObject rootObject = null;
-        COSObject infoObject = null;
-        for (Entry<COSObjectKey, Long> entrySet : getBFCOSObjectOffsets().entrySet())
-        {
-            COSObjectKey currentKey = entrySet.getKey();
-            COSObject cosObject = document.getObjectFromPool(currentKey);
-            COSBase baseObject = cosObject.getObject();
-
-            if (!(baseObject instanceof COSDictionary))
-            {
-                continue;
-            }
-            COSDictionary dictionary = (COSDictionary) baseObject;
-            // document catalog
-            if (isCatalog(dictionary))
-            {
-                rootObject = compareCOSObjects(cosObject, entrySet.getValue(), rootObject);
-            }
-            // info dictionary
-            else if (isInfo(dictionary))
-            {
-                infoObject = compareCOSObjects(cosObject, entrySet.getValue(), infoObject);
-            }
-            // encryption dictionary, if existing, is lost
-            // We can't run "Algorithm 2" from PDF specification because of missing ID
-        }
-        if (rootObject != null)
-        {
-            trailer.setItem(COSName.ROOT, rootObject);
-        }
-        if (infoObject != null)
-        {
-            trailer.setItem(COSName.INFO, infoObject);
-        }
-        return rootObject != null;
-    }
-
-    private COSObject compareCOSObjects(COSObject newObject, Long newOffset,
-            COSObject currentObject)
-    {
-        if (currentObject != null && currentObject.getKey() != null)
-        {
-            COSObjectKey currentKey = currentObject.getKey();
-            COSObjectKey newKey = newObject.getKey();
-            // check if the current object is an updated version of the previous found object
-            if (currentKey.getNumber() == newKey.getNumber())
-            {
-                return currentKey.getGeneration() < newKey.getGeneration() ? newObject
-                        : currentObject;
-            }
-            // most likely the object with the bigger offset is the newer one
-            Long currentOffset = document.getXrefTable().get(currentKey);
-            return currentOffset != null && newOffset > currentOffset ? newObject : currentObject;
-        }
-        return newObject;
-    }
-
-    /**
      * Check if all entries of the pages dictionary are present. Those which can't be dereferenced are removed.
      * 
      * @param root the root dictionary of the pdf
@@ -2064,43 +1406,6 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * Tell if the dictionary is a PDF catalog. Override this for an FDF catalog.
-     * 
-     * @param dictionary
-     * @return true if the given dictionary is a root dictionary
-     */
-    protected boolean isCatalog(COSDictionary dictionary)
-    {
-        return COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE));
-    }
-
-    /**
-     * Tell if the dictionary is an info dictionary.
-     * 
-     * @param dictionary
-     * @return true if the given dictionary is an info dictionary
-     */
-    private boolean isInfo(COSDictionary dictionary)
-    {
-        if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
-                || dictionary.containsKey(COSName.DEST))
-        {
-            return false;
-        }
-        if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
-                && !dictionary.containsKey(COSName.AUTHOR)
-                && !dictionary.containsKey(COSName.SUBJECT)
-                && !dictionary.containsKey(COSName.KEYWORDS)
-                && !dictionary.containsKey(COSName.CREATOR)
-                && !dictionary.containsKey(COSName.PRODUCER)
-                && !dictionary.containsKey(COSName.CREATION_DATE))
-        {
-            return false;
-        }
-        return true;
-    }
-
-    /**
      * This will parse the startxref section from the stream. The startxref value is ignored.
      *
      * @return the startxref value or -1 on parsing error
@@ -2166,46 +1471,6 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * Search for the given string. The search starts at the current position and returns the start position if the
-     * string was found. -1 is returned if there isn't any further occurrence of the given string. After returning the
-     * current position is either the end of the string or the end of the input.
-     * 
-     * @param string the string to be searched
-     * @return the start position of the found string
-     * @throws IOException if something went wrong
-     */
-    private long findString(char[] string) throws IOException
-    {
-        long position = -1L;
-        int stringLength = string.length;
-        int counter = 0;
-        int readChar = source.read();
-        while (readChar != -1)
-        {
-            if (readChar == string[counter])
-            {
-                if (counter == 0)
-                {
-                    position = source.getPosition() - 1;
-                }
-                counter++;
-                if (counter == stringLength)
-                {
-                    return position;
-                }
-            }
-            else if (counter > 0)
-            {
-                counter = 0;
-                position = -1L;
-                continue;
-            }
-            readChar = source.read();
-        }
-        return position;
-    }
-
-    /**
      * This will parse the trailer from the stream and add it to the state.
      *
      * @return false on parsing error
@@ -2524,7 +1789,7 @@ public class COSParser extends BaseParse
      *
      * @throws IOException If there is an error getting the document.
      */
-    public PDEncryption getEncryption() throws IOException
+    protected PDEncryption getEncryption() throws IOException
     {
         if (document == null)
         {
@@ -2541,7 +1806,7 @@ public class COSParser extends BaseParse
      *
      * @throws IOException If there is an error getting the document.
      */
-    public AccessPermission getAccessPermission() throws IOException
+    protected AccessPermission getAccessPermission() throws IOException
     {
         if (document == null)
         {
@@ -2557,7 +1822,7 @@ public class COSParser extends BaseParse
      * @throws InvalidPasswordException If the password is incorrect.
      * @throws IOException if something went wrong
      */
-    private void prepareDecryption() throws IOException
+    protected void prepareDecryption() throws IOException
     {
         if (encryption != null)
         {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java?rev=1900449&r1=1900448&r2=1900449&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java Sun May  1 11:39:05 2022
@@ -44,18 +44,6 @@ public class FDFParser extends COSParser
         init();
     }
 
-    /**
-     * Tell if the dictionary is a FDF catalog.
-     *
-     * @param dictionary
-     * @return true if the dictionary is a FDF catalog
-     */
-    @Override
-    protected final boolean isCatalog(COSDictionary dictionary)
-    {
-        return dictionary.containsKey(COSName.FDF);
-    }
-
     private void init()
     {
         String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);