You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2015/02/22 14:10:25 UTC

svn commit: r1661474 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Author: lehmi
Date: Sun Feb 22 13:10:24 2015
New Revision: 1661474

URL: http://svn.apache.org/r1661474
Log:
PDFBOX-2527: splitted up the brute force search for xref tables and xref streams

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1661474&r1=1661473&r2=1661474&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Feb 22 13:10:24 2015
@@ -131,7 +131,8 @@ public class COSParser extends BaseParse
      */
     private Map<String, Long> bfSearchObjectOffsets = null;
     private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
-    private List<Long> bfSearchXRefOffsets = null;
+    private List<Long> bfSearchXRefTablesOffsets = null;
+    private List<Long> bfSearchXRefStreamsOffsets = null;
 
     /**
      * The security handler.
@@ -270,7 +271,7 @@ public class COSParser extends BaseParse
                 {
                     int streamOffset = trailer.getInt(COSName.XREF_STM);
                     // check the xref stream reference
-                    fixedOffset = checkXRefOffset(streamOffset);
+                    fixedOffset = checkXRefStreamOffset(streamOffset);
                     if (fixedOffset > -1 && fixedOffset != streamOffset)
                     {
                         streamOffset = (int)fixedOffset;
@@ -1070,24 +1071,47 @@ public class COSParser extends BaseParse
         {
             return startXRefOffset;
         }
-        pdfSource.seek(startXRefOffset-1);
-        // save the previous character
-        int previous = pdfSource.read();
+        pdfSource.seek(startXRefOffset);
         if (pdfSource.peek() == X && isString(XREF_TABLE))
         {
             return startXRefOffset;
         }
-        // the previous character has to be a whitespace
-        if (isWhitespace(previous))
+        long fixedOffset = checkXRefStreamOffset(startXRefOffset);
+        if (fixedOffset > -1)
         {
-            int nextValue = pdfSource.peek();
-            // maybe there isn't a xref table but a xref stream
+        	return fixedOffset;
+        }
+        // try to find a fixed offset
+        return calculateXRefFixedOffset(startXRefOffset, false);
+    }
+
+    /**
+     * Check if the cross reference stream can be found at the current offset.
+     * 
+     * @param startXRefOffset
+     * @return the revised offset
+     * @throws IOException
+     */
+    private long checkXRefStreamOffset(long startXRefOffset) throws IOException
+    {
+        // repair mode isn't available in non-lenient mode
+        if (!isLenient)
+        {
+            return startXRefOffset;
+        }
+        // seek to offset-1 
+        pdfSource.seek(startXRefOffset-1);
+        int nextValue = pdfSource.read();
+        // the first character has to be a whitespace
+        if (isWhitespace(nextValue))
+        {
+        	nextValue = pdfSource.peek();
             // is the next character a digit?
             if (nextValue > 47 && nextValue < 58)
             {
                 try
                 {
-                    // Maybe it's a XRef stream
+                    // it's a XRef stream
                     readObjectNumber();
                     readGenerationNumber();
                     readExpectedString(OBJ_MARKER, true);
@@ -1103,18 +1127,18 @@ public class COSParser extends BaseParse
             }
         }
         // try to find a fixed offset
-        return calculateXRefFixedOffset(startXRefOffset);
+        return calculateXRefFixedOffset(startXRefOffset, true);
     }
-
     /**
      * Try to find a fixed offset for the given xref table/stream.
      * 
      * @param objectOffset the given offset where to look at
+     * @param streamsOnly search for xref streams only
      * @return the fixed offset
      * 
      * @throws IOException if something went wrong
      */
-    private long calculateXRefFixedOffset(long objectOffset) throws IOException
+    private long calculateXRefFixedOffset(long objectOffset, boolean streamsOnly) throws IOException
     {
         if (objectOffset < 0)
         {
@@ -1122,7 +1146,7 @@ public class COSParser extends BaseParse
             return 0;
         }
         // start a brute force search for all xref tables and try to find the offset we are looking for
-        long newOffset = bfSearchForXRef(objectOffset);
+        long newOffset = bfSearchForXRef(objectOffset, streamsOnly);
         if (newOffset > -1)
         {
             LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
@@ -1303,50 +1327,94 @@ public class COSParser extends BaseParse
     /**
      * Search for the offset of the given xref table/stream among those found by a brute force search.
      * 
+     * @param streamsOnly search for xref streams only
      * @return the offset of the xref entry
      * @throws IOException if something went wrong
      */
-    private long bfSearchForXRef(long xrefOffset) throws IOException
+    private long bfSearchForXRef(long xrefOffset, boolean streamsOnly) throws IOException
     {
         long newOffset = -1;
-        bfSearchForXRefs();
-        if (bfSearchXRefOffsets != null)
+        long newOffsetTable = -1;
+        long newOffsetStream = -1;
+        if (!streamsOnly)
+        {
+        	bfSearchForXRefTables();
+        }
+        bfSearchForXRefStreams();
+        if (!streamsOnly && bfSearchXRefTablesOffsets != null)
         {
-            long currentDifference = -1;
-            int currentOffsetIndex = -1;
-            int numberOfOffsets = bfSearchXRefOffsets.size();
-            // find the most likely value
             // TODO to be optimized, this won't work in every case
-            for (int i = 0; i < numberOfOffsets; i++)
-            {
-                long newDifference = xrefOffset - bfSearchXRefOffsets.get(i);
-                // find the nearest offset
-                if (currentDifference == -1
-                        || (Math.abs(currentDifference) > Math.abs(newDifference)))
-                {
-                    currentDifference = newDifference;
-                    currentOffsetIndex = i;
-                }
-            }
-            if (currentOffsetIndex > -1)
-            {
-                newOffset = bfSearchXRefOffsets.remove(currentOffsetIndex);
-            }
+        	newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
+        }
+        if (bfSearchXRefStreamsOffsets != null)
+        {
+            // TODO to be optimized, this won't work in every case
+        	newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
+        }
+        // choose the nearest value
+        if (newOffsetTable > -1 && newOffsetStream > -1)
+        {
+            long differenceTable = xrefOffset - newOffsetTable;
+            long differenceStream = xrefOffset - newOffsetStream;
+        	if (Math.abs(differenceTable) > Math.abs(differenceStream))
+        	{
+        		newOffset = differenceStream;
+        		bfSearchXRefStreamsOffsets.remove(newOffsetStream);
+        	}
+        	else
+        	{
+        		newOffset = differenceTable;
+        		bfSearchXRefTablesOffsets.remove(newOffsetTable);
+        	}
+        }
+        else if (newOffsetTable > -1)
+        {
+        	newOffset = newOffsetTable;
+    		bfSearchXRefTablesOffsets.remove(newOffsetTable);
+        }
+        else if (newOffsetStream > -1)
+        {
+        	newOffset = newOffsetStream;
+    		bfSearchXRefStreamsOffsets.remove(newOffsetStream);
         }
         return newOffset;
     }
 
+    private long searchNearestValue(List<Long> values, long offset)
+    {
+    	long newValue = -1;
+        long currentDifference = -1;
+        int currentOffsetIndex = -1;
+        int numberOfOffsets = values.size();
+        // find the nearest value
+        for (int i = 0; i < numberOfOffsets; i++)
+        {
+            long newDifference = offset - values.get(i);
+            // find the nearest offset
+            if (currentDifference == -1
+                    || (Math.abs(currentDifference) > Math.abs(newDifference)))
+            {
+                currentDifference = newDifference;
+                currentOffsetIndex = i;
+            }
+        }
+        if (currentOffsetIndex > -1)
+        {
+        	newValue = values.get(currentOffsetIndex);
+        }
+        return newValue;
+    }
     /**
-     * Brute force search for all xref entries.
+     * Brute force search for all xref entries (tables).
      * 
      * @throws IOException if something went wrong
      */
-    private void bfSearchForXRefs() throws IOException
+    private void bfSearchForXRefTables() throws IOException
     {
-        if (bfSearchXRefOffsets == null)
+        if (bfSearchXRefTablesOffsets == null)
         {
             // a pdf may contain more than one xref entry
-            bfSearchXRefOffsets = new Vector<Long>();
+        	bfSearchXRefTablesOffsets = new Vector<Long>();
             long originOffset = pdfSource.getOffset();
             pdfSource.seek(MINIMUM_SEARCH_OFFSET);
             // search for xref tables
@@ -1359,12 +1427,28 @@ public class COSParser extends BaseParse
                     // ensure that we don't read "startxref" instead of "xref"
                     if (isWhitespace())
                     {
-                        bfSearchXRefOffsets.add(newOffset);
+                    	bfSearchXRefTablesOffsets.add(newOffset);
                     }
                     pdfSource.seek(newOffset + 4);
                 }
                 pdfSource.read();
             }
+            pdfSource.seek(originOffset);
+        }
+    }
+
+    /**
+     * Brute force search for all /XRef entries (streams).
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForXRefStreams() throws IOException
+    {
+        if (bfSearchXRefStreamsOffsets == null)
+        {
+            // a pdf may contain more than one /XRef entry
+        	bfSearchXRefStreamsOffsets = new Vector<Long>();
+            long originOffset = pdfSource.getOffset();
             pdfSource.seek(MINIMUM_SEARCH_OFFSET);
             // search for XRef streams
             String objString = " obj";
@@ -1427,7 +1511,7 @@ public class COSParser extends BaseParse
                     }
                     if (newOffset > -1)
                     {
-                        bfSearchXRefOffsets.add(newOffset);
+                    	bfSearchXRefStreamsOffsets.add(newOffset);
                     }
                     pdfSource.seek(xrefOffset + 5);
                 }
@@ -1436,7 +1520,6 @@ public class COSParser extends BaseParse
             pdfSource.seek(originOffset);
         }
     }
-
     /**
      * This will parse the startxref section from the stream.
      * The startxref value is ignored.