You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/10/19 13:58:40 UTC

svn commit: r1632895 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Author: lehmi
Date: Sun Oct 19 11:58:39 2014
New Revision: 1632895

URL: http://svn.apache.org/r1632895
Log:
PDFBOX-2250: optimized the xref repair mechanism, lower the minimum start offset

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1632895&r1=1632894&r2=1632895&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun Oct 19 11:58:39 2014
@@ -77,8 +77,10 @@ import org.apache.pdfbox.persistence.uti
  */
 public class NonSequentialPDFParser extends PDFParser
 {
-    private static final byte[] XREF = new byte[] { 'x', 'r', 'e', 'f' };
-
+    private static final byte[] XREF_TABLE = new byte[] { 'x', 'r', 'e', 'f' };
+    private static final byte[] XREF_STREAM = new byte[] { '/','X', 'R', 'e', 'f' };
+    private static final long MINIMUM_SEARCH_OFFSET = 6;
+    
     private static final int X = 'x';
 
     public static final String SYSPROP_PARSEMINIMAL = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
@@ -369,7 +371,11 @@ public class NonSequentialPDFParser exte
         // check the startxref offset
         if (isLenient)
         {
-            startXrefOffset -= calculateXRefFixingOffset(startXrefOffset);
+            long fixedOffset = checkXRefOffset(startXrefOffset);
+            if (fixedOffset > -1)
+            {
+            	startXrefOffset = fixedOffset;
+            }
             document.setStartXref(startXrefOffset);
         }
         long prev = startXrefOffset;
@@ -419,10 +425,10 @@ public class NonSequentialPDFParser exte
                 if (isLenient && prev > -1)
                 {
                     // check the xref table reference
-                    long fixingOffset = calculateXRefFixingOffset(prev);
-                    if (fixingOffset != 0)
+                    long fixedOffset = checkXRefOffset(prev);
+                    if (fixedOffset > -1 && fixedOffset != prev)
                     {
-                        prev -= fixingOffset;
+                        prev = fixedOffset;
                         trailer.setLong(COSName.PREV, prev);
                     }
                 }
@@ -434,10 +440,10 @@ public class NonSequentialPDFParser exte
                 if (isLenient && prev > -1)
                 {
                     // check the xref table reference
-                    long fixingOffset = calculateXRefFixingOffset(prev);
-                    if (fixingOffset != 0)
+                    long fixedOffset = checkXRefOffset(prev);
+                    if (fixedOffset > -1 && fixedOffset != prev)
                     {
-                        prev -= fixingOffset;
+                        prev = fixedOffset;
                         COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                         trailer.setLong(COSName.PREV, prev);
                     }
@@ -1671,15 +1677,15 @@ public class NonSequentialPDFParser exte
      * Check if the cross reference table/stream can be found at the current offset.
      * 
      * @param startXRefOffset
-     * @return the calculated offset
+     * @return the revised offset
      * @throws IOException
      */
-    private long calculateXRefFixingOffset(long startXRefOffset) throws IOException
+    private long checkXRefOffset(long startXRefOffset) throws IOException
     {
         setPdfSource(startXRefOffset);
-        if (pdfSource.peek() == X && calculateFixingOffset(startXRefOffset, XREF) == 0)
+        if (pdfSource.peek() == X && checkBytesAtOffset(XREF_TABLE))
         {
-            return 0;
+            return startXRefOffset;
         }
         int nextValue = pdfSource.peek();
         // maybe there isn't a xref table but a xref stream
@@ -1693,7 +1699,7 @@ public class NonSequentialPDFParser exte
                 readGenerationNumber();
                 readPattern(OBJ_MARKER);
                 setPdfSource(startXRefOffset);
-                return 0;
+                return startXRefOffset;
             }
             catch (IOException exception)
             {
@@ -1702,8 +1708,8 @@ public class NonSequentialPDFParser exte
                 pdfSource.seek(startXRefOffset);
             }
         }
-        // TODO try to repair XRef streams
-        return calculateFixingOffset(startXRefOffset, XREF);
+        // try to find a fixed offset
+        return calculateXRefFixedOffset(startXRefOffset);
     }
 
     /**
@@ -1740,39 +1746,28 @@ public class NonSequentialPDFParser exte
     }
 
     /**
-     * Check if the given bytes can be found at the given offset. The method seeks 200 bytes backward/forward if the
-     * given string can't be found at the given offset and returns the difference of the new offset to the origin one.
+     * Try to find a fixed offset for the given xref table/stream.
      * 
      * @param objectOffset the given offset where to look at
-     * @param string the bytes to look for
-     * @return the difference to the origin one
+     * @return the fixed offset
+     * 
      * @throws IOException if something went wrong
      */
-    private long calculateFixingOffset(long objectOffset, byte[] string) throws IOException
+    private long calculateXRefFixedOffset(long objectOffset) throws IOException
     {
         if (objectOffset < 0)
         {
-            LOG.error("Invalid object offset " + objectOffset + " for object " + new String(string));
-            return 0;
-        }
-        long originOffset = pdfSource.getOffset();
-        pdfSource.seek(objectOffset);
-        // most likely the object can be found at the given offset
-        if (checkBytesAtOffset(string))
-        {
-            pdfSource.seek(originOffset);
+            LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
             return 0;
         }
-        // the offset seems to be wrong -> seek to find the object we are looking for
-        long newOffset = bfSearchXRef(objectOffset);
+        // start a brute force search for all xref tables and try to find the offset we are looking for
+        long newOffset = bfSearchForXRef(objectOffset);
         if (newOffset > -1)
         {
-            LOG.debug("Fixed reference for object " + new String(string) + " "
-                    + objectOffset + " -> " + (objectOffset - newOffset));
-            return objectOffset - newOffset;
+            LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
+            return newOffset;
         }
-        pdfSource.seek(originOffset);
-        LOG.error("Can't find the object " + new String(string) + " at offset " + objectOffset);
+        LOG.error("Can't find the object axref table/stream at offset " + objectOffset);
         return 0;
     }
 
@@ -1805,6 +1800,11 @@ public class NonSequentialPDFParser exte
                             LOG.debug("Fixed reference for object " + objectNr + " " + objectGen
                                     + " " + objectOffset + " -> " + newOffset);
                         }
+                        else
+                        {
+                            LOG.error("Can't find the object " + objectNr + " " + objectGen
+                                    + " (origin offset " + objectOffset + ")");
+                        }
                     }
                 }
             }
@@ -1870,7 +1870,7 @@ public class NonSequentialPDFParser exte
         {
             bfSearchObjectOffsets = new HashMap<String, Long>();
             long originOffset = pdfSource.getOffset();
-            int currentOffset = 15;
+            long currentOffset = MINIMUM_SEARCH_OFFSET;
             String objString = " obj";
             byte[] string = objString.getBytes("ISO-8859-1");
             do
@@ -1891,7 +1891,7 @@ public class NonSequentialPDFParser exte
                         {
                             int length = 0;
                             pdfSource.seek(--tempOffset);
-                            while (tempOffset > 14 && pdfSource.peek() > 47
+                            while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() > 47
                                     && pdfSource.peek() < 58)
                             {
                                 pdfSource.seek(--tempOffset);
@@ -1929,37 +1929,15 @@ public class NonSequentialPDFParser exte
     }
 
     /**
-     * Brute force search for the xref entry.
+     * Search for the offset of the given xref table/stream among those found by a brute force search.
      * 
      * @return the offset of the xref entry
      * @throws IOException if something went wrong
      */
-    private long bfSearchXRef(long xrefOffset) throws IOException
+    private long bfSearchForXRef(long xrefOffset) throws IOException
     {
-        // a pdf may contain more than one xref entry
-    	if (bfSearchXRefOffsets == null)
-    	{
-    		bfSearchXRefOffsets = new Vector<Long>();
-    		long originOffset = pdfSource.getOffset();
-	        pdfSource.seek(15);
-	        while(!pdfSource.isEOF())
-	        {
-	            if (checkBytesAtOffset(XREF))
-	            {
-	                long newOffset = pdfSource.getOffset(); 
-	                pdfSource.seek(newOffset-1);
-	                // ensure that we don't read "startxref" instead of "xref"
-	                if (isWhitespace())
-	                {
-	                    bfSearchXRefOffsets.add(newOffset);
-	                }
-	                pdfSource.seek(newOffset+4);
-	            }
-	            pdfSource.read();
-	        }
-	        pdfSource.seek(originOffset);
-    	}
     	long newOffset = -1;
+    	bfSearchForXRefs();
     	if (bfSearchXRefOffsets != null)
     	{
 	    	long currentDifference = -1;
@@ -1984,4 +1962,106 @@ public class NonSequentialPDFParser exte
     	}
         return newOffset;
     }
+
+    /**
+     * Brute force search for all xref entries.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForXRefs() throws IOException
+    {
+    	if (bfSearchXRefOffsets == null)
+    	{
+            // a pdf may contain more than one xref entry
+    		bfSearchXRefOffsets = new Vector<Long>();
+    		long originOffset = pdfSource.getOffset();
+	        pdfSource.seek(MINIMUM_SEARCH_OFFSET);
+	        // search for xref tables
+	        while(!pdfSource.isEOF())
+	        {
+	            if (checkBytesAtOffset(XREF_TABLE))
+	            {
+	                long newOffset = pdfSource.getOffset(); 
+	                pdfSource.seek(newOffset-1);
+	                // ensure that we don't read "startxref" instead of "xref"
+	                if (isWhitespace())
+	                {
+	                    bfSearchXRefOffsets.add(newOffset);
+	                }
+	                pdfSource.seek(newOffset+4);
+	            }
+	            pdfSource.read();
+	        }
+	        pdfSource.seek(MINIMUM_SEARCH_OFFSET);
+	        // search for XRef streams
+            String objString = " obj";
+            byte[] string = objString.getBytes("ISO-8859-1");
+	        while(!pdfSource.isEOF())
+	        {
+	            if (checkBytesAtOffset(XREF_STREAM))
+	            {
+	            	// search backwards for the beginning of the stream
+	                long newOffset = -1;
+	            	long xrefOffset = pdfSource.getOffset();
+	            	long currentOffset = xrefOffset;
+	            	boolean objFound = false;
+	            	for (int i=1; i<30 && !objFound;i++)
+	            	{
+	            		currentOffset = xrefOffset - (i*10);
+	            		if (currentOffset > 0)
+	            		{
+	        	    		pdfSource.seek(currentOffset);
+	        	    		for (int j=0; j<10;j++)
+	        	    		{
+	        	    			if (checkBytesAtOffset(string))
+	        	    			{
+	        	                    long tempOffset = currentOffset - 1;
+	        	                    pdfSource.seek(tempOffset);
+	        	                    int genID = pdfSource.peek();
+	        	                    // is the next char a digit?
+	        	                    if (genID > 47 && genID < 58)
+	        	                    {
+	        	                        genID -= 48;
+	        	                        tempOffset--;
+	        	                        pdfSource.seek(tempOffset);
+	        	                        if (pdfSource.peek() == 32)
+	        	                        {
+	        	                            int length = 0;
+	        	                            pdfSource.seek(--tempOffset);
+	        	                            while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() > 47
+	        	                                    && pdfSource.peek() < 58)
+	        	                            {
+	        	                                pdfSource.seek(--tempOffset);
+	        	                                length++;
+	        	                            }
+	        	                            if (length > 0)
+	        	                            {
+	        	                                pdfSource.read();
+	        	        		            	newOffset = pdfSource.getOffset();
+	        	                            }
+	        	                        }
+	        	                    }
+	        						LOG.debug("Fixed reference for xref stream "+xrefOffset + " -> "+newOffset);
+	        		            	objFound = true;
+	        		            	break;
+	        	    			}
+	        	    			else
+	        	    			{
+	        	    				currentOffset++;
+	        	    				pdfSource.read();
+	        	    			}
+	        	    		}
+	            		}
+	            	}
+	            	if (newOffset > -1)
+	            	{
+		                bfSearchXRefOffsets.add(newOffset);
+	            	}
+    				pdfSource.seek(xrefOffset+5);
+	            }
+	            pdfSource.read();
+	        }
+	        pdfSource.seek(originOffset);
+    	}
+    }
 }