You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/06/09 16:27:16 UTC

svn commit: r1601392 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java

Author: tilman
Date: Mon Jun  9 14:27:16 2014
New Revision: 1601392

URL: http://svn.apache.org/r1601392
Log:
PDFBOX-2120: Don't filter if ASCII stream, i.e. keep a final CR LF or LF

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java?rev=1601392&r1=1601391&r2=1601392&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java Mon Jun  9 14:27:16 2014
@@ -22,10 +22,11 @@ import java.io.OutputStream;
 
 /**
  * This class is only for the readUntilEndStream methods, to prevent a
- * final CR LF or LF (but not a final CR!) from being written to the output.
- * Because of that, only the 3-param write() method is implemented. This solves
- * PDFBOX-2079 and avoids making readUntilEndStream() even more complex than it
- * already is.
+ * final CR LF or LF (but not a final CR!) from being written to the output,
+ * unless the beginning of the stream is assumed to be ASCII.
+ * Only the 3-param write() method is implemented. This solves
+ * PDFBOX-2079 and PDFBOX-2120 and avoids making readUntilEndStream() 
+ * even more complex than it already is.
  *
  * @author Tilman Hausherr
  */
@@ -35,7 +36,9 @@ class EndstreamOutputStream extends Buff
     
     private boolean hasCR = false;
     private boolean hasLF = false;
-    
+    private int pos = 0;
+    private boolean mustFilter = true;
+
     public EndstreamOutputStream(OutputStream out)
     {
         super(out);
@@ -44,7 +47,8 @@ class EndstreamOutputStream extends Buff
     /**
      * Write CR and/or LF that were kept, then writes len bytes from the 
      * specified byte array starting at offset off to this output stream,
-     * except trailing CR, CR LF, or LF.
+     * except trailing CR, CR LF, or LF. No filtering will be done for the
+     * entire stream if the beginning is assumed to be ASCII.
      * @param b byte array.
      * @param off offset.
      * @param len length of segment to write.
@@ -53,46 +57,65 @@ class EndstreamOutputStream extends Buff
     @Override
     public void write(byte[] b, int off, int len) throws IOException
     {
-        // first write what we kept last time
-        if (hasCR)
+        if (pos == 0 && len > 10)
         {
-            if (!hasLF && len == 1 && b[off] == '\n')
+            // PDFBOX-2120 Don't filter if ASCII, i.e. keep a final CR LF or LF
+            mustFilter = false;
+            for (int i = 0; i < 10; ++i)
             {
-                // previous buffer ended with CR
-                // actual buffer contains only LF so it will be the last one
-                // => we're done
-                hasCR = false; // to avoid this getting written in the flush
-                return;                    
+                // Heuristic approach, taken from PDFStreamParser, PDFBOX-1164
+                if ((b[i] < 0x09) || ((b[i] > 0x0a) && (b[i] < 0x20) && (b[i] != 0x0d)))
+                {
+                    // control character or > 0x7f -> we have binary data
+                    mustFilter = true;
+                    break;
+                }
             }
-            super.write('\r');
-            hasCR = false;
         }
-        if (hasLF)
-        {
-            super.write('\n');
-            hasLF = false;
-        } 
-        // don't write CR, LF, or CR LF if at the end of the buffer
-        if (len > 0)
+        if (mustFilter)
         {
-            if (b[off + len - 1] == '\r')
+            // first write what we kept last time
+            if (hasCR)
             {
-                hasCR = true;
-                --len;
+                if (!hasLF && len == 1 && b[off] == '\n')
+                {
+                    // previous buffer ended with CR
+                    // actual buffer contains only LF so it will be the last one
+                    // => we're done
+                    hasCR = false; // to avoid this getting written in the flush
+                    return;
+                }
+                super.write('\r');
+                hasCR = false;
             }
-            else if (b[off + len - 1] == '\n')
+            if (hasLF)
             {
-                hasLF = true;
-                --len;
-                if (len > 0 && b[off + len - 1] == '\r')
+                super.write('\n');
+                hasLF = false;
+            }
+            // don't write CR, LF, or CR LF if at the end of the buffer
+            if (len > 0)
+            {
+                if (b[off + len - 1] == '\r')
                 {
                     hasCR = true;
                     --len;
                 }
+                else if (b[off + len - 1] == '\n')
+                {
+                    hasLF = true;
+                    --len;
+                    if (len > 0 && b[off + len - 1] == '\r')
+                    {
+                        hasCR = true;
+                        --len;
+                    }
+                }
             }
-        }        
+        }
         super.write(b, off, len);
-    } 
+        pos += len;
+    }
 
     /**
      * write out a single CR if one was kept. Don't write kept CR LF or LF, 
@@ -107,6 +130,7 @@ class EndstreamOutputStream extends Buff
         if (hasCR && !hasLF)
         {
             super.write('\r');
+            ++pos;
         }
         hasCR = false;
         hasLF = false;