You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/05/25 16:58:51 UTC
svn commit: r1597435 - in /pdfbox/branches/1.8/pdfbox/src: main/java/org/apache/pdfbox/pdfparser/ test/java/org/apache/pdfbox/

Author: tilman
Date: Sun May 25 14:58:51 2014
New Revision: 1597435

URL: http://svn.apache.org/r1597435
Log:
PDFBOX-2079: remove final CR LF or LF from output of readUntilEndStream()

Added:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java   (with props)
Modified:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun May 25 14:58:51 2014
@@ -503,7 +503,7 @@ public abstract class BaseParser
             {
                 // Couldn't determine length from dict: just
                 // scan until we find endstream:
-                readUntilEndStream( out );
+                readUntilEndStream(new EndstreamOutputStream(out));
             }
             else
             {
@@ -583,7 +583,7 @@ public abstract class BaseParser
                       	IOUtils.closeQuietly(out);
                         out = stream.createFilteredStream( streamLength );
                         // scan until we find endstream:
-                        readUntilEndStream( out );
+                        readUntilEndStream(new EndstreamOutputStream(out));
                     }
                 }
             }
@@ -621,7 +621,7 @@ public abstract class BaseParser
                      * If for some reason we get something else here, Read until we find the next
                      * "endstream"
                      */
-                    readUntilEndStream( out );
+                    readUntilEndStream(new EndstreamOutputStream(out));
                     endStream = readString();
                     if( !endStream.equals( ENDSTREAM_STRING ) )
                     {
@@ -747,6 +747,8 @@ public abstract class BaseParser
             }
             
         }  // while
+        
+        out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
     }
     
     /**

Added: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java?rev=1597435&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java Sun May 25 14:58:51 2014
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdfparser;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * This class is only for the readUntilEndStream methods, to prevent a
+ * final CR LF or LF (but not a final CR!) from being written to the output.
+ * Because of that, only the 3-param write() method is implemented. This solves
+ * PDFBOX-2079 and avoids making readUntilEndStream() even more complex than it
+ * already is.
+ *
+ * @author Tilman Hausherr
+ */
+class EndstreamOutputStream extends BufferedOutputStream
+{
+    //TODO: replace this class with a PullBackOutputStream class if there ever is one
+    
+    private boolean hasCR = false;
+    private boolean hasLF = false;
+    
+    public EndstreamOutputStream(OutputStream out)
+    {
+        super(out);
+    }
+
+    /**
+     * Write CR and/or LF that were kept, then writes len bytes from the 
+     * specified byte array starting at offset off to this output stream,
+     * except trailing CR, CR LF, or LF.
+     * @param b byte array.
+     * @param off offset.
+     * @param len length of segment to write.
+     * @throws IOException 
+     */
+    @Override
+    public void write(byte[] b, int off, int len) throws IOException
+    {
+        // first write what we kept last time
+        if (hasCR)
+        {
+            if (!hasLF && len == 1 && b[off] == '\n')
+            {
+                // previous buffer ended with CR
+                // actual buffer contains only LF so it will be the last one
+                // => we're done
+                hasCR = false; // to avoid this getting written in the flush
+                return;                    
+            }
+            super.write('\r');
+            hasCR = false;
+        }
+        if (hasLF)
+        {
+            super.write('\n');
+            hasLF = false;
+        } 
+        // don't write CR, LF, or CR LF if at the end of the buffer
+        if (len > 0)
+        {
+            if (b[off + len - 1] == '\r')
+            {
+                hasCR = true;
+                --len;
+            }
+            else if (b[off + len - 1] == '\n')
+            {
+                hasLF = true;
+                --len;
+                if (len > 0 && b[off + len - 1] == '\r')
+                {
+                    hasCR = true;
+                    --len;
+                }
+            }
+        }        
+        super.write(b, off, len);
+    } 
+
+    /**
+     * write out a single CR if one was kept. Don't write kept CR LF or LF, 
+     * and then call the base method to flush.
+     * 
+     * @throws IOException 
+     */
+    @Override
+    public void flush() throws IOException
+    {
+        // if there is only a CR and no LF, write it
+        if (hasCR && !hasLF)
+        {
+            super.write('\r');
+        }
+        hasCR = false;
+        hasLF = false;
+        super.flush();
+    }
+}

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun May 25 14:58:51 2014
@@ -1604,7 +1604,7 @@ public class NonSequentialPDFParser exte
             if (useReadUntilEnd)
             {
                 out = stream.createFilteredStream(streamLengthObj);
-                readUntilEndStream(out);
+                readUntilEndStream(new EndstreamOutputStream(out));
             }
             String endStream = readString();
             if (!endStream.equals("endstream"))
@@ -1646,9 +1646,9 @@ public class NonSequentialPDFParser exte
     	}
     	return streamLengthIsValid;
     }
+    
     private void readUntilEndStream(final OutputStream out) throws IOException
     {
-
         int bufSize;
         int charMatchCount = 0;
         byte[] keyw = ENDSTREAM;
@@ -1660,7 +1660,7 @@ public class NonSequentialPDFParser exte
         // beginning of buffer
         while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen - charMatchCount)) > 0)
         {
-        	// number of already matching chars
+            // number of already matching chars
             int startingMatchCount = charMatchCount;
             int bIdx = charMatchCount;
             int quickTestIdx;
@@ -1693,7 +1693,6 @@ public class NonSequentialPDFParser exte
 
                 final byte ch = streamCopyBuf[bIdx]; // could be negative - but
                                                      // we only compare to ASCII
-
                 if (ch == keyw[charMatchCount])
                 {
                     if (++charMatchCount == keyw.length)
@@ -1710,7 +1709,6 @@ public class NonSequentialPDFParser exte
                         // maybe ENDSTREAM is missing but we could have ENDOBJ
                         keyw = ENDOBJ;
                         charMatchCount++;
-
                     }
                     else
                     {
@@ -1742,17 +1740,17 @@ public class NonSequentialPDFParser exte
             {
                 // keyword matched; 
             	// unread matched keyword (endstream/endobj) and following buffered content
-           		pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + startingMatchCount);
+       		pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + startingMatchCount);
                 break;
-
             }
             else
             {
                 // copy matched chars at start of buffer
                 System.arraycopy(keyw, 0, streamCopyBuf, 0, charMatchCount);
             }
-
         } // while
+
+        out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
     }
     
     /**

Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java Sun May 25 14:58:51 2014
@@ -29,6 +29,7 @@ import org.apache.pdfbox.io.TestRandomAc
 import org.apache.pdfbox.io.TestRandomAccessFileOutputStream;
 import org.apache.pdfbox.io.ccitt.TestCCITTFaxG31DDecodeInputStream;
 import org.apache.pdfbox.io.ccitt.TestPackedBitArray;
+import org.apache.pdfbox.pdfparser.EndstreamOutputStreamTest;
 import org.apache.pdfbox.pdmodel.TestFDF;
 import org.apache.pdfbox.pdmodel.TestPDDocument;
 import org.apache.pdfbox.pdmodel.TestPDDocumentCatalog;
@@ -125,6 +126,9 @@ public class TestAll extends TestCase
         suite.addTestSuite(TestPDFText2HTML.class);
         suite.addTestSuite(PDColorStateTest.class);
 
+        suite.addTestSuite(EndstreamOutputStreamTest.class);
+        
+
         return suite;
     }
 }