You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/05/25 16:58:51 UTC
svn commit: r1597435 - in /pdfbox/branches/1.8/pdfbox/src:
main/java/org/apache/pdfbox/pdfparser/ test/java/org/apache/pdfbox/
Author: tilman
Date: Sun May 25 14:58:51 2014
New Revision: 1597435
URL: http://svn.apache.org/r1597435
Log:
PDFBOX-2079: remove final CR LF or LF from output of readUntilEndStream()
Added:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java (with props)
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun May 25 14:58:51 2014
@@ -503,7 +503,7 @@ public abstract class BaseParser
{
// Couldn't determine length from dict: just
// scan until we find endstream:
- readUntilEndStream( out );
+ readUntilEndStream(new EndstreamOutputStream(out));
}
else
{
@@ -583,7 +583,7 @@ public abstract class BaseParser
IOUtils.closeQuietly(out);
out = stream.createFilteredStream( streamLength );
// scan until we find endstream:
- readUntilEndStream( out );
+ readUntilEndStream(new EndstreamOutputStream(out));
}
}
}
@@ -621,7 +621,7 @@ public abstract class BaseParser
* If for some reason we get something else here, Read until we find the next
* "endstream"
*/
- readUntilEndStream( out );
+ readUntilEndStream(new EndstreamOutputStream(out));
endStream = readString();
if( !endStream.equals( ENDSTREAM_STRING ) )
{
@@ -747,6 +747,8 @@ public abstract class BaseParser
}
} // while
+
+ out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
}
/**
Added: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java?rev=1597435&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java Sun May 25 14:58:51 2014
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdfparser;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * This class is only for the readUntilEndStream methods, to prevent a
+ * final CR LF or LF (but not a final CR!) from being written to the output.
+ * Because of that, only the 3-param write() method is implemented. This solves
+ * PDFBOX-2079 and avoids making readUntilEndStream() even more complex than it
+ * already is.
+ *
+ * @author Tilman Hausherr
+ */
+class EndstreamOutputStream extends BufferedOutputStream
+{
+ //TODO: replace this class with a PullBackOutputStream class if there ever is one
+
+ private boolean hasCR = false;
+ private boolean hasLF = false;
+
+ public EndstreamOutputStream(OutputStream out)
+ {
+ super(out);
+ }
+
+ /**
+ * Write CR and/or LF that were kept, then writes len bytes from the
+ * specified byte array starting at offset off to this output stream,
+ * except trailing CR, CR LF, or LF.
+ * @param b byte array.
+ * @param off offset.
+ * @param len length of segment to write.
+ * @throws IOException
+ */
+ @Override
+ public void write(byte[] b, int off, int len) throws IOException
+ {
+ // first write what we kept last time
+ if (hasCR)
+ {
+ if (!hasLF && len == 1 && b[off] == '\n')
+ {
+ // previous buffer ended with CR
+ // actual buffer contains only LF so it will be the last one
+ // => we're done
+ hasCR = false; // to avoid this getting written in the flush
+ return;
+ }
+ super.write('\r');
+ hasCR = false;
+ }
+ if (hasLF)
+ {
+ super.write('\n');
+ hasLF = false;
+ }
+ // don't write CR, LF, or CR LF if at the end of the buffer
+ if (len > 0)
+ {
+ if (b[off + len - 1] == '\r')
+ {
+ hasCR = true;
+ --len;
+ }
+ else if (b[off + len - 1] == '\n')
+ {
+ hasLF = true;
+ --len;
+ if (len > 0 && b[off + len - 1] == '\r')
+ {
+ hasCR = true;
+ --len;
+ }
+ }
+ }
+ super.write(b, off, len);
+ }
+
+ /**
+ * write out a single CR if one was kept. Don't write kept CR LF or LF,
+ * and then call the base method to flush.
+ *
+ * @throws IOException
+ */
+ @Override
+ public void flush() throws IOException
+ {
+ // if there is only a CR and no LF, write it
+ if (hasCR && !hasLF)
+ {
+ super.write('\r');
+ }
+ hasCR = false;
+ hasLF = false;
+ super.flush();
+ }
+}
Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun May 25 14:58:51 2014
@@ -1604,7 +1604,7 @@ public class NonSequentialPDFParser exte
if (useReadUntilEnd)
{
out = stream.createFilteredStream(streamLengthObj);
- readUntilEndStream(out);
+ readUntilEndStream(new EndstreamOutputStream(out));
}
String endStream = readString();
if (!endStream.equals("endstream"))
@@ -1646,9 +1646,9 @@ public class NonSequentialPDFParser exte
}
return streamLengthIsValid;
}
+
private void readUntilEndStream(final OutputStream out) throws IOException
{
-
int bufSize;
int charMatchCount = 0;
byte[] keyw = ENDSTREAM;
@@ -1660,7 +1660,7 @@ public class NonSequentialPDFParser exte
// beginning of buffer
while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen - charMatchCount)) > 0)
{
- // number of already matching chars
+ // number of already matching chars
int startingMatchCount = charMatchCount;
int bIdx = charMatchCount;
int quickTestIdx;
@@ -1693,7 +1693,6 @@ public class NonSequentialPDFParser exte
final byte ch = streamCopyBuf[bIdx]; // could be negative - but
// we only compare to ASCII
-
if (ch == keyw[charMatchCount])
{
if (++charMatchCount == keyw.length)
@@ -1710,7 +1709,6 @@ public class NonSequentialPDFParser exte
// maybe ENDSTREAM is missing but we could have ENDOBJ
keyw = ENDOBJ;
charMatchCount++;
-
}
else
{
@@ -1742,17 +1740,17 @@ public class NonSequentialPDFParser exte
{
// keyword matched;
// unread matched keyword (endstream/endobj) and following buffered content
- pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + startingMatchCount);
+ pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + startingMatchCount);
break;
-
}
else
{
// copy matched chars at start of buffer
System.arraycopy(keyw, 0, streamCopyBuf, 0, charMatchCount);
}
-
} // while
+
+ out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
}
/**
Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java?rev=1597435&r1=1597434&r2=1597435&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java Sun May 25 14:58:51 2014
@@ -29,6 +29,7 @@ import org.apache.pdfbox.io.TestRandomAc
import org.apache.pdfbox.io.TestRandomAccessFileOutputStream;
import org.apache.pdfbox.io.ccitt.TestCCITTFaxG31DDecodeInputStream;
import org.apache.pdfbox.io.ccitt.TestPackedBitArray;
+import org.apache.pdfbox.pdfparser.EndstreamOutputStreamTest;
import org.apache.pdfbox.pdmodel.TestFDF;
import org.apache.pdfbox.pdmodel.TestPDDocument;
import org.apache.pdfbox.pdmodel.TestPDDocumentCatalog;
@@ -125,6 +126,9 @@ public class TestAll extends TestCase
suite.addTestSuite(TestPDFText2HTML.class);
suite.addTestSuite(PDColorStateTest.class);
+ suite.addTestSuite(EndstreamOutputStreamTest.class);
+
+
return suite;
}
}