You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/02/25 20:44:10 UTC

svn commit: r1571801 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: filter/LZWDictionary.java filter/LZWFilter.java filter/LZWNode.java io/NBitInputStream.java io/NBitOutputStream.java

Author: tilman
Date: Tue Feb 25 19:44:10 2014
New Revision: 1571801

URL: http://svn.apache.org/r1571801
Log:
PDFBOX-1147: rewrote LZW filter after failure to find bug; PDFBOX-205: catch EOF if EOD marker is missing; delete files that are no longer needed.

Removed:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWDictionary.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWNode.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/NBitInputStream.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/NBitOutputStream.java
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java?rev=1571801&r1=1571800&r2=1571801&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java Tue Feb 25 19:44:10 2014
@@ -1,10 +1,9 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -16,183 +15,259 @@
  */
 package org.apache.pdfbox.filter;
 
-import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.io.StreamCorruptedException;
-
+import java.util.ArrayList;
+import java.util.Arrays;
+import javax.imageio.stream.MemoryCacheImageInputStream;
+import javax.imageio.stream.MemoryCacheImageOutputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSDictionary;
 
-import org.apache.pdfbox.io.NBitInputStream;
-import org.apache.pdfbox.io.NBitOutputStream;
-
 /**
- * This is the used for the LZWDecode filter.
  *
- * @author Ben Litchfield
+ * This is the filter used for the LZWDecode filter.
+ *
+ * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
+ * @author Tilman Hausherr
  */
-final class LZWFilter extends Filter
+public class LZWFilter extends Filter
 {
+    /**
+     * Log instance.
+     */
+    private static final Log LOG = LogFactory.getLog(LZWFilter.class);
+
+    /**
+     * The LZW clear table code.
+     */
     public static final long CLEAR_TABLE = 256;
+
+    /**
+     * The LZW end of data code.
+     */
     public static final long EOD = 257;
 
+    /**
+     * The LZW code table.
+     */
+    private ArrayList<byte[]> codeTable = null;
+
+    /**
+     * {@inheritDoc}
+     */
     @Override
     protected final DecodeResult decode(InputStream encoded, OutputStream decoded,
-                                         COSDictionary parameters) throws IOException
+            COSDictionary parameters) throws IOException
     {
-        //log.debug("decode()");
-        NBitInputStream in = new NBitInputStream(encoded);
-        in.setBitsInChunk(9);
-        LZWDictionary dic = new LZWDictionary();
-        byte firstByte = 0;
-        long nextCommand;
-        while ((nextCommand = in.read()) != EOD)
-        {
-            // log.debug("decode - nextCommand=" + nextCommand + ", bitsInChunk: " + in.getBitsInChunk());
+        codeTable = null;
+        int chunk = 9;
+        MemoryCacheImageInputStream in = new MemoryCacheImageInputStream(encoded);
+        long nextCommand = 0;
+        long prevCommand = -1;
 
-            if (nextCommand == CLEAR_TABLE)
-            {
-                in.setBitsInChunk(9);
-                dic = new LZWDictionary();
-            }
-            else
+        try
+        {
+            while ((nextCommand = in.readBits(chunk)) != EOD)
             {
-                byte[] data = dic.getData(nextCommand);
-                if (data == null)
-                {
-                    dic.visit(firstByte);
-                    data = dic.getData(nextCommand);
-                    dic.clear();
-                }
-                if (data == null)
-                {
-                    throw new StreamCorruptedException("Error: data is null");
-                }
-                dic.visit(data);
-
-                //log.debug("decode - dic.getNextCode(): " + dic.getNextCode());
-
-                if (dic.getNextCode() >= 2047)
+                if (nextCommand == CLEAR_TABLE)
                 {
-                    in.setBitsInChunk(12);
-                }
-                else if (dic.getNextCode() >= 1023)
-                {
-                    in.setBitsInChunk(11);
-                }
-                else if (dic.getNextCode() >= 511)
-                {
-                    in.setBitsInChunk(10);
+                    chunk = 9;
+                    initCodeTable();
+                    prevCommand = -1;
                 }
                 else
                 {
-                    in.setBitsInChunk(9);
+                    if (nextCommand < codeTable.size())
+                    {
+                        byte[] data = codeTable.get((int) nextCommand);
+                        byte firstByte = data[0];
+                        decoded.write(data);
+                        if (prevCommand != -1)
+                        {
+                            data = codeTable.get((int) prevCommand);
+                            byte[] newData = Arrays.copyOf(data, data.length + 1);
+                            newData[data.length] = firstByte;
+                            codeTable.add(newData);
+                        }
+                    }
+                    else
+                    {
+                        byte[] data = codeTable.get((int) prevCommand);
+                        byte[] newData = Arrays.copyOf(data, data.length + 1);
+                        newData[data.length] = data[0];
+                        decoded.write(newData);
+                        codeTable.add(newData);
+                    }
+                    if (codeTable.size() >= 2047)
+                    {
+                        chunk = 12;
+                    }
+                    else if (codeTable.size() >= 1023)
+                    {
+                        chunk = 11;
+                    }
+                    else if (codeTable.size() >= 511)
+                    {
+                        chunk = 10;
+                    }
+                    else
+                    {
+                        chunk = 9;
+                    }
+                    prevCommand = nextCommand;
                 }
-                /**
-                if (in.getBitsInChunk() != dic.getCodeSize())
-                {
-                    in.unread(nextCommand);
-                    in.setBitsInChunk(dic.getCodeSize());
-                    System.out.print("Switching " + nextCommand + " to ");
-                    nextCommand = in.read();
-                    System.out.println("" +  nextCommand);
-                    data = dic.getData(nextCommand);
-                }**/
-                firstByte = data[0];
-                decoded.write(data);
             }
         }
+        catch (EOFException ex)
+        {
+            LOG.warn("Premature EOF in LZW stream, EOD code missing");
+        }
         decoded.flush();
         return new DecodeResult(parameters);
     }
 
+    /**
+     * {@inheritDoc}
+     */
     @Override
     protected final void encode(InputStream rawData, OutputStream encoded, COSDictionary parameters)
             throws IOException
     {
-        //log.debug("encode()");
-        PushbackInputStream input = new PushbackInputStream(rawData, 4096);
-        LZWDictionary dic = new LZWDictionary();
-        NBitOutputStream out = new NBitOutputStream(encoded);
-        out.setBitsInChunk(9); //initially nine
-        out.write(CLEAR_TABLE);
-        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-        int byteRead = 0;
-        for (int i = 0; (byteRead = input.read()) != -1; i++)
-        {
-            //log.debug("byteRead = '" + (char)byteRead + "' (0x" + Integer.toHexString(byteRead) + "), i=" + i);
-            buffer.write(byteRead);
-            dic.visit((byte)byteRead);
-            out.setBitsInChunk(dic.getCodeSize());
-
-            //log.debug("Getting node '" + new String(buffer.toByteArray()) + "', buffer.size = " + buffer.size());
-            LZWNode node = dic.getNode(buffer.toByteArray());
-            int nextByte = input.read();
-            if (nextByte != -1)
-            {
-                //log.debug("nextByte = '" + (char)nextByte + "' (0x" + Integer.toHexString(nextByte) + ")");
-                LZWNode next = node.getNode((byte)nextByte);
-                if (next == null)
-                {
-                    //log.debug("encode - No next node, writing node and resetting buffer (" +
-                    //          " node.getCode: " + node.getCode() + ")" +
-                    //          " bitsInChunk: " + out.getBitsInChunk() +
-                    //          ")");
-                    out.write(node.getCode());
-                    buffer.reset();
-                }
+        initCodeTable();
+        int chunk = 9;
 
-                input.unread(nextByte);
+        byte[] inputPattern = null;
+        MemoryCacheImageOutputStream out = new MemoryCacheImageOutputStream(encoded);
+        out.writeBits(CLEAR_TABLE, chunk);
+        int foundCode = -1;
+        int r;
+        while ((r = rawData.read()) != -1)
+        {
+            byte by = (byte) r;
+            if (inputPattern == null)
+            {
+                inputPattern = new byte[]
+                {
+                    by
+                };
+                foundCode = by & 0xff;
             }
             else
             {
-                //log.debug("encode - EOF on lookahead: writing node, resetting buffer, and terminating read loop (" +
-                //          " node.getCode: " + node.getCode() + ")" +
-                //          " bitsInChunk: " + out.getBitsInChunk() +
-                //          ")");
-                out.write(node.getCode());
-                buffer.reset();
-                break;
+                inputPattern = Arrays.copyOf(inputPattern, inputPattern.length + 1);
+                inputPattern[inputPattern.length - 1] = by;
+                int newFoundCode = findPatternCode(codeTable, inputPattern);
+                if (newFoundCode == -1)
+                {
+                    // use previous
+                    out.writeBits(foundCode, chunk);
+                    // create new table entry
+                    codeTable.add(inputPattern);
+
+                    if (codeTable.size() == 4096)
+                    {
+                        // code table is full
+                        out.writeBits(CLEAR_TABLE, chunk);
+                        chunk = 9;
+                        initCodeTable();
+                    }
+
+                    inputPattern = new byte[]
+                    {
+                        by
+                    };
+                    foundCode = by & 0xff;
+                }
+                else
+                {
+                    foundCode = newFoundCode;
+                }
             }
-
-            if (dic.getNextCode() == 4096)
+            if (codeTable.size() - 1 >= 2047)
             {
-                //log.debug("encode - Clearing dictionary and unreading pending buffer data (" +
-                //          " bitsInChunk: " + out.getBitsInChunk() +
-                //          ")");
-                out.write(CLEAR_TABLE);
-                dic = new LZWDictionary();
-                input.unread(buffer.toByteArray());
-                buffer.reset();
+                chunk = 12;
+            }
+            else if (codeTable.size() - 1 >= 1023)
+            {
+                chunk = 11;
+            }
+            else if (codeTable.size() - 1 >= 511)
+            {
+                chunk = 10;
+            }
+            else
+            {
+                chunk = 9;
             }
         }
-
-        // Fix the code size based on the fact that we are writing the EOD
-        //
-        if (dic.getNextCode() >= 2047)
-        {
-            out.setBitsInChunk(12);
-        }
-        else if (dic.getNextCode() >= 1023)
+        if (foundCode != -1)
         {
-            out.setBitsInChunk(11);
+            out.writeBits(foundCode, chunk);
         }
-        else if (dic.getNextCode() >= 511)
+        out.writeBits(EOD, chunk);
+        out.writeBits(0, 7);
+        out.flush(); // must do or file will be empty :-(
+        codeTable.clear();
+    }
+
+    /**
+     * Find the longest matching pattern in the code table.
+     *
+     * @param codeTable The LZW code table.
+     * @param pattern The pattern to be searched for.
+     * @return The index of the longest matching pattern or -1 if nothing is
+     * found.
+     */
+    private int findPatternCode(ArrayList<byte[]> codeTable, byte[] pattern)
+    {
+        int foundCode = -1;
+        int foundLen = 0;
+        for (int i = codeTable.size() - 1; i >= 0; --i)
         {
-            out.setBitsInChunk(10);
+            if (i <= EOD)
+            {
+                // we're in the single byte area
+                if (foundCode != -1)
+                {
+                    return foundCode; // we already found pattern with size > 1
+                }
+                else if (pattern.length > 1)
+                {
+                    return -1; // we won't find anything here anyway
+                }
+            }
+            byte[] tryPattern = codeTable.get(i);
+            if (foundCode != -1 || tryPattern.length > foundLen)
+            {
+                if (Arrays.equals(tryPattern, pattern))
+                {
+                    foundCode = i;
+                    foundLen = tryPattern.length;
+                }
+            }
         }
-        else
+        return foundCode;
+    }
+
+    /**
+     * Init the code table with 1 byte entries and the EOD and CLEAR_TABLE
+     * markers.
+     */
+    private void initCodeTable()
+    {
+        codeTable = new ArrayList<byte[]>(4096);
+        for (int i = 0; i < 256; ++i)
         {
-            out.setBitsInChunk(9);
+            codeTable.add(new byte[]
+            {
+                (byte) (i & 0xFF)
+            });
         }
-
-        //log.debug("encode - Writing EOD (" +
-        //          " bitsInChunk: " + out.getBitsInChunk() +
-        //          ")");
-        out.write(EOD);
-        out.close();
-        encoded.flush();
+        codeTable.add(null); // 256 EOD
+        codeTable.add(null); // 257 CLEAR_TABLE
     }
 }