You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by tb...@apache.org on 2012/04/06 16:25:10 UTC

svn commit: r1310338 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io: PushBackInputStream.java RandomAccessBufferedFileInputStream.java

Author: tboehme
Date: Fri Apr  6 14:25:10 2012
New Revision: 1310338

URL: http://svn.apache.org/viewvc?rev=1310338&view=rev
Log:
as announced on PDFBOX-1199 this adds new input stream class with buffering and seek functionality;
PushBackInputStream is extended to support seek operation if the underlying stream implements RandomAccessRead

Added:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java   (with props)
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/PushBackInputStream.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/PushBackInputStream.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/PushBackInputStream.java?rev=1310338&r1=1310337&r2=1310338&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/PushBackInputStream.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/PushBackInputStream.java Fri Apr  6 14:25:10 2012
@@ -33,6 +33,10 @@ public class PushBackInputStream extends
      */
     private long offset = 0;
     
+    /** In case provided input stream implements {@link RandomAccessRead} we hold
+     *  a typed reference to it in order to support seek operations. */
+    private final RandomAccessRead raInput;
+    
     /**
      * Constructor.
      *
@@ -48,6 +52,9 @@ public class PushBackInputStream extends
         {
             throw new IOException( "Error: input was null" );
         }
+        
+        raInput = ( input instanceof RandomAccessRead ) ?
+										(RandomAccessRead) input : null;
     }
 
     /**
@@ -198,4 +205,33 @@ public class PushBackInputStream extends
         return data;
     }
 
+    /** Allows to seek to another position within stream in case the underlying
+     *  stream implements {@link RandomAccessRead}. Otherwise an {@link IOException}
+     *  is thrown.
+     *  
+     *  Pushback buffer is cleared before seek operation by skipping over all bytes
+     *  of buffer.
+     *  
+     *  @param newOffset  new position within stream from which to read next
+     *  
+     *  @throws IOException if underlying stream does not implement {@link RandomAccessRead}
+     *                      or seek operation on underlying stream was not successful
+     */
+    public void seek( long newOffset ) throws IOException
+    {
+    	if ( raInput == null )
+    			throw new IOException( "Provided stream of type " + in.getClass().getSimpleName() +
+    													 	 " is not seekable." );
+    	
+    	// clear unread buffer by skipping over all bytes of buffer
+    	int unreadLength = buf.length - pos;
+    	if ( unreadLength > 0 )
+    	{
+    			skip( unreadLength );
+    	}
+    	
+    	raInput.seek( newOffset );
+    	offset = newOffset;
+    }
+
 }

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java?rev=1310338&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java Fri Apr  6 14:25:10 2012
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.io;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.util.LinkedHashMap;
+
+/**
+ * Provides {@link InputStream} access to portions of a file combined with
+ * buffered reading of content. Start of next bytes to read can be set via seek method.
+ * 
+ * File is accessed via {@link RandomAccessFile} and is read in byte chunks which are
+ * cached.
+ * 
+ * @author Timo Boehme (timo.boehme at ontochem com)
+ */
+public class RandomAccessBufferedFileInputStream extends InputStream implements RandomAccessRead
+{
+
+		private int  pageSizeShift  = 12;
+		private int  pageSize       = 1 << pageSizeShift;
+		private long pageOffsetMask = -1L << pageSizeShift;
+		private int  maxCachedPages = 1000;
+	
+		private byte[] lastRemovedCachePage = null;
+	
+		/** Create a LRU page cache. */
+		private final LinkedHashMap<Long,byte[]> pageCache = new LinkedHashMap<Long, byte[]>( maxCachedPages, 0.75f, true ) 
+		{
+				private static final long serialVersionUID = -6302488539257741101L;
+
+				@Override
+				protected boolean removeEldestEntry( java.util.Map.Entry<Long, byte[]> _eldest )
+				{
+						final boolean doRemove = size() > maxCachedPages;
+						if ( doRemove )
+							lastRemovedCachePage = _eldest.getValue();
+						return doRemove;
+				}
+		};
+	
+		private long   curPageOffset    = -1;
+		private byte[] curPage          = new byte[ pageSize ];
+		private int    offsetWithinPage = 0;
+	
+		private final RandomAccessFile raFile;
+		private final long             fileLength;
+		private long                   fileOffset = 0;
+	
+		// ------------------------------------------------------------------------
+		/** Create input stream instance for given file. */
+		public RandomAccessBufferedFileInputStream( File _file )
+		throws FileNotFoundException, IOException
+		{
+				raFile     = new RandomAccessFile( _file, "r" );
+				fileLength = _file.length();
+		
+				seek( 0 );
+		}
+	
+		// ------------------------------------------------------------------------
+		/** Returns offset in file at which next byte would be read. */
+		public final long getFilePointer()
+		{
+				return fileOffset;
+		}
+	
+		// ------------------------------------------------------------------------
+		/** Seeks to new position. If new position is outside of current page
+		 *  the new page is either taken from cache or read from file and added to cache. */
+		public final void seek( final long newOffset ) throws IOException
+		{
+				final long newPageOffset = newOffset & pageOffsetMask;
+				if ( newPageOffset != curPageOffset )
+				{
+						byte[] newPage = pageCache.get( newPageOffset );
+						if ( newPage == null )
+						{
+								raFile.seek( newPageOffset );
+								newPage = readPage();
+								pageCache.put( newPageOffset, newPage );
+						}
+						curPageOffset = newPageOffset;
+						curPage       = newPage;
+				}
+		
+				offsetWithinPage = (int) (newOffset - curPageOffset);
+				fileOffset       = newOffset;
+		}
+	
+		// ------------------------------------------------------------------------
+		/** Reads a page with data from current file position. If we have a previously
+		 *  removed page from cache the buffer of this page is reused. Otherwise a new
+		 *  byte buffer is created. */
+		private final byte[] readPage() throws IOException
+		{
+				byte[] page;
+		
+				if ( lastRemovedCachePage != null )
+				{
+						page = lastRemovedCachePage;
+						lastRemovedCachePage = null;
+				} else
+					page = new byte[ pageSize ];
+		
+				int readBytes = 0;
+				while ( readBytes < pageSize )
+				{
+						int curBytesRead = raFile.read( page, readBytes, pageSize - readBytes );
+						if ( curBytesRead < 0 )
+								// EOF
+								break;
+						readBytes += curBytesRead;
+				}
+		
+				return page;
+		}
+	
+	// ------------------------------------------------------------------------
+	@Override
+	public int read() throws IOException
+	{
+		if ( fileOffset >= fileLength )
+		{
+				return -1;
+		}
+		
+		if ( offsetWithinPage == pageSize )
+		{
+				seek( fileOffset );
+		}
+
+		fileOffset++;
+		return curPage[ offsetWithinPage++ ] & 0xff;
+	}
+
+	// ------------------------------------------------------------------------
+	@Override
+	public int read( byte[] b, int off, int len ) throws IOException
+	{	
+			if ( fileOffset >= fileLength )
+			{
+					return -1;
+			}
+			
+			if ( offsetWithinPage == pageSize ) 
+			{
+					seek( fileOffset );
+			}
+	
+			int commonLen = Math.min( pageSize - offsetWithinPage, len );
+			if ( ( fileLength - fileOffset ) < pageSize )
+					commonLen = Math.min( commonLen, (int) ( fileLength - fileOffset ) );
+			
+			System.arraycopy( curPage, offsetWithinPage, b, off, commonLen );
+			
+			offsetWithinPage += commonLen;
+			fileOffset       += commonLen;
+			
+			return commonLen;
+	}
+	
+	// ------------------------------------------------------------------------
+	@Override
+	public int available() throws IOException
+	{
+			return (int) Math.min( fileLength - fileOffset, Integer.MAX_VALUE );
+	}
+	
+	// ------------------------------------------------------------------------
+	@Override
+	public long skip( long n ) throws IOException
+	{	
+			// test if we have to reduce skip count because of EOF
+			long toSkip = n;
+			
+			if ( fileLength - fileOffset < toSkip )
+					toSkip = fileLength - fileOffset;
+			
+			if ( ( toSkip < pageSize ) && ( ( offsetWithinPage + toSkip ) <= pageSize ) )
+			{
+					// we can skip within current page
+					offsetWithinPage += toSkip;
+				  fileOffset       += toSkip;
+			}
+			else
+			{
+					// seek to the page we will get after skipping
+					seek( fileOffset + toSkip );
+			}
+			
+			return toSkip;
+	}
+	
+	// ------------------------------------------------------------------------
+	@Override
+	public long length() throws IOException 
+	{
+			return fileLength;
+	}
+	
+	// ------------------------------------------------------------------------
+	@Override
+	public void close() throws IOException
+	{
+			raFile.close();
+			pageCache.clear();
+	}
+}

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/io/RandomAccessBufferedFileInputStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain