You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Xavier Sudan <xs...@gmail.com> on 2009/08/25 09:26:19 UTC
Splitt a PDF with iText and read with PDFBox
Hello,
I am trying to splitt a PDF with iText and read content after that. I can do
this process 1 time (read, splitt, read) but if I splitt another time,
PDFBox is unable to read the content (it just reads "/r/n"). The PDF (that
PDFBox cannot read) is readable with Acrobat (I can do a copy past)...
I would like to be able to splitt and read PDF multiple times... I have done
an application reproducing this problem.
Does anyone have an idea about this problem ?
Thank you :)
-----------------------------------------------------------------------
Code to reproduce the problem :
-----------------------------------------------------------------------
There is an application reproducing the problem... I hope that someone could
find the source of the problem. This arrived with different PDF source
(printed with pdfFractory, OpenOffice, ...) for the original PDF (will be
PDF_A).
*To use this application, just place a PDF named "PDF_A.pdf" at the base of
the project. Need library PDFBox and iText.*
------------------------------------------------------------------------
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;
/**
* Problem using iText or PDFBox.
* http://article.gmane.org/gmane.comp.java.lib.itext.general/46543
*/
public class TestReuseSameFile {
/**
* @param args
*/
public static void main(String[] args) {
System.out
.println("The problem is that after the creation of two
files,"
+ " the last is unreadable and I don't know what I
do wrong..."
+ " For my tests, I take one PDF (PDF_A) from 1
page, and I "
+ "create the PDF_B. After I take PDF_B to create
PDF_C and "
+ "there PDF_C is unreadable (with Code using
PDFBox, but I can open"
+ " and read it by the OS system). Test with IText
2.1.4 and 2.1.7."
+ " \n !!! USAGE : put the file : PDF_A.pdf at the
base of the project.");
TestReuseSameFile reuse = new TestReuseSameFile();
// Put the original PDF at the base of the project.
String PDF_A = "PDF_A.pdf";
System.out.println("--------------- " + PDF_A + "\n"
+ reuse.pdftoText(PDF_A));
// Creation of PDF_B base on PDF_A
String PDF_B = "PDF_B.pdf";
reuse.splittPDF(PDF_A, PDF_B);
System.out.println("--------------- " + PDF_B + "\n"
+ reuse.pdftoText(PDF_B));
// Creation of PDF_C base on PDF_B
String PDF_C = "PDF_C.pdf";
reuse.splittPDF(PDF_B, PDF_C);
System.out.println("--------------- " + PDF_C + "\n"
+ reuse.pdftoText(PDF_C));
// There is bug because PDF_C has no content.... :(
}
/**
* Take the file (just the first page) and recreate another (same
content).
* Test with iText 2.1.4 and 2.1.7.
*
* @param fileName Source
* @param nameOfNewFile Destination
* @return
*/
public boolean splittPDF(String fileName, String nameOfNewFile) {
try {
PdfReader reader = new PdfReader(fileName);
com.lowagie.text.Document document = new
com.lowagie.text.Document(
reader.getPageSizeWithRotation(1));
PdfWriter writer;
try {
File f = new File(nameOfNewFile);
if (f.exists()) {
System.out.println("The file : " + nameOfNewFile
+ " already exist. It will be overwrite");
}
writer = PdfWriter.getInstance(document, new
FileOutputStream(
nameOfNewFile));
} catch (Exception e) {
System.out.println(e);
return false;
}
document.open();
PdfContentByte cb = writer.getDirectContent();
PdfImportedPage page;
int rotation;
int pageNumber = 1;
document.setPageSize(reader.getPageSizeWithRotation(pageNumber));
document.newPage();
page = writer.getImportedPage(reader, pageNumber);
rotation = reader.getPageRotation(pageNumber);
if (rotation == 90 || rotation == 270) {
cb.addTemplate(page, 0, -1f, 1f, 0, 0, reader
.getPageSizeWithRotation(pageNumber).getHeight());
} else {
cb.addTemplate(page, 1f, 0, 0, 1f, 0, 0);
}
document.close();
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
* Get content of PDF in String (Use PDFBOX (tested with PDFBox-0.7.3)
*
* @param fileName
* the PDF File name
* @return Content of PDF
*/
public String pdftoText(String fileName) {
File f = new File(fileName);
if (!f.isFile()) {
System.out.println("File " + fileName + " does not exist.");
return null;
}
PDFParser parser;
PDFTextStripper pdfStripper;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
parser = new PDFParser(new FileInputStream(f));
} catch (Exception e) {
System.out.println("Unable to open PDF Parser.");
return null;
}
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
int numberOfPage = pdDoc.getNumberOfPages();
// Get Content JUST ONE PAGE FOR THE EXEMPLE
for (int i = 1; i <= numberOfPage; i++) {
pdfStripper.setStartPage(i);
pdfStripper.setEndPage(i);
String out = pdfStripper.getText(pdDoc);
return out;
}
} catch (Exception e) {
System.err
.println("An exception occured in parsing the PDF
Document.");
e.printStackTrace();
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e1.printStackTrace();
}
}
return null;
}
}