You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Xavier Sudan <xs...@gmail.com> on 2009/08/25 09:26:19 UTC
Splitt a PDF with iText and read with PDFBox

Hello,

I am trying to splitt a PDF with iText and read content after that. I can do
this process 1 time (read, splitt, read) but if I splitt another time,
PDFBox is unable to read the content (it just reads "/r/n"). The PDF (that
PDFBox cannot read) is readable with Acrobat (I can do a copy past)...

I would like to be able to splitt and read PDF multiple times... I have done
an application reproducing this problem.

Does anyone have an idea about this problem ?

Thank you :)


-----------------------------------------------------------------------
Code to reproduce the problem :
-----------------------------------------------------------------------

There is an application reproducing the problem... I hope that someone could
find the source of the problem. This arrived with different PDF source
(printed with pdfFractory, OpenOffice, ...) for the original PDF (will be
PDF_A).

*To use this application, just place a PDF named "PDF_A.pdf" at the base of
the project. Need library PDFBox and iText.*

------------------------------------------------------------------------
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;

import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;

/**
 * Problem using iText or PDFBox.
 * http://article.gmane.org/gmane.comp.java.lib.itext.general/46543
 */
public class TestReuseSameFile {

    /**
     * @param args
     */
    public static void main(String[] args) {
        System.out
                .println("The problem is that after the creation of two
files,"
                        + " the last is unreadable and I don't know what I
do wrong..."
                        + " For my tests, I take one PDF (PDF_A) from 1
page, and I "
                        + "create the PDF_B. After I take PDF_B to create
PDF_C and "
                        + "there PDF_C is unreadable (with Code using
PDFBox, but I can open"
                        + " and read it by the OS system). Test with IText
2.1.4 and 2.1.7."
                        + " \n !!! USAGE : put the file : PDF_A.pdf at the
base of the project.");
        TestReuseSameFile reuse = new TestReuseSameFile();

        // Put the original PDF at the base of the project.
        String PDF_A = "PDF_A.pdf";
        System.out.println("--------------- " + PDF_A + "\n"
                + reuse.pdftoText(PDF_A));

        // Creation of PDF_B base on PDF_A
        String PDF_B = "PDF_B.pdf";
        reuse.splittPDF(PDF_A, PDF_B);
        System.out.println("--------------- " + PDF_B + "\n"
                + reuse.pdftoText(PDF_B));

        // Creation of PDF_C base on PDF_B
        String PDF_C = "PDF_C.pdf";
        reuse.splittPDF(PDF_B, PDF_C);
        System.out.println("--------------- " + PDF_C + "\n"
                + reuse.pdftoText(PDF_C));

        // There is bug because PDF_C has no content.... :(

    }

    /**
     * Take the file (just the first page) and recreate another (same
content).
     * Test with iText 2.1.4 and 2.1.7.
     *
     * @param fileName Source
     * @param nameOfNewFile Destination
     * @return
     */
    public boolean splittPDF(String fileName, String nameOfNewFile) {

        try {
            PdfReader reader = new PdfReader(fileName);

            com.lowagie.text.Document document = new
com.lowagie.text.Document(
                    reader.getPageSizeWithRotation(1));
            PdfWriter writer;
            try {
                File f = new File(nameOfNewFile);
                if (f.exists()) {
                    System.out.println("The file : " + nameOfNewFile
                            + " already exist. It will be overwrite");
                }
                writer = PdfWriter.getInstance(document, new
FileOutputStream(
                        nameOfNewFile));
            } catch (Exception e) {
                System.out.println(e);
                return false;
            }
            document.open();
            PdfContentByte cb = writer.getDirectContent();
            PdfImportedPage page;
            int rotation;
            int pageNumber = 1;


document.setPageSize(reader.getPageSizeWithRotation(pageNumber));
            document.newPage();
            page = writer.getImportedPage(reader, pageNumber);
            rotation = reader.getPageRotation(pageNumber);
            if (rotation == 90 || rotation == 270) {
                cb.addTemplate(page, 0, -1f, 1f, 0, 0, reader
                        .getPageSizeWithRotation(pageNumber).getHeight());
            } else {
                cb.addTemplate(page, 1f, 0, 0, 1f, 0, 0);
            }

            document.close();

        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }

        return true;
    }

    /**
     * Get content of PDF in String (Use PDFBOX (tested with PDFBox-0.7.3)
     *
     * @param fileName
     *            the PDF File name
     * @return Content of PDF
     */
    public String pdftoText(String fileName) {

        File f = new File(fileName);
        if (!f.isFile()) {
            System.out.println("File " + fileName + " does not exist.");
            return null;
        }
        PDFParser parser;
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;

        try {
            parser = new PDFParser(new FileInputStream(f));
        } catch (Exception e) {
            System.out.println("Unable to open PDF Parser.");
            return null;
        }
        try {
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            int numberOfPage = pdDoc.getNumberOfPages();
            // Get Content JUST ONE PAGE FOR THE EXEMPLE
            for (int i = 1; i <= numberOfPage; i++) {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);
                String out = pdfStripper.getText(pdDoc);
                return out;
            }
        } catch (Exception e) {
            System.err
                    .println("An exception occured in parsing the PDF
Document.");
            e.printStackTrace();
        } finally {
            try {
                if (cosDoc != null)
                    cosDoc.close();

                if (pdDoc != null)
                    pdDoc.close();
            } catch (Exception e1) {
                e1.printStackTrace();
            }
        }
        return null;
    }
}