You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Mark Davidson <uk...@googlemail.com> on 2009/11/28 01:02:52 UTC

Updating metadata

Hi there

I've been playing about with Tika and PDFBox for metadata extraction and
update.On running the below program I get the following output:

Reading source file with Tika...
Title=Title
Format=application/pdf
Reading source file with PDFBox...
Title=Title
Writing target file with PDFBox...
Reading target file with Tika...
Title=null
Format=application/rdf+xml
Reading target file with PDFBox...
Title=Updated Title
Finished

Tika and PDFBox read the title property OK. I then update the title property
using PDFBox. PDFBox reads the updated title property correctly but Tika
doesn't. Also I noticed that the format has changed.How do I set the format
to be application/pdf? Could the format change have affected Tika's ability
to read the title property?

Many thanks

Mark

------------

public class UpdatePdfMetadata {

    final String NEW_TITLE = "Updated Title";
    final String OLD_TITLE = "Title";

    public UpdatePdfMetadata() {}

    public void execute(final String sourceFile,
                        final String targetFile) {

        System.out.println("Reading source file with Tika...");
        readTitleWithTika(sourceFile, OLD_TITLE);

        System.out.println("Reading source file with PDFBox...");
        readTitleWithPdfBox(sourceFile, OLD_TITLE);

        System.out.println("Writing target file with PDFBox...");
        writeTitleWithPdfBox(sourceFile, targetFile);

        System.out.println("Reading target file with Tika...");
        readTitleWithTika(targetFile, NEW_TITLE);

        System.out.println("Reading target file with PDFBox...");
        readTitleWithPdfBox(targetFile, NEW_TITLE);
    }

    private void writeTitleWithPdfBox(final String sourceFile,
                                      final String targetFile) {
        PDDocument pdf = null;
        PDDocumentInformation info = null;

        try {
            pdf = PDDocument.load(sourceFile);
            info = pdf.getDocumentInformation();
            info.setTitle(NEW_TITLE);
            pdf.save(targetFile);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (COSVisitorException e) {
            e.printStackTrace();
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }

    private void readTitleWithPdfBox(final String filename,
                                     final String expectedTitle) {
        PDDocument pdf = null;
        PDDocumentInformation info = null;
        try {
            pdf = PDDocument.load(filename);
            info = pdf.getDocumentInformation();
            System.out.println("Title=" + info.getTitle());
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private void readTitleWithTika(final String filename,
                                   final String expectedTitle) {
        InputStream stream = null;
        final Metadata meta = new Metadata();
        final Tika tika = new Tika();

        try {
            stream = new FileInputStream(filename);
            tika.parse(stream, meta);
            System.out.println("Title=" + meta.get(Metadata.TITLE));
            System.out.println("Format=" + meta.get(Metadata.CONTENT_TYPE));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (stream != null) {
                try {
                    stream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public static void main(final String[] args) {
        if (args == null || args.length != 2) {
            System.out.println("Usage: UpdatePdfMetadata source_file
target_file");
            System.exit(1);
        } else {
            new UpdatePdfMetadata().execute(args[0], args[1]);
            System.out.println("Finished");
            System.exit(2);
        }
    }

}