You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Mark Davidson <uk...@googlemail.com> on 2009/11/28 01:02:52 UTC
Updating metadata
Hi there
I've been playing about with Tika and PDFBox for metadata extraction and
update.On running the below program I get the following output:
Reading source file with Tika...
Title=Title
Format=application/pdf
Reading source file with PDFBox...
Title=Title
Writing target file with PDFBox...
Reading target file with Tika...
Title=null
Format=application/rdf+xml
Reading target file with PDFBox...
Title=Updated Title
Finished
Tika and PDFBox read the title property OK. I then update the title property
using PDFBox. PDFBox reads the updated title property correctly but Tika
doesn't. Also I noticed that the format has changed.How do I set the format
to be application/pdf? Could the format change have affected Tika's ability
to read the title property?
Many thanks
Mark
------------
public class UpdatePdfMetadata {
final String NEW_TITLE = "Updated Title";
final String OLD_TITLE = "Title";
public UpdatePdfMetadata() {}
public void execute(final String sourceFile,
final String targetFile) {
System.out.println("Reading source file with Tika...");
readTitleWithTika(sourceFile, OLD_TITLE);
System.out.println("Reading source file with PDFBox...");
readTitleWithPdfBox(sourceFile, OLD_TITLE);
System.out.println("Writing target file with PDFBox...");
writeTitleWithPdfBox(sourceFile, targetFile);
System.out.println("Reading target file with Tika...");
readTitleWithTika(targetFile, NEW_TITLE);
System.out.println("Reading target file with PDFBox...");
readTitleWithPdfBox(targetFile, NEW_TITLE);
}
private void writeTitleWithPdfBox(final String sourceFile,
final String targetFile) {
PDDocument pdf = null;
PDDocumentInformation info = null;
try {
pdf = PDDocument.load(sourceFile);
info = pdf.getDocumentInformation();
info.setTitle(NEW_TITLE);
pdf.save(targetFile);
} catch (IOException e) {
e.printStackTrace();
} catch (COSVisitorException e) {
e.printStackTrace();
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private void readTitleWithPdfBox(final String filename,
final String expectedTitle) {
PDDocument pdf = null;
PDDocumentInformation info = null;
try {
pdf = PDDocument.load(filename);
info = pdf.getDocumentInformation();
System.out.println("Title=" + info.getTitle());
} catch (IOException e) {
e.printStackTrace();
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private void readTitleWithTika(final String filename,
final String expectedTitle) {
InputStream stream = null;
final Metadata meta = new Metadata();
final Tika tika = new Tika();
try {
stream = new FileInputStream(filename);
tika.parse(stream, meta);
System.out.println("Title=" + meta.get(Metadata.TITLE));
System.out.println("Format=" + meta.get(Metadata.CONTENT_TYPE));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static void main(final String[] args) {
if (args == null || args.length != 2) {
System.out.println("Usage: UpdatePdfMetadata source_file
target_file");
System.exit(1);
} else {
new UpdatePdfMetadata().execute(args[0], args[1]);
System.out.println("Finished");
System.exit(2);
}
}
}