You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/12 18:13:35 UTC
[tika] branch branch_1x updated: TIKA-3006 -- fix regression in
metadata extraction from PDFs in 1.23.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new e09c4b3 TIKA-3006 -- fix regression in metadata extraction from PDFs in 1.23.
e09c4b3 is described below
commit e09c4b3974837b5c7b627899ca462e1f05336144
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 12 13:12:49 2020 -0500
TIKA-3006 -- fix regression in metadata extraction from PDFs in 1.23.
---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 53 ++++++++++++----------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 16 +++++++
2 files changed, 46 insertions(+), 23 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index d2839fa..5591b5f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -61,6 +61,8 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static org.apache.tika.parser.pdf.PDMetadataExtractor.addMetadata;
+
/**
* PDF parser.
* <p/>
@@ -228,34 +230,39 @@ public class PDFParser extends AbstractParser implements Initializable {
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
+ addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
+ addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
//if this wasn't already set by xmp, use doc info
if (metadata.get(TikaCoreProperties.CREATOR) == null) {
- PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
+ addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
}
if (metadata.get(TikaCoreProperties.TITLE) == null) {
- PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
+ addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
}
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
- PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
- PDMetadataExtractor.addMetadata(metadata, Office.KEYWORDS, info.getKeywords());
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
-
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
-
- PDMetadataExtractor.addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
- PDMetadataExtractor.addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
- PDMetadataExtractor.addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject());
-
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
+ addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
+ addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
+ addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
+ addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
+ addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
+ addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
+ addMetadata(metadata, "producer", info.getProducer());
+ addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
+
+ addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
+
+ // TODO: Move to description in Tika 2.0
+ addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
+ addMetadata(metadata, "trapped", info.getTrapped());
+ addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
+ // TODO Remove these in Tika 2.0
Calendar created = info.getCreationDate();
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
- PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATED, created);
+ addMetadata(metadata, DEPRECATED_CREATED, created);
+ addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
+ addMetadata(metadata, TikaCoreProperties.CREATED, created);
Calendar modified = info.getModificationDate();
- PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
- PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
+ addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+ addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
+ addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
// All remaining metadata is custom
// Copy this over as-is
@@ -264,8 +271,8 @@ public class PDFParser extends AbstractParser implements Initializable {
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
- PDMetadataExtractor.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
- PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
+ addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
+ addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index fa6e962..6fa268d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,6 +26,8 @@ import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -1506,6 +1508,20 @@ public class PDFParserTest extends TikaTest {
assertEquals(35, max);
}
+ @Test //TIKA-3006
+ public void testBranch1xMetadata() throws Exception {
+ //assert that we don't break legacy metadata keys in branch_1x
+ Metadata m = getXML("testPDF_1x_metadata.pdf").metadata;
+ for (String k : new String[]{
+ "Keywords", "dc:subject",
+ "pdf:docinfo:keywords", "meta:keyword"
+ }) {
+ assertEquals("fails on "+k, "keyword1, keyword2", m.get(k));
+ assertEquals(1, m.getValues(k).length);
+ }
+ assertEquals("2016-07-07T08:37:42Z", m.get("created"));
+
+ }
/**
* Simple class to count end of document events. If functionality is useful,