You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/07 17:04:24 UTC
[tika] 01/02: TIKA-2559: Extract language metadata item from PDF
files via Matt Sheppard.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0e5fded007e65dce35dad52b29dc6d4bd8a550cc
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Feb 7 12:00:53 2018 -0500
TIKA-2559: Extract language metadata item from PDF files via Matt Sheppard.
---
.../src/main/java/org/apache/tika/parser/pdf/PDFParser.java | 3 +++
.../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 ++++++++
2 files changed, 11 insertions(+)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 303d8e6..4d5202d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -222,6 +222,9 @@ public class PDFParser extends AbstractParser implements Initializable {
metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
Boolean.toString(ap.canPrintDegraded()));
+ if (document.getDocumentCatalog().getLanguage() != null) {
+ metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
+ }
//now go for the XMP
Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 52814ac..537a7ff 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1370,6 +1370,14 @@ public class PDFParserTest extends TikaTest {
assertFalse(path + " should have thrown exception", noEx);
}
+ @Test
+ public void testLanguageMetadata() throws Exception {
+ assertEquals("de-CH", getXML("testPDF-custommetadata.pdf")
+ .metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("zh-CN", getXML("testPDFFileEmbInAnnotation.pdf")
+ .metadata.get(TikaCoreProperties.LANGUAGE));
+ }
+
/**
* Simple class to count end of document events. If functionality is useful,
* move to org.apache.tika in src/test
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.