You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/07 17:04:24 UTC

[tika] 01/02: TIKA-2559: Extract language metadata item from PDF files via Matt Sheppard.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0e5fded007e65dce35dad52b29dc6d4bd8a550cc
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Feb 7 12:00:53 2018 -0500

    TIKA-2559: Extract language metadata item from PDF files via Matt Sheppard.
---
 .../src/main/java/org/apache/tika/parser/pdf/PDFParser.java       | 3 +++
 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java   | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 303d8e6..4d5202d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -222,6 +222,9 @@ public class PDFParser extends AbstractParser implements Initializable {
         metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
                 Boolean.toString(ap.canPrintDegraded()));
 
+        if (document.getDocumentCatalog().getLanguage() != null) {
+            metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
+        }
 
         //now go for the XMP
         Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 52814ac..537a7ff 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1370,6 +1370,14 @@ public class PDFParserTest extends TikaTest {
         assertFalse(path + " should have thrown exception", noEx);
     }
 
+    @Test
+    public void testLanguageMetadata() throws Exception {
+        assertEquals("de-CH", getXML("testPDF-custommetadata.pdf")
+                .metadata.get(TikaCoreProperties.LANGUAGE));
+        assertEquals("zh-CN", getXML("testPDFFileEmbInAnnotation.pdf")
+                .metadata.get(TikaCoreProperties.LANGUAGE));
+    }
+
     /**
      * Simple class to count end of document events.  If functionality is useful,
      * move to org.apache.tika in src/test

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.