You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/03/07 02:27:41 UTC

svn commit: r1575112 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Author: tallison
Date: Fri Mar  7 01:27:41 2014
New Revision: 1575112

URL: http://svn.apache.org/r1575112
Log:
TIKA-1252 small clean up

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1575112&r1=1575111&r2=1575112&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Mar  7 01:27:41 2014
@@ -177,12 +177,14 @@ public class PDFParser extends AbstractP
         }
         PDDocumentInformation info = document.getDocumentInformation();
         metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
-        extractDublinCoreListItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
+        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), (XMPSchema)dcSchema);
         extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
         extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
         addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
         addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
         addMetadata(metadata, "producer", info.getProducer());
+        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, (XMPSchema)dcSchema);
+
         // TODO: Move to description in Tika 2.0
         addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
         addMetadata(metadata, "trapped", info.getTrapped());
@@ -274,6 +276,37 @@ public class PDFParser extends AbstractP
         }
     }
 
+   /**
+     * Try to extract all multilingual items from the XMPSchema
+     * <p>
+     * This relies on the property having a valid xmp getName()
+     * @param metadata
+     * @param property
+     * @param pdfBoxBaseline
+     * @param schema
+     */
+    private void extractMultilingualItems(Metadata metadata, Property property,
+            String pdfBoxBaseline, XMPSchema schema) {
+        if (schema == null){
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0){
+                metadata.add(property, pdfBoxBaseline);
+            }
+            return;
+        }
+        
+        for (String lang : schema.getLanguagePropertyLanguages(property.getName())){
+            String value = schema.getLanguageProperty(property.getName(), lang);
+            if (value != null && pdfBoxBaseline != null 
+                    && ! value.equals(pdfBoxBaseline) && value.length() > 0){
+                metadata.add(property, value);
+            }
+        }
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0){
+            metadata.add(property,  pdfBoxBaseline);
+        }
+    }
+
+
     /**
      * This tries to read a list from a particular property in
      * XMPSchemaDublinCore.
@@ -285,6 +318,8 @@ public class PDFParser extends AbstractP
      * <p>
      * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
      * on dates!
+     * <p>
+     * This relies on the property having a DublinCore compliant getName()
      * 
      * @param property
      * @param pdfBoxBaseline