You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/03/07 02:27:41 UTC
svn commit: r1575112 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Author: tallison
Date: Fri Mar 7 01:27:41 2014
New Revision: 1575112
URL: http://svn.apache.org/r1575112
Log:
TIKA-1252 small clean up
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1575112&r1=1575111&r2=1575112&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Mar 7 01:27:41 2014
@@ -177,12 +177,14 @@ public class PDFParser extends AbstractP
}
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
- extractDublinCoreListItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
+ extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), (XMPSchema)dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
addMetadata(metadata, "producer", info.getProducer());
+ extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, (XMPSchema)dcSchema);
+
// TODO: Move to description in Tika 2.0
addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
addMetadata(metadata, "trapped", info.getTrapped());
@@ -274,6 +276,37 @@ public class PDFParser extends AbstractP
}
}
+ /**
+ * Try to extract all multilingual items from the XMPSchema
+ * <p>
+ * This relies on the property having a valid xmp getName()
+ * @param metadata
+ * @param property
+ * @param pdfBoxBaseline
+ * @param schema
+ */
+ private void extractMultilingualItems(Metadata metadata, Property property,
+ String pdfBoxBaseline, XMPSchema schema) {
+ if (schema == null){
+ if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0){
+ metadata.add(property, pdfBoxBaseline);
+ }
+ return;
+ }
+
+ for (String lang : schema.getLanguagePropertyLanguages(property.getName())){
+ String value = schema.getLanguageProperty(property.getName(), lang);
+ if (value != null && pdfBoxBaseline != null
+ && ! value.equals(pdfBoxBaseline) && value.length() > 0){
+ metadata.add(property, value);
+ }
+ }
+ if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0){
+ metadata.add(property, pdfBoxBaseline);
+ }
+ }
+
+
/**
* This tries to read a list from a particular property in
* XMPSchemaDublinCore.
@@ -285,6 +318,8 @@ public class PDFParser extends AbstractP
* <p>
* Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
* on dates!
+ * <p>
+ * This relies on the property having a DublinCore compliant getName()
*
* @param property
* @param pdfBoxBaseline