You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/05/15 17:48:56 UTC
svn commit: r1594957 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDFParser.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Thu May 15 15:48:55 2014
New Revision: 1594957
URL: http://svn.apache.org/r1594957
Log:
temporary bug fix until TIKA-1295 is resolved
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1594957&r1=1594956&r2=1594957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu May 15 15:48:55 2014
@@ -274,6 +274,8 @@ public class PDFParser extends AbstractP
* Try to extract all multilingual items from the XMPSchema
* <p>
* This relies on the property having a valid xmp getName()
+ * <p>
+ * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
* @param metadata
* @param property
* @param pdfBoxBaseline
@@ -281,21 +283,37 @@ public class PDFParser extends AbstractP
*/
private void extractMultilingualItems(Metadata metadata, Property property,
String pdfBoxBaseline, XMPSchema schema) {
+ //if schema is null, just go with pdfBoxBaseline
if (schema == null) {
if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- metadata.add(property, pdfBoxBaseline);
+ metadata.set(property, pdfBoxBaseline);
}
return;
}
-
+
for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
String value = schema.getLanguageProperty(property.getName(), lang);
- if (value != null && pdfBoxBaseline != null
- && ! value.equals(pdfBoxBaseline) && value.length() > 0) {
- metadata.add(property, value);
+
+ if (value != null && value.length() > 0) {
+ //if you're going to add it below in the baseline addition, don't add it now
+ if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)){
+ continue;
+ }
+ metadata.add(property, value);
+ if (! property.isMultiValuePermitted()){
+ return;
+ }
}
}
+
if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+ //if we've already added something above and multivalue is not permitted
+ //return.
+ if (! property.isMultiValuePermitted()){
+ if (metadata.get(property) != null){
+ return;
+ }
+ }
metadata.add(property, pdfBoxBaseline);
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1594957&r1=1594956&r2=1594957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu May 15 15:48:55 2014
@@ -743,6 +743,7 @@ public class PDFParserTest extends TikaT
assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
}
+ @Test
public void testMultipleAuthors() throws Exception {
String fName = "testPDF_twoAuthors.pdf";
InputStream is = PDFParserTest.class.getResourceAsStream(
@@ -772,6 +773,24 @@ public class PDFParserTest extends TikaT
}
}
+ //STUB test for once TIKA-1295 is fixed
+ @Test
+ public void testMultipleTitles() throws Exception {
+ InputStream is = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDFTripleLangTitle.pdf");
+ Parser p = new AutoDetectParser();
+ Metadata m = new Metadata();
+ ParseContext c = new ParseContext();
+ ContentHandler h = new BodyContentHandler();
+ p.parse(is, h, m, c);
+ is.close();
+ //TODO: add other tests as part of TIKA-1295
+ //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
+ //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp?
+ //
+ assertEquals("Hello World", m.get("dc:title"));
+ }
+
/**
* This is a workaround until PDFBox-1922 is fixed.
* The goal is to test for equality but skip the version issue.