You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/05/15 17:48:56 UTC

svn commit: r1594957 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/PDFParser.java test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Author: tallison
Date: Thu May 15 15:48:55 2014
New Revision: 1594957

URL: http://svn.apache.org/r1594957
Log:
temporary bug fix until TIKA-1295 is resolved

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1594957&r1=1594956&r2=1594957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu May 15 15:48:55 2014
@@ -274,6 +274,8 @@ public class PDFParser extends AbstractP
      * Try to extract all multilingual items from the XMPSchema
      * <p>
      * This relies on the property having a valid xmp getName()
+     * <p>
+     * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
      * @param metadata
      * @param property
      * @param pdfBoxBaseline
@@ -281,21 +283,37 @@ public class PDFParser extends AbstractP
      */
     private void extractMultilingualItems(Metadata metadata, Property property,
             String pdfBoxBaseline, XMPSchema schema) {
+        //if schema is null, just go with pdfBoxBaseline
         if (schema == null) {
             if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                metadata.add(property, pdfBoxBaseline);
+                metadata.set(property, pdfBoxBaseline);
             }
             return;
         }
-        
+
         for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
             String value = schema.getLanguageProperty(property.getName(), lang);
-            if (value != null && pdfBoxBaseline != null 
-                    && ! value.equals(pdfBoxBaseline) && value.length() > 0) {
-                metadata.add(property, value);
+
+            if (value != null && value.length() > 0) {
+                //if you're going to add it below in the baseline addition, don't add it now
+                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)){
+                    continue;
+                }
+                metadata.add(property, value); 
+                if (! property.isMultiValuePermitted()){
+                    return;
+                }
             }
         }
+
         if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            //if we've already added something above and multivalue is not permitted
+            //return.
+            if (! property.isMultiValuePermitted()){
+                if (metadata.get(property) != null){
+                    return;
+                }
+            }
             metadata.add(property,  pdfBoxBaseline);
         }
     }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1594957&r1=1594956&r2=1594957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu May 15 15:48:55 2014
@@ -743,6 +743,7 @@ public class PDFParserTest extends TikaT
         assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
     }
 
+    @Test
     public void testMultipleAuthors() throws Exception {
         String fName = "testPDF_twoAuthors.pdf";
         InputStream is = PDFParserTest.class.getResourceAsStream(
@@ -772,6 +773,24 @@ public class PDFParserTest extends TikaT
         }
     }
 
+    //STUB test for once TIKA-1295 is fixed
+    @Test
+    public void testMultipleTitles() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf");
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        //TODO: add other tests as part of TIKA-1295
+        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
+        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp?
+        //
+        assertEquals("Hello World", m.get("dc:title"));
+    }
+
     /**
      * This is a workaround until PDFBox-1922 is fixed.
      * The goal is to test for equality but skip the version issue.