You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/06/24 03:12:46 UTC

svn commit: r1604989 - in /tika/trunk/tika-parsers: pom.xml src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Author: tallison
Date: Tue Jun 24 01:12:45 2014
New Revision: 1604989

URL: http://svn.apache.org/r1604989
Log:
TIKA-1352 upgrade to PDFBox 1.8.6

Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Jun 24 01:12:45 2014
@@ -39,7 +39,7 @@
     <codec.version>1.5</codec.version> <!-- NOTE: sync with POI -->
     <mime4j.version>0.7.2</mime4j.version>
     <vorbis.version>0.6</vorbis.version>
-    <pdfbox.version>1.8.5</pdfbox.version>
+    <pdfbox.version>1.8.6</pdfbox.version>
   </properties>
 
   <dependencies>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue Jun 24 01:12:45 2014
@@ -92,7 +92,7 @@ class PDF2XHTML extends PDFTextStripper 
     private final static int MAX_ACROFORM_RECURSIONS = 10;
 
 
-    // TODO: remove once PDFBOX-1130 is fixed:
+    // TODO: remove once PDFBOX-2160 is fixed:
     private boolean inParagraph = false;
 
     /**
@@ -353,7 +353,7 @@ class PDF2XHTML extends PDFTextStripper 
 
     @Override
     protected void writeParagraphStart() throws IOException {
-        // TODO: remove once PDFBOX-1130 is fixed
+        // TODO: remove once PDFBOX-2160 is fixed
         if (inParagraph) {
             // Close last paragraph
             writeParagraphEnd();
@@ -369,7 +369,7 @@ class PDF2XHTML extends PDFTextStripper 
 
     @Override
     protected void writeParagraphEnd() throws IOException {
-        // TODO: remove once PDFBOX-1130 is fixed
+        // TODO: remove once PDFBOX-2160 is fixed
         if (!inParagraph) {
             writeParagraphStart();
         }
@@ -535,29 +535,22 @@ class PDF2XHTML extends PDFTextStripper 
         handler.endElement("div");
     }
 
-    private void processAcroField(PDField field, XHTMLContentHandler handler, final int recurseDepth)
+    private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth)
             throws SAXException, IOException { 
 
-        if (recurseDepth >= MAX_ACROFORM_RECURSIONS) {
+        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
             return;
         }
 
         addFieldString(field, handler);
 
-        @SuppressWarnings("rawtypes")
-        List kids = field.getKids();
+        List<COSObjectable> kids = field.getKids();
         if(kids != null) {
 
-            @SuppressWarnings("rawtypes")
-            Iterator kidsIter = kids.iterator();
-            if (kidsIter == null) {
-                return;
-            }
-            int r = recurseDepth+1;
+            int r = currentRecursiveDepth+1;
             handler.startElement("ol");
             //TODO: can generate <ol/>. Rework to avoid that.
-            while(kidsIter.hasNext()) {
-                Object pdfObj = kidsIter.next();
+            for(COSObjectable pdfObj : kids) {
                 if(pdfObj != null && pdfObj instanceof PDField) {
                     PDField kid = (PDField)pdfObj;
                     //recurse
@@ -596,6 +589,8 @@ class PDF2XHTML extends PDFTextStripper 
             }
         } catch (IOException e) {
             //swallow
+        } catch (NullPointerException e) {
+            //TODO: remove once PDFBOX-2161 is fixed
         }
 
         if (attrs.getLength() > 0 || sb.length() > 0) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Jun 24 01:12:45 2014
@@ -227,10 +227,11 @@ public class PDFParser extends AbstractP
         //Caveats:
         //    there is currently a fair amount of redundancy
         //    TikaCoreProperties.FORMAT can be multivalued
-        //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
+        //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
         metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
         metadata.add(TikaCoreProperties.FORMAT.getName(), 
-            MEDIA_TYPE.toString()+"; version="+Float.toString(document.getDocument().getVersion()));
+            MEDIA_TYPE.toString()+"; version="+
+            Float.toString(document.getDocument().getVersion()));
 
         try {           
             if( xmp != null ) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Jun 24 01:12:45 2014
@@ -467,9 +467,6 @@ public class PDFParserTest extends TikaT
     }
     
     //TIKA-1124
-    //IGNORE until TIKA-1298/PDFBOX 2079 is fixed or we all 
-    //move to Java 1.7
-    @Ignore 
     @Test
     public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
        /* format of test doc:
@@ -527,20 +524,26 @@ public class PDFParserTest extends TikaT
      */
     @Test
     public void testSequentialParser() throws Exception{
-        Parser defaultParser = new AutoDetectParser();
+
         Parser sequentialParser = new AutoDetectParser();
-        ParseContext context = new ParseContext();
-        PDFParserConfig config = new PDFParserConfig();
-        config.setUseNonSequentialParser(true);
-        context.set(PDFParserConfig.class, config);
+        Parser nonSequentialParser = new AutoDetectParser();
+
+        ParseContext seqContext = new ParseContext();
+        PDFParserConfig seqConfig = new PDFParserConfig();
+        seqConfig.setUseNonSequentialParser(false);
+        seqContext.set(PDFParserConfig.class, seqConfig);
+
+        ParseContext nonSeqContext = new ParseContext();
+        PDFParserConfig nonSeqConfig = new PDFParserConfig();
+        nonSeqConfig.setUseNonSequentialParser(true);
+        nonSeqContext.set(PDFParserConfig.class, nonSeqConfig);
 
         File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
         int pdfs = 0;
         Set<String> knownMetadataDiffs = new HashSet<String>();
         //PDFBox-1792/Tika-1203
         knownMetadataDiffs.add("testAnnotations.pdf");
-        //PDFBox-1792
-        knownMetadataDiffs.add("test_acroForm2.pdf");
+
         //empty for now
         Set<String> knownContentDiffs = new HashSet<String>();
 
@@ -550,25 +553,25 @@ public class PDFParserTest extends TikaT
             }
 
             pdfs++;
-            Metadata defaultMetadata = new Metadata();
-            String defaultContent = getText(new FileInputStream(f), defaultParser, defaultMetadata);
-
             Metadata sequentialMetadata = new Metadata();
-            String sequentialContent = getText(new FileInputStream(f), sequentialParser, context, sequentialMetadata);
+            String sequentialContent = getText(new FileInputStream(f), 
+                sequentialParser, seqContext, sequentialMetadata);
+
+            Metadata nonSequentialMetadata = new Metadata();
+            String nonSequentialContent = getText(new FileInputStream(f), 
+                nonSequentialParser, nonSeqContext, nonSequentialMetadata);
 
             if (knownContentDiffs.contains(f.getName())) {
-                assertFalse(f.getName(), defaultContent.equals(sequentialContent));
+                assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent));
             } else {
-                assertEquals(f.getName(), defaultContent, sequentialContent);
+                assertEquals(f.getName(), sequentialContent, nonSequentialContent);
             }
 
             //skip this one file.
             if (knownMetadataDiffs.contains(f.getName())) {
-                //turn back on once PDFBOX-1922 is fixed
-                //assertFalse(f.getName(), defaultMetadata.equals(sequentialMetadata));
+                assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata));
             } else {
-                //assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
-                testMetadataEquality(f.getName(), defaultMetadata, sequentialMetadata);
+                assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata);
             }
         }
         //make sure nothing went wrong with getting the resource to test-documents
@@ -816,55 +819,6 @@ public class PDFParserTest extends TikaT
         assertEquals("Hello World", m.get("dc:title"));
     }
 
-    /**
-     * This is a workaround until PDFBox-1922 is fixed.
-     * The goal is to test for equality but skip the version issue.
-     * TODO: get rid of this asap and revert back to this.Metadata.equals(thatMetadata)!
-     * @return equal or not (ignore version differences)
-     */
-    private void testMetadataEquality(String fName, Metadata thisMetadata,
-            Metadata thatMetadata) {
-        String[] thisNames = thisMetadata.names();
-        String[] thatNames = thatMetadata.names();
-
-        assertTrue("metadata null test: "+fName, 
-         (thisNames == null && thatNames == null) ||
-         (thisNames != null && thatNames != null));
-        
-        assertEquals("metadata length: "+fName, thisNames.length, thatMetadata.names().length);
-        
-        for (String n : thisNames) {
-            //don't pay attention to differences here for now
-            if (n.equals("pdf:PDFVersion") || n.equals("dc:format")) {
-                continue;
-            }
-            if (thisMetadata.isMultiValued(n) && thatMetadata.isMultiValued(n)) {
-                String[] thisValues = thisMetadata.getValues(n);
-                String[] thatValues = thatMetadata.getValues(n);
-                testEqualMetadataValue(fName, thisValues, thatValues);
-            } else if (! thisMetadata.isMultiValued(n) && ! thatMetadata.isMultiValued(n)) {
-                assertEquals("unequal multivalued values: " + fName, thisMetadata.get(n), thatMetadata.get(n));
-            } else {
-                //one is multivalued and the other isn't
-                assertTrue("one multivalued, other isn't: "+fName, false);
-            }
-        }
-    }
-    
-    private void testEqualMetadataValue(String fName, String[] thisValues, String[] thatValues) {
-        assertTrue("null equality of metadata values: "+fName, 
-                (thisValues == null && thatValues == null) ||
-                (thisValues != null && thatValues != null));
-
-        assertEquals("metadata values length: "+fName, thisValues.length, thatValues.length);
-        List<String> list = Arrays.asList(thatValues);
-        for (String v : thisValues) {
-            if (! list.contains(v)) {
-                assertTrue("metadata value; that doesn't contain" + v, false);
-            }
-        }
-    }
-
     @Test
     public void testInlineSelector() throws Exception {