You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/09/02 14:58:22 UTC

svn commit: r1379960 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Sun Sep  2 12:58:21 2012
New Revision: 1379960

URL: http://svn.apache.org/viewvc?rev=1379960&view=rev
Log:
TIKA-981: also extract from PDF pop-up annotations

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Sep  2 12:58:21 2012
@@ -5,6 +5,8 @@ Release 1.3 - Current Development
     you can see where in the main text the embedded document
     occurred. (TIKA-956)
 
+  * PDF: Text from pop-up annotations is now extracted (TIKA-981)
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Sep  2 12:58:21 2012
@@ -24,7 +24,6 @@ import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
 import org.apache.pdfbox.util.TextPosition;
@@ -159,12 +158,11 @@ class PDF2XHTML extends PDFTextStripper 
                              }
                         }
                     }
-                
-                    if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
-                        // It's a text annotation:
+
+                    if (o instanceof PDAnnotationMarkup) {
                         PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
                         String title = annot.getTitlePopup();
-                        String subject = annot.getTitlePopup();
+                        String subject = annot.getSubject();
                         String contents = annot.getContents();
                         // TODO: maybe also annot.getRichContents()?
                         if (title != null || subject != null || contents != null) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sun Sep  2 12:58:21 2012
@@ -297,6 +297,23 @@ public class PDFParserTest extends TikaT
                      substringCount("</p>", xml));
     }
 
+    // TIKA-981
+    public void testPopupAnnotation() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        String content = handler.toString();
+        assertContains("this is the note", content);
+        assertContains("igalsh", content);
+    }
+
     public void testEmbeddedPDFs() throws Exception {
         String xml = getXML("testPDFPackage.pdf").xml;
         assertContains("PDF1", xml);

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf?rev=1379960&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf