You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/09/02 14:58:22 UTC
svn commit: r1379960 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Sun Sep 2 12:58:21 2012
New Revision: 1379960
URL: http://svn.apache.org/viewvc?rev=1379960&view=rev
Log:
TIKA-981: also extract from PDF pop-up annotations
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Sep 2 12:58:21 2012
@@ -5,6 +5,8 @@ Release 1.3 - Current Development
you can see where in the main text the embedded document
occurred. (TIKA-956)
+ * PDF: Text from pop-up annotations is now extracted (TIKA-981)
+
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Sep 2 12:58:21 2012
@@ -24,7 +24,6 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.util.TextPosition;
@@ -159,12 +158,11 @@ class PDF2XHTML extends PDFTextStripper
}
}
}
-
- if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
- // It's a text annotation:
+
+ if (o instanceof PDAnnotationMarkup) {
PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
String title = annot.getTitlePopup();
- String subject = annot.getTitlePopup();
+ String subject = annot.getSubject();
String contents = annot.getContents();
// TODO: maybe also annot.getRichContents()?
if (title != null || subject != null || contents != null) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1379960&r1=1379959&r2=1379960&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sun Sep 2 12:58:21 2012
@@ -297,6 +297,23 @@ public class PDFParserTest extends TikaT
substringCount("</p>", xml));
}
+ // TIKA-981
+ public void testPopupAnnotation() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ assertContains("this is the note", content);
+ assertContains("igalsh", content);
+ }
+
public void testEmbeddedPDFs() throws Exception {
String xml = getXML("testPDFPackage.pdf").xml;
assertContains("PDF1", xml);
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf?rev=1379960&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf