You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2011/12/19 15:52:56 UTC
svn commit: r1220781 - in /pdfbox/trunk/pdfbox: download/
src/main/java/org/apache/pdfbox/tika/ src/test/java/org/apache/pdfbox/tika/
src/test/resources/org/apache/pdfbox/tika/
Author: jukka
Date: Mon Dec 19 14:52:55 2011
New Revision: 1220781
URL: http://svn.apache.org/viewvc?rev=1220781&view=rev
Log:
PDFBOX-1132: Add Tika parser classes
Merge recent changes from Tika trunk. Full details below.
Revision: 1206568
Author: Michael McCandless <mi...@apache.org>
Date: Sat Nov 26 19:57:15 2011 +0000
TIKA-778: fix cases where PDFParser produced too many </p> tags
Revision: 1203287
Author: Michael McCandless <mi...@apache.org>
Date: Thu Nov 17 17:24:26 2011 +0000
TIKA-612: enable controlling PDFBox's setSortByPosition from PDFParser
Revision: 1197630
Author: Michael McCandless <mi...@apache.org>
Date: Fri Nov 4 16:28:38 2011 +0000
TIKA-767: allow controlling whether PDFBox should try to remove overlapped duplicated text; default to disabled
Revison: 1195596
Author: Jukka Zitting <ju...@apache.org>
Date: Mon Oct 31 18:22:06 2011 +0000
TIKA-565: Improved OSGi bundling
Use central OSGiParser and OSGiDetector classes in tika-parsers to better handle issues with missing dependencies and t
Revision: 1195500
Author: Jukka Zitting <ju...@apache.org>
Date: Mon Oct 31 15:01:33 2011 +0000
TIKA-565: Improved OSGi bundling
Mark all Parser services with SCR annotations
Revision: 1186775
Author: Michael McCandless <mi...@apache.org>
Date: Thu Oct 20 12:55:46 2011 +0000
TIKA-724: add option to PDFParser to control auto-space behavior
Revision: 1186771
Author: Michael McCandless <mi...@apache.org>
Date: Thu Oct 20 12:49:50 2011 +0000
TIKA-738: optionally extract PDF annotations
Added:
pdfbox/trunk/pdfbox/download/
pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar
pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf (with props)
pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf (with props)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
Added: pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar?rev=1220781&view=auto
==============================================================================
Files pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar (added) and pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar Mon Dec 19 14:52:55 2011 differ
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java Mon Dec 19 14:52:55 2011
@@ -21,6 +21,8 @@ import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
@@ -51,12 +53,16 @@ class PDF2XHTML extends PDFTextStripper
* @throws TikaException if the PDF document can not be processed
*/
public static void process(
- PDDocument document, ContentHandler handler, Metadata metadata)
+ PDDocument document, ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content handler.
- new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+ new PDF2XHTML(handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@@ -77,12 +83,25 @@ class PDF2XHTML extends PDFTextStripper
}
private final XHTMLContentHandler handler;
+ private final boolean extractAnnotationText;
- private PDF2XHTML(ContentHandler handler, Metadata metadata)
+ private PDF2XHTML(ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws IOException {
this.handler = new XHTMLContentHandler(handler, metadata);
+ this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
- setSortByPosition(false);
+ setSortByPosition(sortByPosition);
+ if (enableAutoSpace) {
+ setWordSeparator(" ");
+ } else {
+ setWordSeparator("");
+ }
+ // TODO: maybe expose setting these too:
+ //setAverageCharTolerance(1.0f);
+ //setSpacingTolerance(1.0f);
+ setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
}
@Override
@@ -107,16 +126,53 @@ class PDF2XHTML extends PDFTextStripper
protected void startPage(PDPage page) throws IOException {
try {
handler.startElement("div", "class", "page");
- handler.startElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a page", e);
}
+ writeParagraphStart();
}
@Override
protected void endPage(PDPage page) throws IOException {
+
try {
- handler.endElement("p");
+ writeParagraphEnd();
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (extractAnnotationText) {
+ for(Object o : page.getAnnotations()) {
+ if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
+ // It's a text annotation:
+ PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
+ String title = annot.getTitlePopup();
+ String subject = annot.getTitlePopup();
+ String contents = annot.getContents();
+ // TODO: maybe also annot.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ handler.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ handler.startElement("div", "class", "annotationTitle");
+ handler.characters(title);
+ handler.endElement("div");
+ }
+
+ if (subject != null) {
+ handler.startElement("div", "class", "annotationSubject");
+ handler.characters(subject);
+ handler.endElement("div");
+ }
+
+ if (contents != null) {
+ handler.startElement("div", "class", "annotationContents");
+ handler.characters(contents);
+ handler.endElement("div");
+ }
+
+ handler.endElement("div");
+ }
+ }
+ }
+ }
handler.endElement("div");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
@@ -177,7 +233,7 @@ class PDF2XHTML extends PDFTextStripper
@Override
protected void writeWordSeparator() throws IOException {
try {
- handler.characters(" ");
+ handler.characters(getWordSeparator());
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a space character", e);
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java Mon Dec 19 14:52:55 2011
@@ -56,6 +56,20 @@ public class PDFParser extends AbstractP
/** Serial version UID */
private static final long serialVersionUID = -752276948656079347L;
+ // True if we let PDFBox "guess" where spaces should go:
+ private boolean enableAutoSpace = true;
+
+ // True if we let PDFBox remove duplicate overlapping text:
+ private boolean suppressDuplicateOverlappingText;
+
+ // True if we extract annotation text ourselves
+ // (workaround for PDFBOX-1143):
+ private boolean extractAnnotationText = true;
+
+ // True if we should sort text tokens by position
+ // (necessary for some PDFs, but messes up other PDFs):
+ private boolean sortByPosition = false;
+
/**
* Metadata key for giving the document password to the parser.
*/
@@ -99,7 +113,9 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, metadata);
+ PDF2XHTML.process(pdfDocument, handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText, sortByPosition);
} finally {
pdfDocument.close();
}
@@ -178,4 +194,70 @@ public class PDFParser extends AbstractP
addMetadata(metadata, name, value.toString());
}
}
+
+ /**
+ * If true (the default), the parser should estimate
+ * where spaces should be inserted between words. For
+ * many PDFs this is necessary as they do not include
+ * explicit whitespace characters.
+ */
+ public void setEnableAutoSpace(boolean v) {
+ enableAutoSpace = v;
+ }
+
+ /** @see #setEnableAutoSpace. */
+ public boolean getEnableAutoSpace() {
+ return enableAutoSpace;
+ }
+
+ /**
+ * If true (the default), text in annotations will be
+ * extracted.
+ */
+ public void setExtractAnnotationText(boolean v) {
+ extractAnnotationText = v;
+ }
+
+ /**
+ * If true, text in annotations will be extracted.
+ */
+ public boolean getExtractAnnotationText() {
+ return extractAnnotationText;
+ }
+
+ /**
+ * If true, the parser should try to remove duplicated
+ * text over the same region. This is needed for some
+ * PDFs that achieve bolding by re-writing the same
+ * text in the same area. Note that this can
+ * slow down extraction substantially (PDFBOX-956) and
+ * sometimes remove characters that were not in fact
+ * duplicated (PDFBOX-1155). By default this is disabled.
+ */
+ public void setSuppressDuplicateOverlappingText(boolean v) {
+ suppressDuplicateOverlappingText = v;
+ }
+
+ /** @see #setSuppressDuplicateOverlappingText. */
+ public boolean getSuppressDuplicateOverlappingText() {
+ return suppressDuplicateOverlappingText;
+ }
+
+ /**
+ * If true, sort text tokens by their x/y position
+ * before extracting text. This may be necessary for
+ * some PDFs (if the text tokens are not rendered "in
+ * order"), while for other PDFs it can produce the
+ * wrong result (for example if there are 2 columns,
+ * the text will be interleaved). Default is false.
+ */
+ public void setSortByPosition(boolean v) {
+ sortByPosition = v;
+ }
+
+ /** @see #setSortByPosition. */
+ public boolean getSortByPosition() {
+ return sortByPosition;
+ }
+
}
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java Mon Dec 19 14:52:55 2011
@@ -219,8 +219,7 @@ public class PDFParserTest extends TestC
//assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
- // TIKA-738: re-enable this
- public void IGNOREtestAnnotations() throws Exception {
+ public void testAnnotations() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
@@ -236,15 +235,160 @@ public class PDFParserTest extends TestC
content = content.replaceAll("[\\s\u00a0]+"," ");
assertContains("Here is some text", content);
assertContains("Here is a comment", content);
+
+ // Test w/ annotation text disabled:
+ PDFParser pdfParser = new PDFParser();
+ pdfParser.setExtractAnnotationText(false);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream = PDFParserTest.class.getResourceAsStream("testAnnotations.pdf");
+ try {
+ pdfParser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ assertContains("Here is some text", content);
+ assertEquals(-1, content.indexOf("Here is a comment"));
+
+ // TIKA-738: make sure no extra </p> tags
+ String xml = getXML("testAnnotations.pdf").xml;
+ assertEquals(substringCount("<p>", xml),
+ substringCount("</p>", xml));
+ }
+
+ private static int substringCount(String needle, String haystack) {
+ int upto = -1;
+ int count = 0;
+ while(true) {
+ final int next = haystack.indexOf(needle, upto);
+ if (next == -1) {
+ break;
+ }
+ count++;
+ upto = next+1;
+ }
+
+ return count;
}
public void testPageNumber() throws Exception {
- String result = getXML("testPageNumber.pdf");
+ String result = getXML("testPageNumber.pdf").xml;
String content = result.replaceAll("\\s+","");
assertContains("<p>1</p>", content);
}
- private String getXML(String filename) throws Exception {
+ public void testDisableAutoSpace() throws Exception {
+ PDFParser parser = new PDFParser();
+ parser.setEnableAutoSpace(false);
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = PDFParserTest.class.getResourceAsStream("testExtraSpaces.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ // Text is correct when autoSpace is off:
+ assertContains("Here is some formatted text", content);
+
+ parser.setEnableAutoSpace(true);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream = PDFParserTest.class.getResourceAsStream("testExtraSpaces.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ // Text is correct when autoSpace is off:
+
+ // Text has extra spaces when autoSpace is on
+ assertEquals(-1, content.indexOf("Here is some formatted text"));
+ }
+
+ public void testDuplicateOverlappingText() throws Exception {
+ PDFParser parser = new PDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream =PDFParserTest.class.getResourceAsStream("testOverlappingText.pdf");
+ // Default is false (keep overlapping text):
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ assertContains("Text the first timeText the second time", content);
+
+ parser.setSuppressDuplicateOverlappingText(true);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream = PDFParserTest.class.getResourceAsStream("testOverlappingText.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ // "Text the first" was dedup'd:
+ assertContains("Text the first timesecond time", content);
+ }
+
+ public void testSortByPosition() throws Exception {
+ PDFParser parser = new PDFParser();
+ parser.setEnableAutoSpace(false);
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = PDFParserTest.class.getResourceAsStream("testPDFTwoTextBoxes.pdf");
+ // Default is false (do not sort):
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
+
+ parser.setSortByPosition(true);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream = PDFParserTest.class.getResourceAsStream("testPDFTwoTextBoxes.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
+ }
+
+ private static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ private XMLResult getXML(String filename) throws Exception {
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser(); // Should auto-detect!
StringWriter sw = new StringWriter();
@@ -255,12 +399,11 @@ public class PDFParserTest extends TestC
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(sw));
- // Try with a document containing various tables and formatting
- InputStream input = PDFParserTest.class.getResourceAsStream(
- filename);
+ // Try with a document containing various tables and formattings
+ InputStream input = PDFParserTest.class.getResourceAsStream(filename);
try {
parser.parse(input, handler, metadata, new ParseContext());
- return sw.toString();
+ return new XMLResult(sw.toString(), metadata);
} finally {
input.close();
}
Added: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf?rev=1220781&view=auto
==============================================================================
Binary file - no diff available.
Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
------------------------------------------------------------------------------
svn:executable = *
Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf
Added: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf?rev=1220781&view=auto
==============================================================================
Binary file - no diff available.
Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf