You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/11/04 17:28:39 UTC
svn commit: r1197630 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Fri Nov 4 16:28:38 2011
New Revision: 1197630
URL: http://svn.apache.org/viewvc?rev=1197630&view=rev
Log:
TIKA-767: allow controlling whether PDFBox should try to remove overlapped duplicated text; default to disabled
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1197630&r1=1197629&r2=1197630&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Nov 4 16:28:38 2011
@@ -1,6 +1,14 @@
Apache Tika Change Log
======================
+Release 1.1 - Current Development
+---------------------------------
+
+ * PDF: Allow controlling whether overlapping duplicated text should
+ be removed. Disabling this (the default) can give big
+ speedups to text extraction and may workaround cases where
+ non-duplicated characters were incorrectly removed. (TIKA-767)
+
Release 1.0 - 11/4/2011
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1197630&r1=1197629&r2=1197630&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Nov 4 16:28:38 2011
@@ -53,12 +53,16 @@ class PDF2XHTML extends PDFTextStripper
* @throws TikaException if the PDF document can not be processed
*/
public static void process(
- PDDocument document, ContentHandler handler, Metadata metadata, boolean extractAnnotationText, boolean enableAutoSpace)
+ PDDocument document, ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content handler.
- new PDF2XHTML(handler, metadata, extractAnnotationText, enableAutoSpace).writeText(document, new Writer() {
+ new PDF2XHTML(handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText).writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@@ -81,7 +85,9 @@ class PDF2XHTML extends PDFTextStripper
private final XHTMLContentHandler handler;
private final boolean extractAnnotationText;
- private PDF2XHTML(ContentHandler handler, Metadata metadata, boolean extractAnnotationText, boolean enableAutoSpace)
+ private PDF2XHTML(ContentHandler handler, Metadata metadata,
+ boolean extractAnnotationText, boolean enableAutoSpace,
+ boolean suppressDuplicateOverlappingText)
throws IOException {
this.handler = new XHTMLContentHandler(handler, metadata);
this.extractAnnotationText = extractAnnotationText;
@@ -95,6 +101,7 @@ class PDF2XHTML extends PDFTextStripper
// TODO: maybe expose setting these too:
//setAverageCharTolerance(1.0f);
//setSpacingTolerance(1.0f);
+ setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
}
@Override
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1197630&r1=1197629&r2=1197630&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Nov 4 16:28:38 2011
@@ -57,6 +57,9 @@ public class PDFParser extends AbstractP
// True if we let PDFBox "guess" where spaces should go:
private boolean enableAutoSpace = true;
+ // True if we let PDFBox remove duplicate overlapping text:
+ private boolean suppressDuplicateOverlappingText;
+
/**
* Metadata key for giving the document password to the parser.
*
@@ -93,7 +96,7 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace);
+ PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText);
} finally {
pdfDocument.close();
}
@@ -200,4 +203,23 @@ public class PDFParser extends AbstractP
public boolean getExtractAnnotationText() {
return extractAnnotationText;
}
+
+ /**
+ * If true, the parser should try to remove duplicated
+ * text over the same region. This is needed for some
+ * PDFs that achieve bolding by re-writing the same
+ * text in the same area. Note that this can
+ * slow down extraction substantially (PDFBOX-956) and
+ * sometimes remove characters that were not in fact
+ * duplicated (PDFBOX-1155). By default this is disabled.
+ */
+ public void setSuppressDuplicateOverlappingText(boolean v) {
+ suppressDuplicateOverlappingText = v;
+ }
+
+ /** @see #setSuppressDuplicateOverlappingText. */
+ public boolean getSuppressDuplicateOverlappingText() {
+ return suppressDuplicateOverlappingText;
+ }
+
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1197630&r1=1197629&r2=1197630&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Nov 4 16:28:38 2011
@@ -293,6 +293,36 @@ public class PDFParserTest extends TikaT
assertEquals(-1, content.indexOf("Here is some formatted text"));
}
+ public void testDuplicateOverlappingText() throws Exception {
+ PDFParser parser = new PDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+ // Default is false (keep overlapping text):
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ assertContains("Text the first timeText the second time", content);
+
+ parser.setSuppressDuplicateOverlappingText(true);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ // "Text the first" was dedup'd:
+ assertContains("Text the first timesecond time", content);
+ }
+
private static class XMLResult {
public final String xml;
public final Metadata metadata;
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf?rev=1197630&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf