You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/11/26 20:57:16 UTC
svn commit: r1206568 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: mikemccand
Date: Sat Nov 26 19:57:15 2011
New Revision: 1206568
URL: http://svn.apache.org/viewvc?rev=1206568&view=rev
Log:
TIKA-778: fix cases where PDFParser produced too many </p> tags
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1206568&r1=1206567&r2=1206568&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Nov 26 19:57:15 2011
@@ -13,7 +13,10 @@ Release 1.1 - Current Development
non-duplicated characters were incorrectly removed (TIKA-767).
Allow controlling whether text tokens should be sorted by their x/y
position before extracting text (TIKA-612); this is necessary for
- certain PDFs.
+ certain PDFs. Fixed cases where too many </p> tags appear in the
+ XHTML output, causing NPE when opening some PDFs with the GUI
+ (TIKA-778).
+
* RTF: Fixed case where a font change would result in processing
bytes in the wrong font's charset, producing bogus text output
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1206568&r1=1206567&r2=1206568&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sat Nov 26 19:57:15 2011
@@ -126,20 +126,19 @@ class PDF2XHTML extends PDFTextStripper
protected void startPage(PDPage page) throws IOException {
try {
handler.startElement("div", "class", "page");
- handler.startElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a page", e);
}
+ writeParagraphStart();
}
@Override
protected void endPage(PDPage page) throws IOException {
try {
+ writeParagraphEnd();
// TODO: remove once PDFBOX-1143 is fixed:
- handler.endElement("p");
if (extractAnnotationText) {
- boolean foundTextAnnots = false;
for(Object o : page.getAnnotations()) {
if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
// It's a text annotation:
@@ -149,11 +148,6 @@ class PDF2XHTML extends PDFTextStripper
String contents = annot.getContents();
// TODO: maybe also annot.getRichContents()?
if (title != null || subject != null || contents != null) {
- if (!foundTextAnnots) {
- handler.endElement("p");
- foundTextAnnots = true;
- }
-
handler.startElement("div", "class", "annotation");
if (title != null) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1206568&r1=1206567&r2=1206568&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sat Nov 26 19:57:15 2011
@@ -250,6 +250,26 @@ public class PDFParserTest extends TikaT
content = content.replaceAll("[\\s\u00a0]+"," ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
+
+ // TIKA-738: make sure no extra </p> tags
+ String xml = getXML("testAnnotations.pdf").xml;
+ assertEquals(substringCount("<p>", xml),
+ substringCount("</p>", xml));
+ }
+
+ private static int substringCount(String needle, String haystack) {
+ int upto = -1;
+ int count = 0;
+ while(true) {
+ final int next = haystack.indexOf(needle, upto);
+ if (next == -1) {
+ break;
+ }
+ count++;
+ upto = next+1;
+ }
+
+ return count;
}
public void testPageNumber() throws Exception {