You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/11/18 19:10:30 UTC
svn commit: r1036562 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: jukka
Date: Thu Nov 18 18:10:29 2010
New Revision: 1036562
URL: http://svn.apache.org/viewvc?rev=1036562&view=rev
Log:
TIKA-548: PDF content extracted as single line
With test assertions by Staffan Olsson
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1036562&r1=1036561&r2=1036562&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Thu Nov 18 18:10:29 2010
@@ -17,6 +17,7 @@
package org.apache.tika.parser.pdf;
import java.io.IOException;
+import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -50,7 +51,19 @@ class PDF2XHTML extends PDFTextStripper
PDDocument document, ContentHandler handler, Metadata metadata)
throws SAXException, TikaException {
try {
- new PDF2XHTML(handler, metadata).getText(document);
+ // Extract text using a dummy Writer as we override the
+ // key methods to output to the given content handler.
+ new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+ @Override
+ public void flush() {
+ }
+ @Override
+ public void close() {
+ }
+ });
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
@@ -127,4 +140,14 @@ class PDF2XHTML extends PDFTextStripper
}
}
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ try {
+ handler.characters("\n");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a newline character", e);
+ }
+ }
+
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1036562&r1=1036561&r2=1036562&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu Nov 18 18:10:29 2010
@@ -47,7 +47,7 @@ public class PDFParserTest extends TestC
}
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Bertrand Delacrétaz", metadata.get(Metadata.AUTHOR));
+ assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
assertEquals("Apache Tika - Apache Tika", metadata.get(Metadata.TITLE));
// Can't reliably test dates yet - see TIKA-451
@@ -59,6 +59,11 @@ public class PDFParserTest extends TestC
assertTrue(content.contains("Tika - Content Analysis Toolkit"));
assertTrue(content.contains("incubator"));
assertTrue(content.contains("Apache Software Foundation"));
+ // testing how the end of one paragraph is separated from start of the next one
+ assertTrue("should have word boundary after headline",
+ !content.contains("ToolkitApache"));
+ assertTrue("should have word boundary between paragraphs",
+ !content.contains("libraries.Apache"));
}
public void testCustomMetadata() throws Exception {