You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/14 21:02:32 UTC
svn commit: r695266 - in /incubator/tika/trunk: CHANGES.txt
src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Author: jukka
Date: Sun Sep 14 12:02:32 2008
New Revision: 695266
URL: http://svn.apache.org/viewvc?rev=695266&view=rev
Log:
TIKA-114: PDFParser : Getting content of the document using "writer.ToString ()" , some words are stuck together
Patch by Dave Meikle.
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695266&r1=695265&r2=695266&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 12:02:32 2008
@@ -81,6 +81,10 @@
34. TIKA-54 - Outlook msg parser
(Rida Benjelloun, Dave Meikle & Jukka Zitting)
+35. TIKA-114 - PDFParser : Getting content of the document using
+ "writer.ToString ()" , some words are stuck together
+ (Dave Meikle)
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=695266&r1=695265&r2=695266&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Sep 14 12:02:32 2008
@@ -39,7 +39,7 @@
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
- *
+ *
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
@@ -124,21 +124,45 @@
}
}
- protected void processLineSeparator(TextPosition p) throws IOException {
- try {
- handler.characters("\n");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to write a newline", e);
+ // Two methods added to work around lack of support for processWordSeparator
+ // and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head (PDFBox-0.7.4)
+ public String getWordSeparator()
+ {
+ try
+ {
+ handler.characters(" ");
+ } catch(SAXException e) {
+
}
+ return super.getWordSeparator(); //To change body of overridden methods use File | Settings | File Templates.
}
- protected void processWordSeparator(TextPosition a, TextPosition b)
- throws IOException {
- try {
- handler.characters(" ");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to write a space", e);
+ public String getLineSeparator()
+ {
+ try
+ {
+ handler.characters("\n");
+ } catch(SAXException e) {
+
}
+ return super.getLineSeparator();
}
+// protected void processLineSeparator(TextPosition p) throws IOException {
+// try {
+// handler.characters("\n");
+// } catch (SAXException e) {
+// throw new IOExceptionWithCause("Unable to write a newline", e);
+// }
+// }
+//
+// protected void processWordSeparator(TextPosition a, TextPosition b)
+// throws IOException {
+// try {
+// handler.characters(" ");
+// } catch (SAXException e) {
+// throw new IOExceptionWithCause("Unable to write a space", e);
+// }
+// }
+
}