You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/14 21:02:32 UTC

svn commit: r695266 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Author: jukka
Date: Sun Sep 14 12:02:32 2008
New Revision: 695266

URL: http://svn.apache.org/viewvc?rev=695266&view=rev
Log:
TIKA-114: PDFParser : Getting content of the document using "writer.ToString ()" , some words are stuck together

Patch by Dave Meikle.

Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695266&r1=695265&r2=695266&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 12:02:32 2008
@@ -81,6 +81,10 @@
 34. TIKA-54  - Outlook msg parser
                (Rida Benjelloun, Dave Meikle & Jukka Zitting)
 
+35. TIKA-114 - PDFParser : Getting content of the document using
+               "writer.ToString ()" , some words are stuck together
+               (Dave Meikle)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=695266&r1=695265&r2=695266&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Sep 14 12:02:32 2008
@@ -39,7 +39,7 @@
     /**
      * Converts the given PDF document (and related metadata) to a stream
      * of XHTML SAX events sent to the given content handler.
-     * 
+     *
      * @param document PDF document
      * @param handler SAX content handler
      * @param metadata PDF metadata
@@ -124,21 +124,45 @@
         }
     }
 
-    protected void processLineSeparator(TextPosition p) throws IOException {
-        try {
-            handler.characters("\n");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to write a newline", e);
+    // Two methods added to work around lack of support for processWordSeparator
+    // and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head (PDFBox-0.7.4)
+    public String getWordSeparator()
+    {
+        try
+        {
+            handler.characters(" ");
+        } catch(SAXException e) {
+
         }
+        return super.getWordSeparator();    //To change body of overridden methods use File | Settings | File Templates.
     }
 
-    protected void processWordSeparator(TextPosition a, TextPosition b)
-            throws IOException {
-        try {
-            handler.characters(" ");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to write a space", e);
+    public String getLineSeparator()
+    {
+        try
+        {
+            handler.characters("\n");
+        } catch(SAXException e) {
+
         }
+        return super.getLineSeparator();
     }
 
+//    protected void processLineSeparator(TextPosition p) throws IOException {
+//        try {
+//            handler.characters("\n");
+//        } catch (SAXException e) {
+//            throw new IOExceptionWithCause("Unable to write a newline", e);
+//        }
+//    }
+//
+//    protected void processWordSeparator(TextPosition a, TextPosition b)
+//            throws IOException {
+//        try {
+//            handler.characters(" ");
+//        } catch (SAXException e) {
+//            throw new IOExceptionWithCause("Unable to write a space", e);
+//        }
+//    }
+
 }