You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/02/25 17:50:41 UTC

svn commit: r747858 - in /incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox: ExtractText.java util/PDFText2HTML.java util/PDFTextStripper.java

Author: carrier
Date: Wed Feb 25 16:50:41 2009
New Revision: 747858

URL: http://svn.apache.org/viewvc?rev=747858&view=rev
Log:
Patch for PDFBOX-434 to add new HTML output features for text extraction. Patch by Justin LeFebvre <justinl at basistech dot com>

Modified:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java Wed Feb 25 16:50:41 2009
@@ -39,20 +39,6 @@
  */
 public class ExtractText
 {
-    /**
-     * This is the default encoding of the text to be output.
-     */
-    public static final String DEFAULT_ENCODING =
-        null;
-        //"ISO-8859-1";
-        //"ISO-8859-6"; //arabic
-        //"US-ASCII";
-        //"UTF-8";
-        //"UTF-16";
-        //"UTF-16BE";
-        //"UTF-16LE";
-
-
     private static final String PASSWORD = "-password";
     private static final String ENCODING = "-encoding";
     private static final String CONSOLE = "-console";
@@ -82,9 +68,11 @@
         boolean toHTML = false;
         boolean sort = false;
         String password = "";
-        String encoding = DEFAULT_ENCODING;
+        String encoding = null;
         String pdfFile = null;
-        String textFile = null;
+        String outputFile = null;
+        // Defaults to text files
+        String ext = ".txt";
         int startPage = 1;
         int endPage = Integer.MAX_VALUE;
         for( int i=0; i<args.length; i++ )
@@ -119,6 +107,7 @@
             else if( args[i].equals( HTML ) )
             {
                 toHTML = true;
+                ext = ".html";
             }
             else if( args[i].equals( SORT ) )
             {
@@ -145,7 +134,7 @@
                 }
                 else
                 {
-                    textFile = args[i];
+                    outputFile = args[i];
                 }
             }
         }
@@ -168,19 +157,17 @@
                     URL url = new URL( pdfFile );
                     document = PDDocument.load( url );
                     String fileName = url.getFile();
-                    if( textFile == null && fileName.length() >4 )
+                    if( outputFile == null && fileName.length() >4 )
                     {
-                        File outputFile =
-                            new File( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
-                        textFile = outputFile.getName();
+                        outputFile = new File( fileName.substring( 0, fileName.length() -4 ) + ext ).getName();
                     }
                 }
                 catch( MalformedURLException e )
                 {
                     document = PDDocument.load( pdfFile );
-                    if( textFile == null && pdfFile.length() >4 )
+                    if( outputFile == null && pdfFile.length() >4 )
                     {
-                        textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
+                        outputFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ext;
                     }
                 }
 
@@ -196,6 +183,10 @@
                         throw new IOException( "You do not have permission to extract text" );
                     }
                 }
+
+                if ((encoding == null) && (toHTML))
+                    encoding = "UTF-8";
+
                 if( toConsole )
                 {
                     output = new OutputStreamWriter( System.out );
@@ -205,24 +196,24 @@
                     if( encoding != null )
                     {
                         output = new OutputStreamWriter(
-                            new FileOutputStream( textFile ), encoding );
+                                new FileOutputStream( outputFile ), encoding );
                     }
                     else
                     {
                         //use default encoding
                         output = new OutputStreamWriter(
-                            new FileOutputStream( textFile ) );
+                                new FileOutputStream( outputFile ) );
                     }
                 }
 
                 PDFTextStripper stripper = null;
                 if(toHTML)
                 {
-                   stripper = new PDFText2HTML();
+                    stripper = new PDFText2HTML(encoding);
                 }
                 else
                 {
-                   stripper = new PDFTextStripper();
+                    stripper = new PDFTextStripper();
                 }
                 stripper.setSortByPosition( sort );
                 stripper.setStartPage( startPage );

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Wed Feb 25 16:50:41 2009
@@ -24,74 +24,55 @@
 import org.apache.pdfbox.pdmodel.PDDocument;
 
 /**
- * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
- * Paragraphs broken by pages, columns, or figures are not mended.
- *
- *
+ * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs
+ * broken by pages, columns, or figures are not mended.
+ * 
+ * 
  * @author jjb - http://www.johnjbarton.com
- * @version  $Revision: 1.3 $
+ * @version $Revision: 1.3 $
  */
-public class PDFText2HTML extends PDFTextStripper
-{
+public class PDFText2HTML extends PDFTextStripper {
     private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
 
-    private TextPosition beginTitle;
-    private TextPosition afterEndTitle;
-    private String titleGuess;
-    private boolean suppressParagraphs;
     private boolean onFirstPage = true;
+    private String encoding;
 
     /**
      * Constructor.
-     *
-     * @throws IOException If there is an error during initialization.
+     * 
+     * @throws IOException
+     *             If there is an error during initialization.
      */
-    public PDFText2HTML() throws IOException
-    {
-        titleGuess = "";
-        beginTitle = null;
-        afterEndTitle = null;
-        suppressParagraphs = false;
+    public PDFText2HTML(String encoding) throws IOException {
+        this.encoding = encoding;
+        this.lineSeparator = "<br>" + System.getProperty("line.separator");
     }
 
     /**
-     * Write the header to the output document.
-     *
-     * @throws IOException If there is a problem writing out the header to the document.
+     * Write the header to the output document. Now also writes the tag defining
+     * the character encoding.
+     * 
+     * @throws IOException
+     *             If there is a problem writing out the header to the document.
      */
-    protected void writeHeader() throws IOException
-    {
+    protected void writeHeader() throws IOException {
         StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
+        buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n" + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");
         buf.append("<html><head>");
-        buf.append("<title>");
-        buf.append(getTitleGuess());
-        buf.append("</title>");
-        buf.append("</head>");
+        buf.append("<title>" + getTitle() + "</title>\n");
+        if(encoding != null){
+            buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + this.encoding + "\">\n");
+        }
+        buf.append("</head>\n");
         buf.append("<body>\n");
         super.writeString(buf.toString());
     }
 
     /**
-     * The guess to the document title.
-     *
-     * @return A string that is the title of this document.
-     */
-    protected String getTitleGuess()
-    {
-        return titleGuess;
-    }
-
-
-    /**
      * {@inheritDoc}
      */
-    protected void writePage() throws IOException
-    {
-        Iterator textIter = getCharactersByArticle().iterator();
-
-        if (onFirstPage)
-        {
-            guessTitle(textIter);
+    protected void writePage() throws IOException {
+        if (onFirstPage) {
             writeHeader();
             onFirstPage = false;
         }
@@ -101,109 +82,91 @@
     /**
      * {@inheritDoc}
      */
-    public void endDocument(PDDocument pdf) throws IOException
-    {
+    public void endDocument(PDDocument pdf) throws IOException {
         super.writeString("</body></html>");
     }
 
     /**
-     * This method will attempt to guess the title of the document.
-     *
-     * @param textIter The characters on the first page.
-     * @return The text position that is guessed to be the title.
-     */
-    protected TextPosition guessTitle(Iterator textIter)
-    {
-        float lastFontSize = -1.0f;
-        int stringsInFont = 0;
-        StringBuffer titleText = new StringBuffer();
-        while (textIter.hasNext())
-        {
-            Iterator textByArticle = ((List)textIter.next()).iterator();
-            while( textByArticle.hasNext() )
-            {
-                TextPosition position = (TextPosition) textByArticle.next();
-                float currentFontSize = position.getFontSize();
-                if (currentFontSize != lastFontSize)
-                {
-                    if (beginTitle != null)
-                    { // font change in candidate title.
-                        if (stringsInFont == 0)
-                        {
-                            beginTitle = null; // false alarm
-                            titleText.setLength(0);
-                        }
-                        else
-                        {
-                            // had a significant font with some words: call it a title
-                            titleGuess = titleText.toString();
-                            afterEndTitle = position;
-                            return beginTitle;
+     * This method will attempt to guess the title of the document using
+     * either the document properties or the first lines of text.
+     * 
+     * @return returns the title.
+     */
+    protected String getTitle() {
+        String titleGuess = document.getDocumentInformation().getTitle();
+        if(titleGuess != null && titleGuess.length() > 0){
+            return titleGuess;
+        }
+        else {
+            Iterator textIter = getCharactersByArticle().iterator();
+            float lastFontSize = -1.0f;
+
+            StringBuffer titleText = new StringBuffer();
+            while (textIter.hasNext()) {
+
+                Iterator textByArticle = ((List) textIter.next()).iterator();
+                while (textByArticle.hasNext()) {
+                    TextPosition position = (TextPosition) textByArticle.next();
+
+                    float currentFontSize = position.getFontSize();
+                    //If we're past 64 chars we will assume that we're past the title
+                    //64 is arbitrary 
+                    if (currentFontSize != lastFontSize || titleText.length() > 64) {
+                        if (titleText.length() > 0) {
+                            return titleText.toString();
                         }
+                        lastFontSize = currentFontSize;
                     }
-                    else
-                    { // font change and begin == null
-                        if (currentFontSize > 13.0f)
-                        { // most body text is 12pt max I guess
-                            beginTitle = position;
-                        }
+                    if (currentFontSize > 13.0f) { // most body text is 12pt
+                        titleText.append(position.getCharacter());
                     }
-
-                    lastFontSize = currentFontSize;
-                    stringsInFont = 0;
-                }
-                stringsInFont++;
-                if (beginTitle != null)
-                {
-                    titleText.append(position.getCharacter()+" ");
                 }
             }
         }
-        return beginTitle; // null
+        return "";
     }
 
+
     /**
-     * Write out the paragraph separator.
-     *
-     * @throws IOException If there is an error writing to the stream.
+     * Write out the article separator (div tag) with proper text direction
+     * information.
+     * 
+     * @param true if direction of text is left to right
+     * @throws IOException
+     *             If there is an error writing to the stream.
      */
-    protected void startParagraph() throws IOException
-    {
-        if (! suppressParagraphs)
-        {
-            super.writeString("<p>");
+    protected void startArticle(boolean isltr) throws IOException {
+        if (isltr) {
+            super.writeString("<div>");
+        } 
+        else {
+            super.writeString("<div dir=\"RTL\">");
         }
     }
+
     /**
-     * Write out the paragraph separator.
-     *
-     * @throws IOException If there is an error writing to the stream.
+     * Write out the article separator.
+     * 
+     * @throws IOException
+     *             If there is an error writing to the stream.
      */
-    protected void endParagraph() throws IOException
-    {
-        if (! suppressParagraphs)
-        {
-            super.writeString("</p>");
-        }
+    protected void endArticle() throws IOException {
+        super.writeString("</div>");
     }
 
     /**
      * Write a string to the output stream and escape some HTML characters
      */
-    protected void writeString(String chars) throws IOException
-    {
-        for (int i = 0; i < chars.length(); i++)
-        {
+    protected void writeString(String chars) throws IOException {
+        for (int i = 0; i < chars.length(); i++) {
             char c = chars.charAt(i);
-            if ((c < 32) || (c > 126))
-            {
+            // write non-ASCII as named entities
+            if ((c < 32) || (c > 126)) {
                 int charAsInt = c;
                 super.writeString("&#" + charAsInt + ";");
-            }
-            else
-            {
-                switch (c)
-                {
+            } 
+            else {
+                switch (c) {
                 case 34:
                     super.writeString("&quot;");
                     break;
@@ -222,39 +185,4 @@
             }
         }
     }
-    
-    /**
-     * {@inheritDoc}
-     */
-    protected void writeCharacters(TextPosition position ) throws IOException
-    {
-        if (position == beginTitle)
-        {
-            super.writeString("<H1>");
-            suppressParagraphs = true;
-        }
-        if (position == afterEndTitle)
-        {
-            super.writeString("</H1>");  // end title and start first paragraph
-            suppressParagraphs = false;
-        }
-
-        writeString(position.getCharacter());
-    }
-    
-
-    /**
-     * @return Returns the suppressParagraphs.
-     */
-    public boolean isSuppressParagraphs()
-    {
-        return suppressParagraphs;
-    }
-    /**
-     * @param shouldSuppressParagraphs The suppressParagraphs to set.
-     */
-    public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
-    {
-        this.suppressParagraphs = shouldSuppressParagraphs;
-    }
 }

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Wed Feb 25 16:50:41 2009
@@ -63,7 +63,7 @@
     private int startBookmarkPageNumber = -1;
     private PDOutlineItem endBookmark = null;
     private int endBookmarkPageNumber = -1;
-    private PDDocument document;
+    protected PDDocument document;
     private boolean suppressDuplicateOverlappingText = true;
     private boolean shouldSeparateByBeads = true;
     private boolean sortByPosition = false;
@@ -88,7 +88,7 @@
 
     private Map characterListMapping = new HashMap();
 
-    private String lineSeparator = System.getProperty("line.separator");
+    protected String lineSeparator = System.getProperty("line.separator");
     private String pageSeparator = System.getProperty("line.separator");
     private String wordSeparator = " ";
 
@@ -335,23 +335,40 @@
     }
 
     /**
-     * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
+     * Start a new article, which is typically defined as a column
+     * on a single page (also referred to as a bead).  This assumes
+     * that the primary direction of text is left to right.  
+     * Default implementation is to do nothing.  Subclasses
      * may provide additional information.
      *
      * @throws IOException If there is any error writing to the stream.
      */
-    protected void startParagraph() throws IOException
+    protected void startArticle() throws IOException
+    {
+        startArticle(true);
+    }
+    
+    /**
+     * Start a new article, which is typically defined as a column
+     * on a single page (also referred to as a bead).  
+     * Default implementation is to do nothing.  Subclasses
+     * may provide additional information.
+     *
+     * @param true if primary direction of text is left to right
+     * @throws IOException If there is any error writing to the stream.
+     */
+    protected void startArticle(boolean isltr) throws IOException
     {
         //default is to do nothing.
     }
 
     /**
-     * End a paragraph.  Default implementation is to do nothing.  Subclasses
+     * End an article.  Default implementation is to do nothing.  Subclasses
      * may provide additional information.
      *
      * @throws IOException If there is any error writing to the stream.
      */
-    protected void endParagraph() throws IOException
+    protected void endArticle() throws IOException
     {
         //default is to do nothing
     }
@@ -395,13 +412,10 @@
     {
         float maxYForLine = -1;
         float minYTopForLine = Float.MAX_VALUE;
-        //float lastBaselineFontSize = -1;
         float endOfLastTextX = -1;
-        //float endOfLastTextY = -1;
         float expectedStartOfNextWordX = -1;
         float lastWordSpacing = -1;
         float maxHeightForLine = -1;
-        //float lastHeightForLine = -1;
         TextPosition lastPosition = null;
         
         if (normalize == null) {
@@ -410,7 +424,6 @@
         
         for( int i = 0; i < charactersByArticle.size(); i++)
         {
-            startParagraph();
             List textList = (List)charactersByArticle.get( i );
             if( sortByPosition )
             {
@@ -464,7 +477,9 @@
             if (rtlCnt > ltrCnt) {
                 isRtlDominant = true;
             }
-
+            
+            startArticle(!isRtlDominant);
+            
             // we will later use this to skip reordering
             boolean hasRtl = false;
             if (rtlCnt > 0)
@@ -501,22 +516,14 @@
                 	positionHeight = position.getHeight();
                 }
 
-
-                float wordSpacing = 0;
-                /* float wordSpacing = position.getWordSpacing();	BC: When I re-enabled this for a a test, lots of extra spaces were added
+                //try to get width of a space character
+                float wordSpacing = position.getWidthOfSpace();
+                //if still zero fall back to getting the width of the current
+                //character
                 if( wordSpacing == 0 )
                 {
-                */
-                    //try to get width of a space character
-                    wordSpacing = position.getWidthOfSpace();
-                    //if still zero fall back to getting the width of the current
-                    //character
-                    if( wordSpacing == 0 )
-                    {
-                      wordSpacing = positionWidth;
-                    }
-                //}
-
+                    wordSpacing = positionWidth;
+                }
 
                 // RDD - We add a conservative approximation for space determination.
                 // basically if there is a blank area between two characters that is
@@ -531,27 +538,6 @@
                     expectedStartOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
                 }
 
-                // RDD - We will suppress text that is very close to the current line
-                // and which overwrites previously rendered text on this line.
-                // This is done specifically to handle a reasonably common situation
-                // where an application (MS Word, in the case of my examples) renders
-                // text four times at small (1 point) offsets in order to accomplish
-                // bold printing.  You would not want to do this step if you were
-                // going to render the TextPosition objects graphically.
-                //
-                /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
-                    (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
-                {
-                    if (log.isDebugEnabled())
-                    {
-                        log.debug("Suppressing text overwrite" +
-                                  " x: " + position.getX() +
-                                  " endOfLastTextX: " + endOfLastTextX +
-                                  " string: " + position.getCharacter());
-                    }
-                    continue;
-                }*/
-
                 // RDD - Here we determine whether this text object is on the current
                 // line.  We use the lastBaselineFontSize to handle the superscript
                 // case, and the size of the current font to handle the subscript case.
@@ -559,24 +545,11 @@
                 // a small amount in order to be considered as being on the same line.
                 //
 
-                //int verticalScaling = 1;
-                //if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
-                //{
-                //    verticalScaling = -1;
-                //}
-
-                if( lastPosition != null )
-                {
-                    //if (currentY != -1 &&
-                    //    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
-                    //     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
-                    //{
+                if( lastPosition != null ){  
                     /* XXX BC: In theory, this check should really check if the next char is in full range
                      * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
                      * of regression test failures.  So, I'm leaving it be for now. */
-                    if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) ))
-                    		//maxYForLine - minYTopForLine)))
-                    {
+                    if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)){
                         // If we have RTL text on the page, change the direction
                         if (hasRtl)
                             lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
@@ -597,9 +570,7 @@
                         expectedStartOfNextWordX = -1;
                         maxYForLine = -1;
                         maxHeightForLine = -1;
-                        //lastBaselineFontSize = -1;
                         minYTopForLine = Float.MAX_VALUE;
-                        //lastHeightForLine = -1;
                     }
 
 
@@ -610,35 +581,25 @@
                     {
 	                    lineStr += getWordSeparator();
 	                }
-	                else
-	                {
-	                    //System.out.println( "Not a word separator " + position.getCharacter() +  " start=" + startOfNextWordX + " x=" + position.getX() );
-	                }
+
                 }
 
                 if (positionY >= maxYForLine) {
                 	maxYForLine = positionY;
-                    //lastBaselineFontSize = position.getFontSize();
                 }
 
                 // RDD - endX is what PDF considers to be the x coordinate of the
                 // end position of the text.  We use it in computing our metrics below.
                 endOfLastTextX = positionX + positionWidth;
-                //endOfLastTextY = positionY;
 
                 // add it to the list
                 if (characterValue != null)
                 {
                     lineStr += characterValue;
                 }
-                else
-                {
-                    //Position.getString() is null so not writing anything
-                }
                 maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
                 minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
                 lastPosition = position;
-                //lastHeightForLine = position.getHeight();
                 lastWordSpacing = wordSpacing;
             }
             
@@ -653,7 +614,7 @@
                 writeString(lineStr);
             }
             
-            endParagraph();
+            endArticle();
         }
 
         writePageSeperator();;