You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2013/11/24 14:55:24 UTC

svn commit: r1544975 - in /pdfbox/branches/1.8: ./ pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Author: lehmi
Date: Sun Nov 24 13:55:24 2013
New Revision: 1544975

URL: http://svn.apache.org/r1544975
Log:
PDFBOX-1213: added some style imformation to the PDF2HTML converter as proposed by Axel Dörfler

Modified:
    pdfbox/branches/1.8/   (props changed)
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Propchange: pdfbox/branches/1.8/
------------------------------------------------------------------------------
  Merged /pdfbox/trunk:r1544818

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1544975&r1=1544974&r2=1544975&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Sun Nov 24 13:55:24 2013
@@ -17,25 +17,28 @@
 package org.apache.pdfbox.util;
 
 import java.io.IOException;
-
+import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Set;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
 
 /**
  * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs
  * broken by pages, columns, or figures are not mended.
  *
- *
  * @author jjb - http://www.johnjbarton.com
- * @version $Revision: 1.3 $
+ * 
  */
 public class PDFText2HTML extends PDFTextStripper
 {
     private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
 
     private boolean onFirstPage = true;
+    private FontState fontState = new FontState();
 
     /**
      * Constructor.
@@ -180,6 +183,19 @@ public class PDFText2HTML extends PDFTex
     }
 
     /**
+     * Write a string to the output stream, maintain font state, and escape some HTML characters.
+     * The font state is only preserved per word.
+     *
+     * @param text The text to write to the stream.
+     * @param textPositions the corresponding text positions
+     * @throws IOException If there is an error writing to the stream.
+     */
+    protected void writeString(String text, List<TextPosition> textPositions) throws IOException
+    {
+        super.writeString(fontState.push(text, textPositions));
+    }
+
+    /**
      * Write a string to the output stream and escape some HTML characters.
      *
      * @param chars String to be written to the stream
@@ -192,44 +208,215 @@ public class PDFText2HTML extends PDFTex
     }
 
     /**
+     * Writes the paragraph end "</p>" to the output. Furthermore, it will also clear the font state.
+     * 
+     * {@inheritDoc}
+     */
+    @Override
+    protected void writeParagraphEnd() throws IOException
+    {
+        writeString(fontState.clear());
+        super.writeParagraphEnd();
+    }
+
+    /**
      * Escape some HTML characters.
      *
      * @param chars String to be escaped
      * @return returns escaped String.
      */
-    private String escape(String chars)
+    private static String escape(String chars)
     {
         StringBuilder builder = new StringBuilder(chars.length());
         for (int i = 0; i < chars.length(); i++)
         {
-            char c = chars.charAt(i);
-            // write non-ASCII as named entities
-            if ((c < 32) || (c > 126))
+            appendEscaped(builder, chars.charAt(i));
+        }
+        return builder.toString();
+    }
+
+    private static void appendEscaped(StringBuilder builder, char character)
+    {
+        // write non-ASCII as named entities
+        if ((character < 32) || (character > 126))
+        {
+            int charAsInt = character;
+            builder.append("&#").append(charAsInt).append(";");
+        }
+        else
+        {
+            switch (character)
             {
-                int charAsInt = c;
-                builder.append("&#").append(charAsInt).append(";");
+            case 34:
+                builder.append("&quot;");
+                break;
+            case 38:
+                builder.append("&amp;");
+                break;
+            case 60:
+                builder.append("&lt;");
+                break;
+            case 62:
+                builder.append("&gt;");
+                break;
+            default:
+                builder.append(String.valueOf(character));
             }
-            else
+        }
+    }
+
+    /**
+     * A helper class to maintain the current font state. It's public methods will emit opening and
+     * closing tags as needed, and in the correct order.
+     *
+     * @author Axel Dörfler
+     */
+    private static class FontState
+    {
+        protected List<String> stateList = new ArrayList<String>();
+        protected Set<String> stateSet = new HashSet<String>();
+
+        /**
+         * Pushes new {@link TextPosition TextPositions} into the font state. The state is only
+         * preserved correctly for each letter if the number of letters in <code>text</code> matches
+         * the number of {@link TextPosition} objects. Otherwise, it's done once for the complete
+         * array (just by looking at its first entry).
+         *
+         * @return A string that contains the text including tag changes caused by its font state.
+         */
+        public String push(String text, List<TextPosition> textPositions)
+        {
+            StringBuilder buffer = new StringBuilder();
+
+            if (text.length() == textPositions.size())
             {
-                switch (c)
+                // There is a 1:1 mapping, and we can use the TextPositions directly
+                for (int i = 0; i < text.length(); i++)
                 {
-                case 34:
-                    builder.append("&quot;");
-                    break;
-                case 38:
-                    builder.append("&amp;");
-                    break;
-                case 60:
-                    builder.append("&lt;");
-                    break;
-                case 62:
-                    builder.append("&gt;");
-                    break;
-                default:
-                    builder.append(String.valueOf(c));
+                    push(buffer, text.charAt(i), textPositions.get(i));
                 }
             }
+            else if (!text.isEmpty())
+            {
+                // The normalized text does not match the number of TextPositions, so we'll just
+                // have a look at its first entry.
+                // TODO change PDFTextStripper.normalize() such that it maintains the 1:1 relation
+                if (textPositions.isEmpty())
+                {
+                    return text;
+                }
+                push(buffer, text.charAt(0), textPositions.get(0));
+                buffer.append(escape(text.substring(1)));
+            }
+            return buffer.toString();
+        }
+
+        /**
+         * Closes all open states.
+         * @return A string that contains the closing tags of all currently open states.
+         */
+        public String clear()
+        {
+            StringBuilder buffer = new StringBuilder();
+            closeUntil(buffer, null);
+            stateList.clear();
+            stateSet.clear();
+            return buffer.toString();
+        }
+
+        protected String push(StringBuilder buffer, char character, TextPosition textPosition)
+        {
+            boolean bold = false;
+            boolean italics = false;
+
+            PDFontDescriptor descriptor = textPosition.getFont().getFontDescriptor();
+            if (descriptor != null)
+            {
+                bold = isBold(descriptor);
+                italics = isItalic(descriptor);
+            }
+            
+            buffer.append(bold ? open("b") : close("b"));
+            buffer.append(italics ? open("i") : close("i"));
+            appendEscaped(buffer, character);
+
+            return buffer.toString();
+        }
+
+        private String open(String tag)
+        {
+            if (stateSet.contains(tag))
+            {
+                return "";
+            }
+            stateList.add(tag);
+            stateSet.add(tag);
+
+            return openTag(tag);
+        }
+
+        private String close(String tag)
+        {
+            if (!stateSet.contains(tag))
+            {
+                return "";
+            }
+            // Close all tags until (but including) the one we should close
+            StringBuilder tagsBuilder = new StringBuilder();
+            int index = closeUntil(tagsBuilder, tag);
+
+            // Remove from state
+            stateList.remove(index);
+            stateSet.remove(tag);
+
+            // Now open the states that were closed but should remain open again
+            for (; index < stateList.size(); index++)
+            {
+                tagsBuilder.append(openTag(stateList.get(index)));
+            }
+            return tagsBuilder.toString();
+        }
+
+        private int closeUntil(StringBuilder tagsBuilder, String endTag)
+        {
+            for (int i = stateList.size(); i-- > 0;)
+            {
+                String tag = stateList.get(i);
+                tagsBuilder.append(closeTag(tag));
+                if (endTag != null && tag.equals(endTag))
+                {
+                    return i;
+                }
+            }
+            return -1;
+        }
+
+        private String openTag(String tag)
+        {
+            return "<" + tag + ">";
+        }
+
+        private String closeTag(String tag)
+        {
+            return "</" + tag + ">";
+        }
+
+        private boolean isBold(PDFontDescriptor descriptor)
+        {
+            if (descriptor.isForceBold())
+            {
+                return true;
+            }
+            return descriptor.getFontName().contains("Bold");
+        }
+
+        private boolean isItalic(PDFontDescriptor descriptor)
+        {
+            if (descriptor.isItalic())
+            {
+                return true;
+            }
+            return descriptor.getFontName().contains("Italic");
         }
-        return builder.toString();
     }
 }

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1544975&r1=1544974&r2=1544975&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sun Nov 24 13:55:24 2013
@@ -59,7 +59,7 @@ import org.apache.pdfbox.pdmodel.interac
  * Eventually, we fully process each page and then print it. 
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * @version $Revision: 1.70 $
+ * 
  */
 public class PDFTextStripper extends PDFStreamEngine
 {
@@ -120,7 +120,6 @@ public class PDFTextStripper extends PDF
     private String articleStart = "";
     private String articleEnd = "";
 
-
     private int currentPageNo = 0;
     private int startPage = 1;
     private int endPage = Integer.MAX_VALUE;
@@ -188,7 +187,6 @@ public class PDFTextStripper extends PDF
      */
     private boolean inParagraph;
 
-
     /**
      * Instantiate a new PDFTextStripper object. This object will load
      * properties from PDFTextStripper.properties and will not do
@@ -205,7 +203,6 @@ public class PDFTextStripper extends PDF
         normalize = new TextNormalize(this.outputEncoding);
     }
 
-
     /**
      * Instantiate a new PDFTextStripper object.  Loading all of the operator mappings
      * from the properties object that is passed in.  Does not convert the text
@@ -243,9 +240,7 @@ public class PDFTextStripper extends PDF
      * NOTE: The document must not be encrypted when coming into this method.
      *
      * @param doc The document to get the text from.
-     *
      * @return The text of the PDF document.
-     *
      * @throws IOException if the doc state is invalid or it is encrypted.
      */
     public String getText( PDDocument doc ) throws IOException
@@ -331,7 +326,6 @@ public class PDFTextStripper extends PDF
                 throw new WrappedIOException("Error: document is encrypted", e);
             }
         }
-
         processPages( document.getDocumentCatalog().getAllPages() );
         endDocument(document);
     }
@@ -349,7 +343,6 @@ public class PDFTextStripper extends PDF
         {
             startBookmarkPageNumber = getPageNumber( startBookmark, pages );
         }
-
         if( endBookmark != null )
         {
             endBookmarkPageNumber = getPageNumber( endBookmark, pages );
@@ -365,8 +358,6 @@ public class PDFTextStripper extends PDF
             startBookmarkPageNumber = 0;
             endBookmarkPageNumber = 0;
         }
-
-
         Iterator<COSObjectable> pageIter = pages.iterator();
         while( pageIter.hasNext() )
         {
@@ -450,13 +441,11 @@ public class PDFTextStripper extends PDF
                     charactersByArticle.set( i, new ArrayList<TextPosition>() );
                 }
             }
-
             characterListMapping.clear();
             processStream( page, page.findResources(), content );
             writePage();
             endPage( page );
         }
-
     }
 
     /**
@@ -564,9 +553,7 @@ public class PDFTextStripper extends PDF
                 TextPositionComparator comparator = new TextPositionComparator();
                 Collections.sort( textList, comparator );
             }
-
             Iterator<TextPosition> textIter = textList.iterator();
-
             /* Before we can display the text, we need to do some normalizing.
              * Arabic and Hebrew text is right to left and is typically stored
              * in its logical format, which means that the rightmost character is
@@ -607,7 +594,6 @@ public class PDFTextStripper extends PDF
                     }
                 }
             }
-
             // choose the dominant direction
             boolean isRtlDominant = rtlCnt > ltrCnt;
 
@@ -742,17 +728,14 @@ public class PDFTextStripper extends PDF
                     {
                         writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
                         line.clear();
-
                         lastLineStartPosition = 
                             handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
-
                         endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
                         expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
                         maxYForLine = MAXYFORLINE_RESET_VALUE;
                         maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
                         minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
                     }
-
                     //Test if our TextPosition starts after a new word would be expected to start.
                     if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE 
                             && expectedStartOfNextWordX < positionX &&
@@ -763,12 +746,10 @@ public class PDFTextStripper extends PDF
                         line.add(WordSeparator.getSeparator());
                     }
                 }
-
                 if (positionY >= maxYForLine)
                 {
                     maxYForLine = positionY;
                 }
-
                 // RDD - endX is what PDF considers to be the x coordinate of the
                 // end position of the text.  We use it in computing our metrics below.
                 endOfLastTextX = positionX + positionWidth;
@@ -795,14 +776,12 @@ public class PDFTextStripper extends PDF
                 lastWordSpacing = wordSpacing;
                 previousAveCharWidth = averageCharWidth;
             }
-
             // print the final line
             if (line.size() > 0)
             {
                 writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
                 writeParagraphEnd();
             }
-
             endArticle();
         }
         writePageEnd();
@@ -823,7 +802,6 @@ public class PDFTextStripper extends PDF
     {
         // RDD - newline at end of flush - required for end of page (so that the top
         // of the next page starts on its own line.
-        //
         output.write(getPageSeparator());
         output.flush();
     }
@@ -861,6 +839,19 @@ public class PDFTextStripper extends PDF
     }
 
     /**
+     * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code>
+     * and just calls {@link #writeString(String)}.
+     *
+     * @param text The text to write to the stream.
+     * @param textPositions The TextPositions belonging to the text.
+     * @throws IOException If there is an error when writing the text.
+     */
+    protected void writeString(String text, List<TextPosition> textPositions) throws IOException
+    {
+        writeString(text);
+    }
+
+    /**
      * Write a Java string to the output stream.
      *
      * @param text The text to write to the stream.
@@ -905,7 +896,6 @@ public class PDFTextStripper extends PDF
                 sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
                 characterListMapping.put( textCharacter, sameTextCharacters );
             }
-
             // RDD - Here we compute the value that represents the end of the rendered
             // text.  This value is used to determine whether subsequent text rendered
             // on the same line overwrites the current text.
@@ -932,7 +922,6 @@ public class PDFTextStripper extends PDF
                     break;
                 }
             }
-
             if( !suppressCharacter )
             {
                 TreeSet<Float> ySet = sameTextCharacters.get(textX);
@@ -945,7 +934,6 @@ public class PDFTextStripper extends PDF
                 showCharacter = true;
             }
         }
-
         if( showCharacter )
         {
             //if we are showing the character then we need to determine which
@@ -1220,7 +1208,7 @@ public class PDFTextStripper extends PDF
     public void setSuppressDuplicateOverlappingText(
             boolean suppressDuplicateOverlappingTextValue)
     {
-        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
+        suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
     }
 
     /**
@@ -1240,7 +1228,7 @@ public class PDFTextStripper extends PDF
      */
     public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
     {
-        this.shouldSeparateByBeads = aShouldSeparateByBeads;
+        shouldSeparateByBeads = aShouldSeparateByBeads;
     }
 
     /**
@@ -1353,7 +1341,7 @@ public class PDFTextStripper extends PDF
      */
     public void setSpacingTolerance(float spacingToleranceValue)
     {
-        this.spacingTolerance = spacingToleranceValue;
+        spacingTolerance = spacingToleranceValue;
     }
 
     /**
@@ -1378,7 +1366,7 @@ public class PDFTextStripper extends PDF
      */
     public void setAverageCharTolerance(float averageCharToleranceValue) 
     {
-        this.averageCharTolerance = averageCharToleranceValue;
+        averageCharTolerance = averageCharToleranceValue;
     }
 
 
@@ -1455,7 +1443,7 @@ public class PDFTextStripper extends PDF
      */
     public void setParagraphStart(String s)
     {
-        this.paragraphStart = s;
+        paragraphStart = s;
     }
 
     /**
@@ -1473,7 +1461,7 @@ public class PDFTextStripper extends PDF
      */
     public void setParagraphEnd(String s)
     {
-        this.paragraphEnd = s;
+        paragraphEnd = s;
     }
 
 
@@ -1576,7 +1564,6 @@ public class PDFTextStripper extends PDF
                 return str;
             }
         }
-
         StringBuilder reversed = new StringBuilder(str.length());
         for (int i = str.length() - 1; i >= 0; --i)
         {
@@ -1784,8 +1771,7 @@ public class PDFTextStripper extends PDF
     {
         TextPosition tp = pw.getTextPosition();
         String txt = tp.getCharacter();
-        Pattern p = matchPattern(txt,getListItemPatterns());
-        return p;
+        return matchPattern(txt,getListItemPatterns());
     }
 
     /**
@@ -1820,7 +1806,6 @@ public class PDFTextStripper extends PDF
             listOfPatterns = patterns;
     }
 
-
     /**
      * returns a list of regular expression Patterns representing
      * different common list item formats.  For example
@@ -1887,12 +1872,13 @@ public class PDFTextStripper extends PDF
      * @param isRtlDominant determines if rtl or ltl is dominant
      * @throws IOException if something went wrong
      */
-    private void writeLine(List<String> line, boolean isRtlDominant)throws IOException
+    private void writeLine(List<WordWithTextPositions> line, boolean isRtlDominant) throws IOException
     {
         int numberOfStrings = line.size();
         for(int i=0; i<numberOfStrings; i++)
         {
-            writeString(line.get(i));
+            WordWithTextPositions word = line.get(i);
+            writeString(word.getText(), word.getTextPositions());
             if (i < numberOfStrings-1)
             {
                 writeWordSeparator();
@@ -1907,55 +1893,65 @@ public class PDFTextStripper extends PDF
      * @param hasRtl determines if lines contains rtl formatted text(parts)
      * @return a list of strings, one string for every word
      */
-    private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
+    private List<WordWithTextPositions> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
     {
-        LinkedList<String> normalized = new LinkedList<String>();
+        LinkedList<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
         StringBuilder lineBuilder = new StringBuilder();
+        List<TextPosition> wordPositions = new ArrayList<TextPosition>();
         // concatenate the pieces of text in opposite order if RTL is dominant
         if (isRtlDominant)
         {
             int numberOfPositions = line.size();
             for(int i = numberOfPositions-1;i>=0;i--)
             {
-                TextPosition text = line.get(i);
-                if (text instanceof WordSeparator) 
-                {
-                    normalized.add(normalize.normalizePres(lineBuilder.toString()));
-                    lineBuilder = new StringBuilder();
-                }
-                else 
-                {
-                    lineBuilder.append(text.getCharacter());
-                }
-            }
-            if (lineBuilder.length() > 0) 
-            {
-                normalized.add(normalize.normalizePres(lineBuilder.toString()));
+                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
             }
         }
         else
         {
             for(TextPosition text : line)
             {
-                if (text instanceof WordSeparator) 
-                {
-                    normalized.add(normalize.normalizePres(lineBuilder.toString()));
-                    lineBuilder = new StringBuilder();
-                }
-                else 
-                {
-                    lineBuilder.append(text.getCharacter());
-                }
-            }
-            if (lineBuilder.length() > 0) 
-            {
-                normalized.add(normalize.normalizePres(lineBuilder.toString()));
+                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, text);
             }
         }
+        if (lineBuilder.length() > 0) 
+        {
+            normalized.add(createWord(lineBuilder.toString(), wordPositions));
+        }
         return normalized;
     }
 
     /**
+     * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions}
+     * entry.
+     */
+    private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
+    {
+        return new WordWithTextPositions(normalize.normalizePres(word), wordPositions);
+    }
+
+    /**
+     * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
+     * @return The StringBuilder that must be used when calling this method.
+     */
+    private StringBuilder normalizeAdd(LinkedList<WordWithTextPositions> normalized,
+            StringBuilder lineBuilder, List<TextPosition> wordPositions, TextPosition text)
+    {
+        if (text instanceof WordSeparator) 
+        {
+            normalized.add(createWord(lineBuilder.toString(), wordPositions));
+            lineBuilder = new StringBuilder();
+            wordPositions.clear();
+        }
+        else 
+        {
+            lineBuilder.append(text.getCharacter());
+            wordPositions.add(text);
+        }
+        return lineBuilder;
+    }
+
+    /**
      * internal marker class.  Used as a place holder in
      * a line of TextPositions.
      * @author ME21969
@@ -1973,7 +1969,34 @@ public class PDFTextStripper extends PDF
         {
             return separator;
         }
-
     }
 
+    /**
+     * Internal class that maps strings to lists of {@link TextPosition} arrays.
+     * Note that the number of entries in that list may differ from the number of characters in the
+     * string due to normalization.
+     *
+     * @author Axel Dörfler
+     */
+    private static final class WordWithTextPositions
+    {
+        protected String text;
+        protected List<TextPosition> textPositions;
+        
+        public WordWithTextPositions(String word, List<TextPosition> positions)
+        {
+            text = word;
+            textPositions = positions;
+        }
+        
+        public String getText()
+        {
+            return text;
+        }
+
+        public List<TextPosition> getTextPositions()
+        {
+            return textPositions;
+        }
+    }
 }