You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/07/26 19:35:28 UTC
svn commit: r979379 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/util/ test/resources/input/

Author: lehmi
Date: Mon Jul 26 17:35:27 2010
New Revision: 979379

URL: http://svn.apache.org/viewvc?rev=979379&view=rev
Log:
PDFBOX-521: added improved text extraction algos. Based on a patch by Mel Martinez (m dot martinez at ll dot mit dot edu)

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Mon Jul 26 17:35:27 2010
@@ -44,8 +44,14 @@ public class PDFText2HTML extends PDFTex
      */
     public PDFText2HTML(String encoding) throws IOException 
     {
-        this.outputEncoding = encoding;
-        this.lineSeparator = "<br>" + System.getProperty("line.separator");
+        super(encoding);
+        setLineSeparator(systemLineSeparator);
+        setParagraphStart("<p>");
+        setParagraphEnd("</p>"+systemLineSeparator);
+        setPageStart("<div style=\"page-break-before:always; page-break-after:always\">");
+        setPageEnd("</div>"+systemLineSeparator);
+        setArticleStart(systemLineSeparator);
+        setArticleEnd(systemLineSeparator);
     }
 
     /**
@@ -108,16 +114,16 @@ public class PDFText2HTML extends PDFTex
         }
         else 
         {
-            Iterator textIter = getCharactersByArticle().iterator();
+            Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();
             float lastFontSize = -1.0f;
 
             StringBuffer titleText = new StringBuffer();
             while (textIter.hasNext()) 
             {
-                Iterator textByArticle = ((List) textIter.next()).iterator();
+                Iterator<TextPosition> textByArticle = textIter.next().iterator();
                 while (textByArticle.hasNext()) 
                 {
-                    TextPosition position = (TextPosition) textByArticle.next();
+                    TextPosition position = textByArticle.next();
 
                     float currentFontSize = position.getFontSize();
                     //If we're past 64 chars we will assume that we're past the title
@@ -169,6 +175,7 @@ public class PDFText2HTML extends PDFTex
      */
     protected void endArticle() throws IOException 
     {
+        super.endArticle();
         super.writeString("</div>");
     }
 

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Mon Jul 26 17:35:27 2010
@@ -23,10 +23,12 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Vector;
+import java.util.regex.Pattern;
 
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSStream;
@@ -57,6 +59,60 @@ import org.apache.pdfbox.pdmodel.interac
  */
 public class PDFTextStripper extends PDFStreamEngine
 {
+
+    private static final String thisClassName = PDFTextStripper.class.getSimpleName().toLowerCase();
+
+    //enable the ability to set the default indent/drop thresholds
+    //with -D system properties:
+    //    pdftextstripper.indent
+    //    pdftextstripper.drop
+    static
+    {
+        String prop = thisClassName+".indent";
+        String s = System.getProperty(prop);
+        if(s!=null && s.length()>0)
+        {
+            try
+            {
+                float f = Float.parseFloat(s);
+                DEFAULT_INDENT_THRESHOLD = f;
+            }
+            catch(NumberFormatException nfe)
+            {
+                        //ignore and use default
+            }
+        }
+        prop = thisClassName+".drop";
+        s = System.getProperty(prop);
+        if(s!=null && s.length()>0)
+        {
+            try
+            {
+                float f = Float.parseFloat(s);
+                DEFAULT_DROP_THRESHOLD = f;
+            }
+            catch(NumberFormatException nfe){
+                        //ignore and use default
+            }
+        }
+    }
+
+    /**
+     * The platforms line separator.
+     */
+    protected final String systemLineSeparator = System.getProperty("line.separator"); 
+
+    private String lineSeparator = systemLineSeparator;
+    private String pageSeparator = systemLineSeparator;
+    private String wordSeparator = " ";
+    private String paragraphStart = "";
+    private String paragraphEnd = "";
+    private String pageStart = "";
+    private String pageEnd = pageSeparator;
+    private String articleStart = "";
+    private String articleEnd = "";
+
+
     private int currentPageNo = 0;
     private int startPage = 1;
     private int endPage = Integer.MAX_VALUE;
@@ -67,7 +123,14 @@ public class PDFTextStripper extends PDF
     private boolean suppressDuplicateOverlappingText = true;
     private boolean shouldSeparateByBeads = true;
     private boolean sortByPosition = false;
+    private boolean addMoreFormatting = false;
     
+    private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
+    private static float DEFAULT_DROP_THRESHOLD = 2.5f;
+
+    private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
+    private float dropThreshold = DEFAULT_DROP_THRESHOLD;
+
     // We will need to estimate where to add spaces.  
     // These are used to help guess. 
     private float spacingTolerance = .5f;
@@ -94,12 +157,6 @@ public class PDFTextStripper extends PDF
     private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
 
     /**
-     * The platforms lineseparator.
-     */
-    protected String lineSeparator = System.getProperty("line.separator");
-    private String pageSeparator = System.getProperty("line.separator");
-    private String wordSeparator = " ";
-    /**
      * encoding that text will be written in (or null).
      */
     protected String outputEncoding; 
@@ -231,6 +288,12 @@ public class PDFTextStripper extends PDF
         resetEngine();
         document = doc;
         output = outputStream;
+        if (getAddMoreFormatting()) {
+            paragraphEnd = lineSeparator;
+            pageStart = lineSeparator;
+            articleStart = lineSeparator;
+            articleEnd = lineSeparator;
+        }
         startDocument(document);
 
         if( document.isEncrypted() )
@@ -407,7 +470,7 @@ public class PDFTextStripper extends PDF
      */
     protected void startArticle(boolean isltr) throws IOException
     {
-        //default is to do nothing.
+        output.write(getArticleStart());
     }
 
     /**
@@ -418,7 +481,7 @@ public class PDFTextStripper extends PDF
      */
     protected void endArticle() throws IOException
     {
-        //default is to do nothing
+        output.write(getArticleEnd());
     }
 
     /**
@@ -462,20 +525,26 @@ public class PDFTextStripper extends PDF
      *
      * @throws IOException If there is an error writing the text.
      */
-    protected void writePage() throws IOException    
+    protected void writePage() throws IOException
     {
         float maxYForLine = MAXYFORLINE_RESET_VALUE;
         float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
         float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
         float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
         float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
+        PositionWrapper lastPosition = null;
+        PositionWrapper lastLineStartPosition = null;
 
-        TextPosition lastPosition = null;
+        boolean startOfPage = true;//flag to indicate start of page
+        boolean startOfArticle = true;
+        if(charactersByArticle.size() > 0) { 
+            writePageStart();
+        }
 
         for( int i = 0; i < charactersByArticle.size(); i++)
         {
-            List<TextPosition> textList = (List<TextPosition>)charactersByArticle.get( i );
-            if( sortByPosition )
+            List<TextPosition> textList = charactersByArticle.get( i );
+            if( getSortByPosition() )
             {
                 TextPositionComparator comparator = new TextPositionComparator();
                 Collections.sort( textList, comparator );
@@ -485,40 +554,39 @@ public class PDFTextStripper extends PDF
 
             /* Before we can display the text, we need to do some normalizing.
              * Arabic and Hebrew text is right to left and is typically stored
-             * in its logical format, which means that the rightmost character is 
+             * in its logical format, which means that the rightmost character is
              * stored first, followed by the second character from the right etc.
-             * However, PDF stores the text in presentation form, which is left to 
+             * However, PDF stores the text in presentation form, which is left to
              * right.  We need to do some normalization to convert the PDF data to
-             * the proper logical output format. 
-             * 
+             * the proper logical output format.
+             *
              * Note that if we did not sort the text, then the output of reversing the
              * text is undefined and can sometimes produce worse output then not trying
              * to reverse the order.  Sorting should be done for these languages.
              * */
 
-            /* First step is to determine if we have any right to left text, and 
-             * if so, is it dominant. */ 
+            /* First step is to determine if we have any right to left text, and
+             * if so, is it dominant. */
             int ltrCnt = 0;
             int rtlCnt = 0;
 
             while( textIter.hasNext() )
             {
-                TextPosition position = textIter.next();
+                TextPosition position = (TextPosition)textIter.next();
                 String stringValue = position.getCharacter();
-
-                for (int a = 0; a < stringValue.length(); a++) 
+                for (int a = 0; a < stringValue.length(); a++)
                 {
                     byte dir = Character.getDirectionality(stringValue.charAt(a));
-                    if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) || 
+                    if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
                             (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
-                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE )) 
+                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
                     {
                         ltrCnt++;
                     }
                     else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
                             (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
                             (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
-                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE )) 
+                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
                     {
                         rtlCnt++;
                     }
@@ -526,47 +594,39 @@ public class PDFTextStripper extends PDF
             }
 
             // choose the dominant direction
-            boolean isRtlDominant = false; 
-            if (rtlCnt > ltrCnt) 
-            {
-                isRtlDominant = true;
-            }
+            boolean isRtlDominant = rtlCnt > ltrCnt;
 
             startArticle(!isRtlDominant);
-
+            startOfArticle = true;
             // we will later use this to skip reordering
-            boolean hasRtl = false;
-            if (rtlCnt > 0)
-            {
-                hasRtl = true;
-            }
+            boolean hasRtl = rtlCnt > 0;
 
-            /* Now cycle through to print the text.  
+            /* Now cycle through to print the text.
              * We queue up a line at a time before we print so that we can convert
-             * the line from presentation form to logical form (if needed). */
-            String lineStr = "";
+             * the line from presentation form to logical form (if needed). 
+             */
+            List<TextPosition> line = new ArrayList<TextPosition>();
 
             textIter = textList.iterator();    // start from the beginning again
-
             /* PDF files don't always store spaces. We will need to guess where we should add
              * spaces based on the distances between TextPositions. Historically, this was done
              * based on the size of the space character provided by the font. In general, this worked
              * but there were cases where it did not work. Calculating the average character width
-             * and using that as a metric works better in some cases but fails in some cases where the 
-             * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples. 
+             * and using that as a metric works better in some cases but fails in some cases where the
+             * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
              */
-
             //Keeps track of the previous average character width
             float previousAveCharWidth = -1;
             while( textIter.hasNext() )
             {
-                TextPosition position = textIter.next();
+                TextPosition position = (TextPosition)textIter.next();
+                PositionWrapper current = new PositionWrapper(position);
                 String characterValue = position.getCharacter();
 
-                //Resets the average character width when we see a change in font 
+                //Resets the average character width when we see a change in font
                 // or a change in the font size
-                if(lastPosition != null && ((position.getFont() != lastPosition.getFont()) 
-                        || (position.getFontSize() != lastPosition.getFontSize())))
+                if(lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
+                        || (position.getFontSize() != lastPosition.getTextPosition().getFontSize())))
                 {
                     previousAveCharWidth = -1;
                 }
@@ -578,14 +638,14 @@ public class PDFTextStripper extends PDF
 
                 /* If we are sorting, then we need to use the text direction
                  * adjusted coordinates, because they were used in the sorting. */
-                if (sortByPosition) 
+                if (getSortByPosition())
                 {
                     positionX = position.getXDirAdj();
                     positionY = position.getYDirAdj();
                     positionWidth = position.getWidthDirAdj();
                     positionHeight = position.getHeightDir();
                 }
-                else 
+                else
                 {
                     positionX = position.getX();
                     positionY = position.getY();
@@ -596,31 +656,31 @@ public class PDFTextStripper extends PDF
                 //The current amount of characters in a word
                 int wordCharCount = position.getIndividualWidths().length;
 
-                /* Estimate the expected width of the space based on the 
+                /* Estimate the expected width of the space based on the
                  * space character with some margin. */
                 float wordSpacing = position.getWidthOfSpace();
                 float deltaSpace = 0;
-                if ((wordSpacing == 0) || Float.isNaN(wordSpacing)) 
+                if ((wordSpacing == 0) || (wordSpacing == Float.NaN))
                 {
                     deltaSpace = Float.MAX_VALUE;
                 }
-                else 
+                else
                 {
                     if( lastWordSpacing < 0 )
                     {
-                        deltaSpace = (wordSpacing * spacingTolerance);
+                        deltaSpace = (wordSpacing * getSpacingTolerance());
                     }
                     else
                     {
-                        deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* spacingTolerance);
+                        deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* getSpacingTolerance());
                     }
                 }
 
-                /* Estimate the expected width of the space based on the 
+                /* Estimate the expected width of the space based on the
                  * average character width with some margin. This calculation does not
                  * make a true average (average of averages) but we found that it gave the
                  * best results after numerous experiments. Based on experiments we also found that
-                 * .3 worked well. */                       
+                 * .3 worked well. */
                 float averageCharWidth = -1;
                 if(previousAveCharWidth < 0)
                 {
@@ -630,10 +690,10 @@ public class PDFTextStripper extends PDF
                 {
                     averageCharWidth = (previousAveCharWidth + (positionWidth/wordCharCount))/2f;
                 }
-                float deltaCharWidth = (averageCharWidth * averageCharTolerance);
-                
+                float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());
+
                 //Compares the values obtained by the average method and the wordSpacing method and picks
-                //the smaller number. 
+                //the smaller number.
                 float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
                 if(endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE)
                 {
@@ -645,10 +705,14 @@ public class PDFTextStripper extends PDF
                     {
                         expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                     }
-                }   
+                }
 
                 if( lastPosition != null )
-                {  
+                {
+                    if(startOfArticle){
+                        lastPosition.setArticleStart();
+                        startOfArticle = false;
+                    }
                     // RDD - Here we determine whether this text object is on the current
                     // line.  We use the lastBaselineFontSize to handle the superscript
                     // case, and the size of the current font to handle the subscript case.
@@ -660,23 +724,10 @@ public class PDFTextStripper extends PDF
                      * of regression test failures.  So, I'm leaving it be for now. */
                     if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
                     {
-                        // If we have RTL text on the page, change the direction
-                        if (hasRtl)
-                        {
-                            lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
-                        }
+                        writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+                        line.clear();
 
-                        /* normalize string to remove presentation forms.
-                         * Note that this must come after the line direction 
-                         * conversion because the process looks ahead to the next
-                         * logical character. 
-                         */
-                        lineStr = normalize.normalizePres(lineStr);
-
-                        writeString(lineStr);
-                        lineStr = "";
-
-                        writeLineSeparator( );
+                        lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition);
 
                         endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
                         expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
@@ -685,17 +736,17 @@ public class PDFTextStripper extends PDF
                         minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
                     }
 
-                    //Test if our TextPosition starts after a new word would be expected to start. 
-                    if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
+                    //Test if our TextPosition starts after a new word would be expected to start.
+                    if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE && expectedStartOfNextWordX < positionX &&
                             //only bother adding a space if the last character was not a space
-                            lastPosition.getCharacter() != null &&
-                            !lastPosition.getCharacter().endsWith( " " ) ) 
+                             lastPosition.getTextPosition().getCharacter() != null &&
+                            !lastPosition.getTextPosition().getCharacter().endsWith( " " ) )
                     {
-                        lineStr += getWordSeparator();
+                        line.add(WordSeparator.getSeparator());
                     }
                 }
 
-                if (positionY >= maxYForLine) 
+                if (positionY >= maxYForLine)
                 {
                     maxYForLine = positionY;
                 }
@@ -705,34 +756,36 @@ public class PDFTextStripper extends PDF
                 endOfLastTextX = positionX + positionWidth;
 
                 // add it to the list
-                if (characterValue != null) 
+                if (characterValue != null)
                 {
-                    lineStr += characterValue;
+                    if(startOfPage && lastPosition==null){
+                        writeParagraphStart();//not sure this is correct for RTL?
+                    }
+                    line.add(position);
                 }
                 maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
-                minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
-                lastPosition = position;
+                minYTopForLine = Math.min(minYTopForLine,positionY - positionHeight);
+                lastPosition = current;
+                if(startOfPage){
+                    lastPosition.setParagraphStart();
+                    lastPosition.setLineStart();
+                    lastLineStartPosition = lastPosition;
+                    startOfPage=false;
+                }
                 lastWordSpacing = wordSpacing;
                 previousAveCharWidth = averageCharWidth;
             }
 
             // print the final line
-            if (lineStr.length() > 0) 
+            if (line.size() > 0)
             {
-                if (hasRtl)
-                {
-                    lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
-                }
-
-                // normalize string to remove presentation forms
-                lineStr = normalize.normalizePres(lineStr);
-
-                writeString(lineStr);
+                writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+                writeParagraphEnd();
             }
 
             endArticle();
         }
-        writePageSeperator();
+        writePageEnd();
     }
 
     private boolean overlap( float y1, float height1, float y2, float height2 )
@@ -810,7 +863,6 @@ public class PDFTextStripper extends PDF
         return second < first + variance && second > first - variance;
     }
 
-
     /**
      * This will process a TextPosition object and add the
      * text to the list of characters on a page.  It takes care of
@@ -1098,7 +1150,7 @@ public class PDFTextStripper extends PDF
     /**
      * @return Returns the suppressDuplicateOverlappingText.
      */
-    public boolean shouldSuppressDuplicateOverlappingText()
+    public boolean getSuppressDuplicateOverlappingText()
     {
         return suppressDuplicateOverlappingText;
     }
@@ -1154,7 +1206,7 @@ public class PDFTextStripper extends PDF
      *
      * @return If the text will be grouped by beads.
      */
-    public boolean shouldSeparateByBeads()
+    public boolean getSeparateByBeads()
     {
         return shouldSeparateByBeads;
     }
@@ -1210,12 +1262,31 @@ public class PDFTextStripper extends PDF
     }
 
     /**
+     * This will tell if the text stripper should add some more text formatting.
+     * @return true if some more text formatting will be added
+     */
+    public boolean getAddMoreFormatting()
+    {
+        return addMoreFormatting;
+    }
+    
+    /**
+     * There will some additional text formatting be added if addMoreFormatting
+     * is set to true. Default is false. 
+     * @param newAddMoreFormatting Tell PDFBox to add some more text formatting
+     */
+    public void setAddMoreFormatting(boolean newAddMoreFormatting)
+    {
+        addMoreFormatting = newAddMoreFormatting;
+    }
+
+    /**
      * This will tell if the text stripper should sort the text tokens
      * before writing to the stream.
      *
      * @return true If the text tokens will be sorted before being written.
      */
-    public boolean shouldSortByPosition()
+    public boolean getSortByPosition()
     {
         return sortByPosition;
     }
@@ -1288,9 +1359,174 @@ public class PDFTextStripper extends PDF
         this.averageCharTolerance = averageCharToleranceValue;
     }
 
+
+    /**
+     * returns the multiple of whitespace character widths
+     * for the current text which the current
+     * line start can be indented from the previous line start
+     * beyond which the current line start is considered
+     * to be a paragraph start.
+     * @return the number of whitespace character widths to use
+     * when detecting paragraph indents.
+     */
+    public float getIndentThreshold() 
+    {
+        return indentThreshold;
+    }
+
+    /**
+     * sets the multiple of whitespace character widths
+     * for the current text which the current
+     * line start can be indented from the previous line start
+     * beyond which the current line start is considered
+     * to be a paragraph start.  The default value is 2.0.
+     *
+     * @param indentThreshold the number of whitespace character widths to use
+     * when detecting paragraph indents.
+     */
+    public void setIndentThreshold(float indentThreshold) 
+    {
+        this.indentThreshold = indentThreshold;
+    }
+
+    /**
+     * the minimum whitespace, as a multiple
+     * of the max height of the current characters
+     * beyond which the current line start is considered
+     * to be a paragraph start.
+     * @return the character height multiple for
+     * max allowed whitespace between lines in
+     * the same paragraph.
+     */
+    public float getDropThreshold() 
+    {
+        return dropThreshold;
+    }
+
+    /**
+     * sets the minimum whitespace, as a multiple
+     * of the max height of the current characters
+     * beyond which the current line start is considered
+     * to be a paragraph start.  The default value is 2.5.
+     *
+     * @param dropThreshold the character height multiple for
+     * max allowed whitespace between lines in
+     * the same paragraph.
+     */
+    public void setDropThreshold(float dropThreshold) 
+    {
+        this.dropThreshold = dropThreshold;
+    }
+
+    /**
+     * Returns the string which will be used at the beginning of a paragraph.
+     * @return the paragraph start string
+     */
+    public String getParagraphStart()
+    {
+        return paragraphStart;
+    }
+
+    /**
+     * Sets the string which will be used at the beginning of a paragraph.
+     * @param s the paragraph start string
+     */
+    public void setParagraphStart(String s)
+    {
+        this.paragraphStart = s;
+    }
+
+    /**
+     * Returns the string which will be used at the end of a paragraph.
+     * @return the paragraph end string
+     */
+    public String getParagraphEnd()
+    {
+        return paragraphEnd;
+    }
+
+    /**
+     * Sets the string which will be used at the end of a paragraph.
+     * @param s the paragraph end string
+     */
+    public void setParagraphEnd(String s)
+    {
+        this.paragraphEnd = s;
+    }
+
+
+    /**
+     * Returns the string which will be used at the beginning of a page.
+     * @return the page start string
+     */
+    public String getPageStart() 
+    {
+        return pageStart;
+    }
+
+    /**
+     * Sets the string which will be used at the beginning of a page.
+     * @param s the page start string
+     */
+    public void setPageStart(String pageStart) 
+    {
+        this.pageStart = pageStart;
+    }
+
+    /**
+     * Returns the string which will be used at the end of a page.
+     * @return the page end string
+     */
+    public String getPageEnd() 
+    {
+        return pageEnd;
+    }
+
+    /**
+     * Sets the string which will be used at the end of a page.
+     * @param s the page end string
+     */
+    public void setPageEnd(String pageEnd) 
+    {
+        this.pageEnd = pageEnd;
+    }
+
+    /**
+     * Returns the string which will be used at the beginning of an article.
+     * @return the article start string
+     */
+    public String getArticleStart() {
+        return articleStart;
+    }
+
+    /**
+     * Sets the string which will be used at the beginning of an article.
+     * @param s the article start string
+     */
+    public void setArticleStart(String articleStart) {
+        this.articleStart = articleStart;
+    }
+
+    /**
+     * Returns the string which will be used at the end of an article.
+     * @return the article end string
+     */
+    public String getArticleEnd(){
+        return articleEnd;
+    }
+
+    /**
+     * Sets the string which will be used at the end of an article.
+     * @param s the article end string
+     */
+    public void setArticleEnd(String articleEnd){
+        this.articleEnd = articleEnd;
+    }
+
+
     /**
      * Reverse characters of a compound Arabic glyph.
-     * When shouldSortByPosition() is true, inspect the sequence encoded
+     * When getSortByPosition() is true, inspect the sequence encoded
      * by one glyph. If the glyph encodes two or more Arabic characters,
      * reverse these characters from a logical order to a visual order.
      * This ensures that the bidirectional algorithm that runs later will
@@ -1318,4 +1554,326 @@ public class PDFTextStripper extends PDF
         return reversed.toString();
     }
 
+    /**
+     * handles the line separator for a new line given
+     * the specified current and previous TextPositions.
+     * @param position the current text position
+     * @param lastPosition the previous text position
+     * @param lastLineStartPosition the last text position that followed a line
+     *        separator.
+     * @throws IOException
+     */
+    protected PositionWrapper handleLineSeparation(PositionWrapper current,
+            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
+            throws IOException {
+        current.setLineStart();
+        isParagraphSeparation(current, lastPosition, lastLineStartPosition);
+        lastLineStartPosition = current;
+        if (current.isParagraphStart())  {
+            if(lastPosition.isArticleStart()) {
+                writeParagraphStart();
+            } else {
+                writeLineSeparator();
+                writeParagraphSeparator();
+            }
+        } else {
+            writeLineSeparator();
+        }
+        return lastLineStartPosition;
+    }
+    
+    /**
+     * tests the relationship between the last text position, the current text
+     * position and the last text position that followed a line separator to
+     * decide if the gap represents a paragraph separation. This should
+     * <i>only</i> be called for consecutive text positions that first pass the
+     * line separation test.
+     * <p>
+     * This base implementation tests to see if the lastLineStartPosition is
+     * null OR if the current vertical position has dropped below the last text
+     * vertical position by at least 2.5 times the current text height OR if the
+     * current horizontal position is indented by at least 2 times the current
+     * width of a space character.</p>
+     * <p>
+     * This also attempts to identify text that is indented under a hanging indent.</p>
+     * <p>
+     * This method sets the isParagraphStart and isHangingIndent flags on the current
+     * position object.</p>
+     *
+     * @param position the current text position.  This may have its isParagraphStart
+     * or isHangingIndent flags set upon return.
+     * @param lastPosition the previous text position (should not be null).
+     * @param lastLineStartPosition the last text position that followed a line
+     *            separator. May be null.
+     */
+    protected void isParagraphSeparation(PositionWrapper position,  
+            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition){
+        boolean result = false;
+        if(lastLineStartPosition == null) {
+            result = true;
+        }else{
+            float yGap = Math.abs(position.getTextPosition().getYDirAdj()-
+                    lastPosition.getTextPosition().getYDirAdj());
+            float xGap = (position.getTextPosition().getXDirAdj()-
+                    lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
+            if(yGap > (getDropThreshold()*position.getTextPosition().getHeightDir())){
+                        result = true;
+            }else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace())){
+                //text is indented, but try to screen for hanging indent
+                if(!lastLineStartPosition.isParagraphStart()){
+                     result = true;
+                }else{
+                     position.setHangingIndent();
+                }
+            }else if(xGap < -position.getTextPosition().getWidthOfSpace()){
+                //text is left of previous line. Was it a hanging indent?
+                if(!lastLineStartPosition.isParagraphStart()){
+                            result = true;
+                }
+            }else if(Math.abs(xGap) < (0.25 * position.getTextPosition().getWidth())){
+                //current horizontal position is within 1/4 a char of the last
+                //linestart.  We'll treat them as lined up.
+                if(lastLineStartPosition.isHangingIndent()){
+                    position.setHangingIndent();
+                }else if(lastLineStartPosition.isParagraphStart()){
+                    //check to see if the previous line looks like
+                    //any of a number of standard list item formats
+                    Pattern liPattern = matchListItemPattern(lastLineStartPosition);
+                    if(liPattern!=null){
+                        Pattern currentPattern = matchListItemPattern(position);
+                        if(liPattern == currentPattern){
+                                    result = true;
+                        }
+                    }
+               }
+           }
+        }
+        if(result){
+            position.setParagraphStart();
+        }
+    }
+
+    /**
+     * writes the paragraph separator string to the output.
+     * @throws IOException
+     */
+    protected void writeParagraphSeparator()throws IOException{
+        writeParagraphEnd();
+        writeParagraphStart();
+    }
+
+    /**
+     * Write something (if defined) at the start of a paragraph.
+     * @throws IOException if something went wrong
+     */
+    protected void writeParagraphStart() throws IOException{
+        output.write(getParagraphStart());
+    }
+
+    /**
+     * Write something (if defined) at the end of a paragraph.
+     * @throws IOException if something went wrong
+     */
+    protected void writeParagraphEnd() throws IOException{
+        output.write(getParagraphEnd());
+    }
+
+    /**
+     * Write something (if defined) at the start of a page.
+     * @throws IOException if something went wrong
+     */
+    protected void writePageStart()throws IOException{
+        output.write(getPageStart());
+    }
+
+    /**
+     * Write something (if defined) at the end of a page.
+     * @throws IOException if something went wrong
+     */
+    protected void writePageEnd()throws IOException{
+        output.write(getPageEnd());
+    }
+
+    /**
+     * returns the list item Pattern object that matches
+     * the text at the specified PositionWrapper or null
+     * if the text does not match such a pattern.  The list
+     * of Patterns tested against is given by the
+     * {@link #getListItemPatterns()} method.  To add to
+     * the list, simply override that method (if sub-classing)
+     * or explicitly supply your own list using
+     * {@link #setListItemPatterns(List)}.
+     * @param pw
+     * @return
+     */
+    protected Pattern matchListItemPattern(PositionWrapper pw) {
+        TextPosition tp = pw.getTextPosition();
+        String txt = tp.getCharacter();
+        Pattern p = matchPattern(txt,getListItemPatterns());
+        return p;
+    }
+
+    /**
+     * a list of regular expressions that match commonly used
+     * list item formats, i.e. bullets, numbers, letters,
+     * Roman numerals, etc.  Not meant to be
+     * comprehensive.
+     */
+    private static final String[] LIST_ITEM_EXPRESSIONS = {
+            "\\.",
+            "\\d+\\.",
+            "\\[\\d+\\]",
+            "\\d+\\)",
+            "[A-Z]\\.",
+            "[a-z]\\.",
+            "[A-Z]\\)",
+            "[a-z]\\)",
+            "[IVXL]+\\.",
+            "[ivxl]+\\.",
+
+    };
+
+    private List<Pattern> liPatterns = null;
+    /**
+     * use to supply a different set of regular expression
+     * patterns for matching list item starts.
+     *
+     * @param patterns
+     */
+    protected void setListItemPatterns(List<Pattern> patterns){
+            liPatterns = patterns;
+    }
+
+
+    /**
+     * returns a list of regular expression Patterns representing
+     * different common list item formats.  For example
+     * numbered items of form:
+     * <ol>
+     * <li>some text</li>
+     * <li>more text</li>
+     * </ol>
+     * or
+     * <ul>
+     * <li>some text</li>
+     * <li>more text</li>
+     * </ul>
+     * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...)
+     * or "\[\\d+\]" (matches "[1]", "[2]", ...).
+     * <p>
+     * This method returns a list of such regular expression Patterns.
+     * @return a list of Pattern objects.
+     */
+    protected List<Pattern> getListItemPatterns(){
+        if(liPatterns == null){
+            liPatterns = new ArrayList<Pattern>();
+            for(String expression : LIST_ITEM_EXPRESSIONS){
+                Pattern p = Pattern.compile(expression);
+                liPatterns.add(p);
+            }
+        }
+        return liPatterns;
+    }
+
+    /**
+     * iterates over the specified list of Patterns until
+     * it finds one that matches the specified string.  Then
+     * returns the Pattern.
+     * <p>
+     * Order of the supplied list of patterns is important as
+     * most common patterns should come first.  Patterns
+     * should be strict in general, and all will be
+     * used with case sensitivity on.
+     * </p>
+     * @param s
+     * @param patterns
+     * @return
+     */
+    protected static final Pattern matchPattern(String s, List<Pattern> patterns){
+        Pattern matchedPattern = null;
+        for(Pattern p : patterns){
+            if(p.matcher(s).matches()){
+                return p;
+            }
+        }
+        return matchedPattern;
+    }
+
+    /**
+     * Write a list of string containing a whole line of a document.
+     * @param line a list with the words of the given line
+     * @param isRtlDominant determines if rtl or ltl is dominant
+     * @throws IOException if something went wrong
+     */
+    private void writeLine(List<String> line, boolean isRtlDominant)throws IOException{
+        int numberOfStrings = line.size();
+        if (isRtlDominant) {
+            for(int i=numberOfStrings-1; i>=0; i--){
+                if (i > 1)
+                    writeWordSeparator();
+                writeString(line.get(i));
+            }
+        }
+        else {
+            for(int i=0; i<numberOfStrings; i++){
+                writeString(line.get(i));
+                if (!isRtlDominant && i < numberOfStrings-1)
+                    writeWordSeparator();
+            }
+        }
+    }
+
+    /**
+     * Normalize the given list of TextPositions.
+     * @param line list of TextPositions
+     * @param isRtlDominant determines if rtl or ltl is dominant 
+     * @param hasRtl determines if lines contains rtl formatted text(parts)
+     * @return a list of strings, one string for every word
+     */
+    private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl){
+        LinkedList<String> normalized = new LinkedList<String>();
+        StringBuilder lineBuilder = new StringBuilder();
+        for(TextPosition text : line){
+            if (text instanceof WordSeparator) {
+                String lineStr = lineBuilder.toString();
+                if (hasRtl) {
+                    lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
+                }
+                lineStr = normalize.normalizePres(lineStr);
+                normalized.add(lineStr);
+                lineBuilder = new StringBuilder();
+            }
+            else {
+                lineBuilder.append(text.getCharacter());
+            }
+        }
+        if (lineBuilder.length() > 0) {
+            String lineStr = lineBuilder.toString();
+            if (hasRtl) {
+                lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
+            }
+            lineStr = normalize.normalizePres(lineStr);
+            normalized.add(lineStr);
+        }
+        return normalized;
+    }
+
+    /**
+     * internal marker class.  Used as a place holder in
+     * a line of TextPositions.
+     * @author ME21969
+     *
+     */
+    private static final class WordSeparator extends TextPosition{
+        private static final WordSeparator separator = new WordSeparator();
+        
+        private WordSeparator(){
+        }
+
+        public static final WordSeparator getSeparator(){
+            return separator;
+        }
+
+    }
+
 }

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.