You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ms...@apache.org on 2015/09/24 09:57:16 UTC

svn commit: r1705010 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Author: msahyoun
Date: Thu Sep 24 07:57:15 2015
New Revision: 1705010

URL: http://svn.apache.org/viewvc?rev=1705010&view=rev
Log:
PDFBOX-2252: reformat source to match new conventions

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1705010&r1=1705009&r2=1705010&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Thu Sep 24 07:57:15 2015
@@ -42,13 +42,12 @@ import org.apache.pdfbox.pdmodel.interac
 import org.apache.pdfbox.util.QuickSort;
 
 /**
- * This class will take a pdf document and strip out all of the text and ignore the
- * formatting and such.  Please note; it is up to clients of this class to verify that
- * a specific user has the correct permissions to extract text from the PDF document.
+ * This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it
+ * is up to clients of this class to verify that a specific user has the correct permissions to extract text from the
+ * PDF document.
  * 
- * The basic flow of this process is that we get a document and use a series of 
- * processXXX() functions that work on smaller and smaller chunks of the page.  
- * Eventually, we fully process each page and then print it. 
+ * The basic flow of this process is that we get a document and use a series of processXXX() functions that work on
+ * smaller and smaller chunks of the page. Eventually, we fully process each page and then print it.
  *
  * @author Ben Litchfield
  */
@@ -57,11 +56,11 @@ public class PDFTextStripper extends PDF
     private static float defaultIndentThreshold = 2.0f;
     private static float defaultDropThreshold = 2.5f;
     private static final boolean useCustomQuickSort;
-    
+
     // enable the ability to set the default indent/drop thresholds
     // with -D system properties:
-    //    pdftextstripper.indent
-    //    pdftextstripper.drop
+    // pdftextstripper.indent
+    // pdftextstripper.drop
     static
     {
         String strDrop = null, strIndent = null;
@@ -100,8 +99,8 @@ public class PDFTextStripper extends PDF
                 // ignore and use default
             }
         }
-        
-        // check if we need to use the custom quicksort algorithm as a 
+
+        // check if we need to use the custom quicksort algorithm as a
         // workaround to the transitivity issue of TextPositionComparator:
         // https://issues.apache.org/jira/browse/PDFBOX-1512
         boolean is16orLess = false;
@@ -138,17 +137,17 @@ public class PDFTextStripper extends PDF
     private int startPage = 1;
     private int endPage = Integer.MAX_VALUE;
     private PDOutlineItem startBookmark = null;
-    
+
     // 1-based bookmark pages
     private int startBookmarkPageNumber = -1;
     private int endBookmarkPageNumber = -1;
-    
+
     private PDOutlineItem endBookmark = null;
     private boolean suppressDuplicateOverlappingText = true;
     private boolean shouldSeparateByBeads = true;
     private boolean sortByPosition = false;
     private boolean addMoreFormatting = false;
-    
+
     private float indentThreshold = defaultIndentThreshold;
     private float dropThreshold = defaultDropThreshold;
 
@@ -159,24 +158,19 @@ public class PDFTextStripper extends PDF
     private List<PDThreadBead> pageArticles = null;
 
     /**
-     * The charactersByArticle is used to extract text by article divisions.  For example
-     * a PDF that has two columns like a newspaper, we want to extract the first column and
-     * then the second column.  In this example the PDF would have 2 beads(or articles), one for
-     * each column.  The size of the charactersByArticle would be 5, because not all text on the
-     * screen will fall into one of the articles.  The five divisions are shown below
-     *
-     * Text before first article
-     * first article text
-     * text between first article and second article
-     * second article text
+     * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like
+     * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2
+     * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on
+     * the screen will fall into one of the articles. The five divisions are shown below
+     *
+     * Text before first article first article text text between first article and second article second article text
      * text after second article
      *
      * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
      */
     protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
 
-    private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
-        new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
+    private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping = new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
 
     protected PDDocument document;
     protected Writer output;
@@ -196,7 +190,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will return the text of a document.  See writeText. <br />
+     * This will return the text of a document. See writeText. <br />
      * NOTE: The document must not be encrypted when coming into this method.
      *
      * @param doc The document to get the text from.
@@ -223,7 +217,7 @@ public class PDFTextStripper extends PDF
             characterListMapping.clear();
         }
     }
-    
+
     /**
      * This will take a PDDocument and write the text of that document to the print writer.
      *
@@ -237,7 +231,7 @@ public class PDFTextStripper extends PDF
         resetEngine();
         document = doc;
         output = outputStream;
-        if (getAddMoreFormatting()) 
+        if (getAddMoreFormatting())
         {
             paragraphEnd = lineSeparator;
             pageStart = lineSeparator;
@@ -259,9 +253,9 @@ public class PDFTextStripper extends PDF
     protected void processPages(PDPageTree pages) throws IOException
     {
         PDPageTree pagesTree = document.getPages();
-        
+
         PDPage startBookmarkPage = startBookmark == null ? null
-                           : startBookmark.findDestinationPage(document);
+                : startBookmark.findDestinationPage(document);
         if (startBookmarkPage != null)
         {
             startBookmarkPageNumber = pagesTree.indexOf(startBookmarkPage) + 1;
@@ -273,7 +267,7 @@ public class PDFTextStripper extends PDF
         }
 
         PDPage endBookmarkPage = endBookmark == null ? null
-                         : endBookmark.findDestinationPage(document);
+                : endBookmark.findDestinationPage(document);
         if (endBookmarkPage != null)
         {
             endBookmarkPageNumber = pagesTree.indexOf(endBookmarkPage) + 1;
@@ -284,12 +278,12 @@ public class PDFTextStripper extends PDF
             endBookmarkPageNumber = -1;
         }
 
-        if (startBookmarkPageNumber == -1 && startBookmark != null &&
-            endBookmarkPageNumber == -1 && endBookmark != null &&
-            startBookmark.getCOSObject() == endBookmark.getCOSObject())
+        if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
+                && endBookmark != null
+                && startBookmark.getCOSObject() == endBookmark.getCOSObject())
         {
             // this is a special case where both the start and end bookmark
-            // are the same but point to nothing.  In this case
+            // are the same but point to nothing. In this case
             // we will not extract any text.
             startBookmarkPageNumber = 0;
             endBookmarkPageNumber = 0;
@@ -306,8 +300,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This method is available for subclasses of this class. It will be called before processing
-     * of the document start.
+     * This method is available for subclasses of this class. It will be called before processing of the document start.
      *
      * @param document The PDF document that is being processed.
      * @throws IOException If an IO error occurs.
@@ -318,8 +311,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This method is available for subclasses of this class. It will be called after processing
-     * of the document finishes.
+     * This method is available for subclasses of this class. It will be called after processing of the document
+     * finishes.
      *
      * @param document The PDF document that is being processed.
      * @throws IOException If an IO error occurs.
@@ -339,9 +332,9 @@ public class PDFTextStripper extends PDF
     @Override
     public void processPage(PDPage page) throws IOException
     {
-        if (currentPageNo >= startPage && currentPageNo <= endPage &&
-                (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) &&
-                (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
+        if (currentPageNo >= startPage && currentPageNo <= endPage
+                && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
+                && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
         {
             startPage(page);
             pageArticles = page.getThreadBeads();
@@ -371,10 +364,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Start a new article, which is typically defined as a column
-     * on a single page (also referred to as a bead).  This assumes
-     * that the primary direction of text is left to right.  
-     * Default implementation is to do nothing.  Subclasses
+     * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This
+     * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses
      * may provide additional information.
      *
      * @throws IOException If there is any error writing to the stream.
@@ -385,10 +376,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Start a new article, which is typically defined as a column
-     * on a single page (also referred to as a bead).  
-     * Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
+     * Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
+     * Default implementation is to do nothing. Subclasses may provide additional information.
      *
      * @param isLTR true if primary direction of text is left to right.
      * @throws IOException If there is any error writing to the stream.
@@ -399,8 +388,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * End an article.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
+     * End an article. Default implementation is to do nothing. Subclasses may provide additional information.
      *
      * @throws IOException If there is any error writing to the stream.
      */
@@ -410,8 +398,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Start a new page.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
+     * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
      *
      * @param page The page we are about to process.
      *
@@ -423,8 +410,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * End a page.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
+     * End a page. Default implementation is to do nothing. Subclasses may provide additional information.
      *
      * @param page The page we are about to process.
      *
@@ -443,10 +429,9 @@ public class PDFTextStripper extends PDF
     private static final float LAST_WORD_SPACING_RESET_VALUE = -1;
 
     /**
-     * This will print the text of the processed page to "output".
-     * It will estimate, based on the coordinates of the text, where
-     * newlines and word spacings should be placed. The text will be
-     * sorted only if that feature was enabled. 
+     * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
+     * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
+     * enabled.
      *
      * @throws IOException If there is an error writing the text.
      */
@@ -462,8 +447,8 @@ public class PDFTextStripper extends PDF
 
         boolean startOfPage = true; // flag to indicate start of page
         boolean startOfArticle;
-        if (charactersByArticle.size() > 0) 
-        { 
+        if (charactersByArticle.size() > 0)
+        {
             writePageStart();
         }
 
@@ -473,16 +458,16 @@ public class PDFTextStripper extends PDF
             {
                 TextPositionComparator comparator = new TextPositionComparator();
 
-                // because the TextPositionComparator is not transitive, but 
+                // because the TextPositionComparator is not transitive, but
                 // JDK7+ enforces transitivity on comparators, we need to use
                 // a custom quicksort implementation (which is slower, unfortunately).
-                if (useCustomQuickSort) 
+                if (useCustomQuickSort)
                 {
                     QuickSort.sort(textList, comparator);
                 }
-                else 
+                else
                 {
-                    Collections.sort( textList, comparator );
+                    Collections.sort(textList, comparator);
                 }
             }
             Iterator<TextPosition> textIter = textList.iterator();
@@ -491,7 +476,7 @@ public class PDFTextStripper extends PDF
             // in its logical format, which means that the rightmost character is
             // stored first, followed by the second character from the right etc.
             // However, PDF stores the text in presentation form, which is left to
-            // right.  We need to do some normalization to convert the PDF data to
+            // right. We need to do some normalization to convert the PDF data to
             // the proper logical output format.
             //
             // Note that if we did not sort the text, then the output of reversing the
@@ -510,16 +495,16 @@ public class PDFTextStripper extends PDF
                 for (int a = 0; a < stringValue.length(); a++)
                 {
                     byte dir = Character.getDirectionality(stringValue.charAt(a));
-                    if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ||
-                            dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
-                            dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
+                    if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT
+                            || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
+                            || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
                     {
                         ltrCount++;
                     }
-                    else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
-                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
-                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
-                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
+                    else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT
+                            || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
+                            || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
+                            || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
                     {
                         rtlCount++;
                     }
@@ -538,7 +523,7 @@ public class PDFTextStripper extends PDF
             // the line from presentation form to logical form (if needed).
             List<LineItem> line = new ArrayList<LineItem>();
 
-            textIter = textList.iterator();    // start from the beginning again
+            textIter = textList.iterator(); // start from the beginning again
             // PDF files don't always store spaces. We will need to guess where we should add
             // spaces based on the distances between TextPositions. Historically, this was done
             // based on the size of the space character provided by the font. In general, this
@@ -557,9 +542,9 @@ public class PDFTextStripper extends PDF
 
                 // Resets the average character width when we see a change in font
                 // or a change in the font size
-                if (lastPosition != null &&
-                        (position.getFont() != lastPosition.getTextPosition().getFont() ||
-                                position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
+                if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition()
+                        .getFont()
+                        || position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
                 {
                     previousAveCharWidth = -1;
                 }
@@ -647,33 +632,32 @@ public class PDFTextStripper extends PDF
                         startOfArticle = false;
                     }
                     // RDD - Here we determine whether this text object is on the current
-                    // line.  We use the lastBaselineFontSize to handle the superscript
+                    // line. We use the lastBaselineFontSize to handle the superscript
                     // case, and the size of the current font to handle the subscript case.
                     // Text must overlap with the last rendered baseline text by at least
                     // a small amount in order to be considered as being on the same line.
 
                     // XXX BC: In theory, this check should really check if the next char is in
                     // full range seen in this line. This is what I tried to do with minYTopForLine,
-                    // but this caused a lot of regression test failures.  So, I'm leaving it be for
+                    // but this caused a lot of regression test failures. So, I'm leaving it be for
                     // now
                     if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
                     {
                         writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
                         line.clear();
-                        lastLineStartPosition =
-                                handleLineSeparation(current, lastPosition, lastLineStartPosition,
-                                        maxHeightForLine);
+                        lastLineStartPosition = handleLineSeparation(current, lastPosition,
+                                lastLineStartPosition, maxHeightForLine);
                         expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                         maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                         maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                         minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                     }
                     // test if our TextPosition starts after a new word would be expected to start
-                    if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE &&
-                        expectedStartOfNextWordX < positionX &&
-                        // only bother adding a space if the last character was not a space
-                        lastPosition.getTextPosition().getUnicode() != null &&
-                        !lastPosition.getTextPosition().getUnicode().endsWith(" "))
+                    if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
+                            && expectedStartOfNextWordX < positionX &&
+                            // only bother adding a space if the last character was not a space
+                            lastPosition.getTextPosition().getUnicode() != null
+                            && !lastPosition.getTextPosition().getUnicode().endsWith(" "))
                     {
                         line.add(LineItem.getWordSeparator());
                     }
@@ -683,7 +667,7 @@ public class PDFTextStripper extends PDF
                     maxYForLine = positionY;
                 }
                 // RDD - endX is what PDF considers to be the x coordinate of the
-                // end position of the text.  We use it in computing our metrics below.
+                // end position of the text. We use it in computing our metrics below.
                 endOfLastTextX = positionX + positionWidth;
 
                 // add it to the list
@@ -691,7 +675,7 @@ public class PDFTextStripper extends PDF
                 {
                     if (startOfPage && lastPosition == null)
                     {
-                        writeParagraphStart();//not sure this is correct for RTL?
+                        writeParagraphStart();// not sure this is correct for RTL?
                     }
                     line.add(new LineItem(position));
                 }
@@ -721,13 +705,13 @@ public class PDFTextStripper extends PDF
 
     private boolean overlap(float y1, float height1, float y2, float height2)
     {
-        return within(y1, y2, .1f) ||
-                y2 <= y1 && y2 >= y1 - height1 ||
-                y1 <= y2 && y1 >= y2 - height2;
+        return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1
+                || y1 <= y2 && y1 >= y2 - height2;
     }
 
     /**
      * Write the line separator value to the output stream.
+     * 
      * @throws IOException If there is a problem writing out the lineseparator to the document.
      */
     protected void writeLineSeparator() throws IOException
@@ -737,6 +721,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Write the word separator value to the output stream.
+     * 
      * @throws IOException If there is a problem writing out the wordseparator to the document.
      */
     protected void writeWordSeparator() throws IOException
@@ -756,8 +741,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Write a Java string to the output stream. The default implementation will ignore the
-     * <code>textPositions</code> and just calls {@link #writeString(String)}.
+     * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code>
+     * and just calls {@link #writeString(String)}.
      *
      * @param text The text to write to the stream.
      * @param textPositions The TextPositions belonging to the text.
@@ -792,8 +777,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will process a TextPosition object and add the text to the list of characters on a page.
-     * It takes care of overlapping text.
+     * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
+     * overlapping text.
      *
      * @param text The text to process.
      */
@@ -807,32 +792,32 @@ public class PDFTextStripper extends PDF
             String textCharacter = text.getUnicode();
             float textX = text.getX();
             float textY = text.getY();
-            TreeMap<Float, TreeSet<Float>> sameTextCharacters =
-                    characterListMapping.get(textCharacter);
+            TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping
+                    .get(textCharacter);
             if (sameTextCharacters == null)
             {
                 sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
                 characterListMapping.put(textCharacter, sameTextCharacters);
             }
             // RDD - Here we compute the value that represents the end of the rendered
-            // text.  This value is used to determine whether subsequent text rendered
+            // text. This value is used to determine whether subsequent text rendered
             // on the same line overwrites the current text.
             //
             // We subtract any positive padding to handle cases where extreme amounts
             // of padding are applied, then backed off (not sure why this is done, but there
             // are cases where the padding is on the order of 10x the character width, and
-            // the TJ just backs up to compensate after each character).  Also, we subtract
+            // the TJ just backs up to compensate after each character). Also, we subtract
             // an amount to allow for kerning (a percentage of the width of the last
             // character).
             boolean suppressCharacter = false;
-            float tolerance = text.getWidth()/textCharacter.length() / 3.0f;
-            
+            float tolerance = text.getWidth() / textCharacter.length() / 3.0f;
+
             SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                     textX + tolerance);
-            for (TreeSet<Float> xMatch : xMatches.values()) 
+            for (TreeSet<Float> xMatch : xMatches.values())
             {
-                SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance , textY + tolerance);
-                if (!yMatches.isEmpty()) 
+                SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
+                if (!yMatches.isEmpty())
                 {
                     suppressCharacter = true;
                     break;
@@ -841,10 +826,10 @@ public class PDFTextStripper extends PDF
             if (!suppressCharacter)
             {
                 TreeSet<Float> ySet = sameTextCharacters.get(textX);
-                if (ySet == null) 
+                if (ySet == null)
                 {
                     ySet = new TreeSet<Float>();
-                    sameTextCharacters.put(textX,  ySet);
+                    sameTextCharacters.put(textX, ySet);
                 }
                 ySet.add(textY);
                 showCharacter = true;
@@ -871,19 +856,18 @@ public class PDFTextStripper extends PDF
                         {
                             foundArticleDivisionIndex = i * 2 + 1;
                         }
-                        else if ((x < rect.getLowerLeftX() ||
-                                y < rect.getUpperRightY()) &&
-                                notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
+                        else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
+                                && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                         {
                             notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                         }
-                        else if (x < rect.getLowerLeftX() &&
-                                notFoundButFirstLeftArticleDivisionIndex == -1)
+                        else if (x < rect.getLowerLeftX()
+                                && notFoundButFirstLeftArticleDivisionIndex == -1)
                         {
                             notFoundButFirstLeftArticleDivisionIndex = i * 2;
                         }
-                        else if (y < rect.getUpperRightY() &&
-                                notFoundButFirstAboveArticleDivisionIndex == -1)
+                        else if (y < rect.getUpperRightY()
+                                && notFoundButFirstAboveArticleDivisionIndex == -1)
                         {
                             notFoundButFirstAboveArticleDivisionIndex = i * 2;
                         }
@@ -923,8 +907,8 @@ public class PDFTextStripper extends PDF
             List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
 
             // In the wild, some PDF encoded documents put diacritics (accents on
-            // top of characters) into a separate Tj element.  When displaying them
-            // graphically, the two chunks get overlayed.  With text output though,
+            // top of characters) into a separate Tj element. When displaying them
+            // graphically, the two chunks get overlayed. With text output though,
             // we need to do the overlay. This code recombines the diacritic with
             // its associated character if the two are consecutive.
             if (textList.isEmpty())
@@ -947,7 +931,7 @@ public class PDFTextStripper extends PDF
                 else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                 {
                     text.mergeDiacritic(previousTextPosition);
-                    textList.remove(textList.size()-1);
+                    textList.remove(textList.size() - 1);
                     textList.add(text);
                 }
                 else
@@ -959,10 +943,9 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This is the page that the text extraction will start on.  The pages start
-     * at page 1.  For example in a 5 page PDF document, if the start page is 1
-     * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
-     * will be extracted.  The default value is 1.
+     * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF
+     * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will
+     * be extracted. The default value is 1.
      *
      * @return Value of property startPage.
      */
@@ -982,10 +965,9 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will get the last page that will be extracted.  This is inclusive,
-     * for example if a 5 page PDF an endPage value of 5 would extract the
-     * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
-     * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
+     * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage
+     * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to
+     * Integer.MAX_VALUE such that all pages of the pdf will be extracted.
      *
      * @return Value of property endPage.
      */
@@ -1005,9 +987,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Set the desired line separator for output text.  The line.separator
-     * system property is used if the line separator preference is not set
-     * explicitly using this method.
+     * Set the desired line separator for output text. The line.separator system property is used if the line separator
+     * preference is not set explicitly using this method.
      *
      * @param separator The desired line separator string.
      */
@@ -1037,11 +1018,10 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Set the desired word separator for output text.  The PDFBox text extraction
-     * algorithm will output a space character if there is enough space between
-     * two words.  By default a space character is used.  If you need and accurate
-     * count of characters that are found in a PDF document then you might want to
-     * set the word separator to the empty string.
+     * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
+     * character if there is enough space between two words. By default a space character is used. If you need and
+     * accurate count of characters that are found in a PDF document then you might want to set the word separator to
+     * the empty string.
      *
      * @param separator The desired page separator string.
      */
@@ -1079,9 +1059,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Character strings are grouped by articles.  It is quite common that there
-     * will only be a single article.  This returns a List that contains List objects,
-     * the inner lists will contain TextPosition objects.
+     * Character strings are grouped by articles. It is quite common that there will only be a single article. This
+     * returns a List that contains List objects, the inner lists will contain TextPosition objects.
      *
      * @return A double List of TextPositions for all text strings on the page.
      */
@@ -1091,10 +1070,9 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * By default the text stripper will attempt to remove text that overlapps each other.
-     * Word paints the same character several times in order to make it look bold.  By setting
-     * this to false all text will be extracted, which means that certain sections will be
-     * duplicated, but better performance will be noticed.
+     * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same
+     * character several times in order to make it look bold. By setting this to false all text will be extracted, which
+     * means that certain sections will be duplicated, but better performance will be noticed.
      *
      * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
      */
@@ -1114,8 +1092,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Set if the text stripper should group the text output by a list of beads.
-     * The default value is true!
+     * Set if the text stripper should group the text output by a list of beads. The default value is true!
      *
      * @param aShouldSeparateByBeads The new grouping of beads.
      */
@@ -1145,7 +1122,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Get the bookmark where text extraction should start, inclusive.  Default is null.
+     * Get the bookmark where text extraction should start, inclusive. Default is null.
      *
      * @return The starting bookmark.
      */
@@ -1166,16 +1143,17 @@ public class PDFTextStripper extends PDF
 
     /**
      * This will tell if the text stripper should add some more text formatting.
+     * 
      * @return true if some more text formatting will be added
      */
     public boolean getAddMoreFormatting()
     {
         return addMoreFormatting;
     }
-    
+
     /**
-     * There will some additional text formatting be added if addMoreFormatting
-     * is set to true. Default is false. 
+     * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false.
+     * 
      * @param newAddMoreFormatting Tell PDFBox to add some more text formatting
      */
     public void setAddMoreFormatting(boolean newAddMoreFormatting)
@@ -1184,8 +1162,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will tell if the text stripper should sort the text tokens
-     * before writing to the stream.
+     * This will tell if the text stripper should sort the text tokens before writing to the stream.
      *
      * @return true If the text tokens will be sorted before being written.
      */
@@ -1195,15 +1172,13 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * The order of the text tokens in a PDF file may not be in the same
-     * as they appear visually on the screen.  For example, a PDF writer may
-     * write out all text by font, so all bold or larger text, then make a second
-     * pass and write out the normal text.<br/>
+     * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For
+     * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and
+     * write out the normal text.<br/>
      * The default is to <b>not</b> sort by position.<br/>
      * <br/>
-     * A PDF writer could choose to write each character in a different order.  By
-     * default PDFBox does <b>not</b> sort the text tokens before processing them due to
-     * performance reasons.
+     * A PDF writer could choose to write each character in a different order. By default PDFBox does <b>not</b> sort
+     * the text tokens before processing them due to performance reasons.
      *
      * @param newSortByPosition Tell PDFBox to sort the text positions.
      */
@@ -1213,22 +1188,20 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Get the current space width-based tolerance value that is being used
-     * to estimate where spaces in text should be added.  Note that the
-     * default value for this has been determined from trial and error. 
+     * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be
+     * added. Note that the default value for this has been determined from trial and error.
      * 
      * @return The current tolerance / scaling factor
      */
-    public float getSpacingTolerance() 
+    public float getSpacingTolerance()
     {
         return spacingTolerance;
     }
 
     /**
-     * Set the space width-based tolerance value that is used
-     * to estimate where spaces in text should be added.  Note that the
-     * default value for this has been determined from trial and error.
-     * Setting this value larger will reduce the number of spaces added. 
+     * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note
+     * that the default value for this has been determined from trial and error. Setting this value larger will reduce
+     * the number of spaces added.
      * 
      * @param spacingToleranceValue tolerance / scaling factor to use
      */
@@ -1238,91 +1211,77 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Get the current character width-based tolerance value that is being used
-     * to estimate where spaces in text should be added.  Note that the
-     * default value for this has been determined from trial and error.
+     * Get the current character width-based tolerance value that is being used to estimate where spaces in text should
+     * be added. Note that the default value for this has been determined from trial and error.
      * 
      * @return The current tolerance / scaling factor
      */
-    public float getAverageCharTolerance() 
+    public float getAverageCharTolerance()
     {
         return averageCharTolerance;
     }
 
     /**
-     * Set the character width-based tolerance value that is used
-     * to estimate where spaces in text should be added.  Note that the
-     * default value for this has been determined from trial and error.
-     * Setting this value larger will reduce the number of spaces added. 
+     * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note
+     * that the default value for this has been determined from trial and error. Setting this value larger will reduce
+     * the number of spaces added.
      * 
      * @param averageCharToleranceValue average tolerance / scaling factor to use
      */
-    public void setAverageCharTolerance(float averageCharToleranceValue) 
+    public void setAverageCharTolerance(float averageCharToleranceValue)
     {
         averageCharTolerance = averageCharToleranceValue;
     }
 
-
     /**
-     * returns the multiple of whitespace character widths
-     * for the current text which the current
-     * line start can be indented from the previous line start
-     * beyond which the current line start is considered
-     * to be a paragraph start.
-     * @return the number of whitespace character widths to use
-     * when detecting paragraph indents.
+     * returns the multiple of whitespace character widths for the current text which the current line start can be
+     * indented from the previous line start beyond which the current line start is considered to be a paragraph start.
+     * 
+     * @return the number of whitespace character widths to use when detecting paragraph indents.
      */
-    public float getIndentThreshold() 
+    public float getIndentThreshold()
     {
         return indentThreshold;
     }
 
     /**
-     * sets the multiple of whitespace character widths
-     * for the current text which the current
-     * line start can be indented from the previous line start
-     * beyond which the current line start is considered
-     * to be a paragraph start.  The default value is 2.0.
+     * sets the multiple of whitespace character widths for the current text which the current line start can be
+     * indented from the previous line start beyond which the current line start is considered to be a paragraph start.
+     * The default value is 2.0.
      *
-     * @param indentThresholdValue the number of whitespace character widths to use
-     * when detecting paragraph indents.
+     * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents.
      */
-    public void setIndentThreshold(float indentThresholdValue) 
+    public void setIndentThreshold(float indentThresholdValue)
     {
         indentThreshold = indentThresholdValue;
     }
 
     /**
-     * the minimum whitespace, as a multiple
-     * of the max height of the current characters
-     * beyond which the current line start is considered
-     * to be a paragraph start.
-     * @return the character height multiple for
-     * max allowed whitespace between lines in
-     * the same paragraph.
+     * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line
+     * start is considered to be a paragraph start.
+     * 
+     * @return the character height multiple for max allowed whitespace between lines in the same paragraph.
      */
-    public float getDropThreshold() 
+    public float getDropThreshold()
     {
         return dropThreshold;
     }
 
     /**
-     * sets the minimum whitespace, as a multiple
-     * of the max height of the current characters
-     * beyond which the current line start is considered
-     * to be a paragraph start.  The default value is 2.5.
-     *
-     * @param dropThresholdValue the character height multiple for
-     * max allowed whitespace between lines in
-     * the same paragraph.
+     * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current
+     * line start is considered to be a paragraph start. The default value is 2.5.
+     *
+     * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same
+     * paragraph.
      */
-    public void setDropThreshold(float dropThresholdValue) 
+    public void setDropThreshold(float dropThresholdValue)
     {
         dropThreshold = dropThresholdValue;
     }
 
     /**
      * Returns the string which will be used at the beginning of a paragraph.
+     * 
      * @return the paragraph start string
      */
     public String getParagraphStart()
@@ -1332,6 +1291,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Sets the string which will be used at the beginning of a paragraph.
+     * 
      * @param s the paragraph start string
      */
     public void setParagraphStart(String s)
@@ -1341,6 +1301,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Returns the string which will be used at the end of a paragraph.
+     * 
      * @return the paragraph end string
      */
     public String getParagraphEnd()
@@ -1350,6 +1311,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Sets the string which will be used at the end of a paragraph.
+     * 
      * @param s the paragraph end string
      */
     public void setParagraphEnd(String s)
@@ -1357,63 +1319,69 @@ public class PDFTextStripper extends PDF
         paragraphEnd = s;
     }
 
-
     /**
      * Returns the string which will be used at the beginning of a page.
+     * 
      * @return the page start string
      */
-    public String getPageStart() 
+    public String getPageStart()
     {
         return pageStart;
     }
 
     /**
      * Sets the string which will be used at the beginning of a page.
+     * 
      * @param pageStartValue the page start string
      */
-    public void setPageStart(String pageStartValue) 
+    public void setPageStart(String pageStartValue)
     {
         pageStart = pageStartValue;
     }
 
     /**
      * Returns the string which will be used at the end of a page.
+     * 
      * @return the page end string
      */
-    public String getPageEnd() 
+    public String getPageEnd()
     {
         return pageEnd;
     }
 
     /**
      * Sets the string which will be used at the end of a page.
+     * 
      * @param pageEndValue the page end string
      */
-    public void setPageEnd(String pageEndValue) 
+    public void setPageEnd(String pageEndValue)
     {
         pageEnd = pageEndValue;
     }
 
     /**
      * Returns the string which will be used at the beginning of an article.
+     * 
      * @return the article start string
      */
-    public String getArticleStart() 
+    public String getArticleStart()
     {
         return articleStart;
     }
 
     /**
      * Sets the string which will be used at the beginning of an article.
+     * 
      * @param articleStartValue the article start string
      */
-    public void setArticleStart(String articleStartValue) 
+    public void setArticleStart(String articleStartValue)
     {
         articleStart = articleStartValue;
     }
 
     /**
      * Returns the string which will be used at the end of an article.
+     * 
      * @return the article end string
      */
     public String getArticleEnd()
@@ -1423,6 +1391,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Sets the string which will be used at the end of an article.
+     * 
      * @param articleEndValue the article end string
      */
     public void setArticleEnd(String articleEndValue)
@@ -1431,8 +1400,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * handles the line separator for a new line given
-     * the specified current and previous TextPositions.
+     * handles the line separator for a new line given the specified current and previous TextPositions.
+     * 
      * @param current the current text position
      * @param lastPosition the previous text position
      * @param lastLineStartPosition the last text position that followed a line separator.
@@ -1447,67 +1416,65 @@ public class PDFTextStripper extends PDF
         current.setLineStart();
         isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
         lastLineStartPosition = current;
-        if (current.isParagraphStart())  
+        if (current.isParagraphStart())
         {
-            if (lastPosition.isArticleStart()) 
+            if (lastPosition.isArticleStart())
             {
                 writeParagraphStart();
             }
-            else 
+            else
             {
                 writeLineSeparator();
                 writeParagraphSeparator();
             }
         }
-        else 
+        else
         {
             writeLineSeparator();
         }
         return lastLineStartPosition;
     }
-    
+
     /**
-     * tests the relationship between the last text position, the current text
-     * position and the last text position that followed a line separator to
-     * decide if the gap represents a paragraph separation. This should
-     * <i>only</i> be called for consecutive text positions that first pass the
-     * line separation test.
+     * tests the relationship between the last text position, the current text position and the last text position that
+     * followed a line separator to decide if the gap represents a paragraph separation. This should <i>only</i> be
+     * called for consecutive text positions that first pass the line separation test.
      * <p>
-     * This base implementation tests to see if the lastLineStartPosition is
-     * null OR if the current vertical position has dropped below the last text
-     * vertical position by at least 2.5 times the current text height OR if the
-     * current horizontal position is indented by at least 2 times the current
-     * width of a space character.</p>
+     * This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position
+     * has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current
+     * horizontal position is indented by at least 2 times the current width of a space character.
+     * </p>
      * <p>
-     * This also attempts to identify text that is indented under a hanging indent.</p>
+     * This also attempts to identify text that is indented under a hanging indent.
+     * </p>
      * <p>
-     * This method sets the isParagraphStart and isHangingIndent flags on the current
-     * position object.</p>
+     * This method sets the isParagraphStart and isHangingIndent flags on the current position object.
+     * </p>
      *
-     * @param position the current text position.  This may have its isParagraphStart
-     * or isHangingIndent flags set upon return.
+     * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon
+     * return.
      * @param lastPosition the previous text position (should not be null).
      * @param lastLineStartPosition the last text position that followed a line separator, or null.
      * @param maxHeightForLine max height for text positions since lasLineStartPosition.
      */
-    private void isParagraphSeparation(PositionWrapper position,  
-            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition,
-            float maxHeightForLine)
+    private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition,
+            PositionWrapper lastLineStartPosition, float maxHeightForLine)
     {
         boolean result = false;
-        if (lastLineStartPosition == null) 
+        if (lastLineStartPosition == null)
         {
             result = true;
         }
         else
         {
-            float yGap = Math.abs(position.getTextPosition().getYDirAdj() -
-                    lastPosition.getTextPosition().getYDirAdj());
+            float yGap = Math.abs(position.getTextPosition().getYDirAdj()
+                    - lastPosition.getTextPosition().getYDirAdj());
             float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine);
             // do we need to flip this for rtl?
-            float xGap = position.getTextPosition().getXDirAdj() -
-                    lastLineStartPosition.getTextPosition().getXDirAdj();
-            float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
+            float xGap = position.getTextPosition().getXDirAdj()
+                    - lastLineStartPosition.getTextPosition().getXDirAdj();
+            float newXVal = multiplyFloat(getIndentThreshold(),
+                    position.getTextPosition().getWidthOfSpace());
             float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
 
             if (yGap > newYVal)
@@ -1519,11 +1486,11 @@ public class PDFTextStripper extends PDF
                 // text is indented, but try to screen for hanging indent
                 if (!lastLineStartPosition.isParagraphStart())
                 {
-                     result = true;
+                    result = true;
                 }
                 else
                 {
-                     position.setHangingIndent();
+                    position.setHangingIndent();
                 }
             }
             else if (xGap < -position.getTextPosition().getWidthOfSpace())
@@ -1547,7 +1514,7 @@ public class PDFTextStripper extends PDF
                     // check to see if the previous line looks like
                     // any of a number of standard list item formats
                     Pattern liPattern = matchListItemPattern(lastLineStartPosition);
-                    if (liPattern!=null)
+                    if (liPattern != null)
                     {
                         Pattern currentPattern = matchListItemPattern(position);
                         if (liPattern == currentPattern)
@@ -1555,8 +1522,8 @@ public class PDFTextStripper extends PDF
                             result = true;
                         }
                     }
-               }
-           }
+                }
+            }
         }
         if (result)
         {
@@ -1570,11 +1537,13 @@ public class PDFTextStripper extends PDF
         // to avoid wrong results when comparing with another float
         return Math.round(value1 * value2 * 1000) / 1000f;
     }
+
     /**
      * writes the paragraph separator string to the output.
+     * 
      * @throws IOException if something went wrong
      */
-    protected void writeParagraphSeparator()throws IOException
+    protected void writeParagraphSeparator() throws IOException
     {
         writeParagraphEnd();
         writeParagraphStart();
@@ -1582,11 +1551,12 @@ public class PDFTextStripper extends PDF
 
     /**
      * Write something (if defined) at the start of a paragraph.
+     * 
      * @throws IOException if something went wrong
      */
     protected void writeParagraphStart() throws IOException
     {
-        if (inParagraph) 
+        if (inParagraph)
         {
             writeParagraphEnd();
             inParagraph = false;
@@ -1597,6 +1567,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Write something (if defined) at the end of a paragraph.
+     * 
      * @throws IOException if something went wrong
      */
     protected void writeParagraphEnd() throws IOException
@@ -1611,64 +1582,52 @@ public class PDFTextStripper extends PDF
 
     /**
      * Write something (if defined) at the start of a page.
+     * 
      * @throws IOException if something went wrong
      */
-    protected void writePageStart()throws IOException
+    protected void writePageStart() throws IOException
     {
         output.write(getPageStart());
     }
 
     /**
      * Write something (if defined) at the end of a page.
+     * 
      * @throws IOException if something went wrong
      */
-    protected void writePageEnd()throws IOException
+    protected void writePageEnd() throws IOException
     {
         output.write(getPageEnd());
     }
 
     /**
-     * returns the list item Pattern object that matches
-     * the text at the specified PositionWrapper or null
-     * if the text does not match such a pattern.  The list
-     * of Patterns tested against is given by the
-     * {@link #getListItemPatterns()} method.  To add to
-     * the list, simply override that method (if sub-classing)
-     * or explicitly supply your own list using
-     * {@link #setListItemPatterns(List)}.
+     * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text
+     * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()}
+     * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list
+     * using {@link #setListItemPatterns(List)}.
+     * 
      * @param pw position
      * @return the matching pattern
      */
-    private Pattern matchListItemPattern(PositionWrapper pw) 
+    private Pattern matchListItemPattern(PositionWrapper pw)
     {
         TextPosition tp = pw.getTextPosition();
         String txt = tp.getUnicode();
-        return matchPattern(txt,getListItemPatterns());
+        return matchPattern(txt, getListItemPatterns());
     }
 
     /**
-     * a list of regular expressions that match commonly used
-     * list item formats, i.e. bullets, numbers, letters,
-     * Roman numerals, etc. Not meant to be
-     * comprehensive.
-     */
-    private static final String[] LIST_ITEM_EXPRESSIONS = {
-            "\\.",
-            "\\d+\\.",
-            "\\[\\d+\\]",
-            "\\d+\\)",
-            "[A-Z]\\.",
-            "[a-z]\\.",
-            "[A-Z]\\)",
-            "[a-z]\\)",
-            "[IVXL]+\\.",
-            "[ivxl]+\\.",
-    };
+     * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman
+     * numerals, etc. Not meant to be comprehensive.
+     */
+    private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]",
+            "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.",
+            "[ivxl]+\\.", };
 
     private List<Pattern> listOfPatterns = null;
+
     /**
-     * use to supply a different set of regular expression
-     * patterns for matching list item starts.
+     * use to supply a different set of regular expression patterns for matching list item starts.
      *
      * @param patterns list of patterns
      */
@@ -1678,8 +1637,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * returns a list of regular expression Patterns representing
-     * different common list item formats.  For example
+     * returns a list of regular expression Patterns representing different common list item formats. For example
      * numbered items of form:
      * <ol>
      * <li>some text</li>
@@ -1690,10 +1648,11 @@ public class PDFTextStripper extends PDF
      * <li>some text</li>
      * <li>more text</li>
      * </ul>
-     * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...)
-     * or "\[\\d+\]" (matches "[1]", "[2]", ...).
+     * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]"
+     * (matches "[1]", "[2]", ...).
      * <p>
      * This method returns a list of such regular expression Patterns.
+     * 
      * @return a list of Pattern objects.
      */
     protected List<Pattern> getListItemPatterns()
@@ -1711,16 +1670,14 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * iterates over the specified list of Patterns until
-     * it finds one that matches the specified string.  Then
-     * returns the Pattern.
+     * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns
+     * the Pattern.
      * <p>
-     * Order of the supplied list of patterns is important as
-     * most common patterns should come first.  Patterns
-     * should be strict in general, and all will be
-     * used with case sensitivity on.
+     * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be
+     * strict in general, and all will be used with case sensitivity on.
      * </p>
-     * @param string the string to be searched 
+     * 
+     * @param string the string to be searched
      * @param patterns list of patterns
      * @return matching pattern
      */
@@ -1738,6 +1695,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Write a list of string containing a whole line of a document.
+     * 
      * @param line a list with the words of the given line
      * @param isRtlDominant determines if rtl or ltl is dominant
      * @throws IOException if something went wrong
@@ -1759,13 +1717,14 @@ public class PDFTextStripper extends PDF
 
     /**
      * Normalize the given list of TextPositions.
+     * 
      * @param line list of TextPositions
-     * @param isRtlDominant determines if rtl or ltl is dominant 
+     * @param isRtlDominant determines if rtl or ltl is dominant
      * @param hasRtl determines if lines contains rtl formatted text(parts)
      * @return a list of strings, one string for every word
      */
     private List<WordWithTextPositions> normalize(List<LineItem> line, boolean isRtlDominant,
-                                                  boolean hasRtl)
+            boolean hasRtl)
     {
         List<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
         StringBuilder lineBuilder = new StringBuilder();
@@ -1786,7 +1745,7 @@ public class PDFTextStripper extends PDF
                 lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
             }
         }
-        if (lineBuilder.length() > 0) 
+        if (lineBuilder.length() > 0)
         {
             normalized.add(createWord(lineBuilder.toString(), wordPositions));
         }
@@ -1794,8 +1753,7 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Used within {@link #normalize(List, boolean, boolean)} to create a single
-     * {@link WordWithTextPositions} entry.
+     * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions} entry.
      */
     private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
     {
@@ -1803,8 +1761,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Normalize certain Unicode characters. For example, convert the
-     * single "fi" ligature to "f" and "i". Also normalises Arabic and Hebrew presentation forms.
+     * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also
+     * normalises Arabic and Hebrew presentation forms.
      *
      * @param word Word to normalize
      * @return Normalized word
@@ -1833,14 +1791,16 @@ public class PDFTextStripper extends PDF
                 // Some fonts map U+FDF2 differently than the Unicode spec.
                 // They add an extra U+0627 character to compensate.
                 // This removes the extra character for those fonts.
-                if(c == 0xFDF2 && q > 0 && (word.charAt(q-1) == 0x0627 || word.charAt(q-1) == 0xFE8D))
+                if (c == 0xFDF2 && q > 0
+                        && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D))
                 {
                     builder.append("\u0644\u0644\u0647");
                 }
                 else
                 {
                     // Trim because some decompositions have an extra space, such as U+FC5E
-                    builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
+                    builder.append(Normalizer
+                            .normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
                 }
                 p = q + 1;
             }
@@ -1858,6 +1818,7 @@ public class PDFTextStripper extends PDF
 
     /**
      * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
+     * 
      * @return The StringBuilder that must be used when calling this method.
      */
     private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized,
@@ -1865,12 +1826,12 @@ public class PDFTextStripper extends PDF
     {
         if (item.isWordSeparator())
         {
-            normalized.add(createWord(lineBuilder.toString(),
-                    new ArrayList<TextPosition>(wordPositions)));
+            normalized.add(
+                    createWord(lineBuilder.toString(), new ArrayList<TextPosition>(wordPositions)));
             lineBuilder = new StringBuilder();
             wordPositions.clear();
         }
-        else 
+        else
         {
             TextPosition text = item.getTextPosition();
             lineBuilder.append(text.getUnicode());
@@ -1915,9 +1876,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Internal class that maps strings to lists of {@link TextPosition} arrays.
-     * Note that the number of entries in that list may differ from the number of characters in the
-     * string due to normalization.
+     * Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that
+     * list may differ from the number of characters in the string due to normalization.
      *
      * @author Axel Dörfler
      */
@@ -1925,13 +1885,13 @@ public class PDFTextStripper extends PDF
     {
         String text;
         List<TextPosition> textPositions;
-        
+
         WordWithTextPositions(String word, List<TextPosition> positions)
         {
             text = word;
             textPositions = positions;
         }
-        
+
         public String getText()
         {
             return text;
@@ -1944,15 +1904,13 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * wrapper of TextPosition that adds flags to track
-     * status as linestart and paragraph start positions.
+     * wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions.
      * <p>
-     * This is implemented as a wrapper since the TextPosition
-     * class doesn't provide complete access to its
-     * state fields to subclasses.  Also, conceptually TextPosition is
-     * immutable while these flags need to be set post-creation so
-     * it makes sense to put these flags in this separate class.
+     * This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields
+     * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it
+     * makes sense to put these flags in this separate class.
      * </p>
+     * 
      * @author m.martinez@ll.mit.edu
      */
     private static final class PositionWrapper
@@ -1977,6 +1935,7 @@ public class PDFTextStripper extends PDF
 
         /**
          * Returns the underlying TextPosition object.
+         * 
          * @return the text position
          */
         public TextPosition getTextPosition()
@@ -1997,7 +1956,6 @@ public class PDFTextStripper extends PDF
             this.isLineStart = true;
         }
 
-
         public boolean isParagraphStart()
         {
             return isParagraphStart;
@@ -2011,13 +1969,11 @@ public class PDFTextStripper extends PDF
             this.isParagraphStart = true;
         }
 
-
         public boolean isArticleStart()
         {
             return isArticleStart;
         }
 
-
         /**
          * Sets the isArticleStart() flag to true.
          */