You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ms...@apache.org on 2015/09/24 09:57:16 UTC
svn commit: r1705010 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Author: msahyoun
Date: Thu Sep 24 07:57:15 2015
New Revision: 1705010
URL: http://svn.apache.org/viewvc?rev=1705010&view=rev
Log:
PDFBOX-2252: reformat source to match new conventions
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1705010&r1=1705009&r2=1705010&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Thu Sep 24 07:57:15 2015
@@ -42,13 +42,12 @@ import org.apache.pdfbox.pdmodel.interac
import org.apache.pdfbox.util.QuickSort;
/**
- * This class will take a pdf document and strip out all of the text and ignore the
- * formatting and such. Please note; it is up to clients of this class to verify that
- * a specific user has the correct permissions to extract text from the PDF document.
+ * This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it
+ * is up to clients of this class to verify that a specific user has the correct permissions to extract text from the
+ * PDF document.
*
- * The basic flow of this process is that we get a document and use a series of
- * processXXX() functions that work on smaller and smaller chunks of the page.
- * Eventually, we fully process each page and then print it.
+ * The basic flow of this process is that we get a document and use a series of processXXX() functions that work on
+ * smaller and smaller chunks of the page. Eventually, we fully process each page and then print it.
*
* @author Ben Litchfield
*/
@@ -57,11 +56,11 @@ public class PDFTextStripper extends PDF
private static float defaultIndentThreshold = 2.0f;
private static float defaultDropThreshold = 2.5f;
private static final boolean useCustomQuickSort;
-
+
// enable the ability to set the default indent/drop thresholds
// with -D system properties:
- // pdftextstripper.indent
- // pdftextstripper.drop
+ // pdftextstripper.indent
+ // pdftextstripper.drop
static
{
String strDrop = null, strIndent = null;
@@ -100,8 +99,8 @@ public class PDFTextStripper extends PDF
// ignore and use default
}
}
-
- // check if we need to use the custom quicksort algorithm as a
+
+ // check if we need to use the custom quicksort algorithm as a
// workaround to the transitivity issue of TextPositionComparator:
// https://issues.apache.org/jira/browse/PDFBOX-1512
boolean is16orLess = false;
@@ -138,17 +137,17 @@ public class PDFTextStripper extends PDF
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
-
+
// 1-based bookmark pages
private int startBookmarkPageNumber = -1;
private int endBookmarkPageNumber = -1;
-
+
private PDOutlineItem endBookmark = null;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private boolean addMoreFormatting = false;
-
+
private float indentThreshold = defaultIndentThreshold;
private float dropThreshold = defaultDropThreshold;
@@ -159,24 +158,19 @@ public class PDFTextStripper extends PDF
private List<PDThreadBead> pageArticles = null;
/**
- * The charactersByArticle is used to extract text by article divisions. For example
- * a PDF that has two columns like a newspaper, we want to extract the first column and
- * then the second column. In this example the PDF would have 2 beads(or articles), one for
- * each column. The size of the charactersByArticle would be 5, because not all text on the
- * screen will fall into one of the articles. The five divisions are shown below
- *
- * Text before first article
- * first article text
- * text between first article and second article
- * second article text
+ * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like
+ * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2
+ * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on
+ * the screen will fall into one of the articles. The five divisions are shown below
+ *
+ * Text before first article first article text text between first article and second article second article text
* text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
*/
protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
- private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
- new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
+ private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping = new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
protected PDDocument document;
protected Writer output;
@@ -196,7 +190,7 @@ public class PDFTextStripper extends PDF
}
/**
- * This will return the text of a document. See writeText. <br />
+ * This will return the text of a document. See writeText. <br />
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
@@ -223,7 +217,7 @@ public class PDFTextStripper extends PDF
characterListMapping.clear();
}
}
-
+
/**
* This will take a PDDocument and write the text of that document to the print writer.
*
@@ -237,7 +231,7 @@ public class PDFTextStripper extends PDF
resetEngine();
document = doc;
output = outputStream;
- if (getAddMoreFormatting())
+ if (getAddMoreFormatting())
{
paragraphEnd = lineSeparator;
pageStart = lineSeparator;
@@ -259,9 +253,9 @@ public class PDFTextStripper extends PDF
protected void processPages(PDPageTree pages) throws IOException
{
PDPageTree pagesTree = document.getPages();
-
+
PDPage startBookmarkPage = startBookmark == null ? null
- : startBookmark.findDestinationPage(document);
+ : startBookmark.findDestinationPage(document);
if (startBookmarkPage != null)
{
startBookmarkPageNumber = pagesTree.indexOf(startBookmarkPage) + 1;
@@ -273,7 +267,7 @@ public class PDFTextStripper extends PDF
}
PDPage endBookmarkPage = endBookmark == null ? null
- : endBookmark.findDestinationPage(document);
+ : endBookmark.findDestinationPage(document);
if (endBookmarkPage != null)
{
endBookmarkPageNumber = pagesTree.indexOf(endBookmarkPage) + 1;
@@ -284,12 +278,12 @@ public class PDFTextStripper extends PDF
endBookmarkPageNumber = -1;
}
- if (startBookmarkPageNumber == -1 && startBookmark != null &&
- endBookmarkPageNumber == -1 && endBookmark != null &&
- startBookmark.getCOSObject() == endBookmark.getCOSObject())
+ if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
+ && endBookmark != null
+ && startBookmark.getCOSObject() == endBookmark.getCOSObject())
{
// this is a special case where both the start and end bookmark
- // are the same but point to nothing. In this case
+ // are the same but point to nothing. In this case
// we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
@@ -306,8 +300,7 @@ public class PDFTextStripper extends PDF
}
/**
- * This method is available for subclasses of this class. It will be called before processing
- * of the document start.
+ * This method is available for subclasses of this class. It will be called before processing of the document start.
*
* @param document The PDF document that is being processed.
* @throws IOException If an IO error occurs.
@@ -318,8 +311,8 @@ public class PDFTextStripper extends PDF
}
/**
- * This method is available for subclasses of this class. It will be called after processing
- * of the document finishes.
+ * This method is available for subclasses of this class. It will be called after processing of the document
+ * finishes.
*
* @param document The PDF document that is being processed.
* @throws IOException If an IO error occurs.
@@ -339,9 +332,9 @@ public class PDFTextStripper extends PDF
@Override
public void processPage(PDPage page) throws IOException
{
- if (currentPageNo >= startPage && currentPageNo <= endPage &&
- (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) &&
- (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
+ if (currentPageNo >= startPage && currentPageNo <= endPage
+ && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
+ && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
{
startPage(page);
pageArticles = page.getThreadBeads();
@@ -371,10 +364,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Start a new article, which is typically defined as a column
- * on a single page (also referred to as a bead). This assumes
- * that the primary direction of text is left to right.
- * Default implementation is to do nothing. Subclasses
+ * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This
+ * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
@@ -385,10 +376,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Start a new article, which is typically defined as a column
- * on a single page (also referred to as a bead).
- * Default implementation is to do nothing. Subclasses
- * may provide additional information.
+ * Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
+ * Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param isLTR true if primary direction of text is left to right.
* @throws IOException If there is any error writing to the stream.
@@ -399,8 +388,7 @@ public class PDFTextStripper extends PDF
}
/**
- * End an article. Default implementation is to do nothing. Subclasses
- * may provide additional information.
+ * End an article. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
@@ -410,8 +398,7 @@ public class PDFTextStripper extends PDF
}
/**
- * Start a new page. Default implementation is to do nothing. Subclasses
- * may provide additional information.
+ * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
@@ -423,8 +410,7 @@ public class PDFTextStripper extends PDF
}
/**
- * End a page. Default implementation is to do nothing. Subclasses
- * may provide additional information.
+ * End a page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
@@ -443,10 +429,9 @@ public class PDFTextStripper extends PDF
private static final float LAST_WORD_SPACING_RESET_VALUE = -1;
/**
- * This will print the text of the processed page to "output".
- * It will estimate, based on the coordinates of the text, where
- * newlines and word spacings should be placed. The text will be
- * sorted only if that feature was enabled.
+ * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
+ * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
+ * enabled.
*
* @throws IOException If there is an error writing the text.
*/
@@ -462,8 +447,8 @@ public class PDFTextStripper extends PDF
boolean startOfPage = true; // flag to indicate start of page
boolean startOfArticle;
- if (charactersByArticle.size() > 0)
- {
+ if (charactersByArticle.size() > 0)
+ {
writePageStart();
}
@@ -473,16 +458,16 @@ public class PDFTextStripper extends PDF
{
TextPositionComparator comparator = new TextPositionComparator();
- // because the TextPositionComparator is not transitive, but
+ // because the TextPositionComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower, unfortunately).
- if (useCustomQuickSort)
+ if (useCustomQuickSort)
{
QuickSort.sort(textList, comparator);
}
- else
+ else
{
- Collections.sort( textList, comparator );
+ Collections.sort(textList, comparator);
}
}
Iterator<TextPosition> textIter = textList.iterator();
@@ -491,7 +476,7 @@ public class PDFTextStripper extends PDF
// in its logical format, which means that the rightmost character is
// stored first, followed by the second character from the right etc.
// However, PDF stores the text in presentation form, which is left to
- // right. We need to do some normalization to convert the PDF data to
+ // right. We need to do some normalization to convert the PDF data to
// the proper logical output format.
//
// Note that if we did not sort the text, then the output of reversing the
@@ -510,16 +495,16 @@ public class PDFTextStripper extends PDF
for (int a = 0; a < stringValue.length(); a++)
{
byte dir = Character.getDirectionality(stringValue.charAt(a));
- if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ||
- dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
- dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
+ if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT
+ || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
+ || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
{
ltrCount++;
}
- else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
+ else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT
+ || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
+ || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
+ || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
{
rtlCount++;
}
@@ -538,7 +523,7 @@ public class PDFTextStripper extends PDF
// the line from presentation form to logical form (if needed).
List<LineItem> line = new ArrayList<LineItem>();
- textIter = textList.iterator(); // start from the beginning again
+ textIter = textList.iterator(); // start from the beginning again
// PDF files don't always store spaces. We will need to guess where we should add
// spaces based on the distances between TextPositions. Historically, this was done
// based on the size of the space character provided by the font. In general, this
@@ -557,9 +542,9 @@ public class PDFTextStripper extends PDF
// Resets the average character width when we see a change in font
// or a change in the font size
- if (lastPosition != null &&
- (position.getFont() != lastPosition.getTextPosition().getFont() ||
- position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
+ if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition()
+ .getFont()
+ || position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
{
previousAveCharWidth = -1;
}
@@ -647,33 +632,32 @@ public class PDFTextStripper extends PDF
startOfArticle = false;
}
// RDD - Here we determine whether this text object is on the current
- // line. We use the lastBaselineFontSize to handle the superscript
+ // line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
// XXX BC: In theory, this check should really check if the next char is in
// full range seen in this line. This is what I tried to do with minYTopForLine,
- // but this caused a lot of regression test failures. So, I'm leaving it be for
+ // but this caused a lot of regression test failures. So, I'm leaving it be for
// now
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
line.clear();
- lastLineStartPosition =
- handleLineSeparation(current, lastPosition, lastLineStartPosition,
- maxHeightForLine);
+ lastLineStartPosition = handleLineSeparation(current, lastPosition,
+ lastLineStartPosition, maxHeightForLine);
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
// test if our TextPosition starts after a new word would be expected to start
- if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE &&
- expectedStartOfNextWordX < positionX &&
- // only bother adding a space if the last character was not a space
- lastPosition.getTextPosition().getUnicode() != null &&
- !lastPosition.getTextPosition().getUnicode().endsWith(" "))
+ if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
+ && expectedStartOfNextWordX < positionX &&
+ // only bother adding a space if the last character was not a space
+ lastPosition.getTextPosition().getUnicode() != null
+ && !lastPosition.getTextPosition().getUnicode().endsWith(" "))
{
line.add(LineItem.getWordSeparator());
}
@@ -683,7 +667,7 @@ public class PDFTextStripper extends PDF
maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of the
- // end position of the text. We use it in computing our metrics below.
+ // end position of the text. We use it in computing our metrics below.
endOfLastTextX = positionX + positionWidth;
// add it to the list
@@ -691,7 +675,7 @@ public class PDFTextStripper extends PDF
{
if (startOfPage && lastPosition == null)
{
- writeParagraphStart();//not sure this is correct for RTL?
+ writeParagraphStart();// not sure this is correct for RTL?
}
line.add(new LineItem(position));
}
@@ -721,13 +705,13 @@ public class PDFTextStripper extends PDF
private boolean overlap(float y1, float height1, float y2, float height2)
{
- return within(y1, y2, .1f) ||
- y2 <= y1 && y2 >= y1 - height1 ||
- y1 <= y2 && y1 >= y2 - height2;
+ return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1
+ || y1 <= y2 && y1 >= y2 - height2;
}
/**
* Write the line separator value to the output stream.
+ *
* @throws IOException If there is a problem writing out the lineseparator to the document.
*/
protected void writeLineSeparator() throws IOException
@@ -737,6 +721,7 @@ public class PDFTextStripper extends PDF
/**
* Write the word separator value to the output stream.
+ *
* @throws IOException If there is a problem writing out the wordseparator to the document.
*/
protected void writeWordSeparator() throws IOException
@@ -756,8 +741,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Write a Java string to the output stream. The default implementation will ignore the
- * <code>textPositions</code> and just calls {@link #writeString(String)}.
+ * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code>
+ * and just calls {@link #writeString(String)}.
*
* @param text The text to write to the stream.
* @param textPositions The TextPositions belonging to the text.
@@ -792,8 +777,8 @@ public class PDFTextStripper extends PDF
}
/**
- * This will process a TextPosition object and add the text to the list of characters on a page.
- * It takes care of overlapping text.
+ * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
+ * overlapping text.
*
* @param text The text to process.
*/
@@ -807,32 +792,32 @@ public class PDFTextStripper extends PDF
String textCharacter = text.getUnicode();
float textX = text.getX();
float textY = text.getY();
- TreeMap<Float, TreeSet<Float>> sameTextCharacters =
- characterListMapping.get(textCharacter);
+ TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping
+ .get(textCharacter);
if (sameTextCharacters == null)
{
sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
characterListMapping.put(textCharacter, sameTextCharacters);
}
// RDD - Here we compute the value that represents the end of the rendered
- // text. This value is used to determine whether subsequent text rendered
+ // text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
- // the TJ just backs up to compensate after each character). Also, we subtract
+ // the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
boolean suppressCharacter = false;
- float tolerance = text.getWidth()/textCharacter.length() / 3.0f;
-
+ float tolerance = text.getWidth() / textCharacter.length() / 3.0f;
+
SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
textX + tolerance);
- for (TreeSet<Float> xMatch : xMatches.values())
+ for (TreeSet<Float> xMatch : xMatches.values())
{
- SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance , textY + tolerance);
- if (!yMatches.isEmpty())
+ SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
+ if (!yMatches.isEmpty())
{
suppressCharacter = true;
break;
@@ -841,10 +826,10 @@ public class PDFTextStripper extends PDF
if (!suppressCharacter)
{
TreeSet<Float> ySet = sameTextCharacters.get(textX);
- if (ySet == null)
+ if (ySet == null)
{
ySet = new TreeSet<Float>();
- sameTextCharacters.put(textX, ySet);
+ sameTextCharacters.put(textX, ySet);
}
ySet.add(textY);
showCharacter = true;
@@ -871,19 +856,18 @@ public class PDFTextStripper extends PDF
{
foundArticleDivisionIndex = i * 2 + 1;
}
- else if ((x < rect.getLowerLeftX() ||
- y < rect.getUpperRightY()) &&
- notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
+ else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
+ && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
}
- else if (x < rect.getLowerLeftX() &&
- notFoundButFirstLeftArticleDivisionIndex == -1)
+ else if (x < rect.getLowerLeftX()
+ && notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i * 2;
}
- else if (y < rect.getUpperRightY() &&
- notFoundButFirstAboveArticleDivisionIndex == -1)
+ else if (y < rect.getUpperRightY()
+ && notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i * 2;
}
@@ -923,8 +907,8 @@ public class PDFTextStripper extends PDF
List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
// In the wild, some PDF encoded documents put diacritics (accents on
- // top of characters) into a separate Tj element. When displaying them
- // graphically, the two chunks get overlayed. With text output though,
+ // top of characters) into a separate Tj element. When displaying them
+ // graphically, the two chunks get overlayed. With text output though,
// we need to do the overlay. This code recombines the diacritic with
// its associated character if the two are consecutive.
if (textList.isEmpty())
@@ -947,7 +931,7 @@ public class PDFTextStripper extends PDF
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition);
- textList.remove(textList.size()-1);
+ textList.remove(textList.size() - 1);
textList.add(text);
}
else
@@ -959,10 +943,9 @@ public class PDFTextStripper extends PDF
}
/**
- * This is the page that the text extraction will start on. The pages start
- * at page 1. For example in a 5 page PDF document, if the start page is 1
- * then all pages will be extracted. If the start page is 4 then pages 4 and 5
- * will be extracted. The default value is 1.
+ * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF
+ * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will
+ * be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
@@ -982,10 +965,9 @@ public class PDFTextStripper extends PDF
}
/**
- * This will get the last page that will be extracted. This is inclusive,
- * for example if a 5 page PDF an endPage value of 5 would extract the
- * entire document, an end page of 2 would extract pages 1 and 2. This defaults
- * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
+ * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage
+ * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to
+ * Integer.MAX_VALUE such that all pages of the pdf will be extracted.
*
* @return Value of property endPage.
*/
@@ -1005,9 +987,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Set the desired line separator for output text. The line.separator
- * system property is used if the line separator preference is not set
- * explicitly using this method.
+ * Set the desired line separator for output text. The line.separator system property is used if the line separator
+ * preference is not set explicitly using this method.
*
* @param separator The desired line separator string.
*/
@@ -1037,11 +1018,10 @@ public class PDFTextStripper extends PDF
}
/**
- * Set the desired word separator for output text. The PDFBox text extraction
- * algorithm will output a space character if there is enough space between
- * two words. By default a space character is used. If you need and accurate
- * count of characters that are found in a PDF document then you might want to
- * set the word separator to the empty string.
+ * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
+ * character if there is enough space between two words. By default a space character is used. If you need and
+ * accurate count of characters that are found in a PDF document then you might want to set the word separator to
+ * the empty string.
*
* @param separator The desired page separator string.
*/
@@ -1079,9 +1059,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Character strings are grouped by articles. It is quite common that there
- * will only be a single article. This returns a List that contains List objects,
- * the inner lists will contain TextPosition objects.
+ * Character strings are grouped by articles. It is quite common that there will only be a single article. This
+ * returns a List that contains List objects, the inner lists will contain TextPosition objects.
*
* @return A double List of TextPositions for all text strings on the page.
*/
@@ -1091,10 +1070,9 @@ public class PDFTextStripper extends PDF
}
/**
- * By default the text stripper will attempt to remove text that overlapps each other.
- * Word paints the same character several times in order to make it look bold. By setting
- * this to false all text will be extracted, which means that certain sections will be
- * duplicated, but better performance will be noticed.
+ * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same
+ * character several times in order to make it look bold. By setting this to false all text will be extracted, which
+ * means that certain sections will be duplicated, but better performance will be noticed.
*
* @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
*/
@@ -1114,8 +1092,7 @@ public class PDFTextStripper extends PDF
}
/**
- * Set if the text stripper should group the text output by a list of beads.
- * The default value is true!
+ * Set if the text stripper should group the text output by a list of beads. The default value is true!
*
* @param aShouldSeparateByBeads The new grouping of beads.
*/
@@ -1145,7 +1122,7 @@ public class PDFTextStripper extends PDF
}
/**
- * Get the bookmark where text extraction should start, inclusive. Default is null.
+ * Get the bookmark where text extraction should start, inclusive. Default is null.
*
* @return The starting bookmark.
*/
@@ -1166,16 +1143,17 @@ public class PDFTextStripper extends PDF
/**
* This will tell if the text stripper should add some more text formatting.
+ *
* @return true if some more text formatting will be added
*/
public boolean getAddMoreFormatting()
{
return addMoreFormatting;
}
-
+
/**
- * There will some additional text formatting be added if addMoreFormatting
- * is set to true. Default is false.
+ * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false.
+ *
* @param newAddMoreFormatting Tell PDFBox to add some more text formatting
*/
public void setAddMoreFormatting(boolean newAddMoreFormatting)
@@ -1184,8 +1162,7 @@ public class PDFTextStripper extends PDF
}
/**
- * This will tell if the text stripper should sort the text tokens
- * before writing to the stream.
+ * This will tell if the text stripper should sort the text tokens before writing to the stream.
*
* @return true If the text tokens will be sorted before being written.
*/
@@ -1195,15 +1172,13 @@ public class PDFTextStripper extends PDF
}
/**
- * The order of the text tokens in a PDF file may not be in the same
- * as they appear visually on the screen. For example, a PDF writer may
- * write out all text by font, so all bold or larger text, then make a second
- * pass and write out the normal text.<br/>
+ * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For
+ * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and
+ * write out the normal text.<br/>
* The default is to <b>not</b> sort by position.<br/>
* <br/>
- * A PDF writer could choose to write each character in a different order. By
- * default PDFBox does <b>not</b> sort the text tokens before processing them due to
- * performance reasons.
+ * A PDF writer could choose to write each character in a different order. By default PDFBox does <b>not</b> sort
+ * the text tokens before processing them due to performance reasons.
*
* @param newSortByPosition Tell PDFBox to sort the text positions.
*/
@@ -1213,22 +1188,20 @@ public class PDFTextStripper extends PDF
}
/**
- * Get the current space width-based tolerance value that is being used
- * to estimate where spaces in text should be added. Note that the
- * default value for this has been determined from trial and error.
+ * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be
+ * added. Note that the default value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
- public float getSpacingTolerance()
+ public float getSpacingTolerance()
{
return spacingTolerance;
}
/**
- * Set the space width-based tolerance value that is used
- * to estimate where spaces in text should be added. Note that the
- * default value for this has been determined from trial and error.
- * Setting this value larger will reduce the number of spaces added.
+ * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note
+ * that the default value for this has been determined from trial and error. Setting this value larger will reduce
+ * the number of spaces added.
*
* @param spacingToleranceValue tolerance / scaling factor to use
*/
@@ -1238,91 +1211,77 @@ public class PDFTextStripper extends PDF
}
/**
- * Get the current character width-based tolerance value that is being used
- * to estimate where spaces in text should be added. Note that the
- * default value for this has been determined from trial and error.
+ * Get the current character width-based tolerance value that is being used to estimate where spaces in text should
+ * be added. Note that the default value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
- public float getAverageCharTolerance()
+ public float getAverageCharTolerance()
{
return averageCharTolerance;
}
/**
- * Set the character width-based tolerance value that is used
- * to estimate where spaces in text should be added. Note that the
- * default value for this has been determined from trial and error.
- * Setting this value larger will reduce the number of spaces added.
+ * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note
+ * that the default value for this has been determined from trial and error. Setting this value larger will reduce
+ * the number of spaces added.
*
* @param averageCharToleranceValue average tolerance / scaling factor to use
*/
- public void setAverageCharTolerance(float averageCharToleranceValue)
+ public void setAverageCharTolerance(float averageCharToleranceValue)
{
averageCharTolerance = averageCharToleranceValue;
}
-
/**
- * returns the multiple of whitespace character widths
- * for the current text which the current
- * line start can be indented from the previous line start
- * beyond which the current line start is considered
- * to be a paragraph start.
- * @return the number of whitespace character widths to use
- * when detecting paragraph indents.
+ * returns the multiple of whitespace character widths for the current text which the current line start can be
+ * indented from the previous line start beyond which the current line start is considered to be a paragraph start.
+ *
+ * @return the number of whitespace character widths to use when detecting paragraph indents.
*/
- public float getIndentThreshold()
+ public float getIndentThreshold()
{
return indentThreshold;
}
/**
- * sets the multiple of whitespace character widths
- * for the current text which the current
- * line start can be indented from the previous line start
- * beyond which the current line start is considered
- * to be a paragraph start. The default value is 2.0.
+ * sets the multiple of whitespace character widths for the current text which the current line start can be
+ * indented from the previous line start beyond which the current line start is considered to be a paragraph start.
+ * The default value is 2.0.
*
- * @param indentThresholdValue the number of whitespace character widths to use
- * when detecting paragraph indents.
+ * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents.
*/
- public void setIndentThreshold(float indentThresholdValue)
+ public void setIndentThreshold(float indentThresholdValue)
{
indentThreshold = indentThresholdValue;
}
/**
- * the minimum whitespace, as a multiple
- * of the max height of the current characters
- * beyond which the current line start is considered
- * to be a paragraph start.
- * @return the character height multiple for
- * max allowed whitespace between lines in
- * the same paragraph.
+ * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line
+ * start is considered to be a paragraph start.
+ *
+ * @return the character height multiple for max allowed whitespace between lines in the same paragraph.
*/
- public float getDropThreshold()
+ public float getDropThreshold()
{
return dropThreshold;
}
/**
- * sets the minimum whitespace, as a multiple
- * of the max height of the current characters
- * beyond which the current line start is considered
- * to be a paragraph start. The default value is 2.5.
- *
- * @param dropThresholdValue the character height multiple for
- * max allowed whitespace between lines in
- * the same paragraph.
+ * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current
+ * line start is considered to be a paragraph start. The default value is 2.5.
+ *
+ * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same
+ * paragraph.
*/
- public void setDropThreshold(float dropThresholdValue)
+ public void setDropThreshold(float dropThresholdValue)
{
dropThreshold = dropThresholdValue;
}
/**
* Returns the string which will be used at the beginning of a paragraph.
+ *
* @return the paragraph start string
*/
public String getParagraphStart()
@@ -1332,6 +1291,7 @@ public class PDFTextStripper extends PDF
/**
* Sets the string which will be used at the beginning of a paragraph.
+ *
* @param s the paragraph start string
*/
public void setParagraphStart(String s)
@@ -1341,6 +1301,7 @@ public class PDFTextStripper extends PDF
/**
* Returns the string which will be used at the end of a paragraph.
+ *
* @return the paragraph end string
*/
public String getParagraphEnd()
@@ -1350,6 +1311,7 @@ public class PDFTextStripper extends PDF
/**
* Sets the string which will be used at the end of a paragraph.
+ *
* @param s the paragraph end string
*/
public void setParagraphEnd(String s)
@@ -1357,63 +1319,69 @@ public class PDFTextStripper extends PDF
paragraphEnd = s;
}
-
/**
* Returns the string which will be used at the beginning of a page.
+ *
* @return the page start string
*/
- public String getPageStart()
+ public String getPageStart()
{
return pageStart;
}
/**
* Sets the string which will be used at the beginning of a page.
+ *
* @param pageStartValue the page start string
*/
- public void setPageStart(String pageStartValue)
+ public void setPageStart(String pageStartValue)
{
pageStart = pageStartValue;
}
/**
* Returns the string which will be used at the end of a page.
+ *
* @return the page end string
*/
- public String getPageEnd()
+ public String getPageEnd()
{
return pageEnd;
}
/**
* Sets the string which will be used at the end of a page.
+ *
* @param pageEndValue the page end string
*/
- public void setPageEnd(String pageEndValue)
+ public void setPageEnd(String pageEndValue)
{
pageEnd = pageEndValue;
}
/**
* Returns the string which will be used at the beginning of an article.
+ *
* @return the article start string
*/
- public String getArticleStart()
+ public String getArticleStart()
{
return articleStart;
}
/**
* Sets the string which will be used at the beginning of an article.
+ *
* @param articleStartValue the article start string
*/
- public void setArticleStart(String articleStartValue)
+ public void setArticleStart(String articleStartValue)
{
articleStart = articleStartValue;
}
/**
* Returns the string which will be used at the end of an article.
+ *
* @return the article end string
*/
public String getArticleEnd()
@@ -1423,6 +1391,7 @@ public class PDFTextStripper extends PDF
/**
* Sets the string which will be used at the end of an article.
+ *
* @param articleEndValue the article end string
*/
public void setArticleEnd(String articleEndValue)
@@ -1431,8 +1400,8 @@ public class PDFTextStripper extends PDF
}
/**
- * handles the line separator for a new line given
- * the specified current and previous TextPositions.
+ * handles the line separator for a new line given the specified current and previous TextPositions.
+ *
* @param current the current text position
* @param lastPosition the previous text position
* @param lastLineStartPosition the last text position that followed a line separator.
@@ -1447,67 +1416,65 @@ public class PDFTextStripper extends PDF
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
- if (current.isParagraphStart())
+ if (current.isParagraphStart())
{
- if (lastPosition.isArticleStart())
+ if (lastPosition.isArticleStart())
{
writeParagraphStart();
}
- else
+ else
{
writeLineSeparator();
writeParagraphSeparator();
}
}
- else
+ else
{
writeLineSeparator();
}
return lastLineStartPosition;
}
-
+
/**
- * tests the relationship between the last text position, the current text
- * position and the last text position that followed a line separator to
- * decide if the gap represents a paragraph separation. This should
- * <i>only</i> be called for consecutive text positions that first pass the
- * line separation test.
+ * tests the relationship between the last text position, the current text position and the last text position that
+ * followed a line separator to decide if the gap represents a paragraph separation. This should <i>only</i> be
+ * called for consecutive text positions that first pass the line separation test.
* <p>
- * This base implementation tests to see if the lastLineStartPosition is
- * null OR if the current vertical position has dropped below the last text
- * vertical position by at least 2.5 times the current text height OR if the
- * current horizontal position is indented by at least 2 times the current
- * width of a space character.</p>
+ * This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position
+ * has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current
+ * horizontal position is indented by at least 2 times the current width of a space character.
+ * </p>
* <p>
- * This also attempts to identify text that is indented under a hanging indent.</p>
+ * This also attempts to identify text that is indented under a hanging indent.
+ * </p>
* <p>
- * This method sets the isParagraphStart and isHangingIndent flags on the current
- * position object.</p>
+ * This method sets the isParagraphStart and isHangingIndent flags on the current position object.
+ * </p>
*
- * @param position the current text position. This may have its isParagraphStart
- * or isHangingIndent flags set upon return.
+ * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon
+ * return.
* @param lastPosition the previous text position (should not be null).
* @param lastLineStartPosition the last text position that followed a line separator, or null.
* @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
- private void isParagraphSeparation(PositionWrapper position,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition,
- float maxHeightForLine)
+ private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition,
+ PositionWrapper lastLineStartPosition, float maxHeightForLine)
{
boolean result = false;
- if (lastLineStartPosition == null)
+ if (lastLineStartPosition == null)
{
result = true;
}
else
{
- float yGap = Math.abs(position.getTextPosition().getYDirAdj() -
- lastPosition.getTextPosition().getYDirAdj());
+ float yGap = Math.abs(position.getTextPosition().getYDirAdj()
+ - lastPosition.getTextPosition().getYDirAdj());
float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine);
// do we need to flip this for rtl?
- float xGap = position.getTextPosition().getXDirAdj() -
- lastLineStartPosition.getTextPosition().getXDirAdj();
- float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
+ float xGap = position.getTextPosition().getXDirAdj()
+ - lastLineStartPosition.getTextPosition().getXDirAdj();
+ float newXVal = multiplyFloat(getIndentThreshold(),
+ position.getTextPosition().getWidthOfSpace());
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
if (yGap > newYVal)
@@ -1519,11 +1486,11 @@ public class PDFTextStripper extends PDF
// text is indented, but try to screen for hanging indent
if (!lastLineStartPosition.isParagraphStart())
{
- result = true;
+ result = true;
}
else
{
- position.setHangingIndent();
+ position.setHangingIndent();
}
}
else if (xGap < -position.getTextPosition().getWidthOfSpace())
@@ -1547,7 +1514,7 @@ public class PDFTextStripper extends PDF
// check to see if the previous line looks like
// any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
- if (liPattern!=null)
+ if (liPattern != null)
{
Pattern currentPattern = matchListItemPattern(position);
if (liPattern == currentPattern)
@@ -1555,8 +1522,8 @@ public class PDFTextStripper extends PDF
result = true;
}
}
- }
- }
+ }
+ }
}
if (result)
{
@@ -1570,11 +1537,13 @@ public class PDFTextStripper extends PDF
// to avoid wrong results when comparing with another float
return Math.round(value1 * value2 * 1000) / 1000f;
}
+
/**
* writes the paragraph separator string to the output.
+ *
* @throws IOException if something went wrong
*/
- protected void writeParagraphSeparator()throws IOException
+ protected void writeParagraphSeparator() throws IOException
{
writeParagraphEnd();
writeParagraphStart();
@@ -1582,11 +1551,12 @@ public class PDFTextStripper extends PDF
/**
* Write something (if defined) at the start of a paragraph.
+ *
* @throws IOException if something went wrong
*/
protected void writeParagraphStart() throws IOException
{
- if (inParagraph)
+ if (inParagraph)
{
writeParagraphEnd();
inParagraph = false;
@@ -1597,6 +1567,7 @@ public class PDFTextStripper extends PDF
/**
* Write something (if defined) at the end of a paragraph.
+ *
* @throws IOException if something went wrong
*/
protected void writeParagraphEnd() throws IOException
@@ -1611,64 +1582,52 @@ public class PDFTextStripper extends PDF
/**
* Write something (if defined) at the start of a page.
+ *
* @throws IOException if something went wrong
*/
- protected void writePageStart()throws IOException
+ protected void writePageStart() throws IOException
{
output.write(getPageStart());
}
/**
* Write something (if defined) at the end of a page.
+ *
* @throws IOException if something went wrong
*/
- protected void writePageEnd()throws IOException
+ protected void writePageEnd() throws IOException
{
output.write(getPageEnd());
}
/**
- * returns the list item Pattern object that matches
- * the text at the specified PositionWrapper or null
- * if the text does not match such a pattern. The list
- * of Patterns tested against is given by the
- * {@link #getListItemPatterns()} method. To add to
- * the list, simply override that method (if sub-classing)
- * or explicitly supply your own list using
- * {@link #setListItemPatterns(List)}.
+ * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text
+ * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()}
+ * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list
+ * using {@link #setListItemPatterns(List)}.
+ *
* @param pw position
* @return the matching pattern
*/
- private Pattern matchListItemPattern(PositionWrapper pw)
+ private Pattern matchListItemPattern(PositionWrapper pw)
{
TextPosition tp = pw.getTextPosition();
String txt = tp.getUnicode();
- return matchPattern(txt,getListItemPatterns());
+ return matchPattern(txt, getListItemPatterns());
}
/**
- * a list of regular expressions that match commonly used
- * list item formats, i.e. bullets, numbers, letters,
- * Roman numerals, etc. Not meant to be
- * comprehensive.
- */
- private static final String[] LIST_ITEM_EXPRESSIONS = {
- "\\.",
- "\\d+\\.",
- "\\[\\d+\\]",
- "\\d+\\)",
- "[A-Z]\\.",
- "[a-z]\\.",
- "[A-Z]\\)",
- "[a-z]\\)",
- "[IVXL]+\\.",
- "[ivxl]+\\.",
- };
+ * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman
+ * numerals, etc. Not meant to be comprehensive.
+ */
+ private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]",
+ "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.",
+ "[ivxl]+\\.", };
private List<Pattern> listOfPatterns = null;
+
/**
- * use to supply a different set of regular expression
- * patterns for matching list item starts.
+ * use to supply a different set of regular expression patterns for matching list item starts.
*
* @param patterns list of patterns
*/
@@ -1678,8 +1637,7 @@ public class PDFTextStripper extends PDF
}
/**
- * returns a list of regular expression Patterns representing
- * different common list item formats. For example
+ * returns a list of regular expression Patterns representing different common list item formats. For example
* numbered items of form:
* <ol>
* <li>some text</li>
@@ -1690,10 +1648,11 @@ public class PDFTextStripper extends PDF
* <li>some text</li>
* <li>more text</li>
* </ul>
- * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...)
- * or "\[\\d+\]" (matches "[1]", "[2]", ...).
+ * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]"
+ * (matches "[1]", "[2]", ...).
* <p>
* This method returns a list of such regular expression Patterns.
+ *
* @return a list of Pattern objects.
*/
protected List<Pattern> getListItemPatterns()
@@ -1711,16 +1670,14 @@ public class PDFTextStripper extends PDF
}
/**
- * iterates over the specified list of Patterns until
- * it finds one that matches the specified string. Then
- * returns the Pattern.
+ * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns
+ * the Pattern.
* <p>
- * Order of the supplied list of patterns is important as
- * most common patterns should come first. Patterns
- * should be strict in general, and all will be
- * used with case sensitivity on.
+ * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be
+ * strict in general, and all will be used with case sensitivity on.
* </p>
- * @param string the string to be searched
+ *
+ * @param string the string to be searched
* @param patterns list of patterns
* @return matching pattern
*/
@@ -1738,6 +1695,7 @@ public class PDFTextStripper extends PDF
/**
* Write a list of string containing a whole line of a document.
+ *
* @param line a list with the words of the given line
* @param isRtlDominant determines if rtl or ltl is dominant
* @throws IOException if something went wrong
@@ -1759,13 +1717,14 @@ public class PDFTextStripper extends PDF
/**
* Normalize the given list of TextPositions.
+ *
* @param line list of TextPositions
- * @param isRtlDominant determines if rtl or ltl is dominant
+ * @param isRtlDominant determines if rtl or ltl is dominant
* @param hasRtl determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
private List<WordWithTextPositions> normalize(List<LineItem> line, boolean isRtlDominant,
- boolean hasRtl)
+ boolean hasRtl)
{
List<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
StringBuilder lineBuilder = new StringBuilder();
@@ -1786,7 +1745,7 @@ public class PDFTextStripper extends PDF
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
}
}
- if (lineBuilder.length() > 0)
+ if (lineBuilder.length() > 0)
{
normalized.add(createWord(lineBuilder.toString(), wordPositions));
}
@@ -1794,8 +1753,7 @@ public class PDFTextStripper extends PDF
}
/**
- * Used within {@link #normalize(List, boolean, boolean)} to create a single
- * {@link WordWithTextPositions} entry.
+ * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions} entry.
*/
private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
{
@@ -1803,8 +1761,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Normalize certain Unicode characters. For example, convert the
- * single "fi" ligature to "f" and "i". Also normalises Arabic and Hebrew presentation forms.
+ * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also
+ * normalises Arabic and Hebrew presentation forms.
*
* @param word Word to normalize
* @return Normalized word
@@ -1833,14 +1791,16 @@ public class PDFTextStripper extends PDF
// Some fonts map U+FDF2 differently than the Unicode spec.
// They add an extra U+0627 character to compensate.
// This removes the extra character for those fonts.
- if(c == 0xFDF2 && q > 0 && (word.charAt(q-1) == 0x0627 || word.charAt(q-1) == 0xFE8D))
+ if (c == 0xFDF2 && q > 0
+ && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D))
{
builder.append("\u0644\u0644\u0647");
}
else
{
// Trim because some decompositions have an extra space, such as U+FC5E
- builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
+ builder.append(Normalizer
+ .normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
}
p = q + 1;
}
@@ -1858,6 +1818,7 @@ public class PDFTextStripper extends PDF
/**
* Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
+ *
* @return The StringBuilder that must be used when calling this method.
*/
private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized,
@@ -1865,12 +1826,12 @@ public class PDFTextStripper extends PDF
{
if (item.isWordSeparator())
{
- normalized.add(createWord(lineBuilder.toString(),
- new ArrayList<TextPosition>(wordPositions)));
+ normalized.add(
+ createWord(lineBuilder.toString(), new ArrayList<TextPosition>(wordPositions)));
lineBuilder = new StringBuilder();
wordPositions.clear();
}
- else
+ else
{
TextPosition text = item.getTextPosition();
lineBuilder.append(text.getUnicode());
@@ -1915,9 +1876,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Internal class that maps strings to lists of {@link TextPosition} arrays.
- * Note that the number of entries in that list may differ from the number of characters in the
- * string due to normalization.
+ * Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that
+ * list may differ from the number of characters in the string due to normalization.
*
* @author Axel Dörfler
*/
@@ -1925,13 +1885,13 @@ public class PDFTextStripper extends PDF
{
String text;
List<TextPosition> textPositions;
-
+
WordWithTextPositions(String word, List<TextPosition> positions)
{
text = word;
textPositions = positions;
}
-
+
public String getText()
{
return text;
@@ -1944,15 +1904,13 @@ public class PDFTextStripper extends PDF
}
/**
- * wrapper of TextPosition that adds flags to track
- * status as linestart and paragraph start positions.
+ * wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions.
* <p>
- * This is implemented as a wrapper since the TextPosition
- * class doesn't provide complete access to its
- * state fields to subclasses. Also, conceptually TextPosition is
- * immutable while these flags need to be set post-creation so
- * it makes sense to put these flags in this separate class.
+ * This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields
+ * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it
+ * makes sense to put these flags in this separate class.
* </p>
+ *
* @author m.martinez@ll.mit.edu
*/
private static final class PositionWrapper
@@ -1977,6 +1935,7 @@ public class PDFTextStripper extends PDF
/**
* Returns the underlying TextPosition object.
+ *
* @return the text position
*/
public TextPosition getTextPosition()
@@ -1997,7 +1956,6 @@ public class PDFTextStripper extends PDF
this.isLineStart = true;
}
-
public boolean isParagraphStart()
{
return isParagraphStart;
@@ -2011,13 +1969,11 @@ public class PDFTextStripper extends PDF
this.isParagraphStart = true;
}
-
public boolean isArticleStart()
{
return isArticleStart;
}
-
/**
* Sets the isArticleStart() flag to true.
*/