You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/06/17 07:16:35 UTC
svn commit: r1603057 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Author: jahewson
Date: Tue Jun 17 05:16:34 2014
New Revision: 1603057
URL: http://svn.apache.org/r1603057
Log:
PDFBOX-2145: Clean up PDFStreamEngine and PDFTextStripper
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1603057&r1=1603056&r2=1603057&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Jun 17 05:16:34 2014
@@ -34,7 +34,6 @@ import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Pattern;
-import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -190,8 +189,8 @@ public class PDFTextStripper extends PDF
*/
public PDFTextStripper() throws IOException
{
- super( ResourceLoader.loadProperties(
- "org/apache/pdfbox/resources/PDFTextStripper.properties", true ) );
+ super(ResourceLoader.loadProperties(
+ "org/apache/pdfbox/resources/PDFTextStripper.properties", true));
this.outputEncoding = null;
normalize = new TextNormalize(this.outputEncoding);
}
@@ -206,9 +205,9 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is an error reading the properties.
*/
- public PDFTextStripper( Properties props ) throws IOException
+ public PDFTextStripper(Properties props) throws IOException
{
- super( props );
+ super(props);
this.outputEncoding = null;
normalize = new TextNormalize(this.outputEncoding);
}
@@ -220,10 +219,10 @@ public class PDFTextStripper extends PDF
* @param encoding The encoding that the output will be written in.
* @throws IOException If there is an error reading the properties.
*/
- public PDFTextStripper( String encoding ) throws IOException
+ public PDFTextStripper(String encoding) throws IOException
{
- super( ResourceLoader.loadProperties(
- "org/apache/pdfbox/resources/PDFTextStripper.properties", true ));
+ super(ResourceLoader.loadProperties(
+ "org/apache/pdfbox/resources/PDFTextStripper.properties", true));
this.outputEncoding = encoding;
normalize = new TextNormalize(this.outputEncoding);
}
@@ -236,10 +235,10 @@ public class PDFTextStripper extends PDF
* @return The text of the PDF document.
* @throws IOException if the doc state is invalid or it is encrypted.
*/
- public String getText( PDDocument doc ) throws IOException
+ public String getText(PDDocument doc) throws IOException
{
StringWriter outputStream = new StringWriter();
- writeText( doc, outputStream );
+ writeText(doc, outputStream);
return outputStream.toString();
}
@@ -268,7 +267,7 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If the doc is in an invalid state.
*/
- public void writeText( PDDocument doc, Writer outputStream ) throws IOException
+ public void writeText(PDDocument doc, Writer outputStream) throws IOException
{
resetEngine();
document = doc;
@@ -282,7 +281,7 @@ public class PDFTextStripper extends PDF
}
startDocument(document);
- if( document.isEncrypted() )
+ if (document.isEncrypted())
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
@@ -299,7 +298,7 @@ public class PDFTextStripper extends PDF
throw new IOException("Invalid password for encrypted document", e);
}
}
- processPages( document.getDocumentCatalog().getAllPages() );
+ processPages(document.getDocumentCatalog().getAllPages());
endDocument(document);
}
@@ -310,20 +309,20 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is an error parsing the text.
*/
- protected void processPages( List<COSObjectable> pages ) throws IOException
+ protected void processPages(List<COSObjectable> pages) throws IOException
{
- if( startBookmark != null )
+ if (startBookmark != null)
{
- startBookmarkPageNumber = getPageNumber( startBookmark, pages );
+ startBookmarkPageNumber = getPageNumber(startBookmark, pages);
}
- if( endBookmark != null )
+ if (endBookmark != null)
{
- endBookmarkPageNumber = getPageNumber( endBookmark, pages );
+ endBookmarkPageNumber = getPageNumber(endBookmark, pages);
}
- if( startBookmarkPageNumber == -1 && startBookmark != null &&
- endBookmarkPageNumber == -1 && endBookmark != null &&
- startBookmark.getCOSObject() == endBookmark.getCOSObject() )
+ if (startBookmarkPageNumber == -1 && startBookmark != null &&
+ endBookmarkPageNumber == -1 && endBookmark != null &&
+ startBookmark.getCOSObject() == endBookmark.getCOSObject())
{
// this is a special case where both the start and end bookmark
// are the same but point to nothing. In this case
@@ -332,27 +331,27 @@ public class PDFTextStripper extends PDF
endBookmarkPageNumber = 0;
}
Iterator<COSObjectable> pageIter = pages.iterator();
- while( pageIter.hasNext() )
+ while (pageIter.hasNext())
{
PDPage nextPage = (PDPage)pageIter.next();
PDStream contentStream = nextPage.getContents();
currentPageNo++;
- if( contentStream != null )
+ if (contentStream != null)
{
COSStream contents = contentStream.getStream();
- processPage( nextPage, contents );
+ processPage(nextPage, contents);
}
}
}
- private int getPageNumber( PDOutlineItem bookmark, List<COSObjectable> allPages )
+ private int getPageNumber(PDOutlineItem bookmark, List<COSObjectable> allPages)
throws IOException
{
int pageNumber = -1;
- PDPage page = bookmark.findDestinationPage( document );
- if( page != null )
+ PDPage page = bookmark.findDestinationPage(document);
+ if (page != null)
{
- pageNumber = allPages.indexOf( page ) + 1; // use one based indexing
+ pageNumber = allPages.indexOf(page) + 1; // use one based indexing
}
return pageNumber;
}
@@ -376,7 +375,7 @@ public class PDFTextStripper extends PDF
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
- protected void endDocument(PDDocument pdf ) throws IOException
+ protected void endDocument(PDDocument pdf) throws IOException
{
// no default implementation, but available for subclasses
}
@@ -389,36 +388,36 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is an error processing the page.
*/
- protected void processPage( PDPage page, COSStream content ) throws IOException
+ protected void processPage(PDPage page, COSStream content) throws IOException
{
- if( currentPageNo >= startPage && currentPageNo <= endPage &&
- (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
- (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
+ if (currentPageNo >= startPage && currentPageNo <= endPage &&
+ (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) &&
+ (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
{
- startPage( page );
+ startPage(page);
pageArticles = page.getThreadBeads();
int numberOfArticleSections = 1 + pageArticles.size() * 2;
- if( !shouldSeparateByBeads )
+ if (!shouldSeparateByBeads)
{
numberOfArticleSections = 1;
}
int originalSize = charactersByArticle.size();
- charactersByArticle.setSize( numberOfArticleSections );
- for( int i=0; i<numberOfArticleSections; i++ )
+ charactersByArticle.setSize(numberOfArticleSections);
+ for (int i=0; i<numberOfArticleSections; i++)
{
- if( numberOfArticleSections < originalSize )
+ if (numberOfArticleSections < originalSize)
{
- charactersByArticle.get( i ).clear();
+ charactersByArticle.get(i).clear();
}
else
{
- charactersByArticle.set( i, new ArrayList<TextPosition>() );
+ charactersByArticle.set(i, new ArrayList<TextPosition>());
}
}
characterListMapping.clear();
- processStream( page.findResources(), content, page.findCropBox(), page.findRotation() );
+ processStream(page.findResources(), content, page.findCropBox(), page.findRotation());
writePage();
- endPage( page );
+ endPage(page);
}
}
@@ -469,7 +468,7 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is any error writing to the stream.
*/
- protected void startPage( PDPage page ) throws IOException
+ protected void startPage(PDPage page) throws IOException
{
// default is to do nothing
}
@@ -482,7 +481,7 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is any error writing to the stream.
*/
- protected void endPage( PDPage page ) throws IOException
+ protected void endPage(PDPage page) throws IOException
{
// default is to do nothing
}
@@ -514,18 +513,17 @@ public class PDFTextStripper extends PDF
boolean startOfPage = true; // flag to indicate start of page
boolean startOfArticle;
- if(charactersByArticle.size() > 0)
+ if (charactersByArticle.size() > 0)
{
writePageStart();
}
- for( int i = 0; i < charactersByArticle.size(); i++)
+ for (List<TextPosition> textList : charactersByArticle)
{
- List<TextPosition> textList = charactersByArticle.get( i );
- if( getSortByPosition() )
+ if (getSortByPosition())
{
TextPositionComparator comparator = new TextPositionComparator();
- Collections.sort( textList, comparator );
+ Collections.sort(textList, comparator);
}
Iterator<TextPosition> textIter = textList.iterator();
// Before we can display the text, we need to do some normalizing.
@@ -545,7 +543,7 @@ public class PDFTextStripper extends PDF
int ltrCount = 0;
int rtlCount = 0;
- while( textIter.hasNext() )
+ while (textIter.hasNext())
{
TextPosition position = textIter.next();
String stringValue = position.getCharacter();
@@ -553,15 +551,14 @@ public class PDFTextStripper extends PDF
{
byte dir = Character.getDirectionality(stringValue.charAt(a));
if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ||
- dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
- dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
+ dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
+ dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
{
ltrCount++;
- }
- else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
- dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
+ } else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
{
rtlCount++;
}
@@ -591,17 +588,17 @@ public class PDFTextStripper extends PDF
// Keeps track of the previous average character width
float previousAveCharWidth = -1;
- while( textIter.hasNext() )
+ while (textIter.hasNext())
{
TextPosition position = textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getCharacter();
- //Resets the average character width when we see a change in font
+ // Resets the average character width when we see a change in font
// or a change in the font size
- if(lastPosition != null &&
+ if (lastPosition != null &&
(position.getFont() != lastPosition.getTextPosition().getFont() ||
- position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
+ position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
{
previousAveCharWidth = -1;
}
@@ -619,8 +616,7 @@ public class PDFTextStripper extends PDF
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
- }
- else
+ } else
{
positionX = position.getX();
positionY = position.getY();
@@ -628,7 +624,7 @@ public class PDFTextStripper extends PDF
positionHeight = position.getHeight();
}
- //The current amount of characters in a word
+ // The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
// Estimate the expected width of the space based on the
@@ -638,14 +634,12 @@ public class PDFTextStripper extends PDF
if (wordSpacing == 0 || wordSpacing == Float.NaN)
{
deltaSpace = Float.MAX_VALUE;
- }
- else
+ } else
{
- if( lastWordSpacing < 0 )
+ if (lastWordSpacing < 0)
{
deltaSpace = wordSpacing * getSpacingTolerance();
- }
- else
+ } else
{
deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
}
@@ -656,11 +650,10 @@ public class PDFTextStripper extends PDF
// averages) but we found that it gave the best results after numerous experiments.
// Based on experiments we also found that .3 worked well.
float averageCharWidth = -1;
- if(previousAveCharWidth < 0)
+ if (previousAveCharWidth < 0)
{
averageCharWidth = positionWidth / wordCharCount;
- }
- else
+ } else
{
averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
}
@@ -669,21 +662,20 @@ public class PDFTextStripper extends PDF
// Compares the values obtained by the average method and the wordSpacing method
// and picks the smaller number.
float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
- if(endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE)
+ if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE)
{
- if(deltaCharWidth > deltaSpace)
+ if (deltaCharWidth > deltaSpace)
{
expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
- }
- else
+ } else
{
expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
}
}
- if( lastPosition != null )
+ if (lastPosition != null)
{
- if(startOfArticle)
+ if (startOfArticle)
{
lastPosition.setArticleStart();
startOfArticle = false;
@@ -698,13 +690,13 @@ public class PDFTextStripper extends PDF
// full range seen in this line. This is what I tried to do with minYTopForLine,
// but this caused a lot of regression test failures. So, I'm leaving it be for
// now
- if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
+ if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
- writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+ writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
line.clear();
- lastLineStartPosition =
- handleLineSeparation(current, lastPosition, lastLineStartPosition,
- maxHeightForLine);
+ lastLineStartPosition =
+ handleLineSeparation(current, lastPosition, lastLineStartPosition,
+ maxHeightForLine);
endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
@@ -712,11 +704,11 @@ public class PDFTextStripper extends PDF
minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
// test if our TextPosition starts after a new word would be expected to start
- if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
- && expectedStartOfNextWordX < positionX &&
- // only bother adding a space if the last character was not a space
- lastPosition.getTextPosition().getCharacter() != null &&
- !lastPosition.getTextPosition().getCharacter().endsWith( " " ) )
+ if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE &&
+ expectedStartOfNextWordX < positionX &&
+ // only bother adding a space if the last character was not a space
+ lastPosition.getTextPosition().getCharacter() != null &&
+ !lastPosition.getTextPosition().getCharacter().endsWith(" "))
{
line.add(WordSeparator.getSeparator());
}
@@ -732,21 +724,21 @@ public class PDFTextStripper extends PDF
// add it to the list
if (characterValue != null)
{
- if(startOfPage && lastPosition==null)
+ if (startOfPage && lastPosition == null)
{
writeParagraphStart();//not sure this is correct for RTL?
}
line.add(position);
}
- maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
- minYTopForLine = Math.min(minYTopForLine,positionY - positionHeight);
+ maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
+ minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
lastPosition = current;
- if(startOfPage)
+ if (startOfPage)
{
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
- startOfPage=false;
+ startOfPage = false;
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
@@ -754,7 +746,7 @@ public class PDFTextStripper extends PDF
// print the final line
if (line.size() > 0)
{
- writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+ writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
writeParagraphEnd();
}
endArticle();
@@ -762,9 +754,10 @@ public class PDFTextStripper extends PDF
writePageEnd();
}
- private boolean overlap( float y1, float height1, float y2, float height2 )
+ private boolean overlap(float y1, float height1, float y2, float height2)
{
- return within( y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 ||
+ return within(y1, y2, .1f) ||
+ y2 <= y1 && y2 >= y1 - height1 ||
y1 <= y2 && y1 >= y2 - height2;
}
@@ -786,7 +779,7 @@ public class PDFTextStripper extends PDF
* @throws IOException
* If there is a problem writing out the lineseparator to the document.
*/
- protected void writeLineSeparator( ) throws IOException
+ protected void writeLineSeparator() throws IOException
{
output.write(getLineSeparator());
}
@@ -808,9 +801,9 @@ public class PDFTextStripper extends PDF
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
- protected void writeCharacters( TextPosition text ) throws IOException
+ protected void writeCharacters(TextPosition text) throws IOException
{
- output.write( text.getCharacter() );
+ output.write(text.getCharacter());
}
/**
@@ -832,9 +825,9 @@ public class PDFTextStripper extends PDF
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
- protected void writeString( String text ) throws IOException
+ protected void writeString(String text) throws IOException
{
- output.write( text );
+ output.write(text);
}
/**
@@ -844,7 +837,7 @@ public class PDFTextStripper extends PDF
* @param second The second number to compare to.
* @param variance The allowed variance.
*/
- private boolean within( float first, float second, float variance )
+ private boolean within(float first, float second, float variance)
{
return second < first + variance && second > first - variance;
}
@@ -857,21 +850,21 @@ public class PDFTextStripper extends PDF
* @param text The text to process.
*/
@Override
- protected void processTextPosition( TextPosition text )
+ protected void processTextPosition(TextPosition text)
{
boolean showCharacter = true;
- if( suppressDuplicateOverlappingText )
+ if (suppressDuplicateOverlappingText)
{
showCharacter = false;
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
TreeMap<Float, TreeSet<Float>> sameTextCharacters =
- characterListMapping.get( textCharacter );
- if( sameTextCharacters == null )
+ characterListMapping.get(textCharacter);
+ if (sameTextCharacters == null)
{
sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
- characterListMapping.put( textCharacter, sameTextCharacters );
+ characterListMapping.put(textCharacter, sameTextCharacters);
}
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
@@ -886,31 +879,30 @@ public class PDFTextStripper extends PDF
boolean suppressCharacter = false;
float tolerance = text.getWidth()/textCharacter.length() / 3.0f;
- SortedMap<Float, TreeSet<Float>> xMatches =
- sameTextCharacters.subMap(textX - tolerance, textX + tolerance );
+ SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
+ textX + tolerance);
for (TreeSet<Float> xMatch : xMatches.values())
{
- SortedSet<Float> yMatches =
- xMatch.subSet(textY - tolerance , textY + tolerance );
+ SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance , textY + tolerance);
if (!yMatches.isEmpty())
{
suppressCharacter = true;
break;
}
}
- if( !suppressCharacter )
+ if (!suppressCharacter)
{
TreeSet<Float> ySet = sameTextCharacters.get(textX);
if (ySet == null)
{
ySet = new TreeSet<Float>();
- sameTextCharacters.put( textX, ySet );
+ sameTextCharacters.put(textX, ySet);
}
- ySet.add( textY );
+ ySet.add(textY);
showCharacter = true;
}
}
- if( showCharacter )
+ if (showCharacter)
{
// if we are showing the character then we need to determine which article it belongs to
int foundArticleDivisionIndex = -1;
@@ -919,30 +911,30 @@ public class PDFTextStripper extends PDF
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
- if( shouldSeparateByBeads )
+ if (shouldSeparateByBeads)
{
- for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
+ for (int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++)
{
- PDThreadBead bead = pageArticles.get( i );
- if( bead != null )
+ PDThreadBead bead = pageArticles.get(i);
+ if (bead != null)
{
PDRectangle rect = bead.getRectangle();
- if( rect.contains( x, y ) )
+ if (rect.contains(x, y))
{
foundArticleDivisionIndex = i*2+1;
}
- else if( (x < rect.getLowerLeftX() ||
+ else if ((x < rect.getLowerLeftX() ||
y < rect.getUpperRightY()) &&
notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
}
- else if( x < rect.getLowerLeftX() &&
+ else if (x < rect.getLowerLeftX() &&
notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i*2;
}
- else if( y < rect.getUpperRightY() &&
+ else if (y < rect.getUpperRightY() &&
notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i*2;
@@ -959,19 +951,19 @@ public class PDFTextStripper extends PDF
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex = -1;
- if( foundArticleDivisionIndex != -1 )
+ if (foundArticleDivisionIndex != -1)
{
articleDivisionIndex = foundArticleDivisionIndex;
}
- else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
+ else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
}
- else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
+ else if (notFoundButFirstLeftArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
}
- else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
+ else if (notFoundButFirstAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
}
@@ -980,14 +972,14 @@ public class PDFTextStripper extends PDF
articleDivisionIndex = charactersByArticle.size()-1;
}
- List<TextPosition> textList = charactersByArticle.get( articleDivisionIndex );
+ List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
// In the wild, some PDF encoded documents put diacritics (accents on
// top of characters) into a separate Tj element. When displaying them
// graphically, the two chunks get overlayed. With text output though,
// we need to do the overlay. This code recombines the diacritic with
// its associated character if the two are consecutive.
- if(textList.isEmpty())
+ if (textList.isEmpty())
{
textList.add(text);
}
@@ -998,13 +990,13 @@ public class PDFTextStripper extends PDF
// one TextPosition to find what we are overlapping.
// This may not always be true. */
TextPosition previousTextPosition = textList.get(textList.size()-1);
- if(text.isDiacritic() && previousTextPosition.contains(text))
+ if (text.isDiacritic() && previousTextPosition.contains(text))
{
previousTextPosition.mergeDiacritic(text, normalize);
}
// If the previous TextPosition was the diacritic, merge it into this
// one and remove it from the list.
- else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
+ else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition, normalize);
textList.remove(textList.size()-1);
@@ -1531,7 +1523,7 @@ public class PDFTextStripper extends PDF
lastLineStartPosition = current;
if (current.isParagraphStart())
{
- if(lastPosition.isArticleStart())
+ if (lastPosition.isArticleStart())
{
writeParagraphStart();
}
@@ -1569,8 +1561,7 @@ public class PDFTextStripper extends PDF
* @param position the current text position. This may have its isParagraphStart
* or isHangingIndent flags set upon return.
* @param lastPosition the previous text position (should not be null).
- * @param lastLineStartPosition the last text position that followed a line
- * separator. May be null.
+ * @param lastLineStartPosition the last text position that followed a line separator, or null.
* @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
protected void isParagraphSeparation(PositionWrapper position,
@@ -1578,24 +1569,25 @@ public class PDFTextStripper extends PDF
float maxHeightForLine)
{
boolean result = false;
- if(lastLineStartPosition == null)
+ if (lastLineStartPosition == null)
{
result = true;
}
else
{
- float yGap = Math.abs(position.getTextPosition().getYDirAdj()-
+ float yGap = Math.abs(position.getTextPosition().getYDirAdj() -
lastPosition.getTextPosition().getYDirAdj());
- float xGap = position.getTextPosition().getXDirAdj()-
- lastLineStartPosition.getTextPosition().getXDirAdj();//do we need to flip this for rtl?
- if(yGap > getDropThreshold()*maxHeightForLine)
+ // do we need to flip this for rtl?
+ float xGap = position.getTextPosition().getXDirAdj() -
+ lastLineStartPosition.getTextPosition().getXDirAdj();
+ if (yGap > getDropThreshold()*maxHeightForLine)
{
result = true;
}
- else if(xGap > getIndentThreshold()*position.getTextPosition().getWidthOfSpace())
+ else if (xGap > getIndentThreshold()*position.getTextPosition().getWidthOfSpace())
{
// text is indented, but try to screen for hanging indent
- if(!lastLineStartPosition.isParagraphStart())
+ if (!lastLineStartPosition.isParagraphStart())
{
result = true;
}
@@ -1604,31 +1596,31 @@ public class PDFTextStripper extends PDF
position.setHangingIndent();
}
}
- else if(xGap < -position.getTextPosition().getWidthOfSpace())
+ else if (xGap < -position.getTextPosition().getWidthOfSpace())
{
// text is left of previous line. Was it a hanging indent?
- if(!lastLineStartPosition.isParagraphStart())
+ if (!lastLineStartPosition.isParagraphStart())
{
result = true;
}
}
- else if(Math.abs(xGap) < 0.25 * position.getTextPosition().getWidth())
+ else if (Math.abs(xGap) < 0.25 * position.getTextPosition().getWidth())
{
// current horizontal position is within 1/4 a char of the last
// linestart. We'll treat them as lined up.
- if(lastLineStartPosition.isHangingIndent())
+ if (lastLineStartPosition.isHangingIndent())
{
position.setHangingIndent();
}
- else if(lastLineStartPosition.isParagraphStart())
+ else if (lastLineStartPosition.isParagraphStart())
{
// check to see if the previous line looks like
// any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
- if(liPattern!=null)
+ if (liPattern!=null)
{
Pattern currentPattern = matchListItemPattern(position);
- if(liPattern == currentPattern)
+ if (liPattern == currentPattern)
{
result = true;
}
@@ -1636,7 +1628,7 @@ public class PDFTextStripper extends PDF
}
}
}
- if(result)
+ if (result)
{
position.setParagraphStart();
}
@@ -1766,10 +1758,10 @@ public class PDFTextStripper extends PDF
*/
protected List<Pattern> getListItemPatterns()
{
- if(listOfPatterns == null)
+ if (listOfPatterns == null)
{
listOfPatterns = new ArrayList<Pattern>();
- for(String expression : LIST_ITEM_EXPRESSIONS)
+ for (String expression : LIST_ITEM_EXPRESSIONS)
{
Pattern p = Pattern.compile(expression);
listOfPatterns.add(p);
@@ -1795,9 +1787,9 @@ public class PDFTextStripper extends PDF
protected static Pattern matchPattern(String string, List<Pattern> patterns)
{
Pattern matchedPattern = null;
- for(Pattern p : patterns)
+ for (Pattern p : patterns)
{
- if(p.matcher(string).matches())
+ if (p.matcher(string).matches())
{
return p;
}
@@ -1815,7 +1807,7 @@ public class PDFTextStripper extends PDF
throws IOException
{
int numberOfStrings = line.size();
- for(int i=0; i<numberOfStrings; i++)
+ for (int i=0; i<numberOfStrings; i++)
{
WordWithTextPositions word = line.get(i);
writeString(word.getText(), word.getTextPositions());
@@ -1843,14 +1835,14 @@ public class PDFTextStripper extends PDF
if (isRtlDominant)
{
int numberOfPositions = line.size();
- for(int i = numberOfPositions-1;i>=0;i--)
+ for (int i = numberOfPositions - 1; i >= 0; i--)
{
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
}
}
else
{
- for(TextPosition text : line)
+ for (TextPosition text : line)
{
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, text);
}