You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/06/17 07:10:23 UTC
svn commit: r1603056 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox:
pdmodel/graphics/state/PDTextState.java util/PDFStreamEngine.java
util/PDFTextStripper.java util/operator/SetHorizontalTextScaling.java
util/operator/ShowTextGlyph.java
Author: jahewson
Date: Tue Jun 17 05:10:23 2014
New Revision: 1603056
URL: http://svn.apache.org/r1603056
Log:
PDFBOX-2145: Clean up PDFStreamEngine and PDFTextStripper
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/state/PDTextState.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/SetHorizontalTextScaling.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/state/PDTextState.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/state/PDTextState.java?rev=1603056&r1=1603055&r2=1603056&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/state/PDTextState.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/state/PDTextState.java Tue Jun 17 05:10:23 2014
@@ -119,7 +119,7 @@ public class PDTextState implements Clon
*
* @return The horizontalScaling.
*/
- public float getHorizontalScalingPercent()
+ public float getHorizontalScaling()
{
return horizontalScaling;
}
@@ -129,7 +129,7 @@ public class PDTextState implements Clon
*
* @param value The horizontalScaling.
*/
- public void setHorizontalScalingPercent(float value)
+ public void setHorizontalScaling(float value)
{
horizontalScaling = value;
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1603056&r1=1603055&r2=1603056&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Tue Jun 17 05:10:23 2014
@@ -42,7 +42,6 @@ import org.apache.pdfbox.pdmodel.font.PD
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
-import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.operator.OperatorProcessor;
@@ -51,6 +50,7 @@ import org.apache.pdfbox.util.operator.P
/**
* Processes a PDF content stream and executes certain operations.
* Provides a callback interface for clients that want to do things with the stream.
+ *
* {@see org.apache.pdfbox.util.PDFTextStripper}
* @author Ben Litchfield
*/
@@ -69,9 +69,6 @@ public class PDFStreamEngine
private Stack<PDResources> streamResourcesStack = new Stack<PDResources>();
- private int validCharCnt;
- private int totalCharCnt;
-
private int pageRotation;
private PDRectangle drawingRectangle;
@@ -86,8 +83,9 @@ public class PDFStreamEngine
}
/**
- * Constructor with engine properties. The property keys are all PDF operators, the values are class names used to
- * execute those operators. An empty value means that the operator will be silently ignored.
+ * Constructor with engine properties. The property keys are all PDF operators, the values are
+ * class names used to execute those operators. An empty value means that the operator will be
+ * silently ignored.
*
* @param properties The engine properties.
*/
@@ -166,15 +164,13 @@ public class PDFStreamEngine
}
/**
- * This method must be called between processing documents. The PDFStreamEngine caches information for the document
- * between pages and this will release the cached information. This only needs to be called if processing a new
- * document.
- *
+ * This method must be called between processing documents. The PDFStreamEngine caches
+ * information for the document between pages and this will release the cached information.
+ * This only needs to be called if processing a new document.
*/
public void resetEngine()
{
- validCharCnt = 0;
- totalCharCnt = 0;
+ // overridden in subclasses
}
/**
@@ -201,11 +197,10 @@ public class PDFStreamEngine
* @param cosStream the Stream to execute.
* @param drawingSize the size of the page
* @param rotation the page rotation
- *
* @throws IOException if there is an error accessing the stream.
*/
- public void processStream(PDResources resources, COSStream cosStream, PDRectangle drawingSize, int rotation)
- throws IOException
+ public void processStream(PDResources resources, COSStream cosStream, PDRectangle drawingSize,
+ int rotation) throws IOException
{
initStream(drawingSize, rotation);
processSubStream(resources, cosStream);
@@ -216,7 +211,6 @@ public class PDFStreamEngine
*
* @param resources The resources used when processing the stream.
* @param cosStream The stream to process.
- *
* @throws IOException If there is an exception while processing the stream.
*/
public void processSubStream(PDResources resources, COSStream cosStream) throws IOException
@@ -275,8 +269,8 @@ public class PDFStreamEngine
}
/**
- * A method provided as an event interface to allow a subclass to perform some specific functionality when text
- * needs to be processed.
+ * A method provided as an event interface to allow a subclass to perform some specific
+ * functionality when text needs to be processed.
*
* @param text The text to be processed.
*/
@@ -286,37 +280,22 @@ public class PDFStreamEngine
}
/**
- * A method provided as an event interface to allow a subclass to perform some specific functionality on the string
- * encoded by a glyph.
- *
- * @param str The string to be processed.
- *
- * @return the altered string
- */
- protected String inspectFontEncoding(String str)
- {
- return str;
- }
-
- /**
- * Process encoded text from the PDF Stream. You should override this method if you want to perform an action when
- * encoded text is being processed.
+ * Process encoded text from the PDF Stream. You should override this method if you want to
+ * perform an action when encoded text is being processed.
*
* @param string The encoded text
- *
* @throws IOException If there is an error processing the string
*/
public void processEncodedText(byte[] string) throws IOException
{
- /*
- * Note on variable names. There are three different units being used in this code. Character sizes are given in
- * glyph units, text locations are initially given in text units, and we want to save the data in display units.
- * The variable names should end with Text or Disp to represent if the values are in text or disp units (no
- * glyph units are saved).
- */
+ // Note on variable names. There are three different units being used in this code.
+ // Character sizes are given in glyph units, text locations are initially given in text
+ // units, and we want to save the data in display units. The variable names should end with
+ // Text or Disp to represent if the values are in text or disp units (no glyph units are
+ // saved).
+
final float fontSizeText = graphicsState.getTextState().getFontSize();
- final float horizontalScalingText = graphicsState.getTextState().getHorizontalScalingPercent() / 100f;
- // float verticalScalingText = horizontalScaling;//not sure if this is right but what else to do???
+ final float horizontalScalingText = graphicsState.getTextState().getHorizontalScaling() / 100f;
final float riseText = graphicsState.getTextState().getRise();
final float wordSpacingText = graphicsState.getTextState().getWordSpacing();
final float characterSpacingText = graphicsState.getTextState().getCharacterSpacing();
@@ -327,7 +306,7 @@ public class PDFStreamEngine
// were a single byte will result in two output characters "fi"
final PDFont font = graphicsState.getTextState().getFont();
- // all fonts are providing the width/height of a character in thousandths of a unit of text space
+ // all fonts have the width/height of a character in thousandths of a unit of text space
float fontMatrixXScaling = 1 / 1000f;
float fontMatrixYScaling = 1 / 1000f;
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
@@ -344,9 +323,8 @@ public class PDFStreamEngine
float spaceWidthText = 0;
try
{
- // to avoid crash as described in PDFBOX-614
- // lets see what the space displacement should be
- spaceWidthText = (font.getSpaceWidth() * glyphSpaceToTextSpaceFactor);
+ // to avoid crash as described in PDFBOX-614, see what the space displacement should be
+ spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
}
catch (Throwable exception)
{
@@ -355,9 +333,8 @@ public class PDFStreamEngine
if (spaceWidthText == 0)
{
- spaceWidthText = (font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor);
- // The average space width appears to be higher than necessary
- // so lets make it a little bit smaller.
+ spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
+ // the average space width appears to be higher than necessary so make it smaller
spaceWidthText *= .80f;
}
if (spaceWidthText == 0)
@@ -380,13 +357,13 @@ public class PDFStreamEngine
Matrix td = new Matrix();
Matrix tempMatrix = new Matrix();
- int codeLength = 1;
+ int codeLength;
for (int i = 0; i < string.length; i += codeLength)
{
// Decode the value to a Unicode character
codeLength = 1;
String c = font.encode(string, i, codeLength);
- int[] codePoints = null;
+ int[] codePoints;
if (c == null && i + 1 < string.length)
{
// maybe a multibyte encoding
@@ -400,19 +377,20 @@ public class PDFStreamEngine
}
// the space width has to be transformed into display units
- float spaceWidthDisp = spaceWidthText * fontSizeText * horizontalScalingText * textMatrix.getXScale()
- * ctm.getXScale();
+ float spaceWidthDisp = spaceWidthText * fontSizeText * horizontalScalingText *
+ textMatrix.getXScale() * ctm.getXScale();
- // todo, handle horizontal displacement
+ // TODO: handle horizontal displacement
// get the width and height of this character in text units
- float characterHorizontalDisplacementText = font.getFontWidth(string, i, codeLength);
- float characterVerticalDisplacementText = font.getFontHeight(string, i, codeLength);
+ float charHorizontalDisplacementText = font.getFontWidth(string, i, codeLength);
+ float charVerticalDisplacementText = font.getFontHeight(string, i, codeLength);
// multiply the width/height with the scaling factor
- characterHorizontalDisplacementText = characterHorizontalDisplacementText * fontMatrixXScaling;
- characterVerticalDisplacementText = characterVerticalDisplacementText * fontMatrixYScaling;
+ charHorizontalDisplacementText = charHorizontalDisplacementText * fontMatrixXScaling;
+ charVerticalDisplacementText = charVerticalDisplacementText * fontMatrixYScaling;
- maxVerticalDisplacementText = Math.max(maxVerticalDisplacementText, characterVerticalDisplacementText);
+ maxVerticalDisplacementText = Math.max(maxVerticalDisplacementText,
+ charVerticalDisplacementText);
// PDF Spec - 5.5.2 Word Spacing
//
@@ -433,18 +411,19 @@ public class PDFStreamEngine
// code 32 non-space resulted in errors consistent with this interpretation.
//
float spacingText = 0;
- if ((string[i] == 0x20) && codeLength == 1)
+ if (string[i] == 0x20 && codeLength == 1)
{
spacingText += wordSpacingText;
}
textMatrix.multiply(ctm, textXctm);
// Convert textMatrix to display units
- // We need to instantiate a new Matrix instance here as it is passed to the TextPosition constructor below.
+ // We need to instantiate a new Matrix instance here as it is passed to the TextPosition
+ // constructor below
Matrix textMatrixStart = textStateParameters.multiply(textXctm);
- // TODO : tx should be set for horizontal text and ty for vertical text
+ // TODO: tx should be set for horizontal text and ty for vertical text
// which seems to be specified in the font (not the direction in the matrix).
- float tx = ((characterHorizontalDisplacementText) * fontSizeText) * horizontalScalingText;
+ float tx = charHorizontalDisplacementText * fontSizeText * horizontalScalingText;
float ty = 0;
// reset the matrix instead of creating a new one
td.reset();
@@ -462,8 +441,8 @@ public class PDFStreamEngine
final float endYPosition = textMatrixEnd.getYPosition();
// add some spacing to the text matrix (see comment above)
- tx = ((characterHorizontalDisplacementText) * fontSizeText + characterSpacingText + spacingText)
- * horizontalScalingText;
+ tx = (charHorizontalDisplacementText * fontSizeText + characterSpacingText +
+ spacingText) * horizontalScalingText;
td.setValue(2, 0, tx);
td.multiply(textMatrix, textMatrix);
@@ -472,27 +451,20 @@ public class PDFStreamEngine
float startXPosition = textMatrixStart.getXPosition();
float widthText = endXPosition - startXPosition;
- // there are several cases where one character code will
- // output multiple characters. For example "fi" or a
- // glyphname that has no mapping like "visiblespace"
- if (c != null)
- {
- validCharCnt++;
- }
- else
+ // PDFBOX-373: Replace a null entry with "?" so it is not printed as "(null)"
+ if (c == null)
{
- // PDFBOX-373: Replace a null entry with "?" so it is
- // not printed as "(null)"
c = "?";
}
- totalCharCnt++;
- float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * textXctm.getYScale();
+ float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText *
+ textXctm.getYScale();
// process the decoded text
- processTextPosition(new TextPosition(pageRotation, pageWidth, pageHeight, textMatrixStart, endXPosition,
- endYPosition, totalVerticalDisplacementDisp, widthText, spaceWidthDisp, c, codePoints, font,
- fontSizeText, (int) (fontSizeText * textMatrix.getXScale())));
+ processTextPosition(new TextPosition(pageRotation, pageWidth, pageHeight,
+ textMatrixStart, endXPosition, endYPosition, totalVerticalDisplacementDisp,
+ widthText, spaceWidthDisp, c, codePoints, font, fontSizeText,
+ (int)(fontSizeText * textMatrix.getXScale())));
}
}
@@ -501,15 +473,14 @@ public class PDFStreamEngine
*
* @param operation The operation to perform.
* @param arguments The list of arguments.
- *
* @throws IOException If there is an error processing the operation.
*/
public void processOperator(String operation, List<COSBase> arguments) throws IOException
{
try
{
- PDFOperator oper = PDFOperator.getOperator(operation);
- processOperator(oper, arguments);
+ PDFOperator operator = PDFOperator.getOperator(operation);
+ processOperator(operator, arguments);
}
catch (IOException e)
{
@@ -522,7 +493,6 @@ public class PDFStreamEngine
*
* @param operator The operation to perform.
* @param arguments The list of arguments.
- *
* @throws IOException If there is an error processing the operation.
*/
protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException
@@ -545,35 +515,7 @@ public class PDFStreamEngine
}
/**
- * Transforms the given point using the current transformation matrix
- *
- * @param x x-coordinate of the point to be transformed
- * @param y y-coordinate of the point to be transformed
- * @return the transformed point
- */
- /* public Point2D.Double transformPoint(double x, double y)
- {
- double[] position = { x, y };
- Matrix ctm = graphicsState.getCurrentTransformationMatrix();
- ctm.createAffineTransform().transform(position, 0, position, 0, 1);
- return new Point2D.Double(position[0], position[1]);
- }*/
-
- /**
- * Transforms the given width using the current transformation matrix
- *
- * @param width the width to be transformed
- * @return the transformed width
- */
- /*public double transformWidth(double width) {
- Matrix ctm = graphicsState.getCurrentTransformationMatrix();
- double x = ctm.getValue(0, 0) + ctm.getValue(1, 0);
- double y = ctm.getValue(0, 1) + ctm.getValue(1, 1);
- return width * Math.sqrt(0.5 * (x * x + y * y));
- }*/
-
- /**
- * @return Returns the colorSpaces.
+ * @return Returns the XObjects.
*/
public Map<String, PDXObject> getXObjects()
{
@@ -581,14 +523,6 @@ public class PDFStreamEngine
}
/**
- * @param value The colorSpaces to set.
- */
- public void setColorSpaces(Map<String, PDColorSpace> value)
- {
- streamResourcesStack.peek().setColorSpaces(value);
- }
-
- /**
* @return Returns the fonts.
*/
public Map<String, PDFont> getFonts()
@@ -706,31 +640,10 @@ public class PDFStreamEngine
}
/**
- * Get the total number of valid characters in the doc that could be decoded in processEncodedText().
- *
- * @return The number of valid characters.
- */
- public int getValidCharCnt()
- {
- return validCharCnt;
- }
-
- /**
- * Get the total number of characters in the doc (including ones that could not be mapped).
- *
- * @return The number of characters.
- */
- public int getTotalCharCnt()
- {
- return totalCharCnt;
- }
-
- /**
* Remove all cached resources.
*/
public void dispose()
{
- resetEngine();
drawingRectangle = null;
graphicsState = null;
textLineMatrix = null;
@@ -750,9 +663,6 @@ public class PDFStreamEngine
operators.clear();
operators = null;
}
- if (unsupportedOperators != null)
- {
- unsupportedOperators.clear();
- }
+ unsupportedOperators.clear();
}
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1603056&r1=1603055&r2=1603056&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Jun 17 05:10:23 2014
@@ -53,71 +53,65 @@ import org.apache.pdfbox.text.TextPositi
/**
* This class will take a pdf document and strip out all of the text and ignore the
* formatting and such. Please note; it is up to clients of this class to verify that
- * a specific user has the correct permissions to extract text from the
- * PDF document.
+ * a specific user has the correct permissions to extract text from the PDF document.
*
* The basic flow of this process is that we get a document and use a series of
* processXXX() functions that work on smaller and smaller chunks of the page.
* Eventually, we fully process each page and then print it.
*
- * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- *
+ * @author Ben Litchfield
*/
public class PDFTextStripper extends PDFStreamEngine
{
-
- private static final String thisClassName = PDFTextStripper.class.getSimpleName().toLowerCase();
-
private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
private static float DEFAULT_DROP_THRESHOLD = 2.5f;
- //enable the ability to set the default indent/drop thresholds
- //with -D system properties:
+ // enable the ability to set the default indent/drop thresholds
+ // with -D system properties:
// pdftextstripper.indent
// pdftextstripper.drop
static
{
- String sdrop = null, sindent = null;
+ String strDrop = null, strIndent = null;
try
{
- String prop = thisClassName + ".indent";
- sindent = System.getProperty(prop);
- prop = thisClassName + ".drop";
- sdrop = System.getProperty(prop);
+ String className = PDFTextStripper.class.getSimpleName().toLowerCase();
+ String prop = className + ".indent";
+ strIndent = System.getProperty(prop);
+ prop = className + ".drop";
+ strDrop = System.getProperty(prop);
}
catch (SecurityException e)
{
// PDFBOX-1946 when run in an applet
// ignore and use default
}
- if (sindent != null && sindent.length() > 0)
+ if (strIndent != null && strIndent.length() > 0)
{
try
{
- float f = Float.parseFloat(sindent);
- DEFAULT_INDENT_THRESHOLD = f;
+ DEFAULT_INDENT_THRESHOLD = Float.parseFloat(strIndent);
}
catch (NumberFormatException nfe)
{
- //ignore and use default
+ // ignore and use default
}
}
- if (sdrop != null && sdrop.length() > 0)
+ if (strDrop != null && strDrop.length() > 0)
{
try
{
- float f = Float.parseFloat(sdrop);
- DEFAULT_DROP_THRESHOLD = f;
+ DEFAULT_DROP_THRESHOLD = Float.parseFloat(strDrop);
}
catch (NumberFormatException nfe)
{
- //ignore and use default
+ // ignore and use default
}
}
}
/**
- * The platforms line separator.
+ * The platform's line separator.
*/
protected final String systemLineSeparator = System.getProperty("line.separator");
@@ -146,12 +140,12 @@ public class PDFTextStripper extends PDF
private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
private float dropThreshold = DEFAULT_DROP_THRESHOLD;
- // We will need to estimate where to add spaces.
- // These are used to help guess.
+ // We will need to estimate where to add spaces. These are used to help guess.
private float spacingTolerance = .5f;
private float averageCharTolerance = .3f;
private List<PDThreadBead> pageArticles = null;
+
/**
* The charactersByArticle is used to extract text by article divisions. For example
* a PDF that has two columns like a newspaper, we want to extract the first column and
@@ -172,18 +166,8 @@ public class PDFTextStripper extends PDF
private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
- /**
- * encoding that text will be written in (or null).
- */
- protected String outputEncoding;
-
- /**
- * The document to read.
- */
+ protected String outputEncoding;
protected PDDocument document;
- /**
- * The stream to write the output to.
- */
protected Writer output;
/**
@@ -193,16 +177,14 @@ public class PDFTextStripper extends PDF
private TextNormalize normalize = null;
/**
- * True if we started a paragraph but haven't ended it
- * yet.
+ * True if we started a paragraph but haven't ended it yet.
*/
private boolean inParagraph;
/**
* Instantiate a new PDFTextStripper object. This object will load
* properties from PDFTextStripper.properties and will not do
- * anything special to convert the text to a more encoding-specific
- * output.
+ * anything special to convert the text to a more encoding-specific output.
*
* @throws IOException If there is an error loading the properties.
*/
@@ -261,25 +243,9 @@ public class PDFTextStripper extends PDF
return outputStream.toString();
}
- /**
- * @deprecated
- * @see PDFTextStripper#getText( PDDocument )
- * @param doc The document to extract the text from.
- * @return The document text.
- * @throws IOException If there is an error extracting the text.
- */
- public String getText( COSDocument doc ) throws IOException
- {
- return getText( new PDDocument( doc ) );
- }
-
- /**
- * {@inheritDoc}
- */
@Override
public void resetEngine()
{
- super.resetEngine();
currentPageNo = 0;
document = null;
if (charactersByArticle != null)
@@ -323,7 +289,6 @@ public class PDFTextStripper extends PDF
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
- //
try
{
StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
@@ -360,9 +325,9 @@ public class PDFTextStripper extends PDF
endBookmarkPageNumber == -1 && endBookmark != null &&
startBookmark.getCOSObject() == endBookmark.getCOSObject() )
{
- //this is a special case where both the start and end bookmark
- //are the same but point to nothing. In this case
- //we will not extract any text.
+ // this is a special case where both the start and end bookmark
+ // are the same but point to nothing. In this case
+ // we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
@@ -380,19 +345,20 @@ public class PDFTextStripper extends PDF
}
}
- private int getPageNumber( PDOutlineItem bookmark, List<COSObjectable> allPages ) throws IOException
+ private int getPageNumber( PDOutlineItem bookmark, List<COSObjectable> allPages )
+ throws IOException
{
int pageNumber = -1;
PDPage page = bookmark.findDestinationPage( document );
if( page != null )
{
- pageNumber = allPages.indexOf( page )+1;//use one based indexing
+ pageNumber = allPages.indexOf( page ) + 1; // use one based indexing
}
return pageNumber;
}
/**
- * This method is available for subclasses of this class. It will be called before processing
+ * This method is available for subclasses of this class. It will be called before processing
* of the document start.
*
* @param pdf The PDF document that is being processed.
@@ -404,7 +370,7 @@ public class PDFTextStripper extends PDF
}
/**
- * This method is available for subclasses of this class. It will be called after processing
+ * This method is available for subclasses of this class. It will be called after processing
* of the document finishes.
*
* @param pdf The PDF document that is being processed.
@@ -442,7 +408,7 @@ public class PDFTextStripper extends PDF
{
if( numberOfArticleSections < originalSize )
{
- ((List<TextPosition>)charactersByArticle.get( i )).clear();
+ charactersByArticle.get( i ).clear();
}
else
{
@@ -505,7 +471,7 @@ public class PDFTextStripper extends PDF
*/
protected void startPage( PDPage page ) throws IOException
{
- //default is to do nothing.
+ // default is to do nothing
}
/**
@@ -518,15 +484,15 @@ public class PDFTextStripper extends PDF
*/
protected void endPage( PDPage page ) throws IOException
{
- //default is to do nothing
+ // default is to do nothing
}
- private static final float ENDOFLASTTEXTX_RESET_VALUE = -1;
- private static final float MAXYFORLINE_RESET_VALUE = -Float.MAX_VALUE;
- private static final float EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE = -Float.MAX_VALUE;
- private static final float MAXHEIGHTFORLINE_RESET_VALUE = -1;
- private static final float MINYTOPFORLINE_RESET_VALUE = Float.MAX_VALUE;
- private static final float LASTWORDSPACING_RESET_VALUE = -1;
+ private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1;
+ private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE;
+ private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE;
+ private static final float MAX_HEIGHT_FOR_LINE_RESET_VALUE = -1;
+ private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE;
+ private static final float LAST_WORD_SPACING_RESET_VALUE = -1;
/**
* This will print the text of the processed page to "output".
@@ -538,16 +504,16 @@ public class PDFTextStripper extends PDF
*/
protected void writePage() throws IOException
{
- float maxYForLine = MAXYFORLINE_RESET_VALUE;
- float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
- float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
- float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
- float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
+ float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
+ float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
+ float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
+ float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
+ float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
PositionWrapper lastPosition = null;
PositionWrapper lastLineStartPosition = null;
- boolean startOfPage = true;//flag to indicate start of page
- boolean startOfArticle = true;
+ boolean startOfPage = true; // flag to indicate start of page
+ boolean startOfArticle;
if(charactersByArticle.size() > 0)
{
writePageStart();
@@ -562,80 +528,80 @@ public class PDFTextStripper extends PDF
Collections.sort( textList, comparator );
}
Iterator<TextPosition> textIter = textList.iterator();
- /* Before we can display the text, we need to do some normalizing.
- * Arabic and Hebrew text is right to left and is typically stored
- * in its logical format, which means that the rightmost character is
- * stored first, followed by the second character from the right etc.
- * However, PDF stores the text in presentation form, which is left to
- * right. We need to do some normalization to convert the PDF data to
- * the proper logical output format.
- *
- * Note that if we did not sort the text, then the output of reversing the
- * text is undefined and can sometimes produce worse output then not trying
- * to reverse the order. Sorting should be done for these languages.
- * */
-
- /* First step is to determine if we have any right to left text, and
- * if so, is it dominant. */
- int ltrCnt = 0;
- int rtlCnt = 0;
+ // Before we can display the text, we need to do some normalizing.
+ // Arabic and Hebrew text is right to left and is typically stored
+ // in its logical format, which means that the rightmost character is
+ // stored first, followed by the second character from the right etc.
+ // However, PDF stores the text in presentation form, which is left to
+ // right. We need to do some normalization to convert the PDF data to
+ // the proper logical output format.
+ //
+ // Note that if we did not sort the text, then the output of reversing the
+ // text is undefined and can sometimes produce worse output then not trying
+ // to reverse the order. Sorting should be done for these languages.
+
+ // First step is to determine if we have any right to left text, and
+ // if so, is it dominant.
+ int ltrCount = 0;
+ int rtlCount = 0;
while( textIter.hasNext() )
{
- TextPosition position = (TextPosition)textIter.next();
+ TextPosition position = textIter.next();
String stringValue = position.getCharacter();
for (int a = 0; a < stringValue.length(); a++)
{
byte dir = Character.getDirectionality(stringValue.charAt(a));
- if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
- (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
- (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
+ if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ||
+ dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
+ dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
{
- ltrCnt++;
+ ltrCount++;
}
- else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
- (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
- (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
- (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
+ else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
+ dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
{
- rtlCnt++;
+ rtlCount++;
}
}
}
// choose the dominant direction
- boolean isRtlDominant = rtlCnt > ltrCnt;
+ boolean isRtlDominant = rtlCount > ltrCount;
startArticle(!isRtlDominant);
startOfArticle = true;
// we will later use this to skip reordering
- boolean hasRtl = rtlCnt > 0;
+ boolean hasRtl = rtlCount > 0;
- /* Now cycle through to print the text.
- * We queue up a line at a time before we print so that we can convert
- * the line from presentation form to logical form (if needed).
- */
+ // Now cycle through to print the text.
+ // We queue up a line at a time before we print so that we can convert
+ // the line from presentation form to logical form (if needed).
List<TextPosition> line = new ArrayList<TextPosition>();
textIter = textList.iterator(); // start from the beginning again
- /* PDF files don't always store spaces. We will need to guess where we should add
- * spaces based on the distances between TextPositions. Historically, this was done
- * based on the size of the space character provided by the font. In general, this worked
- * but there were cases where it did not work. Calculating the average character width
- * and using that as a metric works better in some cases but fails in some cases where the
- * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
- */
- //Keeps track of the previous average character width
+ // PDF files don't always store spaces. We will need to guess where we should add
+ // spaces based on the distances between TextPositions. Historically, this was done
+ // based on the size of the space character provided by the font. In general, this
+ // worked but there were cases where it did not work. Calculating the average character
+ // width and using that as a metric works better in some cases but fails in some cases
+ // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
+ // these examples.
+
+ // Keeps track of the previous average character width
float previousAveCharWidth = -1;
while( textIter.hasNext() )
{
- TextPosition position = (TextPosition)textIter.next();
+ TextPosition position = textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getCharacter();
//Resets the average character width when we see a change in font
// or a change in the font size
- if(lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
- || (position.getFontSize() != lastPosition.getTextPosition().getFontSize())))
+ if(lastPosition != null &&
+ (position.getFont() != lastPosition.getTextPosition().getFont() ||
+ position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
{
previousAveCharWidth = -1;
}
@@ -645,8 +611,8 @@ public class PDFTextStripper extends PDF
float positionWidth;
float positionHeight;
- /* If we are sorting, then we need to use the text direction
- * adjusted coordinates, because they were used in the sorting. */
+ // If we are sorting, then we need to use the text direction
+ // adjusted coordinates, because they were used in the sorting.
if (getSortByPosition())
{
positionX = position.getXDirAdj();
@@ -665,11 +631,11 @@ public class PDFTextStripper extends PDF
//The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
- /* Estimate the expected width of the space based on the
- * space character with some margin. */
+ // Estimate the expected width of the space based on the
+ // space character with some margin.
float wordSpacing = position.getWidthOfSpace();
- float deltaSpace = 0;
- if ((wordSpacing == 0) || (wordSpacing == Float.NaN))
+ float deltaSpace;
+ if (wordSpacing == 0 || wordSpacing == Float.NaN)
{
deltaSpace = Float.MAX_VALUE;
}
@@ -677,34 +643,33 @@ public class PDFTextStripper extends PDF
{
if( lastWordSpacing < 0 )
{
- deltaSpace = (wordSpacing * getSpacingTolerance());
+ deltaSpace = wordSpacing * getSpacingTolerance();
}
else
{
- deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* getSpacingTolerance());
+ deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
}
}
- /* Estimate the expected width of the space based on the
- * average character width with some margin. This calculation does not
- * make a true average (average of averages) but we found that it gave the
- * best results after numerous experiments. Based on experiments we also found that
- * .3 worked well. */
+ // Estimate the expected width of the space based on the average character width
+ // with some margin. This calculation does not make a true average (average of
+ // averages) but we found that it gave the best results after numerous experiments.
+ // Based on experiments we also found that .3 worked well.
float averageCharWidth = -1;
if(previousAveCharWidth < 0)
{
- averageCharWidth = (positionWidth/wordCharCount);
+ averageCharWidth = positionWidth / wordCharCount;
}
else
{
- averageCharWidth = (previousAveCharWidth + (positionWidth/wordCharCount))/2f;
+ averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
}
- float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());
+ float deltaCharWidth = averageCharWidth * getAverageCharTolerance();
- //Compares the values obtained by the average method and the wordSpacing method and picks
- //the smaller number.
- float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
- if(endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE)
+ // Compares the values obtained by the average method and the wordSpacing method
+ // and picks the smaller number.
+ float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
+ if(endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE)
{
if(deltaCharWidth > deltaSpace)
{
@@ -729,25 +694,27 @@ public class PDFTextStripper extends PDF
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
- /* XXX BC: In theory, this check should really check if the next char is in full range
- * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
- * of regression test failures. So, I'm leaving it be for now. */
+ // XXX BC: In theory, this check should really check if the next char is in
+ // full range seen in this line. This is what I tried to do with minYTopForLine,
+ // but this caused a lot of regression test failures. So, I'm leaving it be for
+ // now
if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
line.clear();
lastLineStartPosition =
- handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
- endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
- expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
- maxYForLine = MAXYFORLINE_RESET_VALUE;
- maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
- minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
+ handleLineSeparation(current, lastPosition, lastLineStartPosition,
+ maxHeightForLine);
+ endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
+ expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
+ maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
+ maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
+ minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
- //Test if our TextPosition starts after a new word would be expected to start.
- if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
+ // test if our TextPosition starts after a new word would be expected to start
+ if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
&& expectedStartOfNextWordX < positionX &&
- //only bother adding a space if the last character was not a space
+ // only bother adding a space if the last character was not a space
lastPosition.getTextPosition().getCharacter() != null &&
!lastPosition.getTextPosition().getCharacter().endsWith( " " ) )
{
@@ -797,8 +764,8 @@ public class PDFTextStripper extends PDF
private boolean overlap( float y1, float height1, float y2, float height2 )
{
- return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
- (y1 <= y2 && y1 >= y2-height2);
+ return within( y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 ||
+ y1 <= y2 && y1 >= y2 - height2;
}
/**
@@ -847,8 +814,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code>
- * and just calls {@link #writeString(String)}.
+ * Write a Java string to the output stream. The default implementation will ignore the
+ * <code>textPositions</code> and just calls {@link #writeString(String)}.
*
* @param text The text to write to the stream.
* @param textPositions The TextPositions belonging to the text.
@@ -899,7 +866,8 @@ public class PDFTextStripper extends PDF
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
- TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
+ TreeMap<Float, TreeSet<Float>> sameTextCharacters =
+ characterListMapping.get( textCharacter );
if( sameTextCharacters == null )
{
sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
@@ -915,9 +883,8 @@ public class PDFTextStripper extends PDF
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
- //
boolean suppressCharacter = false;
- float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
+ float tolerance = text.getWidth()/textCharacter.length() / 3.0f;
SortedMap<Float, TreeSet<Float>> xMatches =
sameTextCharacters.subMap(textX - tolerance, textX + tolerance );
@@ -945,8 +912,7 @@ public class PDFTextStripper extends PDF
}
if( showCharacter )
{
- //if we are showing the character then we need to determine which
- //article it belongs to.
+ // if we are showing the character then we need to determine which article it belongs to
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
@@ -957,7 +923,7 @@ public class PDFTextStripper extends PDF
{
for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
{
- PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
+ PDThreadBead bead = pageArticles.get( i );
if( bead != null )
{
PDRectangle rect = bead.getRectangle();
@@ -1014,31 +980,30 @@ public class PDFTextStripper extends PDF
articleDivisionIndex = charactersByArticle.size()-1;
}
- List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );
+ List<TextPosition> textList = charactersByArticle.get( articleDivisionIndex );
- /* In the wild, some PDF encoded documents put diacritics (accents on
- * top of characters) into a separate Tj element. When displaying them
- * graphically, the two chunks get overlayed. With text output though,
- * we need to do the overlay. This code recombines the diacritic with
- * its associated character if the two are consecutive.
- */
+ // In the wild, some PDF encoded documents put diacritics (accents on
+ // top of characters) into a separate Tj element. When displaying them
+ // graphically, the two chunks get overlayed. With text output though,
+ // we need to do the overlay. This code recombines the diacritic with
+ // its associated character if the two are consecutive.
if(textList.isEmpty())
{
textList.add(text);
}
else
{
- /* test if we overlap the previous entry.
- * Note that we are making an assumption that we need to only look back
- * one TextPosition to find what we are overlapping.
- * This may not always be true. */
- TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
+ // test if we overlap the previous entry.
+ // Note that we are making an assumption that we need to only look back
+ // one TextPosition to find what we are overlapping.
+ // This may not always be true. */
+ TextPosition previousTextPosition = textList.get(textList.size()-1);
if(text.isDiacritic() && previousTextPosition.contains(text))
{
previousTextPosition.mergeDiacritic(text, normalize);
}
- /* If the previous TextPosition was the diacritic, merge it into this
- * one and remove it from the list. */
+ // If the previous TextPosition was the diacritic, merge it into this
+ // one and remove it from the list.
else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition, normalize);
@@ -1231,7 +1196,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Set if the text stripper should group the text output by a list of beads. The default value is true!
+ * Set if the text stripper should group the text output by a list of beads.
+ * The default value is true!
*
* @param aShouldSeparateByBeads The new grouping of beads.
*/
@@ -1241,7 +1207,7 @@ public class PDFTextStripper extends PDF
}
/**
- * Get the bookmark where text extraction should end, inclusive. Default is null.
+ * Get the bookmark where text extraction should end, inclusive. Default is null.
*
* @return The ending bookmark.
*/
@@ -1546,57 +1512,20 @@ public class PDFTextStripper extends PDF
articleEnd = articleEndValue;
}
-
- /**
- * Reverse characters of a compound Arabic glyph.
- * When getSortByPosition() is true, inspect the sequence encoded
- * by one glyph. If the glyph encodes two or more Arabic characters,
- * reverse these characters from a logical order to a visual order.
- * This ensures that the bidirectional algorithm that runs later will
- * convert them back to a logical order.
- *
- * @param str a string obtained from font.encoding()
- *
- * @return the reversed string
- */
- @Override
- public String inspectFontEncoding(String str)
- {
- if (!sortByPosition || str == null || str.length() < 2)
- {
- return str;
- }
- for (int i = 0; i < str.length(); ++i)
- {
- if (Character.getDirectionality(str.charAt(i))
- != Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
- {
- return str;
- }
- }
- StringBuilder reversed = new StringBuilder(str.length());
- for (int i = str.length() - 1; i >= 0; --i)
- {
- reversed.append(str.charAt(i));
- }
- return reversed.toString();
- }
-
/**
* handles the line separator for a new line given
* the specified current and previous TextPositions.
* @param current the current text position
* @param lastPosition the previous text position
- * @param lastLineStartPosition the last text position that followed a line
- * separator.
+ * @param lastLineStartPosition the last text position that followed a line separator.
* @param maxHeightForLine max height for positions since lastLineStartPosition
* @return start position of the last line
* @throws IOException if something went wrong
*/
protected PositionWrapper handleLineSeparation(PositionWrapper current,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
- throws IOException
- {
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition,
+ float maxHeightForLine) throws IOException
+ {
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
@@ -1645,7 +1574,8 @@ public class PDFTextStripper extends PDF
* @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
protected void isParagraphSeparation(PositionWrapper position,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition,
+ float maxHeightForLine)
{
boolean result = false;
if(lastLineStartPosition == null)
@@ -1656,15 +1586,15 @@ public class PDFTextStripper extends PDF
{
float yGap = Math.abs(position.getTextPosition().getYDirAdj()-
lastPosition.getTextPosition().getYDirAdj());
- float xGap = (position.getTextPosition().getXDirAdj()-
- lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
- if(yGap > (getDropThreshold()*maxHeightForLine))
+ float xGap = position.getTextPosition().getXDirAdj()-
+ lastLineStartPosition.getTextPosition().getXDirAdj();//do we need to flip this for rtl?
+ if(yGap > getDropThreshold()*maxHeightForLine)
{
- result = true;
+ result = true;
}
- else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace()))
+ else if(xGap > getIndentThreshold()*position.getTextPosition().getWidthOfSpace())
{
- //text is indented, but try to screen for hanging indent
+ // text is indented, but try to screen for hanging indent
if(!lastLineStartPosition.isParagraphStart())
{
result = true;
@@ -1676,24 +1606,24 @@ public class PDFTextStripper extends PDF
}
else if(xGap < -position.getTextPosition().getWidthOfSpace())
{
- //text is left of previous line. Was it a hanging indent?
+ // text is left of previous line. Was it a hanging indent?
if(!lastLineStartPosition.isParagraphStart())
{
- result = true;
+ result = true;
}
}
- else if(Math.abs(xGap) < (0.25 * position.getTextPosition().getWidth()))
+ else if(Math.abs(xGap) < 0.25 * position.getTextPosition().getWidth())
{
- //current horizontal position is within 1/4 a char of the last
- //linestart. We'll treat them as lined up.
+ // current horizontal position is within 1/4 a char of the last
+ // linestart. We'll treat them as lined up.
if(lastLineStartPosition.isHangingIndent())
{
position.setHangingIndent();
}
else if(lastLineStartPosition.isParagraphStart())
{
- //check to see if the previous line looks like
- //any of a number of standard list item formats
+ // check to see if the previous line looks like
+ // any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
if(liPattern!=null)
{
@@ -1801,7 +1731,6 @@ public class PDFTextStripper extends PDF
"[a-z]\\)",
"[IVXL]+\\.",
"[ivxl]+\\.",
-
};
private List<Pattern> listOfPatterns = null;
@@ -1863,7 +1792,7 @@ public class PDFTextStripper extends PDF
* @param patterns list of patterns
* @return matching pattern
*/
- protected static final Pattern matchPattern(String string, List<Pattern> patterns)
+ protected static Pattern matchPattern(String string, List<Pattern> patterns)
{
Pattern matchedPattern = null;
for(Pattern p : patterns)
@@ -1882,7 +1811,8 @@ public class PDFTextStripper extends PDF
* @param isRtlDominant determines if rtl or ltl is dominant
* @throws IOException if something went wrong
*/
- private void writeLine(List<WordWithTextPositions> line, boolean isRtlDominant) throws IOException
+ private void writeLine(List<WordWithTextPositions> line, boolean isRtlDominant)
+ throws IOException
{
int numberOfStrings = line.size();
for(int i=0; i<numberOfStrings; i++)
@@ -1903,7 +1833,8 @@ public class PDFTextStripper extends PDF
* @param hasRtl determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
- private List<WordWithTextPositions> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
+ private List<WordWithTextPositions> normalize(List<TextPosition> line, boolean isRtlDominant,
+ boolean hasRtl)
{
LinkedList<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
StringBuilder lineBuilder = new StringBuilder();
@@ -1932,8 +1863,8 @@ public class PDFTextStripper extends PDF
}
/**
- * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions}
- * entry.
+ * Used within {@link #normalize(List, boolean, boolean)} to create a single
+ * {@link WordWithTextPositions} entry.
*/
private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
{
@@ -1949,7 +1880,8 @@ public class PDFTextStripper extends PDF
{
if (text instanceof WordSeparator)
{
- normalized.add(createWord(lineBuilder.toString(), new ArrayList<TextPosition>(wordPositions)));
+ normalized.add(createWord(lineBuilder.toString(),
+ new ArrayList<TextPosition>(wordPositions)));
lineBuilder = new StringBuilder();
wordPositions.clear();
}
@@ -1962,10 +1894,7 @@ public class PDFTextStripper extends PDF
}
/**
- * internal marker class. Used as a place holder in
- * a line of TextPositions.
- * @author ME21969
- *
+ * internal marker class. Used as a place holder in a line of TextPositions.
*/
private static final class WordSeparator extends TextPosition
{
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/SetHorizontalTextScaling.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/SetHorizontalTextScaling.java?rev=1603056&r1=1603055&r2=1603056&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/SetHorizontalTextScaling.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/SetHorizontalTextScaling.java Tue Jun 17 05:10:23 2014
@@ -43,6 +43,6 @@ public class SetHorizontalTextScaling ex
public void process(PDFOperator operator, List<COSBase> arguments) throws IOException
{
COSNumber scaling = (COSNumber)arguments.get(0);
- context.getGraphicsState().getTextState().setHorizontalScalingPercent( scaling.floatValue() );
+ context.getGraphicsState().getTextState().setHorizontalScaling(scaling.floatValue());
}
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java?rev=1603056&r1=1603055&r2=1603056&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java Tue Jun 17 05:10:23 2014
@@ -43,7 +43,7 @@ public class ShowTextGlyph extends Opera
COSArray array = (COSArray)arguments.get( 0 );
int arraySize = array.size();
float fontsize = context.getGraphicsState().getTextState().getFontSize();
- float horizontalScaling = context.getGraphicsState().getTextState().getHorizontalScalingPercent()/100;
+ float horizontalScaling = context.getGraphicsState().getTextState().getHorizontalScaling()/100;
for( int i=0; i<arraySize; i++ )
{
COSBase next = array.get( i );