You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/07/26 19:35:28 UTC
svn commit: r979379 - in /pdfbox/trunk/pdfbox/src:
main/java/org/apache/pdfbox/util/ test/resources/input/
Author: lehmi
Date: Mon Jul 26 17:35:27 2010
New Revision: 979379
URL: http://svn.apache.org/viewvc?rev=979379&view=rev
Log:
PDFBOX-521: added improved text extraction algos. Based on a patch by Mel Martinez (m dot martinez at ll dot mit dot edu)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt
pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt
pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Mon Jul 26 17:35:27 2010
@@ -44,8 +44,14 @@ public class PDFText2HTML extends PDFTex
*/
public PDFText2HTML(String encoding) throws IOException
{
- this.outputEncoding = encoding;
- this.lineSeparator = "<br>" + System.getProperty("line.separator");
+ super(encoding);
+ setLineSeparator(systemLineSeparator);
+ setParagraphStart("<p>");
+ setParagraphEnd("</p>"+systemLineSeparator);
+ setPageStart("<div style=\"page-break-before:always; page-break-after:always\">");
+ setPageEnd("</div>"+systemLineSeparator);
+ setArticleStart(systemLineSeparator);
+ setArticleEnd(systemLineSeparator);
}
/**
@@ -108,16 +114,16 @@ public class PDFText2HTML extends PDFTex
}
else
{
- Iterator textIter = getCharactersByArticle().iterator();
+ Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();
float lastFontSize = -1.0f;
StringBuffer titleText = new StringBuffer();
while (textIter.hasNext())
{
- Iterator textByArticle = ((List) textIter.next()).iterator();
+ Iterator<TextPosition> textByArticle = textIter.next().iterator();
while (textByArticle.hasNext())
{
- TextPosition position = (TextPosition) textByArticle.next();
+ TextPosition position = textByArticle.next();
float currentFontSize = position.getFontSize();
//If we're past 64 chars we will assume that we're past the title
@@ -169,6 +175,7 @@ public class PDFText2HTML extends PDFTex
*/
protected void endArticle() throws IOException
{
+ super.endArticle();
super.writeString("</div>");
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Mon Jul 26 17:35:27 2010
@@ -23,10 +23,12 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;
+import java.util.regex.Pattern;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSStream;
@@ -57,6 +59,60 @@ import org.apache.pdfbox.pdmodel.interac
*/
public class PDFTextStripper extends PDFStreamEngine
{
+
+ private static final String thisClassName = PDFTextStripper.class.getSimpleName().toLowerCase();
+
+ //enable the ability to set the default indent/drop thresholds
+ //with -D system properties:
+ // pdftextstripper.indent
+ // pdftextstripper.drop
+ static
+ {
+ String prop = thisClassName+".indent";
+ String s = System.getProperty(prop);
+ if(s!=null && s.length()>0)
+ {
+ try
+ {
+ float f = Float.parseFloat(s);
+ DEFAULT_INDENT_THRESHOLD = f;
+ }
+ catch(NumberFormatException nfe)
+ {
+ //ignore and use default
+ }
+ }
+ prop = thisClassName+".drop";
+ s = System.getProperty(prop);
+ if(s!=null && s.length()>0)
+ {
+ try
+ {
+ float f = Float.parseFloat(s);
+ DEFAULT_DROP_THRESHOLD = f;
+ }
+ catch(NumberFormatException nfe){
+ //ignore and use default
+ }
+ }
+ }
+
+ /**
+ * The platforms line separator.
+ */
+ protected final String systemLineSeparator = System.getProperty("line.separator");
+
+ private String lineSeparator = systemLineSeparator;
+ private String pageSeparator = systemLineSeparator;
+ private String wordSeparator = " ";
+ private String paragraphStart = "";
+ private String paragraphEnd = "";
+ private String pageStart = "";
+ private String pageEnd = pageSeparator;
+ private String articleStart = "";
+ private String articleEnd = "";
+
+
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
@@ -67,7 +123,14 @@ public class PDFTextStripper extends PDF
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
+ private boolean addMoreFormatting = false;
+ private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
+ private static float DEFAULT_DROP_THRESHOLD = 2.5f;
+
+ private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
+ private float dropThreshold = DEFAULT_DROP_THRESHOLD;
+
// We will need to estimate where to add spaces.
// These are used to help guess.
private float spacingTolerance = .5f;
@@ -94,12 +157,6 @@ public class PDFTextStripper extends PDF
private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
/**
- * The platforms lineseparator.
- */
- protected String lineSeparator = System.getProperty("line.separator");
- private String pageSeparator = System.getProperty("line.separator");
- private String wordSeparator = " ";
- /**
* encoding that text will be written in (or null).
*/
protected String outputEncoding;
@@ -231,6 +288,12 @@ public class PDFTextStripper extends PDF
resetEngine();
document = doc;
output = outputStream;
+ if (getAddMoreFormatting()) {
+ paragraphEnd = lineSeparator;
+ pageStart = lineSeparator;
+ articleStart = lineSeparator;
+ articleEnd = lineSeparator;
+ }
startDocument(document);
if( document.isEncrypted() )
@@ -407,7 +470,7 @@ public class PDFTextStripper extends PDF
*/
protected void startArticle(boolean isltr) throws IOException
{
- //default is to do nothing.
+ output.write(getArticleStart());
}
/**
@@ -418,7 +481,7 @@ public class PDFTextStripper extends PDF
*/
protected void endArticle() throws IOException
{
- //default is to do nothing
+ output.write(getArticleEnd());
}
/**
@@ -462,20 +525,26 @@ public class PDFTextStripper extends PDF
*
* @throws IOException If there is an error writing the text.
*/
- protected void writePage() throws IOException
+ protected void writePage() throws IOException
{
float maxYForLine = MAXYFORLINE_RESET_VALUE;
float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
+ PositionWrapper lastPosition = null;
+ PositionWrapper lastLineStartPosition = null;
- TextPosition lastPosition = null;
+ boolean startOfPage = true;//flag to indicate start of page
+ boolean startOfArticle = true;
+ if(charactersByArticle.size() > 0) {
+ writePageStart();
+ }
for( int i = 0; i < charactersByArticle.size(); i++)
{
- List<TextPosition> textList = (List<TextPosition>)charactersByArticle.get( i );
- if( sortByPosition )
+ List<TextPosition> textList = charactersByArticle.get( i );
+ if( getSortByPosition() )
{
TextPositionComparator comparator = new TextPositionComparator();
Collections.sort( textList, comparator );
@@ -485,40 +554,39 @@ public class PDFTextStripper extends PDF
/* Before we can display the text, we need to do some normalizing.
* Arabic and Hebrew text is right to left and is typically stored
- * in its logical format, which means that the rightmost character is
+ * in its logical format, which means that the rightmost character is
* stored first, followed by the second character from the right etc.
- * However, PDF stores the text in presentation form, which is left to
+ * However, PDF stores the text in presentation form, which is left to
* right. We need to do some normalization to convert the PDF data to
- * the proper logical output format.
- *
+ * the proper logical output format.
+ *
* Note that if we did not sort the text, then the output of reversing the
* text is undefined and can sometimes produce worse output then not trying
* to reverse the order. Sorting should be done for these languages.
* */
- /* First step is to determine if we have any right to left text, and
- * if so, is it dominant. */
+ /* First step is to determine if we have any right to left text, and
+ * if so, is it dominant. */
int ltrCnt = 0;
int rtlCnt = 0;
while( textIter.hasNext() )
{
- TextPosition position = textIter.next();
+ TextPosition position = (TextPosition)textIter.next();
String stringValue = position.getCharacter();
-
- for (int a = 0; a < stringValue.length(); a++)
+ for (int a = 0; a < stringValue.length(); a++)
{
byte dir = Character.getDirectionality(stringValue.charAt(a));
- if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
+ if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
(dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
- (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
+ (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
{
ltrCnt++;
}
else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
(dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
(dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
- (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
+ (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
{
rtlCnt++;
}
@@ -526,47 +594,39 @@ public class PDFTextStripper extends PDF
}
// choose the dominant direction
- boolean isRtlDominant = false;
- if (rtlCnt > ltrCnt)
- {
- isRtlDominant = true;
- }
+ boolean isRtlDominant = rtlCnt > ltrCnt;
startArticle(!isRtlDominant);
-
+ startOfArticle = true;
// we will later use this to skip reordering
- boolean hasRtl = false;
- if (rtlCnt > 0)
- {
- hasRtl = true;
- }
+ boolean hasRtl = rtlCnt > 0;
- /* Now cycle through to print the text.
+ /* Now cycle through to print the text.
* We queue up a line at a time before we print so that we can convert
- * the line from presentation form to logical form (if needed). */
- String lineStr = "";
+ * the line from presentation form to logical form (if needed).
+ */
+ List<TextPosition> line = new ArrayList<TextPosition>();
textIter = textList.iterator(); // start from the beginning again
-
/* PDF files don't always store spaces. We will need to guess where we should add
* spaces based on the distances between TextPositions. Historically, this was done
* based on the size of the space character provided by the font. In general, this worked
* but there were cases where it did not work. Calculating the average character width
- * and using that as a metric works better in some cases but fails in some cases where the
- * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
+ * and using that as a metric works better in some cases but fails in some cases where the
+ * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
*/
-
//Keeps track of the previous average character width
float previousAveCharWidth = -1;
while( textIter.hasNext() )
{
- TextPosition position = textIter.next();
+ TextPosition position = (TextPosition)textIter.next();
+ PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getCharacter();
- //Resets the average character width when we see a change in font
+ //Resets the average character width when we see a change in font
// or a change in the font size
- if(lastPosition != null && ((position.getFont() != lastPosition.getFont())
- || (position.getFontSize() != lastPosition.getFontSize())))
+ if(lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
+ || (position.getFontSize() != lastPosition.getTextPosition().getFontSize())))
{
previousAveCharWidth = -1;
}
@@ -578,14 +638,14 @@ public class PDFTextStripper extends PDF
/* If we are sorting, then we need to use the text direction
* adjusted coordinates, because they were used in the sorting. */
- if (sortByPosition)
+ if (getSortByPosition())
{
positionX = position.getXDirAdj();
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
}
- else
+ else
{
positionX = position.getX();
positionY = position.getY();
@@ -596,31 +656,31 @@ public class PDFTextStripper extends PDF
//The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
- /* Estimate the expected width of the space based on the
+ /* Estimate the expected width of the space based on the
* space character with some margin. */
float wordSpacing = position.getWidthOfSpace();
float deltaSpace = 0;
- if ((wordSpacing == 0) || Float.isNaN(wordSpacing))
+ if ((wordSpacing == 0) || (wordSpacing == Float.NaN))
{
deltaSpace = Float.MAX_VALUE;
}
- else
+ else
{
if( lastWordSpacing < 0 )
{
- deltaSpace = (wordSpacing * spacingTolerance);
+ deltaSpace = (wordSpacing * getSpacingTolerance());
}
else
{
- deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* spacingTolerance);
+ deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* getSpacingTolerance());
}
}
- /* Estimate the expected width of the space based on the
+ /* Estimate the expected width of the space based on the
* average character width with some margin. This calculation does not
* make a true average (average of averages) but we found that it gave the
* best results after numerous experiments. Based on experiments we also found that
- * .3 worked well. */
+ * .3 worked well. */
float averageCharWidth = -1;
if(previousAveCharWidth < 0)
{
@@ -630,10 +690,10 @@ public class PDFTextStripper extends PDF
{
averageCharWidth = (previousAveCharWidth + (positionWidth/wordCharCount))/2f;
}
- float deltaCharWidth = (averageCharWidth * averageCharTolerance);
-
+ float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());
+
//Compares the values obtained by the average method and the wordSpacing method and picks
- //the smaller number.
+ //the smaller number.
float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
if(endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE)
{
@@ -645,10 +705,14 @@ public class PDFTextStripper extends PDF
{
expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
}
- }
+ }
if( lastPosition != null )
- {
+ {
+ if(startOfArticle){
+ lastPosition.setArticleStart();
+ startOfArticle = false;
+ }
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
@@ -660,23 +724,10 @@ public class PDFTextStripper extends PDF
* of regression test failures. So, I'm leaving it be for now. */
if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
- // If we have RTL text on the page, change the direction
- if (hasRtl)
- {
- lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
- }
+ writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+ line.clear();
- /* normalize string to remove presentation forms.
- * Note that this must come after the line direction
- * conversion because the process looks ahead to the next
- * logical character.
- */
- lineStr = normalize.normalizePres(lineStr);
-
- writeString(lineStr);
- lineStr = "";
-
- writeLineSeparator( );
+ lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition);
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
@@ -685,17 +736,17 @@ public class PDFTextStripper extends PDF
minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
}
- //Test if our TextPosition starts after a new word would be expected to start.
- if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
+ //Test if our TextPosition starts after a new word would be expected to start.
+ if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE && expectedStartOfNextWordX < positionX &&
//only bother adding a space if the last character was not a space
- lastPosition.getCharacter() != null &&
- !lastPosition.getCharacter().endsWith( " " ) )
+ lastPosition.getTextPosition().getCharacter() != null &&
+ !lastPosition.getTextPosition().getCharacter().endsWith( " " ) )
{
- lineStr += getWordSeparator();
+ line.add(WordSeparator.getSeparator());
}
}
- if (positionY >= maxYForLine)
+ if (positionY >= maxYForLine)
{
maxYForLine = positionY;
}
@@ -705,34 +756,36 @@ public class PDFTextStripper extends PDF
endOfLastTextX = positionX + positionWidth;
// add it to the list
- if (characterValue != null)
+ if (characterValue != null)
{
- lineStr += characterValue;
+ if(startOfPage && lastPosition==null){
+ writeParagraphStart();//not sure this is correct for RTL?
+ }
+ line.add(position);
}
maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
- minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
- lastPosition = position;
+ minYTopForLine = Math.min(minYTopForLine,positionY - positionHeight);
+ lastPosition = current;
+ if(startOfPage){
+ lastPosition.setParagraphStart();
+ lastPosition.setLineStart();
+ lastLineStartPosition = lastPosition;
+ startOfPage=false;
+ }
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
// print the final line
- if (lineStr.length() > 0)
+ if (line.size() > 0)
{
- if (hasRtl)
- {
- lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
- }
-
- // normalize string to remove presentation forms
- lineStr = normalize.normalizePres(lineStr);
-
- writeString(lineStr);
+ writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+ writeParagraphEnd();
}
endArticle();
}
- writePageSeperator();
+ writePageEnd();
}
private boolean overlap( float y1, float height1, float y2, float height2 )
@@ -810,7 +863,6 @@ public class PDFTextStripper extends PDF
return second < first + variance && second > first - variance;
}
-
/**
* This will process a TextPosition object and add the
* text to the list of characters on a page. It takes care of
@@ -1098,7 +1150,7 @@ public class PDFTextStripper extends PDF
/**
* @return Returns the suppressDuplicateOverlappingText.
*/
- public boolean shouldSuppressDuplicateOverlappingText()
+ public boolean getSuppressDuplicateOverlappingText()
{
return suppressDuplicateOverlappingText;
}
@@ -1154,7 +1206,7 @@ public class PDFTextStripper extends PDF
*
* @return If the text will be grouped by beads.
*/
- public boolean shouldSeparateByBeads()
+ public boolean getSeparateByBeads()
{
return shouldSeparateByBeads;
}
@@ -1210,12 +1262,31 @@ public class PDFTextStripper extends PDF
}
/**
+ * This will tell if the text stripper should add some more text formatting.
+ * @return true if some more text formatting will be added
+ */
+ public boolean getAddMoreFormatting()
+ {
+ return addMoreFormatting;
+ }
+
+ /**
+ * There will some additional text formatting be added if addMoreFormatting
+ * is set to true. Default is false.
+ * @param newAddMoreFormatting Tell PDFBox to add some more text formatting
+ */
+ public void setAddMoreFormatting(boolean newAddMoreFormatting)
+ {
+ addMoreFormatting = newAddMoreFormatting;
+ }
+
+ /**
* This will tell if the text stripper should sort the text tokens
* before writing to the stream.
*
* @return true If the text tokens will be sorted before being written.
*/
- public boolean shouldSortByPosition()
+ public boolean getSortByPosition()
{
return sortByPosition;
}
@@ -1288,9 +1359,174 @@ public class PDFTextStripper extends PDF
this.averageCharTolerance = averageCharToleranceValue;
}
+
+ /**
+ * returns the multiple of whitespace character widths
+ * for the current text which the current
+ * line start can be indented from the previous line start
+ * beyond which the current line start is considered
+ * to be a paragraph start.
+ * @return the number of whitespace character widths to use
+ * when detecting paragraph indents.
+ */
+ public float getIndentThreshold()
+ {
+ return indentThreshold;
+ }
+
+ /**
+ * sets the multiple of whitespace character widths
+ * for the current text which the current
+ * line start can be indented from the previous line start
+ * beyond which the current line start is considered
+ * to be a paragraph start. The default value is 2.0.
+ *
+ * @param indentThreshold the number of whitespace character widths to use
+ * when detecting paragraph indents.
+ */
+ public void setIndentThreshold(float indentThreshold)
+ {
+ this.indentThreshold = indentThreshold;
+ }
+
+ /**
+ * the minimum whitespace, as a multiple
+ * of the max height of the current characters
+ * beyond which the current line start is considered
+ * to be a paragraph start.
+ * @return the character height multiple for
+ * max allowed whitespace between lines in
+ * the same paragraph.
+ */
+ public float getDropThreshold()
+ {
+ return dropThreshold;
+ }
+
+ /**
+ * sets the minimum whitespace, as a multiple
+ * of the max height of the current characters
+ * beyond which the current line start is considered
+ * to be a paragraph start. The default value is 2.5.
+ *
+ * @param dropThreshold the character height multiple for
+ * max allowed whitespace between lines in
+ * the same paragraph.
+ */
+ public void setDropThreshold(float dropThreshold)
+ {
+ this.dropThreshold = dropThreshold;
+ }
+
+ /**
+ * Returns the string which will be used at the beginning of a paragraph.
+ * @return the paragraph start string
+ */
+ public String getParagraphStart()
+ {
+ return paragraphStart;
+ }
+
+ /**
+ * Sets the string which will be used at the beginning of a paragraph.
+ * @param s the paragraph start string
+ */
+ public void setParagraphStart(String s)
+ {
+ this.paragraphStart = s;
+ }
+
+ /**
+ * Returns the string which will be used at the end of a paragraph.
+ * @return the paragraph end string
+ */
+ public String getParagraphEnd()
+ {
+ return paragraphEnd;
+ }
+
+ /**
+ * Sets the string which will be used at the end of a paragraph.
+ * @param s the paragraph end string
+ */
+ public void setParagraphEnd(String s)
+ {
+ this.paragraphEnd = s;
+ }
+
+
+ /**
+ * Returns the string which will be used at the beginning of a page.
+ * @return the page start string
+ */
+ public String getPageStart()
+ {
+ return pageStart;
+ }
+
+ /**
+ * Sets the string which will be used at the beginning of a page.
+ * @param s the page start string
+ */
+ public void setPageStart(String pageStart)
+ {
+ this.pageStart = pageStart;
+ }
+
+ /**
+ * Returns the string which will be used at the end of a page.
+ * @return the page end string
+ */
+ public String getPageEnd()
+ {
+ return pageEnd;
+ }
+
+ /**
+ * Sets the string which will be used at the end of a page.
+ * @param s the page end string
+ */
+ public void setPageEnd(String pageEnd)
+ {
+ this.pageEnd = pageEnd;
+ }
+
+ /**
+ * Returns the string which will be used at the beginning of an article.
+ * @return the article start string
+ */
+ public String getArticleStart() {
+ return articleStart;
+ }
+
+ /**
+ * Sets the string which will be used at the beginning of an article.
+ * @param s the article start string
+ */
+ public void setArticleStart(String articleStart) {
+ this.articleStart = articleStart;
+ }
+
+ /**
+ * Returns the string which will be used at the end of an article.
+ * @return the article end string
+ */
+ public String getArticleEnd(){
+ return articleEnd;
+ }
+
+ /**
+ * Sets the string which will be used at the end of an article.
+ * @param s the article end string
+ */
+ public void setArticleEnd(String articleEnd){
+ this.articleEnd = articleEnd;
+ }
+
+
/**
* Reverse characters of a compound Arabic glyph.
- * When shouldSortByPosition() is true, inspect the sequence encoded
+ * When getSortByPosition() is true, inspect the sequence encoded
* by one glyph. If the glyph encodes two or more Arabic characters,
* reverse these characters from a logical order to a visual order.
* This ensures that the bidirectional algorithm that runs later will
@@ -1318,4 +1554,326 @@ public class PDFTextStripper extends PDF
return reversed.toString();
}
+ /**
+ * handles the line separator for a new line given
+ * the specified current and previous TextPositions.
+ * @param position the current text position
+ * @param lastPosition the previous text position
+ * @param lastLineStartPosition the last text position that followed a line
+ * separator.
+ * @throws IOException
+ */
+ protected PositionWrapper handleLineSeparation(PositionWrapper current,
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
+ throws IOException {
+ current.setLineStart();
+ isParagraphSeparation(current, lastPosition, lastLineStartPosition);
+ lastLineStartPosition = current;
+ if (current.isParagraphStart()) {
+ if(lastPosition.isArticleStart()) {
+ writeParagraphStart();
+ } else {
+ writeLineSeparator();
+ writeParagraphSeparator();
+ }
+ } else {
+ writeLineSeparator();
+ }
+ return lastLineStartPosition;
+ }
+
+ /**
+ * tests the relationship between the last text position, the current text
+ * position and the last text position that followed a line separator to
+ * decide if the gap represents a paragraph separation. This should
+ * <i>only</i> be called for consecutive text positions that first pass the
+ * line separation test.
+ * <p>
+ * This base implementation tests to see if the lastLineStartPosition is
+ * null OR if the current vertical position has dropped below the last text
+ * vertical position by at least 2.5 times the current text height OR if the
+ * current horizontal position is indented by at least 2 times the current
+ * width of a space character.</p>
+ * <p>
+ * This also attempts to identify text that is indented under a hanging indent.</p>
+ * <p>
+ * This method sets the isParagraphStart and isHangingIndent flags on the current
+ * position object.</p>
+ *
+ * @param position the current text position. This may have its isParagraphStart
+ * or isHangingIndent flags set upon return.
+ * @param lastPosition the previous text position (should not be null).
+ * @param lastLineStartPosition the last text position that followed a line
+ * separator. May be null.
+ */
+ protected void isParagraphSeparation(PositionWrapper position,
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition){
+ boolean result = false;
+ if(lastLineStartPosition == null) {
+ result = true;
+ }else{
+ float yGap = Math.abs(position.getTextPosition().getYDirAdj()-
+ lastPosition.getTextPosition().getYDirAdj());
+ float xGap = (position.getTextPosition().getXDirAdj()-
+ lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
+ if(yGap > (getDropThreshold()*position.getTextPosition().getHeightDir())){
+ result = true;
+ }else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace())){
+ //text is indented, but try to screen for hanging indent
+ if(!lastLineStartPosition.isParagraphStart()){
+ result = true;
+ }else{
+ position.setHangingIndent();
+ }
+ }else if(xGap < -position.getTextPosition().getWidthOfSpace()){
+ //text is left of previous line. Was it a hanging indent?
+ if(!lastLineStartPosition.isParagraphStart()){
+ result = true;
+ }
+ }else if(Math.abs(xGap) < (0.25 * position.getTextPosition().getWidth())){
+ //current horizontal position is within 1/4 a char of the last
+ //linestart. We'll treat them as lined up.
+ if(lastLineStartPosition.isHangingIndent()){
+ position.setHangingIndent();
+ }else if(lastLineStartPosition.isParagraphStart()){
+ //check to see if the previous line looks like
+ //any of a number of standard list item formats
+ Pattern liPattern = matchListItemPattern(lastLineStartPosition);
+ if(liPattern!=null){
+ Pattern currentPattern = matchListItemPattern(position);
+ if(liPattern == currentPattern){
+ result = true;
+ }
+ }
+ }
+ }
+ }
+ if(result){
+ position.setParagraphStart();
+ }
+ }
+
+ /**
+ * writes the paragraph separator string to the output.
+ * @throws IOException
+ */
+ protected void writeParagraphSeparator()throws IOException{
+ writeParagraphEnd();
+ writeParagraphStart();
+ }
+
+ /**
+ * Write something (if defined) at the start of a paragraph.
+ * @throws IOException if something went wrong
+ */
+ protected void writeParagraphStart() throws IOException{
+ output.write(getParagraphStart());
+ }
+
+ /**
+ * Write something (if defined) at the end of a paragraph.
+ * @throws IOException if something went wrong
+ */
+ protected void writeParagraphEnd() throws IOException{
+ output.write(getParagraphEnd());
+ }
+
+ /**
+ * Write something (if defined) at the start of a page.
+ * @throws IOException if something went wrong
+ */
+ protected void writePageStart()throws IOException{
+ output.write(getPageStart());
+ }
+
+ /**
+ * Write something (if defined) at the end of a page.
+ * @throws IOException if something went wrong
+ */
+ protected void writePageEnd()throws IOException{
+ output.write(getPageEnd());
+ }
+
+ /**
+ * returns the list item Pattern object that matches
+ * the text at the specified PositionWrapper or null
+ * if the text does not match such a pattern. The list
+ * of Patterns tested against is given by the
+ * {@link #getListItemPatterns()} method. To add to
+ * the list, simply override that method (if sub-classing)
+ * or explicitly supply your own list using
+ * {@link #setListItemPatterns(List)}.
+ * @param pw
+ * @return
+ */
+ protected Pattern matchListItemPattern(PositionWrapper pw) {
+ TextPosition tp = pw.getTextPosition();
+ String txt = tp.getCharacter();
+ Pattern p = matchPattern(txt,getListItemPatterns());
+ return p;
+ }
+
+ /**
+ * a list of regular expressions that match commonly used
+ * list item formats, i.e. bullets, numbers, letters,
+ * Roman numerals, etc. Not meant to be
+ * comprehensive.
+ */
+ private static final String[] LIST_ITEM_EXPRESSIONS = {
+ "\\.",
+ "\\d+\\.",
+ "\\[\\d+\\]",
+ "\\d+\\)",
+ "[A-Z]\\.",
+ "[a-z]\\.",
+ "[A-Z]\\)",
+ "[a-z]\\)",
+ "[IVXL]+\\.",
+ "[ivxl]+\\.",
+
+ };
+
+ private List<Pattern> liPatterns = null;
+ /**
+ * use to supply a different set of regular expression
+ * patterns for matching list item starts.
+ *
+ * @param patterns
+ */
+ protected void setListItemPatterns(List<Pattern> patterns){
+ liPatterns = patterns;
+ }
+
+
+ /**
+ * returns a list of regular expression Patterns representing
+ * different common list item formats. For example
+ * numbered items of form:
+ * <ol>
+ * <li>some text</li>
+ * <li>more text</li>
+ * </ol>
+ * or
+ * <ul>
+ * <li>some text</li>
+ * <li>more text</li>
+ * </ul>
+ * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...)
+ * or "\[\\d+\]" (matches "[1]", "[2]", ...).
+ * <p>
+ * This method returns a list of such regular expression Patterns.
+ * @return a list of Pattern objects.
+ */
+ protected List<Pattern> getListItemPatterns(){
+ if(liPatterns == null){
+ liPatterns = new ArrayList<Pattern>();
+ for(String expression : LIST_ITEM_EXPRESSIONS){
+ Pattern p = Pattern.compile(expression);
+ liPatterns.add(p);
+ }
+ }
+ return liPatterns;
+ }
+
+ /**
+ * iterates over the specified list of Patterns until
+ * it finds one that matches the specified string. Then
+ * returns the Pattern.
+ * <p>
+ * Order of the supplied list of patterns is important as
+ * most common patterns should come first. Patterns
+ * should be strict in general, and all will be
+ * used with case sensitivity on.
+ * </p>
+ * @param s
+ * @param patterns
+ * @return
+ */
+ protected static final Pattern matchPattern(String s, List<Pattern> patterns){
+ Pattern matchedPattern = null;
+ for(Pattern p : patterns){
+ if(p.matcher(s).matches()){
+ return p;
+ }
+ }
+ return matchedPattern;
+ }
+
+ /**
+ * Write a list of string containing a whole line of a document.
+ * @param line a list with the words of the given line
+ * @param isRtlDominant determines if rtl or ltl is dominant
+ * @throws IOException if something went wrong
+ */
+ private void writeLine(List<String> line, boolean isRtlDominant)throws IOException{
+ int numberOfStrings = line.size();
+ if (isRtlDominant) {
+ for(int i=numberOfStrings-1; i>=0; i--){
+ if (i > 1)
+ writeWordSeparator();
+ writeString(line.get(i));
+ }
+ }
+ else {
+ for(int i=0; i<numberOfStrings; i++){
+ writeString(line.get(i));
+ if (!isRtlDominant && i < numberOfStrings-1)
+ writeWordSeparator();
+ }
+ }
+ }
+
+ /**
+ * Normalize the given list of TextPositions.
+ * @param line list of TextPositions
+ * @param isRtlDominant determines if rtl or ltl is dominant
+ * @param hasRtl determines if lines contains rtl formatted text(parts)
+ * @return a list of strings, one string for every word
+ */
+ private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl){
+ LinkedList<String> normalized = new LinkedList<String>();
+ StringBuilder lineBuilder = new StringBuilder();
+ for(TextPosition text : line){
+ if (text instanceof WordSeparator) {
+ String lineStr = lineBuilder.toString();
+ if (hasRtl) {
+ lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
+ }
+ lineStr = normalize.normalizePres(lineStr);
+ normalized.add(lineStr);
+ lineBuilder = new StringBuilder();
+ }
+ else {
+ lineBuilder.append(text.getCharacter());
+ }
+ }
+ if (lineBuilder.length() > 0) {
+ String lineStr = lineBuilder.toString();
+ if (hasRtl) {
+ lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
+ }
+ lineStr = normalize.normalizePres(lineStr);
+ normalized.add(lineStr);
+ }
+ return normalized;
+ }
+
+ /**
+ * internal marker class. Used as a place holder in
+ * a line of TextPositions.
+ * @author ME21969
+ *
+ */
+ private static final class WordSeparator extends TextPosition{
+ private static final WordSeparator separator = new WordSeparator();
+
+ private WordSeparator(){
+ }
+
+ public static final WordSeparator getSeparator(){
+ return separator;
+ }
+
+ }
+
}
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/FC60_Times.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/Liste732004001452_001_0.pdf_0_.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/allah2.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/data-000001.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/hello3.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/openoffice-test-document.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/rotation.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sampleForSpec.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/simple-openoffice.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf-sorted.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.
Modified: pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/yaddatest.pdf.txt?rev=979379&r1=979378&r2=979379&view=diff
==============================================================================
Binary files - no diff available.