You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2011/11/09 08:04:19 UTC
svn commit: r1199634 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Author: lehmi
Date: Wed Nov 9 07:04:18 2011
New Revision: 1199634
URL: http://svn.apache.org/viewvc?rev=1199634&view=rev
Log:
PDFBOX-956: improved text extraction performance based on a patch of Kevin Jackson, some reformating
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1199634&r1=1199633&r2=1199634&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Wed Nov 9 07:04:18 2011
@@ -27,6 +27,10 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Pattern;
@@ -42,7 +46,6 @@ import org.apache.pdfbox.pdmodel.common.
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-import org.apache.pdfbox.util.TextPosition;
/**
@@ -95,8 +98,9 @@ public class PDFTextStripper extends PDF
float f = Float.parseFloat(s);
DEFAULT_DROP_THRESHOLD = f;
}
- catch(NumberFormatException nfe){
- //ignore and use default
+ catch(NumberFormatException nfe)
+ {
+ //ignore and use default
}
}
}
@@ -155,7 +159,8 @@ public class PDFTextStripper extends PDF
*/
protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
- private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
+ private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
+ new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
/**
* encoding that text will be written in (or null).
@@ -289,7 +294,8 @@ public class PDFTextStripper extends PDF
resetEngine();
document = doc;
output = outputStream;
- if (getAddMoreFormatting()) {
+ if (getAddMoreFormatting())
+ {
paragraphEnd = lineSeparator;
pageStart = lineSeparator;
articleStart = lineSeparator;
@@ -538,7 +544,8 @@ public class PDFTextStripper extends PDF
boolean startOfPage = true;//flag to indicate start of page
boolean startOfArticle = true;
- if(charactersByArticle.size() > 0) {
+ if(charactersByArticle.size() > 0)
+ {
writePageStart();
}
@@ -710,7 +717,8 @@ public class PDFTextStripper extends PDF
if( lastPosition != null )
{
- if(startOfArticle){
+ if(startOfArticle)
+ {
lastPosition.setArticleStart();
startOfArticle = false;
}
@@ -728,7 +736,8 @@ public class PDFTextStripper extends PDF
writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
line.clear();
- lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
+ lastLineStartPosition =
+ handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
@@ -738,7 +747,8 @@ public class PDFTextStripper extends PDF
}
//Test if our TextPosition starts after a new word would be expected to start.
- if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE && expectedStartOfNextWordX < positionX &&
+ if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
+ && expectedStartOfNextWordX < positionX &&
//only bother adding a space if the last character was not a space
lastPosition.getTextPosition().getCharacter() != null &&
!lastPosition.getTextPosition().getCharacter().endsWith( " " ) )
@@ -759,7 +769,8 @@ public class PDFTextStripper extends PDF
// add it to the list
if (characterValue != null)
{
- if(startOfPage && lastPosition==null){
+ if(startOfPage && lastPosition==null)
+ {
writeParagraphStart();//not sure this is correct for RTL?
}
line.add(position);
@@ -767,7 +778,8 @@ public class PDFTextStripper extends PDF
maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
minYTopForLine = Math.min(minYTopForLine,positionY - positionHeight);
lastPosition = current;
- if(startOfPage){
+ if(startOfPage)
+ {
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
@@ -880,10 +892,10 @@ public class PDFTextStripper extends PDF
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
- List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
+ TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get( textCharacter );
if( sameTextCharacters == null )
{
- sameTextCharacters = new ArrayList<TextPosition>();
+ sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
characterListMapping.put( textCharacter, sameTextCharacters );
}
@@ -900,27 +912,29 @@ public class PDFTextStripper extends PDF
//
boolean suppressCharacter = false;
float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
- for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
- {
- TextPosition character = sameTextCharacters.get( i );
- String charCharacter = character.getCharacter();
- float charX = character.getX();
- float charY = character.getY();
- //only want to suppress
-
- if( charCharacter != null &&
- //charCharacter.equals( textCharacter ) &&
- within( charX, textX, tolerance ) &&
- within( charY,
- textY,
- tolerance ) )
+
+ SortedMap<Float, TreeSet<Float>> xMatches =
+ sameTextCharacters.subMap(textX - tolerance, textX + tolerance );
+ for (TreeSet<Float> xMatch : xMatches.values())
+ {
+ SortedSet<Float> yMatches =
+ xMatch.subSet(textY - tolerance , textY + tolerance );
+ if (!yMatches.isEmpty())
{
suppressCharacter = true;
+ break;
}
}
+
if( !suppressCharacter )
{
- sameTextCharacters.add( text );
+ TreeSet<Float> ySet = sameTextCharacters.get(textX);
+ if (ySet == null)
+ {
+ ySet = new TreeSet<Float>();
+ sameTextCharacters.put( textX, ySet );
+ }
+ ySet.add( textY );
showCharacter = true;
}
}
@@ -1382,12 +1396,12 @@ public class PDFTextStripper extends PDF
* beyond which the current line start is considered
* to be a paragraph start. The default value is 2.0.
*
- * @param indentThreshold the number of whitespace character widths to use
+ * @param indentThresholdValue the number of whitespace character widths to use
* when detecting paragraph indents.
*/
- public void setIndentThreshold(float indentThreshold)
+ public void setIndentThreshold(float indentThresholdValue)
{
- this.indentThreshold = indentThreshold;
+ indentThreshold = indentThresholdValue;
}
/**
@@ -1410,13 +1424,13 @@ public class PDFTextStripper extends PDF
* beyond which the current line start is considered
* to be a paragraph start. The default value is 2.5.
*
- * @param dropThreshold the character height multiple for
+ * @param dropThresholdValue the character height multiple for
* max allowed whitespace between lines in
* the same paragraph.
*/
- public void setDropThreshold(float dropThreshold)
+ public void setDropThreshold(float dropThresholdValue)
{
- this.dropThreshold = dropThreshold;
+ dropThreshold = dropThresholdValue;
}
/**
@@ -1467,11 +1481,11 @@ public class PDFTextStripper extends PDF
/**
* Sets the string which will be used at the beginning of a page.
- * @param s the page start string
+ * @param pageStartValue the page start string
*/
- public void setPageStart(String pageStart)
+ public void setPageStart(String pageStartValue)
{
- this.pageStart = pageStart;
+ pageStart = pageStartValue;
}
/**
@@ -1485,43 +1499,47 @@ public class PDFTextStripper extends PDF
/**
* Sets the string which will be used at the end of a page.
- * @param s the page end string
+ * @param pageEndValue the page end string
*/
- public void setPageEnd(String pageEnd)
+ public void setPageEnd(String pageEndValue)
{
- this.pageEnd = pageEnd;
+ pageEnd = pageEndValue;
}
/**
* Returns the string which will be used at the beginning of an article.
* @return the article start string
*/
- public String getArticleStart() {
+ public String getArticleStart()
+ {
return articleStart;
}
/**
* Sets the string which will be used at the beginning of an article.
- * @param s the article start string
+ * @param articleStartValue the article start string
*/
- public void setArticleStart(String articleStart) {
- this.articleStart = articleStart;
+ public void setArticleStart(String articleStartValue)
+ {
+ articleStart = articleStartValue;
}
/**
* Returns the string which will be used at the end of an article.
* @return the article end string
*/
- public String getArticleEnd(){
+ public String getArticleEnd()
+ {
return articleEnd;
}
/**
* Sets the string which will be used at the end of an article.
- * @param s the article end string
+ * @param articleEndValue the article end string
*/
- public void setArticleEnd(String articleEnd){
- this.articleEnd = articleEnd;
+ public void setArticleEnd(String articleEndValue)
+ {
+ articleEnd = articleEndValue;
}
@@ -1534,17 +1552,22 @@ public class PDFTextStripper extends PDF
* convert them back to a logical order.
*
* @param str a string obtained from font.encoding()
+ *
+ * @return the reversed string
*/
public String inspectFontEncoding(String str)
{
if (!sortByPosition || str == null || str.length() < 2)
+ {
return str;
-
+ }
for (int i = 0; i < str.length(); ++i)
{
if (Character.getDirectionality(str.charAt(i))
!= Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
+ {
return str;
+ }
}
StringBuilder reversed = new StringBuilder(str.length());
@@ -1558,27 +1581,35 @@ public class PDFTextStripper extends PDF
/**
* handles the line separator for a new line given
* the specified current and previous TextPositions.
- * @param position the current text position
+ * @param current the current text position
* @param lastPosition the previous text position
* @param lastLineStartPosition the last text position that followed a line
* separator.
* @param maxHeightForLine max height for positions since lastLineStartPosition
- * @throws IOException
+ * @return start position of the last line
+ * @throws IOException if something went wrong
*/
protected PositionWrapper handleLineSeparation(PositionWrapper current,
PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
- throws IOException {
+ throws IOException
+ {
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
- if (current.isParagraphStart()) {
- if(lastPosition.isArticleStart()) {
+ if (current.isParagraphStart())
+ {
+ if(lastPosition.isArticleStart())
+ {
writeParagraphStart();
- } else {
+ }
+ else
+ {
writeLineSeparator();
writeParagraphSeparator();
}
- } else {
+ }
+ else
+ {
writeLineSeparator();
}
return lastLineStartPosition;
@@ -1610,57 +1641,79 @@ public class PDFTextStripper extends PDF
* @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
protected void isParagraphSeparation(PositionWrapper position,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine){
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
+ {
boolean result = false;
- if(lastLineStartPosition == null) {
+ if(lastLineStartPosition == null)
+ {
result = true;
- }else{
+ }
+ else
+ {
float yGap = Math.abs(position.getTextPosition().getYDirAdj()-
lastPosition.getTextPosition().getYDirAdj());
float xGap = (position.getTextPosition().getXDirAdj()-
lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
- if(yGap > (getDropThreshold()*maxHeightForLine)){
+ if(yGap > (getDropThreshold()*maxHeightForLine))
+ {
result = true;
- }else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace())){
+ }
+ else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace()))
+ {
//text is indented, but try to screen for hanging indent
- if(!lastLineStartPosition.isParagraphStart()){
+ if(!lastLineStartPosition.isParagraphStart())
+ {
result = true;
- }else{
+ }
+ else
+ {
position.setHangingIndent();
}
- }else if(xGap < -position.getTextPosition().getWidthOfSpace()){
+ }
+ else if(xGap < -position.getTextPosition().getWidthOfSpace())
+ {
//text is left of previous line. Was it a hanging indent?
- if(!lastLineStartPosition.isParagraphStart()){
+ if(!lastLineStartPosition.isParagraphStart())
+ {
result = true;
}
- }else if(Math.abs(xGap) < (0.25 * position.getTextPosition().getWidth())){
+ }
+ else if(Math.abs(xGap) < (0.25 * position.getTextPosition().getWidth()))
+ {
//current horizontal position is within 1/4 a char of the last
//linestart. We'll treat them as lined up.
- if(lastLineStartPosition.isHangingIndent()){
+ if(lastLineStartPosition.isHangingIndent())
+ {
position.setHangingIndent();
- }else if(lastLineStartPosition.isParagraphStart()){
+ }
+ else if(lastLineStartPosition.isParagraphStart())
+ {
//check to see if the previous line looks like
//any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
- if(liPattern!=null){
+ if(liPattern!=null)
+ {
Pattern currentPattern = matchListItemPattern(position);
- if(liPattern == currentPattern){
- result = true;
+ if(liPattern == currentPattern)
+ {
+ result = true;
}
}
}
}
}
- if(result){
+ if(result)
+ {
position.setParagraphStart();
}
}
/**
* writes the paragraph separator string to the output.
- * @throws IOException
+ * @throws IOException if something went wrong
*/
- protected void writeParagraphSeparator()throws IOException{
+ protected void writeParagraphSeparator()throws IOException
+ {
writeParagraphEnd();
writeParagraphStart();
}
@@ -1669,7 +1722,8 @@ public class PDFTextStripper extends PDF
* Write something (if defined) at the start of a paragraph.
* @throws IOException if something went wrong
*/
- protected void writeParagraphStart() throws IOException{
+ protected void writeParagraphStart() throws IOException
+ {
output.write(getParagraphStart());
}
@@ -1677,7 +1731,8 @@ public class PDFTextStripper extends PDF
* Write something (if defined) at the end of a paragraph.
* @throws IOException if something went wrong
*/
- protected void writeParagraphEnd() throws IOException{
+ protected void writeParagraphEnd() throws IOException
+ {
output.write(getParagraphEnd());
}
@@ -1685,7 +1740,8 @@ public class PDFTextStripper extends PDF
* Write something (if defined) at the start of a page.
* @throws IOException if something went wrong
*/
- protected void writePageStart()throws IOException{
+ protected void writePageStart()throws IOException
+ {
output.write(getPageStart());
}
@@ -1693,7 +1749,8 @@ public class PDFTextStripper extends PDF
* Write something (if defined) at the end of a page.
* @throws IOException if something went wrong
*/
- protected void writePageEnd()throws IOException{
+ protected void writePageEnd()throws IOException
+ {
output.write(getPageEnd());
}
@@ -1709,7 +1766,8 @@ public class PDFTextStripper extends PDF
* @param pw
* @return
*/
- protected Pattern matchListItemPattern(PositionWrapper pw) {
+ protected Pattern matchListItemPattern(PositionWrapper pw)
+ {
TextPosition tp = pw.getTextPosition();
String txt = tp.getCharacter();
Pattern p = matchPattern(txt,getListItemPatterns());
@@ -1743,7 +1801,8 @@ public class PDFTextStripper extends PDF
*
* @param patterns
*/
- protected void setListItemPatterns(List<Pattern> patterns){
+ protected void setListItemPatterns(List<Pattern> patterns)
+ {
liPatterns = patterns;
}
@@ -1767,10 +1826,13 @@ public class PDFTextStripper extends PDF
* This method returns a list of such regular expression Patterns.
* @return a list of Pattern objects.
*/
- protected List<Pattern> getListItemPatterns(){
- if(liPatterns == null){
+ protected List<Pattern> getListItemPatterns()
+ {
+ if(liPatterns == null)
+ {
liPatterns = new ArrayList<Pattern>();
- for(String expression : LIST_ITEM_EXPRESSIONS){
+ for(String expression : LIST_ITEM_EXPRESSIONS)
+ {
Pattern p = Pattern.compile(expression);
liPatterns.add(p);
}
@@ -1792,10 +1854,13 @@ public class PDFTextStripper extends PDF
* @param patterns
* @return
*/
- protected static final Pattern matchPattern(String s, List<Pattern> patterns){
+ protected static final Pattern matchPattern(String s, List<Pattern> patterns)
+ {
Pattern matchedPattern = null;
- for(Pattern p : patterns){
- if(p.matcher(s).matches()){
+ for(Pattern p : patterns)
+ {
+ if(p.matcher(s).matches())
+ {
return p;
}
}
@@ -1808,20 +1873,29 @@ public class PDFTextStripper extends PDF
* @param isRtlDominant determines if rtl or ltl is dominant
* @throws IOException if something went wrong
*/
- private void writeLine(List<String> line, boolean isRtlDominant)throws IOException{
+ private void writeLine(List<String> line, boolean isRtlDominant)throws IOException
+ {
int numberOfStrings = line.size();
- if (isRtlDominant) {
- for(int i=numberOfStrings-1; i>=0; i--){
+ if (isRtlDominant)
+ {
+ for(int i=numberOfStrings-1; i>=0; i--)
+ {
if (i < numberOfStrings-1)
+ {
writeWordSeparator();
+ }
writeString(line.get(i));
}
}
- else {
- for(int i=0; i<numberOfStrings; i++){
+ else
+ {
+ for(int i=0; i<numberOfStrings; i++)
+ {
writeString(line.get(i));
if (!isRtlDominant && i < numberOfStrings-1)
+ {
writeWordSeparator();
+ }
}
}
}
@@ -1833,26 +1907,33 @@ public class PDFTextStripper extends PDF
* @param hasRtl determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
- private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl){
+ private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
+ {
LinkedList<String> normalized = new LinkedList<String>();
StringBuilder lineBuilder = new StringBuilder();
- for(TextPosition text : line){
- if (text instanceof WordSeparator) {
+ for(TextPosition text : line)
+ {
+ if (text instanceof WordSeparator)
+ {
String lineStr = lineBuilder.toString();
- if (hasRtl) {
+ if (hasRtl)
+ {
lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
}
lineStr = normalize.normalizePres(lineStr);
normalized.add(lineStr);
lineBuilder = new StringBuilder();
}
- else {
+ else
+ {
lineBuilder.append(text.getCharacter());
}
}
- if (lineBuilder.length() > 0) {
+ if (lineBuilder.length() > 0)
+ {
String lineStr = lineBuilder.toString();
- if (hasRtl) {
+ if (hasRtl)
+ {
lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
}
lineStr = normalize.normalizePres(lineStr);
@@ -1867,13 +1948,16 @@ public class PDFTextStripper extends PDF
* @author ME21969
*
*/
- private static final class WordSeparator extends TextPosition{
+ private static final class WordSeparator extends TextPosition
+ {
private static final WordSeparator separator = new WordSeparator();
- private WordSeparator(){
+ private WordSeparator()
+ {
}
- public static final WordSeparator getSeparator(){
+ public static final WordSeparator getSeparator()
+ {
return separator;
}