You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/02/25 17:50:41 UTC
svn commit: r747858 - in
/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox: ExtractText.java
util/PDFText2HTML.java util/PDFTextStripper.java
Author: carrier
Date: Wed Feb 25 16:50:41 2009
New Revision: 747858
URL: http://svn.apache.org/viewvc?rev=747858&view=rev
Log:
Patch for PDFBOX-434 to add new HTML output features for text extraction. Patch by Justin LeFebvre <justinl at basistech dot com>
Modified:
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/ExtractText.java Wed Feb 25 16:50:41 2009
@@ -39,20 +39,6 @@
*/
public class ExtractText
{
- /**
- * This is the default encoding of the text to be output.
- */
- public static final String DEFAULT_ENCODING =
- null;
- //"ISO-8859-1";
- //"ISO-8859-6"; //arabic
- //"US-ASCII";
- //"UTF-8";
- //"UTF-16";
- //"UTF-16BE";
- //"UTF-16LE";
-
-
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
@@ -82,9 +68,11 @@
boolean toHTML = false;
boolean sort = false;
String password = "";
- String encoding = DEFAULT_ENCODING;
+ String encoding = null;
String pdfFile = null;
- String textFile = null;
+ String outputFile = null;
+ // Defaults to text files
+ String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
@@ -119,6 +107,7 @@
else if( args[i].equals( HTML ) )
{
toHTML = true;
+ ext = ".html";
}
else if( args[i].equals( SORT ) )
{
@@ -145,7 +134,7 @@
}
else
{
- textFile = args[i];
+ outputFile = args[i];
}
}
}
@@ -168,19 +157,17 @@
URL url = new URL( pdfFile );
document = PDDocument.load( url );
String fileName = url.getFile();
- if( textFile == null && fileName.length() >4 )
+ if( outputFile == null && fileName.length() >4 )
{
- File outputFile =
- new File( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
- textFile = outputFile.getName();
+ outputFile = new File( fileName.substring( 0, fileName.length() -4 ) + ext ).getName();
}
}
catch( MalformedURLException e )
{
document = PDDocument.load( pdfFile );
- if( textFile == null && pdfFile.length() >4 )
+ if( outputFile == null && pdfFile.length() >4 )
{
- textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
+ outputFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ext;
}
}
@@ -196,6 +183,10 @@
throw new IOException( "You do not have permission to extract text" );
}
}
+
+ if ((encoding == null) && (toHTML))
+ encoding = "UTF-8";
+
if( toConsole )
{
output = new OutputStreamWriter( System.out );
@@ -205,24 +196,24 @@
if( encoding != null )
{
output = new OutputStreamWriter(
- new FileOutputStream( textFile ), encoding );
+ new FileOutputStream( outputFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(
- new FileOutputStream( textFile ) );
+ new FileOutputStream( outputFile ) );
}
}
PDFTextStripper stripper = null;
if(toHTML)
{
- stripper = new PDFText2HTML();
+ stripper = new PDFText2HTML(encoding);
}
else
{
- stripper = new PDFTextStripper();
+ stripper = new PDFTextStripper();
}
stripper.setSortByPosition( sort );
stripper.setStartPage( startPage );
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Wed Feb 25 16:50:41 2009
@@ -24,74 +24,55 @@
import org.apache.pdfbox.pdmodel.PDDocument;
/**
- * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
- * Paragraphs broken by pages, columns, or figures are not mended.
- *
- *
+ * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs
+ * broken by pages, columns, or figures are not mended.
+ *
+ *
* @author jjb - http://www.johnjbarton.com
- * @version $Revision: 1.3 $
+ * @version $Revision: 1.3 $
*/
-public class PDFText2HTML extends PDFTextStripper
-{
+public class PDFText2HTML extends PDFTextStripper {
private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
- private TextPosition beginTitle;
- private TextPosition afterEndTitle;
- private String titleGuess;
- private boolean suppressParagraphs;
private boolean onFirstPage = true;
+ private String encoding;
/**
* Constructor.
- *
- * @throws IOException If there is an error during initialization.
+ *
+ * @throws IOException
+ * If there is an error during initialization.
*/
- public PDFText2HTML() throws IOException
- {
- titleGuess = "";
- beginTitle = null;
- afterEndTitle = null;
- suppressParagraphs = false;
+ public PDFText2HTML(String encoding) throws IOException {
+ this.encoding = encoding;
+ this.lineSeparator = "<br>" + System.getProperty("line.separator");
}
/**
- * Write the header to the output document.
- *
- * @throws IOException If there is a problem writing out the header to the document.
+ * Write the header to the output document. Now also writes the tag defining
+ * the character encoding.
+ *
+ * @throws IOException
+ * If there is a problem writing out the header to the document.
*/
- protected void writeHeader() throws IOException
- {
+ protected void writeHeader() throws IOException {
StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
+ buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n" + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");
buf.append("<html><head>");
- buf.append("<title>");
- buf.append(getTitleGuess());
- buf.append("</title>");
- buf.append("</head>");
+ buf.append("<title>" + getTitle() + "</title>\n");
+ if(encoding != null){
+ buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + this.encoding + "\">\n");
+ }
+ buf.append("</head>\n");
buf.append("<body>\n");
super.writeString(buf.toString());
}
/**
- * The guess to the document title.
- *
- * @return A string that is the title of this document.
- */
- protected String getTitleGuess()
- {
- return titleGuess;
- }
-
-
- /**
* {@inheritDoc}
*/
- protected void writePage() throws IOException
- {
- Iterator textIter = getCharactersByArticle().iterator();
-
- if (onFirstPage)
- {
- guessTitle(textIter);
+ protected void writePage() throws IOException {
+ if (onFirstPage) {
writeHeader();
onFirstPage = false;
}
@@ -101,109 +82,91 @@
/**
* {@inheritDoc}
*/
- public void endDocument(PDDocument pdf) throws IOException
- {
+ public void endDocument(PDDocument pdf) throws IOException {
super.writeString("</body></html>");
}
/**
- * This method will attempt to guess the title of the document.
- *
- * @param textIter The characters on the first page.
- * @return The text position that is guessed to be the title.
- */
- protected TextPosition guessTitle(Iterator textIter)
- {
- float lastFontSize = -1.0f;
- int stringsInFont = 0;
- StringBuffer titleText = new StringBuffer();
- while (textIter.hasNext())
- {
- Iterator textByArticle = ((List)textIter.next()).iterator();
- while( textByArticle.hasNext() )
- {
- TextPosition position = (TextPosition) textByArticle.next();
- float currentFontSize = position.getFontSize();
- if (currentFontSize != lastFontSize)
- {
- if (beginTitle != null)
- { // font change in candidate title.
- if (stringsInFont == 0)
- {
- beginTitle = null; // false alarm
- titleText.setLength(0);
- }
- else
- {
- // had a significant font with some words: call it a title
- titleGuess = titleText.toString();
- afterEndTitle = position;
- return beginTitle;
+ * This method will attempt to guess the title of the document using
+ * either the document properties or the first lines of text.
+ *
+ * @return returns the title.
+ */
+ protected String getTitle() {
+ String titleGuess = document.getDocumentInformation().getTitle();
+ if(titleGuess != null && titleGuess.length() > 0){
+ return titleGuess;
+ }
+ else {
+ Iterator textIter = getCharactersByArticle().iterator();
+ float lastFontSize = -1.0f;
+
+ StringBuffer titleText = new StringBuffer();
+ while (textIter.hasNext()) {
+
+ Iterator textByArticle = ((List) textIter.next()).iterator();
+ while (textByArticle.hasNext()) {
+ TextPosition position = (TextPosition) textByArticle.next();
+
+ float currentFontSize = position.getFontSize();
+ //If we're past 64 chars we will assume that we're past the title
+ //64 is arbitrary
+ if (currentFontSize != lastFontSize || titleText.length() > 64) {
+ if (titleText.length() > 0) {
+ return titleText.toString();
}
+ lastFontSize = currentFontSize;
}
- else
- { // font change and begin == null
- if (currentFontSize > 13.0f)
- { // most body text is 12pt max I guess
- beginTitle = position;
- }
+ if (currentFontSize > 13.0f) { // most body text is 12pt
+ titleText.append(position.getCharacter());
}
-
- lastFontSize = currentFontSize;
- stringsInFont = 0;
- }
- stringsInFont++;
- if (beginTitle != null)
- {
- titleText.append(position.getCharacter()+" ");
}
}
}
- return beginTitle; // null
+ return "";
}
+
/**
- * Write out the paragraph separator.
- *
- * @throws IOException If there is an error writing to the stream.
+ * Write out the article separator (div tag) with proper text direction
+ * information.
+ *
+ * @param true if direction of text is left to right
+ * @throws IOException
+ * If there is an error writing to the stream.
*/
- protected void startParagraph() throws IOException
- {
- if (! suppressParagraphs)
- {
- super.writeString("<p>");
+ protected void startArticle(boolean isltr) throws IOException {
+ if (isltr) {
+ super.writeString("<div>");
+ }
+ else {
+ super.writeString("<div dir=\"RTL\">");
}
}
+
/**
- * Write out the paragraph separator.
- *
- * @throws IOException If there is an error writing to the stream.
+ * Write out the article separator.
+ *
+ * @throws IOException
+ * If there is an error writing to the stream.
*/
- protected void endParagraph() throws IOException
- {
- if (! suppressParagraphs)
- {
- super.writeString("</p>");
- }
+ protected void endArticle() throws IOException {
+ super.writeString("</div>");
}
/**
* Write a string to the output stream and escape some HTML characters
*/
- protected void writeString(String chars) throws IOException
- {
- for (int i = 0; i < chars.length(); i++)
- {
+ protected void writeString(String chars) throws IOException {
+ for (int i = 0; i < chars.length(); i++) {
char c = chars.charAt(i);
- if ((c < 32) || (c > 126))
- {
+ // write non-ASCII as named entities
+ if ((c < 32) || (c > 126)) {
int charAsInt = c;
super.writeString("&#" + charAsInt + ";");
- }
- else
- {
- switch (c)
- {
+ }
+ else {
+ switch (c) {
case 34:
super.writeString(""");
break;
@@ -222,39 +185,4 @@
}
}
}
-
- /**
- * {@inheritDoc}
- */
- protected void writeCharacters(TextPosition position ) throws IOException
- {
- if (position == beginTitle)
- {
- super.writeString("<H1>");
- suppressParagraphs = true;
- }
- if (position == afterEndTitle)
- {
- super.writeString("</H1>"); // end title and start first paragraph
- suppressParagraphs = false;
- }
-
- writeString(position.getCharacter());
- }
-
-
- /**
- * @return Returns the suppressParagraphs.
- */
- public boolean isSuppressParagraphs()
- {
- return suppressParagraphs;
- }
- /**
- * @param shouldSuppressParagraphs The suppressParagraphs to set.
- */
- public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
- {
- this.suppressParagraphs = shouldSuppressParagraphs;
- }
}
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=747858&r1=747857&r2=747858&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Wed Feb 25 16:50:41 2009
@@ -63,7 +63,7 @@
private int startBookmarkPageNumber = -1;
private PDOutlineItem endBookmark = null;
private int endBookmarkPageNumber = -1;
- private PDDocument document;
+ protected PDDocument document;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
@@ -88,7 +88,7 @@
private Map characterListMapping = new HashMap();
- private String lineSeparator = System.getProperty("line.separator");
+ protected String lineSeparator = System.getProperty("line.separator");
private String pageSeparator = System.getProperty("line.separator");
private String wordSeparator = " ";
@@ -335,23 +335,40 @@
}
/**
- * Start a new paragraph. Default implementation is to do nothing. Subclasses
+ * Start a new article, which is typically defined as a column
+ * on a single page (also referred to as a bead). This assumes
+ * that the primary direction of text is left to right.
+ * Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
- protected void startParagraph() throws IOException
+ protected void startArticle() throws IOException
+ {
+ startArticle(true);
+ }
+
+ /**
+ * Start a new article, which is typically defined as a column
+ * on a single page (also referred to as a bead).
+ * Default implementation is to do nothing. Subclasses
+ * may provide additional information.
+ *
+ * @param true if primary direction of text is left to right
+ * @throws IOException If there is any error writing to the stream.
+ */
+ protected void startArticle(boolean isltr) throws IOException
{
//default is to do nothing.
}
/**
- * End a paragraph. Default implementation is to do nothing. Subclasses
+ * End an article. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
- protected void endParagraph() throws IOException
+ protected void endArticle() throws IOException
{
//default is to do nothing
}
@@ -395,13 +412,10 @@
{
float maxYForLine = -1;
float minYTopForLine = Float.MAX_VALUE;
- //float lastBaselineFontSize = -1;
float endOfLastTextX = -1;
- //float endOfLastTextY = -1;
float expectedStartOfNextWordX = -1;
float lastWordSpacing = -1;
float maxHeightForLine = -1;
- //float lastHeightForLine = -1;
TextPosition lastPosition = null;
if (normalize == null) {
@@ -410,7 +424,6 @@
for( int i = 0; i < charactersByArticle.size(); i++)
{
- startParagraph();
List textList = (List)charactersByArticle.get( i );
if( sortByPosition )
{
@@ -464,7 +477,9 @@
if (rtlCnt > ltrCnt) {
isRtlDominant = true;
}
-
+
+ startArticle(!isRtlDominant);
+
// we will later use this to skip reordering
boolean hasRtl = false;
if (rtlCnt > 0)
@@ -501,22 +516,14 @@
positionHeight = position.getHeight();
}
-
- float wordSpacing = 0;
- /* float wordSpacing = position.getWordSpacing(); BC: When I re-enabled this for a a test, lots of extra spaces were added
+ //try to get width of a space character
+ float wordSpacing = position.getWidthOfSpace();
+ //if still zero fall back to getting the width of the current
+ //character
if( wordSpacing == 0 )
{
- */
- //try to get width of a space character
- wordSpacing = position.getWidthOfSpace();
- //if still zero fall back to getting the width of the current
- //character
- if( wordSpacing == 0 )
- {
- wordSpacing = positionWidth;
- }
- //}
-
+ wordSpacing = positionWidth;
+ }
// RDD - We add a conservative approximation for space determination.
// basically if there is a blank area between two characters that is
@@ -531,27 +538,6 @@
expectedStartOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
}
- // RDD - We will suppress text that is very close to the current line
- // and which overwrites previously rendered text on this line.
- // This is done specifically to handle a reasonably common situation
- // where an application (MS Word, in the case of my examples) renders
- // text four times at small (1 point) offsets in order to accomplish
- // bold printing. You would not want to do this step if you were
- // going to render the TextPosition objects graphically.
- //
- /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
- (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
- {
- if (log.isDebugEnabled())
- {
- log.debug("Suppressing text overwrite" +
- " x: " + position.getX() +
- " endOfLastTextX: " + endOfLastTextX +
- " string: " + position.getCharacter());
- }
- continue;
- }*/
-
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
@@ -559,24 +545,11 @@
// a small amount in order to be considered as being on the same line.
//
- //int verticalScaling = 1;
- //if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
- //{
- // verticalScaling = -1;
- //}
-
- if( lastPosition != null )
- {
- //if (currentY != -1 &&
- // ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
- // (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
- //{
+ if( lastPosition != null ){
/* XXX BC: In theory, this check should really check if the next char is in full range
* seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
* of regression test failures. So, I'm leaving it be for now. */
- if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) ))
- //maxYForLine - minYTopForLine)))
- {
+ if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)){
// If we have RTL text on the page, change the direction
if (hasRtl)
lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
@@ -597,9 +570,7 @@
expectedStartOfNextWordX = -1;
maxYForLine = -1;
maxHeightForLine = -1;
- //lastBaselineFontSize = -1;
minYTopForLine = Float.MAX_VALUE;
- //lastHeightForLine = -1;
}
@@ -610,35 +581,25 @@
{
lineStr += getWordSeparator();
}
- else
- {
- //System.out.println( "Not a word separator " + position.getCharacter() + " start=" + startOfNextWordX + " x=" + position.getX() );
- }
+
}
if (positionY >= maxYForLine) {
maxYForLine = positionY;
- //lastBaselineFontSize = position.getFontSize();
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
endOfLastTextX = positionX + positionWidth;
- //endOfLastTextY = positionY;
// add it to the list
if (characterValue != null)
{
lineStr += characterValue;
}
- else
- {
- //Position.getString() is null so not writing anything
- }
maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
lastPosition = position;
- //lastHeightForLine = position.getHeight();
lastWordSpacing = wordSpacing;
}
@@ -653,7 +614,7 @@
writeString(lineStr);
}
- endParagraph();
+ endArticle();
}
writePageSeperator();;