You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2013/11/24 14:55:24 UTC
svn commit: r1544975 - in /pdfbox/branches/1.8: ./
pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Author: lehmi
Date: Sun Nov 24 13:55:24 2013
New Revision: 1544975
URL: http://svn.apache.org/r1544975
Log:
PDFBOX-1213: added some style imformation to the PDF2HTML converter as proposed by Axel Dörfler
Modified:
pdfbox/branches/1.8/ (props changed)
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Propchange: pdfbox/branches/1.8/
------------------------------------------------------------------------------
Merged /pdfbox/trunk:r1544818
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1544975&r1=1544974&r2=1544975&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Sun Nov 24 13:55:24 2013
@@ -17,25 +17,28 @@
package org.apache.pdfbox.util;
import java.io.IOException;
-
+import java.util.ArrayList;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Set;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
/**
* Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs
* broken by pages, columns, or figures are not mended.
*
- *
* @author jjb - http://www.johnjbarton.com
- * @version $Revision: 1.3 $
+ *
*/
public class PDFText2HTML extends PDFTextStripper
{
private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
private boolean onFirstPage = true;
+ private FontState fontState = new FontState();
/**
* Constructor.
@@ -180,6 +183,19 @@ public class PDFText2HTML extends PDFTex
}
/**
+ * Write a string to the output stream, maintain font state, and escape some HTML characters.
+ * The font state is only preserved per word.
+ *
+ * @param text The text to write to the stream.
+ * @param textPositions the corresponding text positions
+ * @throws IOException If there is an error writing to the stream.
+ */
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException
+ {
+ super.writeString(fontState.push(text, textPositions));
+ }
+
+ /**
* Write a string to the output stream and escape some HTML characters.
*
* @param chars String to be written to the stream
@@ -192,44 +208,215 @@ public class PDFText2HTML extends PDFTex
}
/**
+ * Writes the paragraph end "</p>" to the output. Furthermore, it will also clear the font state.
+ *
+ * {@inheritDoc}
+ */
+ @Override
+ protected void writeParagraphEnd() throws IOException
+ {
+ writeString(fontState.clear());
+ super.writeParagraphEnd();
+ }
+
+ /**
* Escape some HTML characters.
*
* @param chars String to be escaped
* @return returns escaped String.
*/
- private String escape(String chars)
+ private static String escape(String chars)
{
StringBuilder builder = new StringBuilder(chars.length());
for (int i = 0; i < chars.length(); i++)
{
- char c = chars.charAt(i);
- // write non-ASCII as named entities
- if ((c < 32) || (c > 126))
+ appendEscaped(builder, chars.charAt(i));
+ }
+ return builder.toString();
+ }
+
+ private static void appendEscaped(StringBuilder builder, char character)
+ {
+ // write non-ASCII as named entities
+ if ((character < 32) || (character > 126))
+ {
+ int charAsInt = character;
+ builder.append("&#").append(charAsInt).append(";");
+ }
+ else
+ {
+ switch (character)
{
- int charAsInt = c;
- builder.append("&#").append(charAsInt).append(";");
+ case 34:
+ builder.append(""");
+ break;
+ case 38:
+ builder.append("&");
+ break;
+ case 60:
+ builder.append("<");
+ break;
+ case 62:
+ builder.append(">");
+ break;
+ default:
+ builder.append(String.valueOf(character));
}
- else
+ }
+ }
+
+ /**
+ * A helper class to maintain the current font state. It's public methods will emit opening and
+ * closing tags as needed, and in the correct order.
+ *
+ * @author Axel Dörfler
+ */
+ private static class FontState
+ {
+ protected List<String> stateList = new ArrayList<String>();
+ protected Set<String> stateSet = new HashSet<String>();
+
+ /**
+ * Pushes new {@link TextPosition TextPositions} into the font state. The state is only
+ * preserved correctly for each letter if the number of letters in <code>text</code> matches
+ * the number of {@link TextPosition} objects. Otherwise, it's done once for the complete
+ * array (just by looking at its first entry).
+ *
+ * @return A string that contains the text including tag changes caused by its font state.
+ */
+ public String push(String text, List<TextPosition> textPositions)
+ {
+ StringBuilder buffer = new StringBuilder();
+
+ if (text.length() == textPositions.size())
{
- switch (c)
+ // There is a 1:1 mapping, and we can use the TextPositions directly
+ for (int i = 0; i < text.length(); i++)
{
- case 34:
- builder.append(""");
- break;
- case 38:
- builder.append("&");
- break;
- case 60:
- builder.append("<");
- break;
- case 62:
- builder.append(">");
- break;
- default:
- builder.append(String.valueOf(c));
+ push(buffer, text.charAt(i), textPositions.get(i));
}
}
+ else if (!text.isEmpty())
+ {
+ // The normalized text does not match the number of TextPositions, so we'll just
+ // have a look at its first entry.
+ // TODO change PDFTextStripper.normalize() such that it maintains the 1:1 relation
+ if (textPositions.isEmpty())
+ {
+ return text;
+ }
+ push(buffer, text.charAt(0), textPositions.get(0));
+ buffer.append(escape(text.substring(1)));
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Closes all open states.
+ * @return A string that contains the closing tags of all currently open states.
+ */
+ public String clear()
+ {
+ StringBuilder buffer = new StringBuilder();
+ closeUntil(buffer, null);
+ stateList.clear();
+ stateSet.clear();
+ return buffer.toString();
+ }
+
+ protected String push(StringBuilder buffer, char character, TextPosition textPosition)
+ {
+ boolean bold = false;
+ boolean italics = false;
+
+ PDFontDescriptor descriptor = textPosition.getFont().getFontDescriptor();
+ if (descriptor != null)
+ {
+ bold = isBold(descriptor);
+ italics = isItalic(descriptor);
+ }
+
+ buffer.append(bold ? open("b") : close("b"));
+ buffer.append(italics ? open("i") : close("i"));
+ appendEscaped(buffer, character);
+
+ return buffer.toString();
+ }
+
+ private String open(String tag)
+ {
+ if (stateSet.contains(tag))
+ {
+ return "";
+ }
+ stateList.add(tag);
+ stateSet.add(tag);
+
+ return openTag(tag);
+ }
+
+ private String close(String tag)
+ {
+ if (!stateSet.contains(tag))
+ {
+ return "";
+ }
+ // Close all tags until (but including) the one we should close
+ StringBuilder tagsBuilder = new StringBuilder();
+ int index = closeUntil(tagsBuilder, tag);
+
+ // Remove from state
+ stateList.remove(index);
+ stateSet.remove(tag);
+
+ // Now open the states that were closed but should remain open again
+ for (; index < stateList.size(); index++)
+ {
+ tagsBuilder.append(openTag(stateList.get(index)));
+ }
+ return tagsBuilder.toString();
+ }
+
+ private int closeUntil(StringBuilder tagsBuilder, String endTag)
+ {
+ for (int i = stateList.size(); i-- > 0;)
+ {
+ String tag = stateList.get(i);
+ tagsBuilder.append(closeTag(tag));
+ if (endTag != null && tag.equals(endTag))
+ {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private String openTag(String tag)
+ {
+ return "<" + tag + ">";
+ }
+
+ private String closeTag(String tag)
+ {
+ return "</" + tag + ">";
+ }
+
+ private boolean isBold(PDFontDescriptor descriptor)
+ {
+ if (descriptor.isForceBold())
+ {
+ return true;
+ }
+ return descriptor.getFontName().contains("Bold");
+ }
+
+ private boolean isItalic(PDFontDescriptor descriptor)
+ {
+ if (descriptor.isItalic())
+ {
+ return true;
+ }
+ return descriptor.getFontName().contains("Italic");
}
- return builder.toString();
}
}
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1544975&r1=1544974&r2=1544975&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sun Nov 24 13:55:24 2013
@@ -59,7 +59,7 @@ import org.apache.pdfbox.pdmodel.interac
* Eventually, we fully process each page and then print it.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * @version $Revision: 1.70 $
+ *
*/
public class PDFTextStripper extends PDFStreamEngine
{
@@ -120,7 +120,6 @@ public class PDFTextStripper extends PDF
private String articleStart = "";
private String articleEnd = "";
-
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
@@ -188,7 +187,6 @@ public class PDFTextStripper extends PDF
*/
private boolean inParagraph;
-
/**
* Instantiate a new PDFTextStripper object. This object will load
* properties from PDFTextStripper.properties and will not do
@@ -205,7 +203,6 @@ public class PDFTextStripper extends PDF
normalize = new TextNormalize(this.outputEncoding);
}
-
/**
* Instantiate a new PDFTextStripper object. Loading all of the operator mappings
* from the properties object that is passed in. Does not convert the text
@@ -243,9 +240,7 @@ public class PDFTextStripper extends PDF
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
- *
* @return The text of the PDF document.
- *
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText( PDDocument doc ) throws IOException
@@ -331,7 +326,6 @@ public class PDFTextStripper extends PDF
throw new WrappedIOException("Error: document is encrypted", e);
}
}
-
processPages( document.getDocumentCatalog().getAllPages() );
endDocument(document);
}
@@ -349,7 +343,6 @@ public class PDFTextStripper extends PDF
{
startBookmarkPageNumber = getPageNumber( startBookmark, pages );
}
-
if( endBookmark != null )
{
endBookmarkPageNumber = getPageNumber( endBookmark, pages );
@@ -365,8 +358,6 @@ public class PDFTextStripper extends PDF
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
-
-
Iterator<COSObjectable> pageIter = pages.iterator();
while( pageIter.hasNext() )
{
@@ -450,13 +441,11 @@ public class PDFTextStripper extends PDF
charactersByArticle.set( i, new ArrayList<TextPosition>() );
}
}
-
characterListMapping.clear();
processStream( page, page.findResources(), content );
writePage();
endPage( page );
}
-
}
/**
@@ -564,9 +553,7 @@ public class PDFTextStripper extends PDF
TextPositionComparator comparator = new TextPositionComparator();
Collections.sort( textList, comparator );
}
-
Iterator<TextPosition> textIter = textList.iterator();
-
/* Before we can display the text, we need to do some normalizing.
* Arabic and Hebrew text is right to left and is typically stored
* in its logical format, which means that the rightmost character is
@@ -607,7 +594,6 @@ public class PDFTextStripper extends PDF
}
}
}
-
// choose the dominant direction
boolean isRtlDominant = rtlCnt > ltrCnt;
@@ -742,17 +728,14 @@ public class PDFTextStripper extends PDF
{
writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
line.clear();
-
lastLineStartPosition =
handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
-
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
maxYForLine = MAXYFORLINE_RESET_VALUE;
maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
}
-
//Test if our TextPosition starts after a new word would be expected to start.
if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
&& expectedStartOfNextWordX < positionX &&
@@ -763,12 +746,10 @@ public class PDFTextStripper extends PDF
line.add(WordSeparator.getSeparator());
}
}
-
if (positionY >= maxYForLine)
{
maxYForLine = positionY;
}
-
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
endOfLastTextX = positionX + positionWidth;
@@ -795,14 +776,12 @@ public class PDFTextStripper extends PDF
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
-
// print the final line
if (line.size() > 0)
{
writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
writeParagraphEnd();
}
-
endArticle();
}
writePageEnd();
@@ -823,7 +802,6 @@ public class PDFTextStripper extends PDF
{
// RDD - newline at end of flush - required for end of page (so that the top
// of the next page starts on its own line.
- //
output.write(getPageSeparator());
output.flush();
}
@@ -861,6 +839,19 @@ public class PDFTextStripper extends PDF
}
/**
+ * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code>
+ * and just calls {@link #writeString(String)}.
+ *
+ * @param text The text to write to the stream.
+ * @param textPositions The TextPositions belonging to the text.
+ * @throws IOException If there is an error when writing the text.
+ */
+ protected void writeString(String text, List<TextPosition> textPositions) throws IOException
+ {
+ writeString(text);
+ }
+
+ /**
* Write a Java string to the output stream.
*
* @param text The text to write to the stream.
@@ -905,7 +896,6 @@ public class PDFTextStripper extends PDF
sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
characterListMapping.put( textCharacter, sameTextCharacters );
}
-
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
@@ -932,7 +922,6 @@ public class PDFTextStripper extends PDF
break;
}
}
-
if( !suppressCharacter )
{
TreeSet<Float> ySet = sameTextCharacters.get(textX);
@@ -945,7 +934,6 @@ public class PDFTextStripper extends PDF
showCharacter = true;
}
}
-
if( showCharacter )
{
//if we are showing the character then we need to determine which
@@ -1220,7 +1208,7 @@ public class PDFTextStripper extends PDF
public void setSuppressDuplicateOverlappingText(
boolean suppressDuplicateOverlappingTextValue)
{
- this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
+ suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
}
/**
@@ -1240,7 +1228,7 @@ public class PDFTextStripper extends PDF
*/
public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
- this.shouldSeparateByBeads = aShouldSeparateByBeads;
+ shouldSeparateByBeads = aShouldSeparateByBeads;
}
/**
@@ -1353,7 +1341,7 @@ public class PDFTextStripper extends PDF
*/
public void setSpacingTolerance(float spacingToleranceValue)
{
- this.spacingTolerance = spacingToleranceValue;
+ spacingTolerance = spacingToleranceValue;
}
/**
@@ -1378,7 +1366,7 @@ public class PDFTextStripper extends PDF
*/
public void setAverageCharTolerance(float averageCharToleranceValue)
{
- this.averageCharTolerance = averageCharToleranceValue;
+ averageCharTolerance = averageCharToleranceValue;
}
@@ -1455,7 +1443,7 @@ public class PDFTextStripper extends PDF
*/
public void setParagraphStart(String s)
{
- this.paragraphStart = s;
+ paragraphStart = s;
}
/**
@@ -1473,7 +1461,7 @@ public class PDFTextStripper extends PDF
*/
public void setParagraphEnd(String s)
{
- this.paragraphEnd = s;
+ paragraphEnd = s;
}
@@ -1576,7 +1564,6 @@ public class PDFTextStripper extends PDF
return str;
}
}
-
StringBuilder reversed = new StringBuilder(str.length());
for (int i = str.length() - 1; i >= 0; --i)
{
@@ -1784,8 +1771,7 @@ public class PDFTextStripper extends PDF
{
TextPosition tp = pw.getTextPosition();
String txt = tp.getCharacter();
- Pattern p = matchPattern(txt,getListItemPatterns());
- return p;
+ return matchPattern(txt,getListItemPatterns());
}
/**
@@ -1820,7 +1806,6 @@ public class PDFTextStripper extends PDF
listOfPatterns = patterns;
}
-
/**
* returns a list of regular expression Patterns representing
* different common list item formats. For example
@@ -1887,12 +1872,13 @@ public class PDFTextStripper extends PDF
* @param isRtlDominant determines if rtl or ltl is dominant
* @throws IOException if something went wrong
*/
- private void writeLine(List<String> line, boolean isRtlDominant)throws IOException
+ private void writeLine(List<WordWithTextPositions> line, boolean isRtlDominant) throws IOException
{
int numberOfStrings = line.size();
for(int i=0; i<numberOfStrings; i++)
{
- writeString(line.get(i));
+ WordWithTextPositions word = line.get(i);
+ writeString(word.getText(), word.getTextPositions());
if (i < numberOfStrings-1)
{
writeWordSeparator();
@@ -1907,55 +1893,65 @@ public class PDFTextStripper extends PDF
* @param hasRtl determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
- private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
+ private List<WordWithTextPositions> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl)
{
- LinkedList<String> normalized = new LinkedList<String>();
+ LinkedList<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
StringBuilder lineBuilder = new StringBuilder();
+ List<TextPosition> wordPositions = new ArrayList<TextPosition>();
// concatenate the pieces of text in opposite order if RTL is dominant
if (isRtlDominant)
{
int numberOfPositions = line.size();
for(int i = numberOfPositions-1;i>=0;i--)
{
- TextPosition text = line.get(i);
- if (text instanceof WordSeparator)
- {
- normalized.add(normalize.normalizePres(lineBuilder.toString()));
- lineBuilder = new StringBuilder();
- }
- else
- {
- lineBuilder.append(text.getCharacter());
- }
- }
- if (lineBuilder.length() > 0)
- {
- normalized.add(normalize.normalizePres(lineBuilder.toString()));
+ lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
}
}
else
{
for(TextPosition text : line)
{
- if (text instanceof WordSeparator)
- {
- normalized.add(normalize.normalizePres(lineBuilder.toString()));
- lineBuilder = new StringBuilder();
- }
- else
- {
- lineBuilder.append(text.getCharacter());
- }
- }
- if (lineBuilder.length() > 0)
- {
- normalized.add(normalize.normalizePres(lineBuilder.toString()));
+ lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, text);
}
}
+ if (lineBuilder.length() > 0)
+ {
+ normalized.add(createWord(lineBuilder.toString(), wordPositions));
+ }
return normalized;
}
/**
+ * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions}
+ * entry.
+ */
+ private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
+ {
+ return new WordWithTextPositions(normalize.normalizePres(word), wordPositions);
+ }
+
+ /**
+ * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
+ * @return The StringBuilder that must be used when calling this method.
+ */
+ private StringBuilder normalizeAdd(LinkedList<WordWithTextPositions> normalized,
+ StringBuilder lineBuilder, List<TextPosition> wordPositions, TextPosition text)
+ {
+ if (text instanceof WordSeparator)
+ {
+ normalized.add(createWord(lineBuilder.toString(), wordPositions));
+ lineBuilder = new StringBuilder();
+ wordPositions.clear();
+ }
+ else
+ {
+ lineBuilder.append(text.getCharacter());
+ wordPositions.add(text);
+ }
+ return lineBuilder;
+ }
+
+ /**
* internal marker class. Used as a place holder in
* a line of TextPositions.
* @author ME21969
@@ -1973,7 +1969,34 @@ public class PDFTextStripper extends PDF
{
return separator;
}
-
}
+ /**
+ * Internal class that maps strings to lists of {@link TextPosition} arrays.
+ * Note that the number of entries in that list may differ from the number of characters in the
+ * string due to normalization.
+ *
+ * @author Axel Dörfler
+ */
+ private static final class WordWithTextPositions
+ {
+ protected String text;
+ protected List<TextPosition> textPositions;
+
+ public WordWithTextPositions(String word, List<TextPosition> positions)
+ {
+ text = word;
+ textPositions = positions;
+ }
+
+ public String getText()
+ {
+ return text;
+ }
+
+ public List<TextPosition> getTextPositions()
+ {
+ return textPositions;
+ }
+ }
}