You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Tilman Hausherr (JIRA)" <ji...@apache.org> on 2017/07/27 06:37:01 UTC
[jira] [Commented] (PDFBOX-3879) Not able to get font styles, like
italic and Strikethrough
[ https://issues.apache.org/jira/browse/PDFBOX-3879?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16102788#comment-16102788 ]
Tilman Hausherr commented on PDFBOX-3879:
-----------------------------------------
italic is either a different font (that is italic), or a skew transformation. In your file it is a skew transformation (probably this: "1 0 0.3 -1 0 0 Tm", which is also a flip to undo a previous flip). So I'd say it is very difficult. (and it isn't a bug, it is a "how to" question, see https://pdfbox.apache.org/support.html ). You could analyse `TextPosition.getTextMatrix()` to find out whether that is skewed.
Striketrough doesn't exist. In your file, it is a line that happens to go through a text.
> Not able to get font styles, like italic and Strikethrough
> ----------------------------------------------------------
>
> Key: PDFBOX-3879
> URL: https://issues.apache.org/jira/browse/PDFBOX-3879
> Project: PDFBox
> Issue Type: Bug
> Components: Text extraction
> Affects Versions: 2.0.7
> Reporter: sun pengrui
> Attachments: src.pdf
>
>
> I'm trying to extract text from a PDF file, and save it to a XML file.
> The PDF file includes italic and strikethrough font, I cannot get it with PDFont class.
> Below is the code and result.
> {code:java}
> public class TextExtractor extends PDFTextStripper {
> private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
> private final HashMap<TextPosition, String> colors = new HashMap<>();
> public TextExtractor() throws IOException {
> addOperator(new SetNonStrokingColorSpace());
> addOperator(new SetNonStrokingDeviceCMYKColor());
> addOperator(new SetNonStrokingDeviceRGBColor());
> addOperator(new SetNonStrokingDeviceGrayColor());
> addOperator(new SetNonStrokingColor());
> addOperator(new SetNonStrokingColorN());
> }
> @Override
> protected void startDocument(PDDocument document) throws IOException {
> super.startDocument(document);
> super.writeString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<document>\n");
> }
> @Override
> protected void endDocument(PDDocument document) throws IOException {
> super.endDocument(document);
> super.writeString("</document>\n");
> }
> @Override
> protected void startPage(PDPage page) throws IOException {
> super.startPage(page);
> super.writeString(String.format(" <page width=\"%f\" height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
> }
> @Override
> protected void endPage(PDPage page) throws IOException {
> super.endPage(page);
> super.writeString(" </page>\n");
> }
> @Override
> protected void processTextPosition(TextPosition text) {
> super.processTextPosition(text);
> PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
> try {
> String hex = Integer.toHexString(nonStrokingColor.toRGB() & 0xffffff);
> while (hex.length() < 6) {
> hex = "0" + hex;
> }
> colors.put(text, "#" + hex);
> } catch (IOException e) {
> e.printStackTrace();
> }
> }
> @Override
> protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
> StringBuilder builder = new StringBuilder(" <line>\n");
> String[] words = string.split(this.getWordSeparator());
> int startIndex = 0;
> for (String word : words) {
> if(Strings.isNullOrEmpty(word)){
> continue;
> }
> TextPosition startPosition = textPositions.get(startIndex);
> String color = colors.get(startPosition);
> String font = startPosition.getFont().getName();
> float fontSize = startPosition.getFontSize();
> float x = startPosition.getX();
> float y = startPosition.getY();
> TextPosition endPosition = textPositions.get(startIndex + word.length() - 1);
> float width = endPosition.getEndX() - startPosition.getX();
> float height = startPosition.getHeight();
> String template =" <word x=\"%f\" y=\"%f\" width=\"%f\" height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
> builder.append(String.format(template, x, y, width, height, font, fontSize, color, escape(word)));
> startIndex += word.length() + 1;
> }
> builder.append(" </line>");
> super.writeString(builder.toString());
> }
> /**
> * Escape some HTML characters.
> *
> * @param chars String to be escaped
> * @return returns escaped String.
> */
> private static String escape(String chars)
> {
> StringBuilder builder = new StringBuilder(chars.length());
> for (int i = 0; i < chars.length(); i++)
> {
> appendEscaped(builder, chars.charAt(i));
> }
> return builder.toString();
> }
> private static void appendEscaped(StringBuilder builder, char character)
> {
> // write non-ASCII as named entities
> if ((character < 32) || (character > 126))
> {
> int charAsInt = character;
> builder.append("&#").append(charAsInt).append(";");
> }
> else
> {
> switch (character)
> {
> case 34:
> builder.append(""");
> break;
> case 38:
> builder.append("&");
> break;
> case 60:
> builder.append("<");
> break;
> case 62:
> builder.append(">");
> break;
> default:
> builder.append(String.valueOf(character));
> }
> }
> }
> }
> {code}
> {code:xml}
> <document>
> <page width="595.000000" height="842.000000">
> <line>
> <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" font="LucidaGrande" font-size="28" color="#000000">Title</word>
> </line>
> <line>
> <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" font="LucidaGrande" font-size="14" color="#000000">Italic</word>
> </line>
> <line>
> <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
> <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
> <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
> </line>
> <line>
> <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
> </line>
> <line>
> <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
> </line>
> <line>
> <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">some</word>
> <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">text</word>
> </line>
> <line>
> <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">new</word>
> <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">line</word>
> <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">test</word>
> </line>
> </page>
> </document>
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org