You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Tilman Hausherr (JIRA)" <ji...@apache.org> on 2017/07/27 06:37:01 UTC

[jira] [Commented] (PDFBOX-3879) Not able to get font styles, like italic and Strikethrough

    [ https://issues.apache.org/jira/browse/PDFBOX-3879?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16102788#comment-16102788 ] 

Tilman Hausherr commented on PDFBOX-3879:
-----------------------------------------

italic is either a different font (that is italic), or a skew transformation. In your file it is a skew transformation (probably this: "1 0 0.3 -1 0 0 Tm", which is also a flip to undo a previous flip). So I'd say it is very difficult. (and it isn't a bug, it is a "how to" question, see https://pdfbox.apache.org/support.html ). You could analyse `TextPosition.getTextMatrix()` to find out whether that is skewed.

Striketrough doesn't exist. In your file, it is a line that happens to go through a text.

> Not able to get font styles, like italic and Strikethrough
> ----------------------------------------------------------
>
>                 Key: PDFBOX-3879
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3879
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 2.0.7
>            Reporter: sun pengrui
>         Attachments: src.pdf
>
>
> I'm trying to extract text from a PDF file, and save it to a XML file. 
> The PDF file includes italic and strikethrough font, I cannot get it with PDFont class.
> Below is the code and result.
> {code:java}
> public class TextExtractor extends PDFTextStripper {
>     private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
>     private final HashMap<TextPosition, String> colors = new HashMap<>();
>     public TextExtractor() throws IOException {
>         addOperator(new SetNonStrokingColorSpace());
>         addOperator(new SetNonStrokingDeviceCMYKColor());
>         addOperator(new SetNonStrokingDeviceRGBColor());
>         addOperator(new SetNonStrokingDeviceGrayColor());
>         addOperator(new SetNonStrokingColor());
>         addOperator(new SetNonStrokingColorN());
>     }
>     @Override
>     protected void startDocument(PDDocument document) throws IOException {
>         super.startDocument(document);
>         super.writeString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<document>\n");
>     }
>     @Override
>     protected void endDocument(PDDocument document) throws IOException {
>         super.endDocument(document);
>         super.writeString("</document>\n");
>     }
>     @Override
>     protected void startPage(PDPage page) throws IOException {
>         super.startPage(page);
>         super.writeString(String.format("  <page width=\"%f\" height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
>     }
>     @Override
>     protected void endPage(PDPage page) throws IOException {
>         super.endPage(page);
>         super.writeString("  </page>\n");
>     }
>     @Override
>     protected void processTextPosition(TextPosition text) {
>         super.processTextPosition(text);
>         PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
>         try {
>             String hex = Integer.toHexString(nonStrokingColor.toRGB() & 0xffffff);
>             while (hex.length() < 6) {
>                 hex = "0" + hex;
>             }
>             colors.put(text, "#" + hex);
>         } catch (IOException e) {
>             e.printStackTrace();
>         }
>     }
>     @Override
>     protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
>         StringBuilder builder = new StringBuilder("    <line>\n");
>         String[] words = string.split(this.getWordSeparator());
>         int startIndex = 0;
>         for (String word : words) {
>             if(Strings.isNullOrEmpty(word)){
>                 continue;
>             }
>             TextPosition startPosition = textPositions.get(startIndex);
>             String color = colors.get(startPosition);
>             String font = startPosition.getFont().getName();
>             float fontSize = startPosition.getFontSize();
>             float x = startPosition.getX();
>             float y = startPosition.getY();
>             TextPosition endPosition = textPositions.get(startIndex + word.length() - 1);
>             float width = endPosition.getEndX() - startPosition.getX();
>             float height = startPosition.getHeight();
>             String template ="      <word x=\"%f\" y=\"%f\" width=\"%f\" height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
>             builder.append(String.format(template, x, y, width, height, font, fontSize, color, escape(word)));
>             startIndex += word.length() + 1;
>         }
>         builder.append("    </line>");
>         super.writeString(builder.toString());
>     }
>     /**
>      * Escape some HTML characters.
>      *
>      * @param chars String to be escaped
>      * @return returns escaped String.
>      */
>     private static String escape(String chars)
>     {
>         StringBuilder builder = new StringBuilder(chars.length());
>         for (int i = 0; i < chars.length(); i++)
>         {
>             appendEscaped(builder, chars.charAt(i));
>         }
>         return builder.toString();
>     }
>     private static void appendEscaped(StringBuilder builder, char character)
>     {
>         // write non-ASCII as named entities
>         if ((character < 32) || (character > 126))
>         {
>             int charAsInt = character;
>             builder.append("&#").append(charAsInt).append(";");
>         }
>         else
>         {
>             switch (character)
>             {
>                 case 34:
>                     builder.append("&quot;");
>                     break;
>                 case 38:
>                     builder.append("&amp;");
>                     break;
>                 case 60:
>                     builder.append("&lt;");
>                     break;
>                 case 62:
>                     builder.append("&gt;");
>                     break;
>                 default:
>                     builder.append(String.valueOf(character));
>             }
>         }
>     }
> }
> {code}
> {code:xml}
> <document>
>   <page width="595.000000" height="842.000000">
>     <line>
>       <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" font="LucidaGrande" font-size="28" color="#000000">Title</word>
>     </line>
>     <line>
>       <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" font="LucidaGrande" font-size="14" color="#000000">Italic</word>
>     </line>
>     <line>
>       <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
>       <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
>       <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
>     </line>
>     <line>
>       <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
>     </line>
>     <line>
>       <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
>     </line>
>     <line>
>       <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">some</word>
>       <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">text</word>
>     </line>
>     <line>
>       <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">new</word>
>       <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">line</word>
>       <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">test</word>
>     </line>
>   </page>
> </document>
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org