You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Nicholas Poon <ni...@yahoo.com.INVALID> on 2014/08/17 07:29:53 UTC

Highlighted text annotation -- extract color and show in English name

Hi,
I am trying to extract highlighted text with different colors inside the 
pdf file. I can use "getColour()" by calling PDAnnotation class but the 
problem is PDGamma objects are returned and how can I convert that 
PDGamma string into human readable English names, such as "Yellow", "Red", 
"Blue" ... etc.?

Below is the sample code I did for the text extraction with color returned. 
Any great hints or simple codes would be appreciated.

Cheers,
Nick
====================================================================import java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
    try {
        PDDocument pddDocument = PDDocument.load(new File("sample.pdf"));
        List allPages =
 pddDocument.getDocumentCatalog().getAllPages();
        for (int i = 0; i < allPages.size(); i++) {
            int pageNum = i + 1;
            PDPage page = (PDPage) allPages.get(i);
            List<PDAnnotation> la = page.getAnnotations();
            if (la.size() < 1) {
                continue;
            }
            PDAnnotation pdfAnnot = la.get(0);
       
     System.out.println("Color = " + pdfAnnot.getColour());
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);

            PDRectangle rect = pdfAnnot.getRectangle();
            float x = rect.getLowerLeftX() - 1;
            float y = rect.getUpperRightY() - 1;
            float width = rect.getWidth() + 2;
            float height = rect.getHeight() + rect.getHeight() / 4;
            int rotation = page.findRotation();
            if (rotation == 0) {
                PDRectangle pageSize = page.findMediaBox();
                y = pageSize.getHeight() - y;
            }
            Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
            stripper.addRegion(Integer.toString(0), awtRect);
            stripper.extractRegions(page);
            System.out.println("Getting text from region = " + awtRect + "\n");
            System.out.println(stripper.getTextForRegion(Integer.toString(0)));
            System.out.println("Getting text from comment = " + pdfAnnot.getContents());
        }
        pddDocument.close();
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}
}

Re: Highlighted text annotation -- extract color and show in English name

Posted by Tilman Hausherr <TH...@t-online.de>.
I looked at the source code, you can do getR(), getG(), getB() to get 
color components from PDGamma. Here's some hints about what yellow is, 
i.e. there is no such thing as one yellow.
http://www.rapidtables.com/web/color/Yellow_Color.htm#code

Tilman

Am 17.08.2014 07:29, schrieb Nicholas Poon:
> Hi,
> I am trying to extract highlighted text with different colors inside the
> pdf file. I can use "getColour()" by calling PDAnnotation class but the
> problem is PDGamma objects are returned and how can I convert that
> PDGamma string into human readable English names, such as "Yellow", "Red",
> "Blue" ... etc.?
>
> Below is the sample code I did for the text extraction with color returned.
> Any great hints or simple codes would be appreciated.
>
> Cheers,
> Nick
> ====================================================================import java.awt.geom.Rectangle2D;
> import java.io.File;
> import java.util.List;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.pdmodel.PDPage;
> import org.apache.pdfbox.pdmodel.common.PDRectangle;
> import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
> import org.apache.pdfbox.util.PDFTextStripperByArea;
> public class ExtractHighlights {
> public static void main(String args[]) {
>      try {
>          PDDocument pddDocument = PDDocument.load(new File("sample.pdf"));
>          List allPages =
>   pddDocument.getDocumentCatalog().getAllPages();
>          for (int i = 0; i < allPages.size(); i++) {
>              int pageNum = i + 1;
>              PDPage page = (PDPage) allPages.get(i);
>              List<PDAnnotation> la = page.getAnnotations();
>              if (la.size() < 1) {
>                  continue;
>              }
>              PDAnnotation pdfAnnot = la.get(0);
>         
>       System.out.println("Color = " + pdfAnnot.getColour());
>              PDFTextStripperByArea stripper = new PDFTextStripperByArea();
>              stripper.setSortByPosition(true);
>
>              PDRectangle rect = pdfAnnot.getRectangle();
>              float x = rect.getLowerLeftX() - 1;
>              float y = rect.getUpperRightY() - 1;
>              float width = rect.getWidth() + 2;
>              float height = rect.getHeight() + rect.getHeight() / 4;
>              int rotation = page.findRotation();
>              if (rotation == 0) {
>                  PDRectangle pageSize = page.findMediaBox();
>                  y = pageSize.getHeight() - y;
>              }
>              Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
>              stripper.addRegion(Integer.toString(0), awtRect);
>              stripper.extractRegions(page);
>              System.out.println("Getting text from region = " + awtRect + "\n");
>              System.out.println(stripper.getTextForRegion(Integer.toString(0)));
>              System.out.println("Getting text from comment = " + pdfAnnot.getContents());
>          }
>          pddDocument.close();
>      } catch (Exception ex) {
>          ex.printStackTrace();
>      }
> }
> }