You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by Nicholas Poon <ni...@yahoo.com.INVALID> on 2014/08/17 07:29:53 UTC
Highlighted text annotation -- extract color and show in English name
Hi,
I am trying to extract highlighted text with different colors inside the
pdf file. I can use "getColour()" by calling PDAnnotation class but the
problem is PDGamma objects are returned and how can I convert that
PDGamma string into human readable English names, such as "Yellow", "Red",
"Blue" ... etc.?
Below is the sample code I did for the text extraction with color returned.
Any great hints or simple codes would be appreciated.
Cheers,
Nick
====================================================================import java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
try {
PDDocument pddDocument = PDDocument.load(new File("sample.pdf"));
List allPages =
pddDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
int pageNum = i + 1;
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
PDAnnotation pdfAnnot = la.get(0);
System.out.println("Color = " + pdfAnnot.getColour());
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle rect = pdfAnnot.getRectangle();
float x = rect.getLowerLeftX() - 1;
float y = rect.getUpperRightY() - 1;
float width = rect.getWidth() + 2;
float height = rect.getHeight() + rect.getHeight() / 4;
int rotation = page.findRotation();
if (rotation == 0) {
PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
stripper.addRegion(Integer.toString(0), awtRect);
stripper.extractRegions(page);
System.out.println("Getting text from region = " + awtRect + "\n");
System.out.println(stripper.getTextForRegion(Integer.toString(0)));
System.out.println("Getting text from comment = " + pdfAnnot.getContents());
}
pddDocument.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
Re: Highlighted text annotation -- extract color and show in English
name
Posted by Tilman Hausherr <TH...@t-online.de>.
I looked at the source code, you can do getR(), getG(), getB() to get
color components from PDGamma. Here's some hints about what yellow is,
i.e. there is no such thing as one yellow.
http://www.rapidtables.com/web/color/Yellow_Color.htm#code
Tilman
Am 17.08.2014 07:29, schrieb Nicholas Poon:
> Hi,
> I am trying to extract highlighted text with different colors inside the
> pdf file. I can use "getColour()" by calling PDAnnotation class but the
> problem is PDGamma objects are returned and how can I convert that
> PDGamma string into human readable English names, such as "Yellow", "Red",
> "Blue" ... etc.?
>
> Below is the sample code I did for the text extraction with color returned.
> Any great hints or simple codes would be appreciated.
>
> Cheers,
> Nick
> ====================================================================import java.awt.geom.Rectangle2D;
> import java.io.File;
> import java.util.List;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.pdmodel.PDPage;
> import org.apache.pdfbox.pdmodel.common.PDRectangle;
> import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
> import org.apache.pdfbox.util.PDFTextStripperByArea;
> public class ExtractHighlights {
> public static void main(String args[]) {
> try {
> PDDocument pddDocument = PDDocument.load(new File("sample.pdf"));
> List allPages =
> pddDocument.getDocumentCatalog().getAllPages();
> for (int i = 0; i < allPages.size(); i++) {
> int pageNum = i + 1;
> PDPage page = (PDPage) allPages.get(i);
> List<PDAnnotation> la = page.getAnnotations();
> if (la.size() < 1) {
> continue;
> }
> PDAnnotation pdfAnnot = la.get(0);
>
> System.out.println("Color = " + pdfAnnot.getColour());
> PDFTextStripperByArea stripper = new PDFTextStripperByArea();
> stripper.setSortByPosition(true);
>
> PDRectangle rect = pdfAnnot.getRectangle();
> float x = rect.getLowerLeftX() - 1;
> float y = rect.getUpperRightY() - 1;
> float width = rect.getWidth() + 2;
> float height = rect.getHeight() + rect.getHeight() / 4;
> int rotation = page.findRotation();
> if (rotation == 0) {
> PDRectangle pageSize = page.findMediaBox();
> y = pageSize.getHeight() - y;
> }
> Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
> stripper.addRegion(Integer.toString(0), awtRect);
> stripper.extractRegions(page);
> System.out.println("Getting text from region = " + awtRect + "\n");
> System.out.println(stripper.getTextForRegion(Integer.toString(0)));
> System.out.println("Getting text from comment = " + pdfAnnot.getContents());
> }
> pddDocument.close();
> } catch (Exception ex) {
> ex.printStackTrace();
> }
> }
> }