You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by jorgeeflorez <jo...@gmail.com> on 2018/11/06 21:41:55 UTC
Re: users Digest 6 Nov 2018 21:30:49 -0000 Issue 1773
Tilman, I am sorry, I did not read your last message before writing mine :(
I am currently in "Digest" mode and as soon as I sent my message, the
digest arrived (odd).
Thank you.
El mar., 6 nov. 2018 a las 16:30, <us...@pdfbox.apache.org>
escribió:
>
> ---------- Forwarded message ----------
> From: Tilman Hausherr <TH...@t-online.de>
> To: users@pdfbox.apache.org
> Cc:
> Bcc:
> Date: Tue, 6 Nov 2018 17:46:07 +0100
> Subject: Re: Extracting page "correctly"
> Here's some code for you to try. It should work with any fine rotation,
> I have tested it on 2 files LOL. Please tell if it works for you.
>
>
> /**
> *
> * @author Tilman Hausherr
> */
> public class ExtractAngledText
> {
> Set<Integer> angles = new HashSet<>();
>
> public Set<Integer> getAngles()
> {
> return angles;
> }
>
> /**
> * This will print the documents data.
> *
> * @param args The command line arguments.
> *
> * @throws IOException If there is an error parsing the document.
> */
> public static void main(String[] args) throws IOException
> {
> args = new String[]{"PDFBOX-4368-many-rotations.pdf"};
> if (args.length != 1)
> {
> usage();
> }
> else
> {
> AngleCollector angleCollector = new AngleCollector();
> try (PDDocument doc = PDDocument.load(new File(args[0])))
> {
> for (int p = 1; p <= doc.getNumberOfPages(); ++p)
> {
> System.out.printf("Page: %3d\n", p);
> System.out.println("----------");
>
> angleCollector.setStartPage(p);
> angleCollector.setEndPage(p);
> angleCollector.getText(doc);
> System.out.println("Collected angles: " +
> angleCollector.getAngles());
> System.out.println();
>
> PDPage page = doc.getPage(0);
> FilteredTextStripper filteredTextStripper = new
> FilteredTextStripper();
> for (int angle : angleCollector.getAngles())
> {
> filteredTextStripper.setStartPage(p);
> filteredTextStripper.setEndPage(p);
>
> System.out.printf("Angle: %3d\n", angle);
> System.out.println("----------");
> String text;
> if (angle == 0)
> {
> text = filteredTextStripper.getText(doc);
> }
> else
> {
> // prepend a transformation
> try (PDPageContentStream cs = new
> PDPageContentStream(doc, page, AppendMode.PREPEND, false))
> {
> cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
> }
>
> text = filteredTextStripper.getText(doc);
>
> // remove transformation
> COSArray contents = (COSArray)
> page.getCOSObject().getItem(COSName.CONTENTS);
> contents.remove(0);
> }
> System.out.println(text);
> }
> }
> }
> }
> }
>
> /**
> * This will print the usage for this document.
> */
> private static void usage()
> {
> System.err.println("Usage: java " +
> AngleCollector.class.getName() + " <input-pdf>");
> }
> }
>
> class AngleCollector extends PDFTextStripper
> {
> Set<Integer> angles = new HashSet<>();
>
> public Set<Integer> getAngles()
> {
> return angles;
> }
>
> /**
> * Instantiate a new PDFTextStripper object.
> *
> * @throws IOException If there is an error loading the properties.
> */
> public AngleCollector() throws IOException
> {
> }
>
> @Override
> protected void processTextPosition(TextPosition text)
> {
> Matrix m = text.getTextMatrix();
> int angle = (int)
> Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
> angles.add(angle);
> }
> }
>
> class FilteredTextStripper extends PDFTextStripper
> {
> public FilteredTextStripper() throws IOException
> {
> }
>
> @Override
> protected void processTextPosition(TextPosition text)
> {
> Matrix m = text.getTextMatrix();
> int angle = (int)
> Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
> if (angle == 0)
> {
> super.processTextPosition(text);
> }
> }
> }
>
>
>