You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@pdfbox.apache.org by jorgeeflorez <jo...@gmail.com> on 2018/11/06 21:41:55 UTC
Re: users Digest 6 Nov 2018 21:30:49 -0000 Issue 1773

Tilman, I am sorry, I did not read your last message before writing mine :(
I am currently in "Digest" mode and as soon as I sent my message, the
digest arrived (odd).

Thank you.

El mar., 6 nov. 2018 a las 16:30, <us...@pdfbox.apache.org>
escribió:

>
> ---------- Forwarded message ----------
> From: Tilman Hausherr <TH...@t-online.de>
> To: users@pdfbox.apache.org
> Cc:
> Bcc:
> Date: Tue, 6 Nov 2018 17:46:07 +0100
> Subject: Re: Extracting page "correctly"
> Here's some code for you to try. It should work with any fine rotation,
> I have tested it on 2 files LOL. Please tell if it works for you.
>
>
> /**
>   *
>   * @author Tilman Hausherr
>   */
> public class ExtractAngledText
> {
>      Set<Integer> angles = new HashSet<>();
>
>      public Set<Integer> getAngles()
>      {
>          return angles;
>      }
>
>      /**
>       * This will print the documents data.
>       *
>       * @param args The command line arguments.
>       *
>       * @throws IOException If there is an error parsing the document.
>       */
>      public static void main(String[] args) throws IOException
>      {
>          args = new String[]{"PDFBOX-4368-many-rotations.pdf"};
>          if (args.length != 1)
>          {
>              usage();
>          }
>          else
>          {
>              AngleCollector angleCollector = new AngleCollector();
>              try (PDDocument doc = PDDocument.load(new File(args[0])))
>              {
>                  for (int p = 1; p <= doc.getNumberOfPages(); ++p)
>                  {
>                      System.out.printf("Page: %3d\n", p);
>                      System.out.println("----------");
>
>                      angleCollector.setStartPage(p);
>                      angleCollector.setEndPage(p);
>                      angleCollector.getText(doc);
>                      System.out.println("Collected angles: " +
> angleCollector.getAngles());
>                      System.out.println();
>
>                      PDPage page = doc.getPage(0);
>                      FilteredTextStripper filteredTextStripper = new
> FilteredTextStripper();
>                      for (int angle : angleCollector.getAngles())
>                      {
>                          filteredTextStripper.setStartPage(p);
>                          filteredTextStripper.setEndPage(p);
>
>                          System.out.printf("Angle: %3d\n", angle);
>                          System.out.println("----------");
>                          String text;
>                          if (angle == 0)
>                          {
>                              text = filteredTextStripper.getText(doc);
>                          }
>                          else
>                          {
>                              // prepend a transformation
>                              try (PDPageContentStream cs = new
> PDPageContentStream(doc, page, AppendMode.PREPEND, false))
>                              {
> cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
>                              }
>
>                              text = filteredTextStripper.getText(doc);
>
>                              // remove transformation
>                              COSArray contents = (COSArray)
> page.getCOSObject().getItem(COSName.CONTENTS);
>                              contents.remove(0);
>                          }
>                          System.out.println(text);
>                      }
>                  }
>              }
>          }
>      }
>
>      /**
>       * This will print the usage for this document.
>       */
>      private static void usage()
>      {
>          System.err.println("Usage: java " +
> AngleCollector.class.getName() + " <input-pdf>");
>      }
> }
>
> class AngleCollector extends PDFTextStripper
> {
>      Set<Integer> angles = new HashSet<>();
>
>      public Set<Integer> getAngles()
>      {
>          return angles;
>      }
>
>      /**
>       * Instantiate a new PDFTextStripper object.
>       *
>       * @throws IOException If there is an error loading the properties.
>       */
>      public AngleCollector() throws IOException
>      {
>      }
>
>      @Override
>      protected void processTextPosition(TextPosition text)
>      {
>          Matrix m = text.getTextMatrix();
>          int angle = (int)
> Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
>          angles.add(angle);
>      }
> }
>
> class FilteredTextStripper extends PDFTextStripper
> {
>      public FilteredTextStripper() throws IOException
>      {
>      }
>
>      @Override
>      protected void processTextPosition(TextPosition text)
>      {
>          Matrix m = text.getTextMatrix();
>          int angle = (int)
> Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
>          if (angle == 0)
>          {
>              super.processTextPosition(text);
>          }
>      }
> }
>
>
>