You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Tilman Hausherr (Jira)" <ji...@apache.org> on 2020/04/21 04:50:00 UTC
[jira] [Issue Comment Deleted] (PDFBOX-4818) Is it possible to render a pdf to multi pic with PdfRenderer multi threads?

     [ https://issues.apache.org/jira/browse/PDFBOX-4818?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Tilman Hausherr updated PDFBOX-4818:
------------------------------------
    Comment: was deleted

(was: I don't see where you are closing the "doc" PDDocument object in "render()".)

> Is it possible to render a pdf to multi pic with PdfRenderer multi threads?
> ---------------------------------------------------------------------------
>
>                 Key: PDFBOX-4818
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-4818
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: Rendering
>    Affects Versions: 2.0.19
>            Reporter: jiangpeiheng
>            Priority: Major
>
> Hi, pdfbox developers.
> I'm using pdfbox to render my pdf file to multi jpg pictures. I've read the faq, and found that the PDDocument is not thread safe which means operating same PDDocument with multi threads risks.
> The method now I'm using is generate multi PDDocuments to render different parts of pdf to speed up the render process. However, it costs much memory. And when meeting some big pdf (400+ pages), it may occur OOM.
> So I'm wondering, what is the best way to use pdfbox to render a pdf?
> Here is my code for rendering:
> {code:java}
> package com.bytedance.esign.utils.pdf;
> import com.bytedance.esign.constants.enums.ResponseCode;
> import com.bytedance.esign.exception.EsignException;
> import com.bytedance.esign.threadpool.ThreadPoolManager;
> import com.bytedance.esign.utils.ContractLoadingRecorder;
> import com.google.common.collect.Lists;
> import lombok.AllArgsConstructor;
> import lombok.Data;
> import lombok.extern.slf4j.Slf4j;
> import org.apache.commons.io.IOUtils;
> import org.apache.pdfbox.io.MemoryUsageSetting;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.rendering.PDFRenderer;
> import javax.imageio.ImageIO;
> import java.awt.image.BufferedImage;
> import java.io.ByteArrayInputStream;
> import java.io.ByteArrayOutputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
> import java.util.Collections;
> import java.util.List;
> import java.util.concurrent.CompletableFuture;
> import java.util.function.Supplier;
> /**
>  * pdfbox 工具
>  *
>  * @author jiangpeiheng create on 2020/1/15
>  */
> @Slf4j
> public class PdfBoxUtil {
>     private static final String LOG_PERFIX = "PDF_BOX_UTIL";
>     private static final int MAX_PAGE_FOR_SINGLE_RENDER_TASK = 15;
>     static {
>         System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
>     }
>     /**
>      * pdf转图片
>      *
>      * @param is
>      * @param contractId
>      * @return
>      */
>     public static List<byte[]> pdfToJpg(InputStream is, String contractId) {
>         long start = System.currentTimeMillis();
>         try {
>             byte[] docBytes = IOUtils.toByteArray(is);
>             PDDocument doc = load(docBytes);    // 只为获取页数
>             int pageCount = doc.getPages().getCount();
>             close(doc);
>             doc = null; // 方便gc
>             // 向redis上报总页数
>             ContractLoadingRecorder.setTotalPage(contractId, pageCount);
>             // 拆任务
>             List<CompletableFuture<List<byte[]>>> futures = splitTask(docBytes, pageCount, contractId);
>             List<byte[]> images = Lists.newArrayList();
>             futures.forEach(future -> images.addAll(future.join()));
>             // 判断最终切割的页数
>             if (pageCount != images.size()) {
>                 log.error("[{}]PDF渲染图片，最终获取到的图片页数与预期页数不符，expect:{}, actual:{}",
>                         LOG_PERFIX, pageCount, images.size());
>                 throw new EsignException(ResponseCode.SYSTEM_ERROR.getCode(), "PDF切图，最终图片页数与预计页数不符！");
>             }
>             log.info("[{}]PDF渲染图片整体流程成功", LOG_PERFIX);
>             return images;
>         } catch (Exception e) {
>             log.error("[{}]PDF渲染图片整体流程异常, contractId:{}, e:",
>                     LOG_PERFIX, contractId, e);
>             return Collections.emptyList();
>         } finally {
>             log.info("[{}]PDF渲染图片整体流程完成, contractId:{}, cost:{}",
>                     LOG_PERFIX, contractId, System.currentTimeMillis() - start);
>         }
>     }
>     private static List<CompletableFuture<List<byte[]>>> splitTask(byte[] pdfBytes, int pages, String contractId) {
>         List<CompletableFuture<List<byte[]>>> futures = Lists.newArrayList();
>         int start = 0;
>         int end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
>         while (start != end && end <= pages) {
>             RenderTask task = new RenderTask(pdfBytes, start, end, contractId);
>             futures.add(CompletableFuture.supplyAsync(task, ThreadPoolManager.PDF_RENDER_EXECUTOR));
>             start = end;
>             end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
>         }
>         return futures;
>     }
>     @Data
>     @AllArgsConstructor
>     private static class RenderTask implements Supplier<List<byte[]>> {
>         private byte[] pdfBytes;
>         private int start;
>         private int end;
>         private String contractId;
>         @Override
>         public List<byte[]> get() {
>             return render(pdfBytes, start, end, contractId);
>         }
>     }
>     private static List<byte[]> render(byte[] pdfBytes, int start, int end, String contractId) {
>         long startTime = System.currentTimeMillis();
>         try (
>                 PDDocument doc = load(pdfBytes)
>         ) {
>             log.info("[{}]载入并发线程的PDDocument耗时:{}",
>                     LOG_PERFIX, System.currentTimeMillis() - startTime);
>             PDFRenderer renderer = new PDFRenderer(doc);
>             List<byte[]> images = new ArrayList<>();
>             for (int i = start; i < end; i++) {
>                 BufferedImage bim = renderer.renderImageWithDPI(i, 200);
>                 images.add(transformImage(bim));
>                 long subStart = System.currentTimeMillis();
>                 ContractLoadingRecorder.incrRenderedPage(contractId);
>                 log.info("[{}]上报新增渲染页数耗时:{}",
>                         LOG_PERFIX, System.currentTimeMillis() - subStart);
>             }
>             log.info("[{}]单task渲染PDF成功, start:{}, end:{}, pages:{}",
>                     LOG_PERFIX, start, end, images.size());
>             return images;
>         } catch (Exception e) {
>             log.error("[{}]单task渲染PDF异常, start:{}, end:{}, e:",
>                     LOG_PERFIX, start, end, e);
>             return Collections.emptyList();
>         } finally {
>             log.info("[{}]单task渲染PDF完成, start:{}, end:{}, cost:{}",
>                     LOG_PERFIX, start, end, System.currentTimeMillis() - startTime);
>         }
>     }
>     private static byte[] transformImage(BufferedImage bim) throws IOException {
>         ByteArrayOutputStream os = new ByteArrayOutputStream();
>         ImageIO.write(bim, "jpg", os);
>         return os.toByteArray();
>     }
>     private static PDDocument load(byte[] docBytes) throws IOException {
>         return PDDocument.load(new ByteArrayInputStream(docBytes),
>                 MemoryUsageSetting.setupTempFileOnly());
>     }
>     private static void close(PDDocument doc) throws IOException {
>         if (doc == null) {
>             return;
>         }
>         doc.close();
>     }
> }
> {code}
> Thanks
> Jiangpeiheng



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org