You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "jiangpeiheng (Jira)" <ji...@apache.org> on 2020/04/21 03:56:00 UTC

[jira] [Created] (PDFBOX-4818) Is it possible to render a pdf to multi pic with PdfRenderer multi threads?

jiangpeiheng created PDFBOX-4818:
------------------------------------

             Summary: Is it possible to render a pdf to multi pic with PdfRenderer multi threads?
                 Key: PDFBOX-4818
                 URL: https://issues.apache.org/jira/browse/PDFBOX-4818
             Project: PDFBox
          Issue Type: Improvement
          Components: Rendering
    Affects Versions: 2.0.19
            Reporter: jiangpeiheng


Hi, pdfbox developers.

I'm using pdfbox to render my pdf file to multi jpg pictures. I've read the faq, and found that the PDDocument is not thread safe which means operating same PDDocument with multi threads risks.

The method now I'm using is generate multi PDDocuments to render different parts of pdf to speed up the render process. However, it costs much memory. And when meeting some big pdf (400+ pages), it may occur OOM.

So I'm wondering, what is the best way to use pdfbox to render a pdf?

Here is my code for rendering:
{code:java}
package com.bytedance.esign.utils.pdf;

import com.bytedance.esign.constants.enums.ResponseCode;
import com.bytedance.esign.exception.EsignException;
import com.bytedance.esign.threadpool.ThreadPoolManager;
import com.bytedance.esign.utils.ContractLoadingRecorder;
import com.google.common.collect.Lists;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.function.Supplier;

/**
 * pdfbox 工具
 *
 * @author jiangpeiheng create on 2020/1/15
 */
@Slf4j
public class PdfBoxUtil {

    private static final String LOG_PERFIX = "PDF_BOX_UTIL";
    private static final int MAX_PAGE_FOR_SINGLE_RENDER_TASK = 15;

    static {
        System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
    }

    /**
     * pdf转图片
     *
     * @param is
     * @param contractId
     * @return
     */
    public static List<byte[]> pdfToJpg(InputStream is, String contractId) {
        long start = System.currentTimeMillis();
        try {
            byte[] docBytes = IOUtils.toByteArray(is);
            PDDocument doc = load(docBytes);    // 只为获取页数
            int pageCount = doc.getPages().getCount();
            close(doc);
            doc = null; // 方便gc
            // 向redis上报总页数
            ContractLoadingRecorder.setTotalPage(contractId, pageCount);
            // 拆任务
            List<CompletableFuture<List<byte[]>>> futures = splitTask(docBytes, pageCount, contractId);
            List<byte[]> images = Lists.newArrayList();
            futures.forEach(future -> images.addAll(future.join()));
            // 判断最终切割的页数
            if (pageCount != images.size()) {
                log.error("[{}]PDF渲染图片,最终获取到的图片页数与预期页数不符,expect:{}, actual:{}",
                        LOG_PERFIX, pageCount, images.size());
                throw new EsignException(ResponseCode.SYSTEM_ERROR.getCode(), "PDF切图,最终图片页数与预计页数不符!");
            }
            log.info("[{}]PDF渲染图片整体流程成功", LOG_PERFIX);
            return images;
        } catch (Exception e) {
            log.error("[{}]PDF渲染图片整体流程异常, contractId:{}, e:",
                    LOG_PERFIX, contractId, e);
            return Collections.emptyList();
        } finally {
            log.info("[{}]PDF渲染图片整体流程完成, contractId:{}, cost:{}",
                    LOG_PERFIX, contractId, System.currentTimeMillis() - start);
        }
    }

    private static List<CompletableFuture<List<byte[]>>> splitTask(byte[] pdfBytes, int pages, String contractId) {
        List<CompletableFuture<List<byte[]>>> futures = Lists.newArrayList();
        int start = 0;
        int end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
        while (start != end && end <= pages) {
            RenderTask task = new RenderTask(pdfBytes, start, end, contractId);
            futures.add(CompletableFuture.supplyAsync(task, ThreadPoolManager.PDF_RENDER_EXECUTOR));
            start = end;
            end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
        }
        return futures;
    }

    @Data
    @AllArgsConstructor
    private static class RenderTask implements Supplier<List<byte[]>> {

        private byte[] pdfBytes;
        private int start;
        private int end;
        private String contractId;

        @Override
        public List<byte[]> get() {
            return render(pdfBytes, start, end, contractId);
        }
    }

    private static List<byte[]> render(byte[] pdfBytes, int start, int end, String contractId) {
        long startTime = System.currentTimeMillis();
        try (
                PDDocument doc = load(pdfBytes)
        ) {
            log.info("[{}]载入并发线程的PDDocument耗时:{}",
                    LOG_PERFIX, System.currentTimeMillis() - startTime);
            PDFRenderer renderer = new PDFRenderer(doc);
            List<byte[]> images = new ArrayList<>();
            for (int i = start; i < end; i++) {
                BufferedImage bim = renderer.renderImageWithDPI(i, 200);
                images.add(transformImage(bim));
                long subStart = System.currentTimeMillis();
                ContractLoadingRecorder.incrRenderedPage(contractId);
                log.info("[{}]上报新增渲染页数耗时:{}",
                        LOG_PERFIX, System.currentTimeMillis() - subStart);
            }
            log.info("[{}]单task渲染PDF成功, start:{}, end:{}, pages:{}",
                    LOG_PERFIX, start, end, images.size());
            return images;
        } catch (Exception e) {
            log.error("[{}]单task渲染PDF异常, start:{}, end:{}, e:",
                    LOG_PERFIX, start, end, e);
            return Collections.emptyList();
        } finally {
            log.info("[{}]单task渲染PDF完成, start:{}, end:{}, cost:{}",
                    LOG_PERFIX, start, end, System.currentTimeMillis() - startTime);
        }
    }

    private static byte[] transformImage(BufferedImage bim) throws IOException {
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        ImageIO.write(bim, "jpg", os);
        return os.toByteArray();
    }

    private static PDDocument load(byte[] docBytes) throws IOException {
        return PDDocument.load(new ByteArrayInputStream(docBytes),
                MemoryUsageSetting.setupTempFileOnly());
    }

    private static void close(PDDocument doc) throws IOException {
        if (doc == null) {
            return;
        }
        doc.close();
    }
}
{code}
Thanks

Jiangpeiheng



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org