You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Tilman Hausherr (Jira)" <ji...@apache.org> on 2020/06/03 17:46:00 UTC
[jira] [Commented] (PDFBOX-4857) Render the first page for PDF cost
long time
[ https://issues.apache.org/jira/browse/PDFBOX-4857?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17125168#comment-17125168 ]
Tilman Hausherr commented on PDFBOX-4857:
-----------------------------------------
The problem is that the PDF has a HUGE watermark image, that is what takes so long (and why that PDF is so big, the compressed length of that watermark is over 1MB). You can lessen this a bit by activating subsampling in PDFRenderer, but it is still slow. (On my system the rendering time is gets down about 40%)
Very small speed/space improvement: replace
{{return PDDocument.load(new ByteArrayInputStream(docBytes), MemoryUsageSetting.setupTempFileOnly());}}
with
{{return PDDocument.load(input, "", null, null, MemoryUsageSetting.setupTempFileOnly());}}.
And you can get docBytes easier by calling {{Files.readAllBytes(Paths.get(INPUT_PATH))}}.
> Render the first page for PDF cost long time
> --------------------------------------------
>
> Key: PDFBOX-4857
> URL: https://issues.apache.org/jira/browse/PDFBOX-4857
> Project: PDFBox
> Issue Type: Bug
> Affects Versions: 2.0.19
> Reporter: jiangpeiheng
> Priority: Major
> Attachments: contract_input_jira.pdf
>
>
> Hi, dear PDFBox developers:
> I'm now using pdfbox 2.0.19 to render my PDF file to jpg pictures. As the time goes, I have found a problem. There is a PDF file which only got 2 pages, however it takes 10 or more seconeds to render all pages. The file is in attachment, and here is my rendering code:
> {code:java}
> package com.bytedance.esign.pdfrender.processor;
> import com.google.common.collect.Maps;
> import lombok.extern.slf4j.Slf4j;
> import org.apache.commons.lang3.ArrayUtils;
> import org.apache.pdfbox.io.MemoryUsageSetting;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.rendering.PDFRenderer;
> import org.springframework.stereotype.Service;
> import javax.imageio.ImageIO;
> import java.awt.image.BufferedImage;
> import java.io.ByteArrayInputStream;
> import java.io.ByteArrayOutputStream;
> import java.io.IOException;
> import java.util.List;
> import java.util.Map;
> /**
> * @author jiangpeiheng create on 2020/5/9
> */
> @Service
> @Slf4j
> public class RenderingProcessor {
> private static final String LOG_PERFIX = "RENDERING_PROCESSOR";
> // some settings for rendering
> private static final int IMAGE_DPI = 200;
> private static final String IMAGE_FORMAT = "jpg";
> /**
> * 渲染主入口
> *
> * @param originPdf 原始PDF
> * @param pages 需要渲染的页
> * @return
> */
> public static Map<Integer, byte[]> render(byte[] originPdf, List<Integer> pages) {
> long startTime = System.currentTimeMillis();
> Map<Integer, byte[]> result = Maps.newHashMap();
> try (
> PDDocument doc = load(originPdf)
> ) {
> log.info("[{}]载入PDDocument耗时:{}",
> LOG_PERFIX, System.currentTimeMillis() - startTime);
> long rendererInitStopWatch = System.currentTimeMillis();
> PDFRenderer renderer = new PDFRenderer(doc);
> log.info("[{}]建立PDFRenderer完成,cost:{}",
> LOG_PERFIX, System.currentTimeMillis() - rendererInitStopWatch);
> pages.forEach(pageIndex -> {
> long pageStopWatch = System.currentTimeMillis();
> byte[] res = renderSinglePage(renderer, pageIndex);
> log.info("[{}]单页渲染完成,page index:{}, cost:{}",
> LOG_PERFIX, pageIndex, System.currentTimeMillis() - pageStopWatch);
> pageStopWatch = System.currentTimeMillis();
> if (ArrayUtils.isNotEmpty(res)) {
> result.put(pageIndex, res);
> }
> log.info("[{}]单页put to map完成,page index:{}, cost:{}",
> LOG_PERFIX, pageIndex, System.currentTimeMillis() - pageStopWatch);
> });
> log.info("[{}]渲染PDF成功, successSize:{}",
> LOG_PERFIX, result.size());
> } catch (Exception e) {
> log.error("[{}]渲染PDF异常, successSize:{}, e:",
> LOG_PERFIX, result.size(), e);
> } finally {
> log.info("[{}]渲染PDF完成, successSize:{}, cost:{}",
> LOG_PERFIX, result.size(),
> System.currentTimeMillis() - startTime);
> }
> return result;
> }
> private static PDDocument load(byte[] docBytes) throws IOException {
> return PDDocument.load(new ByteArrayInputStream(docBytes),
> MemoryUsageSetting.setupTempFileOnly());
> }
> /**
> * 渲染单页
> *
> * @param renderer
> * @param pageIndex
> * @return
> */
> private static byte[] renderSinglePage(PDFRenderer renderer, int pageIndex) {
> try {
> // 渲染第一页,则这里传入的pageIndex需要减1
> return transformImage(renderer.renderImageWithDPI(pageIndex - 1, IMAGE_DPI));
> } catch (Exception e) {
> log.error("[{}]渲染单页异常, pageIndex:{}, e:",
> LOG_PERFIX, pageIndex, e);
> return null;
> }
> }
> /**
> * BufferedImage -> byte[]
> *
> * @param bim
> * @return
> * @throws IOException
> */
> private static byte[] transformImage(BufferedImage bim) throws IOException {
> ByteArrayOutputStream os = new ByteArrayOutputStream();
> ImageIO.write(bim, IMAGE_FORMAT, os);
> return os.toByteArray();
> }
> }
> {code}
> Here is my UT code as well:
> {code:java}
> package com.bytedance.esign.pdfrender.processor;
> import com.google.common.collect.Lists;
> import lombok.extern.slf4j.Slf4j;
> import org.apache.commons.io.IOUtils;
> import org.junit.Test;
> import java.io.FileInputStream;
> import java.util.List;
> import java.util.stream.Collectors;
> import java.util.stream.IntStream;
> import static org.junit.Assert.*;
> /**
> * @author jiangpeiheng create on 2020/6/3
> */
> @Slf4j
> public class RenderingProcessorTest {
> private static final String INPUT_PATH = "/Users/jiangpeiheng/myhome/work_stuff/esign/optimize/pdfrender/contract_input_jira.pdf";
> private static final List<Integer> PAGES;
> static {
> PAGES = IntStream.rangeClosed(1, 2).boxed().collect(Collectors.toList());
> }
> @Test
> public void render() {
> try (
> FileInputStream is = new FileInputStream(INPUT_PATH)
> ) {
> byte[] pdfBytes = IOUtils.toByteArray(is);
> // single
> RenderingProcessor.render(pdfBytes, PAGES);
> // loop
> // IntStream.rangeClosed(1, 10).forEach(i -> {
> // log.info("Loop render, index:{}", i);
> // RenderingProcessor.render(pdfBytes, PAGES);
> // });
> } catch (Exception e) {
> log.error("Exception, e:", e);
> }
> }
> }
> {code}
> Does anybody could find out why it cost so long time for rendering the first page?
> Thank you
> Jiang Peiheng
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org