You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2017/03/31 19:14:51 UTC

lucene-solr:jira/solr-10290: SOLR-10298: new pdfbox based tool to reduce PDF size (via FLATE_DECODE)

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-10290 ec2cbb3ee -> d14977553


SOLR-10298: new pdfbox based tool to reduce PDF size (via FLATE_DECODE)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d1497755
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d1497755
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d1497755

Branch: refs/heads/jira/solr-10290
Commit: d14977553f2597d84f907011a5c7275bfb79be77
Parents: ec2cbb3
Author: Chris Hostetter <ho...@apache.org>
Authored: Fri Mar 31 11:09:26 2017 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Fri Mar 31 12:14:26 2017 -0700

----------------------------------------------------------------------
 solr/solr-ref-guide/build.xml                | 25 +++++--
 solr/solr-ref-guide/ivy.xml                  |  3 +
 solr/solr-ref-guide/tools/ReducePDFSize.java | 90 +++++++++++++++++++++++
 3 files changed, 112 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1497755/solr/solr-ref-guide/build.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/build.xml b/solr/solr-ref-guide/build.xml
index 47e87e4..cc21bc4 100644
--- a/solr/solr-ref-guide/build.xml
+++ b/solr/solr-ref-guide/build.xml
@@ -126,16 +126,19 @@
   </target>
 
   <!-- ====== PDF Build ======= -->
-  <target name="build-pdf"
-          depends="build-nav-data-files,resolve"
-          description="Builds a PDF">
+  <target name="build-pdf" depends="-build-raw-pdf,-reduce-pdf-size" description="Builds a PDF">
+    <echo>Finished Building ${build.dir}/${pdf-filename}</echo>
+  </target>
+  <target name="-build-raw-pdf"
+          depends="build-nav-data-files,resolve">
+    <mkdir dir="${build.dir}/pdf-tmp"/>
     <taskdef uri="antlib:org.asciidoctor.ant" resource="org/asciidoctor/ant/antlib.xml"
              classpathref="tools-run-classpath"/>
     <asciidoctor:convert
                  sourceDirectory="${build.content.dir}/pdf"
                  sourceDocumentName="SolrRefGuide-all.adoc"
                  baseDir="${build.content.dir}"
-                 outputDirectory="${build.dir}"
+                 outputDirectory="${build.dir}/pdf-tmp"
                  backend="pdf"
                  extensions="adoc"
                  sourceHighlighter="coderay"
@@ -160,10 +163,20 @@
       <attribute key="build-date" value="${DSTAMP}" />
       <attribute key="build-year" value="${current.year}" />
     </asciidoctor:convert>
-    <move file="${build.dir}/SolrRefGuide-all.pdf" tofile="${build.dir}/${pdf-filename}" />
-    <echo>Finished Building ${build.dir}/${pdf-filename}</echo>
+    <move file="${build.dir}/pdf-tmp/SolrRefGuide-all.pdf" tofile="${build.dir}/pdf-tmp/RAW-${pdf-filename}" />
+  </target>
+  <target name="-reduce-pdf-size" depends="build-init,build-tools-jar">
+    <java classname="ReducePDFSize"
+          failonerror="true"
+          fork="true">
+      <classpath refid="tools-run-classpath"/>
+      <arg value="${build.dir}/pdf-tmp/RAW-${pdf-filename}"/>
+      <arg value="${build.dir}/${pdf-filename}"/>
+    </java>
   </target>
+  
 
+  
   <!-- ======= HTML Site Build =======
        Builds site with Jekyll.
        This (for now) assumes that Jekyll (http://jekyllrb.com) is installed locally. -->

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1497755/solr/solr-ref-guide/ivy.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/ivy.xml b/solr/solr-ref-guide/ivy.xml
index e59a845..adc182e 100644
--- a/solr/solr-ref-guide/ivy.xml
+++ b/solr/solr-ref-guide/ivy.xml
@@ -25,5 +25,8 @@
     <dependency org="org.asciidoctor" name="asciidoctor-ant" rev="${/org.asciidoctor/asciidoctor-ant}" conf="compile" />
     <dependency org="org.json" name="json" rev="${/org.json/json}" conf="compile" />
     <dependency org="org.jsoup" name="jsoup" rev="${/org.jsoup/jsoup}" conf="compile" />
+    <dependency org="org.apache.pdfbox" name="pdfbox" rev="${/org.apache.pdfbox/pdfbox}" conf="compile"/>
+    <dependency org="org.slf4j" name="jcl-over-slf4j" rev="${/org.slf4j/jcl-over-slf4j}" conf="compile"/>
+    <dependency org="org.slf4j" name="slf4j-api" rev="${/org.slf4j/slf4j-api}" conf="compile"/>
   </dependencies>
 </ivy-module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1497755/solr/solr-ref-guide/tools/ReducePDFSize.java
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/tools/ReducePDFSize.java b/solr/solr-ref-guide/tools/ReducePDFSize.java
new file mode 100644
index 0000000..4506cae
--- /dev/null
+++ b/solr/solr-ref-guide/tools/ReducePDFSize.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.common.PDStream;
+
+/**
+ * A simple command line utility for reducing the size of the ref-guide PDF.
+ * <p>
+ * Currently this script focuses on using {@link COSName#FLATE_DECODE} to compress the (decoded) Objects 
+ * in the source PDF, but other improvements may be possible in the future.
+ * </p>
+ * <p>
+ * This code is originally based on the <code>WriteDecodedDoc</code> example provided with <a href="https://pdfbox.apache.org/">Apache PDFBox</a>.
+ * </p>
+ * <p>
+ * <b>NOTE:</b> This class should <em>NOT</em> be considered a general purpose tool for reducing the size of 
+ * <em>any</em> PDF.  
+ * Decisions made in this code can and will be focused explicitly on serving the purpose of reducing the size of the 
+ * Solr Reference Guide PDF, as originally produced by asciidoctor, and may not be generally useful for all PDFs 
+ * "in the wild".
+ * </p>
+ */
+public class ReducePDFSize {
+
+  public static void main(String[] args) throws IOException {
+    if (2 != args.length) {
+      throw new RuntimeException("arg0 must be input file, org1 must be output file");
+    }
+    String in = args[0];
+    String out = args[1];
+    PDDocument doc = null;
+    
+    try {
+      doc = PDDocument.load(new File(in));
+      doc.setAllSecurityToBeRemoved(true);
+      for (COSObject cosObject : doc.getDocument().getObjects()) {
+        COSBase base = cosObject.getObject();
+        // if it's a stream: decode it, then re-write it using FLATE_DECODE
+        if (base instanceof COSStream) {
+          COSStream stream = (COSStream) base;
+          byte[] bytes;
+          try {
+            bytes = new PDStream(stream).toByteArray();
+          } catch (IOException ex) {
+            // NOTE: original example code from PDFBox just logged & "continue;"d here, 'skipping' this stream.
+            // If this type of failure ever happens, we can (perhaps) consider (re)ignoring this type of failure?
+            //
+            // IIUC then that will leave the original (non-decoded / non-flated) stream in place?
+            throw new RuntimeException("can't serialize byte[] from: " +
+                                       cosObject.getObjectNumber() + " " + 
+                                       cosObject.getGenerationNumber() + " obj: " + 
+                                       ex.getMessage(), ex);
+          }
+          stream.removeItem(COSName.FILTER);
+          OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE);
+          streamOut.write(bytes);
+          streamOut.close();
+        }
+      }
+      doc.getDocumentCatalog();
+      doc.save( out );
+    } finally {
+      if ( doc != null ) {
+        doc.close();
+      }
+    }
+  }
+}