You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/28 14:10:53 UTC

svn commit: r819503 - /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Author: jukka
Date: Mon Sep 28 12:10:52 2009
New Revision: 819503

URL: http://svn.apache.org/viewvc?rev=819503&view=rev
Log:
TIKA-292: PDFBox is too verbose

Ignore the unused PDF primitives for now.

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=819503&r1=819502&r2=819503&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Sep 28 12:10:52 2009
@@ -17,11 +17,14 @@
 package org.apache.tika.parser.pdf;
 
 import java.io.IOException;
+import java.util.List;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFOperator;
 import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.util.TextPosition;
+import org.apache.pdfbox.util.operator.OperatorProcessor;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOExceptionWithCause;
 import org.apache.tika.metadata.Metadata;
@@ -65,6 +68,65 @@
     private PDF2XHTML(ContentHandler handler, Metadata metadata)
             throws IOException {
         this.handler = new XHTMLContentHandler(handler, metadata);
+
+        // TIKA-292: Ignore unneeded PDF operators
+        // TODO: Remove this once PDFBox is no longer so verbose
+        OperatorProcessor ignore = new OperatorProcessor() {
+            @Override @SuppressWarnings("unchecked")
+            public void process(PDFOperator operator, List arguments) {
+            }
+        };
+        registerOperatorProcessor("b", ignore);
+        registerOperatorProcessor("B", ignore);
+        registerOperatorProcessor("b*", ignore);
+        registerOperatorProcessor("B*", ignore);
+        registerOperatorProcessor("BDC", ignore);
+        registerOperatorProcessor("BI", ignore);
+        registerOperatorProcessor("BMC", ignore);
+        registerOperatorProcessor("b", ignore);
+        registerOperatorProcessor("BX", ignore);
+        registerOperatorProcessor("c", ignore);
+        registerOperatorProcessor("CS", ignore);
+        registerOperatorProcessor("cs", ignore);
+        registerOperatorProcessor("d", ignore);
+        registerOperatorProcessor("d0", ignore);
+        registerOperatorProcessor("d1", ignore);
+        registerOperatorProcessor("DP", ignore);
+        registerOperatorProcessor("El", ignore);
+        registerOperatorProcessor("EMC", ignore);
+        registerOperatorProcessor("EX", ignore);
+        registerOperatorProcessor("f", ignore);
+        registerOperatorProcessor("F", ignore);
+        registerOperatorProcessor("f*", ignore);
+        registerOperatorProcessor("G", ignore);
+        registerOperatorProcessor("g", ignore);
+        registerOperatorProcessor("h", ignore);
+        registerOperatorProcessor("i", ignore);
+        registerOperatorProcessor("ID", ignore);
+        registerOperatorProcessor("j", ignore);
+        registerOperatorProcessor("J", ignore);
+        registerOperatorProcessor("K", ignore);
+        registerOperatorProcessor("k", ignore);
+        registerOperatorProcessor("l", ignore);
+        registerOperatorProcessor("m", ignore);
+        registerOperatorProcessor("M", ignore);
+        registerOperatorProcessor("MP", ignore);
+        registerOperatorProcessor("n", ignore);
+        registerOperatorProcessor("re", ignore);
+        registerOperatorProcessor("RG", ignore);
+        registerOperatorProcessor("rg", ignore);
+        registerOperatorProcessor("ri", ignore);
+        registerOperatorProcessor("s", ignore);
+        registerOperatorProcessor("S", ignore);
+        registerOperatorProcessor("SC", ignore);
+        registerOperatorProcessor("sc", ignore);
+        registerOperatorProcessor("SCN", ignore);
+        registerOperatorProcessor("scn", ignore);
+        registerOperatorProcessor("sh", ignore);
+        registerOperatorProcessor("v", ignore);
+        registerOperatorProcessor("W", ignore);
+        registerOperatorProcessor("W*", ignore);
+        registerOperatorProcessor("y", ignore);
     }
 
     @Override