You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/28 14:10:53 UTC
svn commit: r819503 -
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Author: jukka
Date: Mon Sep 28 12:10:52 2009
New Revision: 819503
URL: http://svn.apache.org/viewvc?rev=819503&view=rev
Log:
TIKA-292: PDFBox is too verbose
Ignore the unused PDF primitives for now.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=819503&r1=819502&r2=819503&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Sep 28 12:10:52 2009
@@ -17,11 +17,14 @@
package org.apache.tika.parser.pdf;
import java.io.IOException;
+import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
+import org.apache.pdfbox.util.operator.OperatorProcessor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.metadata.Metadata;
@@ -65,6 +68,65 @@
private PDF2XHTML(ContentHandler handler, Metadata metadata)
throws IOException {
this.handler = new XHTMLContentHandler(handler, metadata);
+
+ // TIKA-292: Ignore unneeded PDF operators
+ // TODO: Remove this once PDFBox is no longer so verbose
+ OperatorProcessor ignore = new OperatorProcessor() {
+ @Override @SuppressWarnings("unchecked")
+ public void process(PDFOperator operator, List arguments) {
+ }
+ };
+ registerOperatorProcessor("b", ignore);
+ registerOperatorProcessor("B", ignore);
+ registerOperatorProcessor("b*", ignore);
+ registerOperatorProcessor("B*", ignore);
+ registerOperatorProcessor("BDC", ignore);
+ registerOperatorProcessor("BI", ignore);
+ registerOperatorProcessor("BMC", ignore);
+ registerOperatorProcessor("b", ignore);
+ registerOperatorProcessor("BX", ignore);
+ registerOperatorProcessor("c", ignore);
+ registerOperatorProcessor("CS", ignore);
+ registerOperatorProcessor("cs", ignore);
+ registerOperatorProcessor("d", ignore);
+ registerOperatorProcessor("d0", ignore);
+ registerOperatorProcessor("d1", ignore);
+ registerOperatorProcessor("DP", ignore);
+ registerOperatorProcessor("El", ignore);
+ registerOperatorProcessor("EMC", ignore);
+ registerOperatorProcessor("EX", ignore);
+ registerOperatorProcessor("f", ignore);
+ registerOperatorProcessor("F", ignore);
+ registerOperatorProcessor("f*", ignore);
+ registerOperatorProcessor("G", ignore);
+ registerOperatorProcessor("g", ignore);
+ registerOperatorProcessor("h", ignore);
+ registerOperatorProcessor("i", ignore);
+ registerOperatorProcessor("ID", ignore);
+ registerOperatorProcessor("j", ignore);
+ registerOperatorProcessor("J", ignore);
+ registerOperatorProcessor("K", ignore);
+ registerOperatorProcessor("k", ignore);
+ registerOperatorProcessor("l", ignore);
+ registerOperatorProcessor("m", ignore);
+ registerOperatorProcessor("M", ignore);
+ registerOperatorProcessor("MP", ignore);
+ registerOperatorProcessor("n", ignore);
+ registerOperatorProcessor("re", ignore);
+ registerOperatorProcessor("RG", ignore);
+ registerOperatorProcessor("rg", ignore);
+ registerOperatorProcessor("ri", ignore);
+ registerOperatorProcessor("s", ignore);
+ registerOperatorProcessor("S", ignore);
+ registerOperatorProcessor("SC", ignore);
+ registerOperatorProcessor("sc", ignore);
+ registerOperatorProcessor("SCN", ignore);
+ registerOperatorProcessor("scn", ignore);
+ registerOperatorProcessor("sh", ignore);
+ registerOperatorProcessor("v", ignore);
+ registerOperatorProcessor("W", ignore);
+ registerOperatorProcessor("W*", ignore);
+ registerOperatorProcessor("y", ignore);
}
@Override