You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2010/12/08 20:19:15 UTC

svn commit: r1043618 - in /jackrabbit/trunk/jackrabbit-core: ./ src/main/java/org/apache/jackrabbit/core/query/pdf/ src/main/resources/org/apache/jackrabbit/core/query/lucene/

Author: jukka
Date: Wed Dec  8 19:19:14 2010
New Revision: 1043618

URL: http://svn.apache.org/viewvc?rev=1043618&view=rev
Log:
JCR-2838: Tika regressions in 0.8

Add temporary workarounds for TIKA-548 and TIKA-556.

Added:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java   (with props)
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java   (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/pom.xml
    jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml

Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?rev=1043618&r1=1043617&r2=1043618&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Wed Dec  8 19:19:14 2010
@@ -246,6 +246,11 @@ org.apache.jackrabbit.test.api.Shareable
       </exclusions>
     </dependency>
     <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>netcdf</artifactId>
+      <version>4.2-min</version>
+    </dependency>
+    <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
     </dependency>

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java?rev=1043618&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java Wed Dec  8 19:19:14 2010
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException if the content handler fails to process SAX events
+     * @throws TikaException if the PDF document can not be processed
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, Metadata metadata)
+            throws SAXException, TikaException {
+        try {
+            // Extract text using a dummy Writer as we override the
+            // key methods to output to the given content handler.
+            new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+                @Override
+                public void flush() {
+                }
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+    }
+
+    private final XHTMLContentHandler handler;
+
+    private PDF2XHTML(ContentHandler handler, Metadata metadata)
+            throws IOException {
+        this.handler = new XHTMLContentHandler(handler, metadata);
+        setForceParsing(true);
+        setSortByPosition(true);
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            handler.startElement("div", "class", "page");
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            handler.endElement("p");
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        try {
+            handler.characters(text);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a string: " + text, e);
+        }
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            handler.characters(text.getCharacter());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a character: " + text.getCharacter(), e);
+        }
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        try {
+            handler.characters(" ");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a space character", e);
+        }
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        try {
+            handler.characters("\n");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a newline character", e);
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java?rev=1043618&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java Wed Dec  8 19:19:14 2010
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * PDF parser.
+ * <p>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs.
+ */
+public class PDFParser implements Parser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -752276948656079347L;
+
+    /**
+     * Metadata key for giving the document password to the parser.
+     *
+     * @since Apache Tika 0.5
+     */
+    public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("pdf"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        PDDocument pdfDocument = PDDocument.load(stream, true);
+        try {
+            if (pdfDocument.isEncrypted()
+                    && !pdfDocument.getCurrentAccessPermission().canExtractContent()) {
+                try {
+                    String password = metadata.get(PASSWORD);
+                    if (password == null) {
+                        password = "";
+                    }
+                    pdfDocument.decrypt(password);
+                } catch (Exception e) {
+                    // Ignore
+                }
+            }
+            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+            PDF2XHTML.process(pdfDocument, handler, metadata);
+        } finally {
+            pdfDocument.close();
+        }
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+    private void extractMetadata(PDDocument document, Metadata metadata)
+            throws TikaException {
+        PDDocumentInformation info = document.getDocumentInformation();
+        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+        addMetadata(metadata, Metadata.TITLE, info.getTitle());
+        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+        try {
+            addMetadata(metadata, "created", info.getCreationDate());
+            addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        try {
+            Calendar modified = info.getModificationDate(); 
+            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        
+        // All remaining metadata is custom
+        // Copy this over as-is
+        List<String> handledMetadata = Arrays.asList(new String[] {
+             "Author", "Creator", "CreationDate", "ModDate",
+             "Keywords", "Producer", "Subject", "Title", "Trapped"
+        });
+        for(COSName key : info.getDictionary().keySet()) {
+            String name = key.getName();
+            if(! handledMetadata.contains(name)) {
+        	addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
+            }
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, value);
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
+        }
+    }
+
+    private void addMetadata(Metadata metadata, Property property, Calendar value) {
+        if (value != null) {
+            metadata.set(property, value.getTime());
+        }
+    }
+
+    /**
+     * Used when processing custom metadata entries, as PDFBox won't do
+     *  the conversion for us in the way it does for the standard ones
+     */
+    private void addMetadata(Metadata metadata, String name, COSBase value) {
+        if(value instanceof COSArray) {
+            for(COSBase v : ((COSArray)value).toList()) {
+                addMetadata(metadata, name, v);
+            }
+        } else if(value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString)value).getString());
+        } else {
+            addMetadata(metadata, name, value.toString());
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml?rev=1043618&r1=1043617&r2=1043618&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml Wed Dec  8 19:19:14 2010
@@ -23,6 +23,11 @@
 
     <parser class="org.apache.tika.parser.DefaultParser"/>
 
+    <parser class="org.apache.jackrabbit.core.query.pdf.PDFParser">
+      <!-- JCR-2838: Override the faulty PDF parser in Tika 0.8 -->
+      <mime>application/pdf</mime>
+    </parser>
+
     <parser class="org.apache.tika.parser.EmptyParser">
       <!-- Disable package extraction as it's too resource-intensive -->
       <mime>application/x-archive</mime>