You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2010/12/08 20:19:15 UTC
svn commit: r1043618 - in /jackrabbit/trunk/jackrabbit-core: ./
src/main/java/org/apache/jackrabbit/core/query/pdf/
src/main/resources/org/apache/jackrabbit/core/query/lucene/
Author: jukka
Date: Wed Dec 8 19:19:14 2010
New Revision: 1043618
URL: http://svn.apache.org/viewvc?rev=1043618&view=rev
Log:
JCR-2838: Tika regressions in 0.8
Add temporary workarounds for TIKA-548 and TIKA-556.
Added:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java (with props)
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java (with props)
Modified:
jackrabbit/trunk/jackrabbit-core/pom.xml
jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?rev=1043618&r1=1043617&r2=1043618&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Wed Dec 8 19:19:14 2010
@@ -246,6 +246,11 @@ org.apache.jackrabbit.test.api.Shareable
</exclusions>
</dependency>
<dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>netcdf</artifactId>
+ <version>4.2-min</version>
+ </dependency>
+ <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java?rev=1043618&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java Wed Dec 8 19:19:14 2010
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if the PDF document can not be processed
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, Metadata metadata)
+ throws SAXException, TikaException {
+ try {
+ // Extract text using a dummy Writer as we override the
+ // key methods to output to the given content handler.
+ new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+ @Override
+ public void flush() {
+ }
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ }
+
+ private final XHTMLContentHandler handler;
+
+ private PDF2XHTML(ContentHandler handler, Metadata metadata)
+ throws IOException {
+ this.handler = new XHTMLContentHandler(handler, metadata);
+ setForceParsing(true);
+ setSortByPosition(true);
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ handler.startElement("div", "class", "page");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ }
+ }
+
+ @Override
+ protected void writeString(String text) throws IOException {
+ try {
+ handler.characters(text);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a string: " + text, e);
+ }
+ }
+
+ @Override
+ protected void writeCharacters(TextPosition text) throws IOException {
+ try {
+ handler.characters(text.getCharacter());
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a character: " + text.getCharacter(), e);
+ }
+ }
+
+ @Override
+ protected void writeWordSeparator() throws IOException {
+ try {
+ handler.characters(" ");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a space character", e);
+ }
+ }
+
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ try {
+ handler.characters("\n");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a newline character", e);
+ }
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDF2XHTML.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java?rev=1043618&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java Wed Dec 8 19:19:14 2010
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * PDF parser.
+ * <p>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs.
+ */
+public class PDFParser implements Parser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -752276948656079347L;
+
+ /**
+ * Metadata key for giving the document password to the parser.
+ *
+ * @since Apache Tika 0.5
+ */
+ public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("pdf"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ PDDocument pdfDocument = PDDocument.load(stream, true);
+ try {
+ if (pdfDocument.isEncrypted()
+ && !pdfDocument.getCurrentAccessPermission().canExtractContent()) {
+ try {
+ String password = metadata.get(PASSWORD);
+ if (password == null) {
+ password = "";
+ }
+ pdfDocument.decrypt(password);
+ } catch (Exception e) {
+ // Ignore
+ }
+ }
+ metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+ extractMetadata(pdfDocument, metadata);
+ PDF2XHTML.process(pdfDocument, handler, metadata);
+ } finally {
+ pdfDocument.close();
+ }
+ }
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+ private void extractMetadata(PDDocument document, Metadata metadata)
+ throws TikaException {
+ PDDocumentInformation info = document.getDocumentInformation();
+ metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+ addMetadata(metadata, Metadata.TITLE, info.getTitle());
+ addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+ addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+ addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+ addMetadata(metadata, "producer", info.getProducer());
+ addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+ addMetadata(metadata, "trapped", info.getTrapped());
+ try {
+ addMetadata(metadata, "created", info.getCreationDate());
+ addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
+ } catch (IOException e) {
+ // Invalid date format, just ignore
+ }
+ try {
+ Calendar modified = info.getModificationDate();
+ addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+ } catch (IOException e) {
+ // Invalid date format, just ignore
+ }
+
+ // All remaining metadata is custom
+ // Copy this over as-is
+ List<String> handledMetadata = Arrays.asList(new String[] {
+ "Author", "Creator", "CreationDate", "ModDate",
+ "Keywords", "Producer", "Subject", "Title", "Trapped"
+ });
+ for(COSName key : info.getDictionary().keySet()) {
+ String name = key.getName();
+ if(! handledMetadata.contains(name)) {
+ addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
+ }
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.add(name, value);
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, Calendar value) {
+ if (value != null) {
+ metadata.set(name, value.getTime().toString());
+ }
+ }
+
+ private void addMetadata(Metadata metadata, Property property, Calendar value) {
+ if (value != null) {
+ metadata.set(property, value.getTime());
+ }
+ }
+
+ /**
+ * Used when processing custom metadata entries, as PDFBox won't do
+ * the conversion for us in the way it does for the standard ones
+ */
+ private void addMetadata(Metadata metadata, String name, COSBase value) {
+ if(value instanceof COSArray) {
+ for(COSBase v : ((COSArray)value).toList()) {
+ addMetadata(metadata, name, v);
+ }
+ } else if(value instanceof COSString) {
+ addMetadata(metadata, name, ((COSString)value).getString());
+ } else {
+ addMetadata(metadata, name, value.toString());
+ }
+ }
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/pdf/PDFParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml?rev=1043618&r1=1043617&r2=1043618&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml Wed Dec 8 19:19:14 2010
@@ -23,6 +23,11 @@
<parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.jackrabbit.core.query.pdf.PDFParser">
+ <!-- JCR-2838: Override the faulty PDF parser in Tika 0.8 -->
+ <mime>application/pdf</mime>
+ </parser>
+
<parser class="org.apache.tika.parser.EmptyParser">
<!-- Disable package extraction as it's too resource-intensive -->
<mime>application/x-archive</mime>