You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/08/28 16:30:01 UTC
[2/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser
modules and bundles
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
deleted file mode 100644
index 775e590..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.pdf;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.AccessPermissionException;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Checks whether or not a document allows extraction generally
- * or extraction for accessibility only.
- */
-public class AccessChecker implements Serializable {
-
- private static final long serialVersionUID = 6492570218190936986L;
-
- private final boolean needToCheck;
- private final boolean allowAccessibility;
-
- /**
- * This constructs an {@link AccessChecker} that
- * will not perform any checking and will always return without
- * throwing an exception.
- * <p/>
- * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior.
- */
- public AccessChecker() {
- needToCheck = false;
- allowAccessibility = true;
- }
-
- /**
- * This constructs an {@link AccessChecker} that will check
- * for whether or not content should be extracted from a document.
- *
- * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed
- */
- public AccessChecker(boolean allowExtractionForAccessibility) {
- needToCheck = true;
- this.allowAccessibility = allowExtractionForAccessibility;
- }
-
- /**
- * Checks to see if a document's content should be extracted based
- * on metadata values and the value of {@link #allowAccessibility} in the constructor.
- *
- * @param metadata
- * @throws AccessPermissionException if access is not permitted
- */
- public void check(Metadata metadata) throws AccessPermissionException {
- if (!needToCheck) {
- return;
- }
- if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
- if (allowAccessibility) {
- if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
- return;
- }
- throw new AccessPermissionException("Content extraction for accessibility is not allowed.");
- }
- throw new AccessPermissionException("Content extraction is not allowed.");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
deleted file mode 100644
index 3ad551d..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.io.IOException;
-import java.io.Writer;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-
-/**
- * Utility class that overrides the {@link PDFTextStripper} functionality
- * to integrate text extraction via OCR only.
- *
- */
-class OCR2XHTML extends AbstractPDF2XHTML {
-
- private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config)
- throws IOException {
- super(document, handler, context, metadata, config);
- }
-
- /**
- * Converts the given PDF document (and related metadata) to a stream
- * of XHTML SAX events sent to the given content handler.
- *
- * @param document PDF document
- * @param handler SAX content handler
- * @param metadata PDF metadata
- * @throws SAXException if the content handler fails to process SAX events
- * @throws TikaException if there was an exception outside of per page processing
- */
- public static void process(
- PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config)
- throws SAXException, TikaException {
- OCR2XHTML ocr2XHTML = null;
- try {
- ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
- ocr2XHTML.writeText(document, new Writer() {
- @Override
- public void write(char[] cbuf, int off, int len) {
- }
-
- @Override
- public void flush() {
- }
-
- @Override
- public void close() {
- }
- });
- } catch (IOException e) {
- if (e.getCause() instanceof SAXException) {
- throw (SAXException) e.getCause();
- } else {
- throw new TikaException("Unable to extract PDF content", e);
- }
- }
- if (ocr2XHTML.exceptions.size() > 0) {
- //throw the first
- throw new TikaException("Unable to extract all PDF content",
- ocr2XHTML.exceptions.get(0));
- }
- }
-
- @Override
- public void processPage(PDPage pdPage) throws IOException {
- try {
- startPage(pdPage);
- doOCROnCurrentPage();
- endPage(pdPage);
- } catch (TikaException |SAXException e) {
- throw new IOExceptionWithCause(e);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
-
- @Override
- protected void writeString(String text) throws IOException {
- //no-op
- }
-
- @Override
- protected void writeCharacters(TextPosition text) throws IOException {
- //no-op
- }
-
- @Override
- protected void writeWordSeparator() throws IOException {
- //no-op
- }
-
- @Override
- protected void writeLineSeparator() throws IOException {
- //no-op
- }
-
-}
-
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
deleted file mode 100644
index ac9823e..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
-import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Utility class that overrides the {@link PDFTextStripper} functionality
- * to produce a semi-structured XHTML SAX events instead of a plain text
- * stream.
- */
-class PDF2XHTML extends AbstractPDF2XHTML {
-
-
- private static final List<String> JPEG = Arrays.asList(
- COSName.DCT_DECODE.getName(),
- COSName.DCT_DECODE_ABBREVIATION.getName());
-
- /**
- * This keeps track of the pdf object ids for inline
- * images that have been processed.
- * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
- * is true, this will be checked before extracting an embedded image.
- * The integer keeps track of the inlineImageCounter for that image.
- * This integer is used to identify images in the markup.
- *
- * This is used across the document. To avoid infinite recursion
- * TIKA-1742, we're limiting the export to one image per page.
- */
- private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
- private int inlineImageCounter = 0;
- private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config)
- throws IOException {
- super(document, handler, context, metadata, config);
- }
-
- /**
- * Converts the given PDF document (and related metadata) to a stream
- * of XHTML SAX events sent to the given content handler.
- *
- * @param document PDF document
- * @param handler SAX content handler
- * @param metadata PDF metadata
- * @throws SAXException if the content handler fails to process SAX events
- * @throws TikaException if there was an exception outside of per page processing
- */
- public static void process(
- PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config)
- throws SAXException, TikaException {
- PDF2XHTML pdf2XHTML = null;
- try {
- // Extract text using a dummy Writer as we override the
- // key methods to output to the given content
- // handler.
- pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
-
- config.configure(pdf2XHTML);
-
- pdf2XHTML.writeText(document, new Writer() {
- @Override
- public void write(char[] cbuf, int off, int len) {
- }
-
- @Override
- public void flush() {
- }
-
- @Override
- public void close() {
- }
- });
-
- } catch (IOException e) {
- if (e.getCause() instanceof SAXException) {
- throw (SAXException) e.getCause();
- } else {
- throw new TikaException("Unable to extract PDF content", e);
- }
- }
- if (pdf2XHTML.exceptions.size() > 0) {
- //throw the first
- throw new TikaException("Unable to extract all PDF content",
- pdf2XHTML.exceptions.get(0));
- }
- }
-
-
- @Override
- public void processPage(PDPage page) throws IOException {
- try {
- super.processPage(page);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
-
- @Override
- protected void endPage(PDPage page) throws IOException {
- try {
- writeParagraphEnd();
- try {
- extractImages(page.getResources(), new HashSet<COSBase>());
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- super.endPage(page);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to end a page", e);
- } catch (IOException e) {
- exceptions.add(e);
- }
- }
-
- private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
- if (resources == null || config.getExtractInlineImages() == false) {
- return;
- }
-
- for (COSName name : resources.getXObjectNames()) {
-
- PDXObject object = resources.getXObject(name);
- if (object == null) {
- continue;
- }
- COSStream cosStream = object.getCOSObject();
- if (seenThisPage.contains(cosStream)) {
- //avoid infinite recursion TIKA-1742
- continue;
- }
- seenThisPage.add(cosStream);
-
- if (object instanceof PDFormXObject) {
- extractImages(((PDFormXObject) object).getResources(), seenThisPage);
- } else if (object instanceof PDImageXObject) {
-
- PDImageXObject image = (PDImageXObject) object;
-
- Metadata metadata = new Metadata();
- String extension = image.getSuffix();
- if (extension == null) {
- metadata.set(Metadata.CONTENT_TYPE, "image/png");
- extension = "png";
- } else if (extension.equals("jpg")) {
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- } else if (extension.equals("tiff")) {
- metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
- extension = "tif";
- } else {
- //TODO: determine if we need to add more image types
- //throw new RuntimeException("EXTEN:" + extension);
- }
-
- Integer imageNumber = processedInlineImages.get(cosStream);
- if (imageNumber == null) {
- imageNumber = inlineImageCounter++;
- }
- String fileName = "image" + imageNumber + "."+extension;
- metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-
- // Output the img tag
- AttributesImpl attr = new AttributesImpl();
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
- attr.addAttribute("", "alt", "alt", "CDATA", fileName);
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
-
- //Do we only want to process unique COSObject ids?
- //If so, have we already processed this one?
- if (config.getExtractUniqueInlineImagesOnly() == true) {
- if (processedInlineImages.containsKey(cosStream)) {
- continue;
- }
- processedInlineImages.put(cosStream, imageNumber);
- }
-
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
-
- EmbeddedDocumentExtractor extractor =
- getEmbeddedDocumentExtractor();
- if (extractor.shouldParseEmbedded(metadata)) {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- try {
- //TODO: handle image.getMetadata()?
- writeToBuffer(image, extension, buffer);
- extractor.parseEmbedded(
- new ByteArrayInputStream(buffer.toByteArray()),
- new EmbeddedContentHandler(xhtml),
- metadata, false);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
- }
- }
- }
-
- //nearly directly copied from PDFBox ExtractImages
- private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out)
- throws IOException {
-
- BufferedImage image = pdImage.getImage();
- if (image != null) {
- if ("jpg".equals(suffix)) {
- String colorSpaceName = pdImage.getColorSpace().getName();
- //TODO: figure out if we want directJPEG as a configuration
- //previously: if (directJPeg || PDDeviceGray....
- if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
- PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) {
- // RGB or Gray colorspace: get and write the unmodifiedJPEG stream
- InputStream data = pdImage.getStream().createInputStream(JPEG);
- org.apache.pdfbox.io.IOUtils.copy(data, out);
- org.apache.pdfbox.io.IOUtils.closeQuietly(data);
- } else {
- // for CMYK and other "unusual" colorspaces, the JPEG will be converted
- ImageIOUtil.writeImage(image, suffix, out);
- }
- } else {
- ImageIOUtil.writeImage(image, suffix, out);
- }
- }
- out.flush();
- }
-
- @Override
- protected void writeParagraphStart() throws IOException {
- super.writeParagraphStart();
- try {
- xhtml.startElement("p");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to start a paragraph", e);
- }
- }
-
- @Override
- protected void writeParagraphEnd() throws IOException {
- super.writeParagraphEnd();
- try {
- xhtml.endElement("p");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to end a paragraph", e);
- }
- }
-
- @Override
- protected void writeString(String text) throws IOException {
- try {
- xhtml.characters(text);
- } catch (SAXException e) {
- throw new IOExceptionWithCause(
- "Unable to write a string: " + text, e);
- }
- }
-
- @Override
- protected void writeCharacters(TextPosition text) throws IOException {
- try {
- xhtml.characters(text.getUnicode());
- } catch (SAXException e) {
- throw new IOExceptionWithCause(
- "Unable to write a character: " + text.getUnicode(), e);
- }
- }
-
- @Override
- protected void writeWordSeparator() throws IOException {
- try {
- xhtml.characters(getWordSeparator());
- } catch (SAXException e) {
- throw new IOExceptionWithCause(
- "Unable to write a space character", e);
- }
- }
-
- @Override
- protected void writeLineSeparator() throws IOException {
- try {
- xhtml.newline();
- } catch (SAXException e) {
- throw new IOExceptionWithCause(
- "Unable to write a newline character", e);
- }
- }
-
-}
-
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
deleted file mode 100644
index 057f833..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.pdf;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.io.RandomAccessBuffer;
-import org.apache.pdfbox.io.RandomAccessRead;
-import org.apache.pdfbox.pdfparser.COSParser;
-
-/**
- * In fairly rare cases, a PDF's XMP will contain a string that
- * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
- * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
- * <p>
- * This class can be used to decode those strings.
- * <p>
- * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue
- * and Tilman Hausherr for the solution.
- * <p>
- * As of this writing, we are only handling strings that start with
- * an encoded BOM. Andrew Jackson found a handful of other examples (e.g.
- * this ISO-8859-7 string:
- * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
- * \\364\\347\\362 PRAKSIS \\363\\364\\357")
- * that we aren't currently handling.
- */
-class PDFEncodedStringDecoder {
-
- private static final String[] PDF_ENCODING_BOMS = {
- "\\376\\377", //UTF-16BE
- "\\377\\376", //UTF-16LE
- "\\357\\273\\277"//UTF-8
- };
-
- /**
- * Does this string contain an octal-encoded UTF BOM?
- * Call this statically to determine if you should bother creating a new parser to parse it.
- * @param s
- * @return
- */
- static boolean shouldDecode(String s) {
- if (s == null || s.length() < 8) {
- return false;
- }
- for (String BOM : PDF_ENCODING_BOMS) {
- if (s.startsWith(BOM)) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * This assumes that {@link #shouldDecode(String)} has been called
- * and has returned true. If you run this on a non-octal encoded string,
- * disaster will happen!
- *
- * @param value
- * @return
- */
- String decode(String value) {
- try {
- byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
- InputStream is = new ByteArrayInputStream(bytes);
- COSStringParser p = new COSStringParser(new RandomAccessBuffer(is));
- String parsed = p.myParseCOSString();
- if (parsed != null) {
- return parsed;
- }
- } catch (IOException e) {
- //oh well, we tried.
- }
- //just return value if something went wrong
- return value;
- }
-
- class COSStringParser extends COSParser {
-
- COSStringParser(RandomAccessRead buffer) throws IOException {
- super(buffer);
- }
-
- /**
- *
- * @return parsed string or null if something went wrong.
- */
- String myParseCOSString() {
- try {
- COSString cosString = parseCOSString();
- if (cosString != null) {
- return cosString.getString();
- }
- } catch (IOException e) {
- }
- return null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
deleted file mode 100644
index f735f25..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.stream.XMLStreamException;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.jempbox.xmp.XMPSchema;
-import org.apache.jempbox.xmp.XMPSchemaDublinCore;
-import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-import org.apache.pdfbox.pdmodel.common.PDMetadata;
-import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
-import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.w3c.dom.Document;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.ErrorHandler;
-import org.xml.sax.SAXException;
-
-/**
- * PDF parser.
- * <p/>
- * This parser can process also encrypted PDF documents if the required
- * password is given as a part of the input metadata associated with a
- * document. If no password is given, then this parser will try decrypting
- * the document using the empty password that's often used with PDFs. If
- * the PDF contains any embedded documents (for example as part of a PDF
- * package) then this parser will use the {@link EmbeddedDocumentExtractor}
- * to handle them.
- * <p/>
- * As of Tika 1.6, it is possible to extract inline images with
- * the {@link EmbeddedDocumentExtractor} as if they were regular
- * attachments. By default, this feature is turned off because of
- * the potentially enormous number and size of inline images. To
- * turn this feature on, see
- * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
- */
-public class PDFParser extends AbstractParser {
-
-
- /**
- * Metadata key for giving the document password to the parser.
- *
- * @since Apache Tika 0.5
- * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
- */
- public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
- private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -752276948656079347L;
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MEDIA_TYPE);
- private PDFParserConfig defaultConfig = new PDFParserConfig();
-
-
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- PDDocument pdfDocument = null;
- TemporaryResources tmp = new TemporaryResources();
- //config from context, or default if not set via context
- PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
- String password = "";
- try {
- // PDFBox can process entirely in memory, or can use a temp file
- // for unpacked / processed resources
- // Decide which to do based on if we're reading from a file or not already
- //TODO: make this configurable via MemoryUsageSetting
- TikaInputStream tstream = TikaInputStream.cast(stream);
- password = getPassword(metadata, context);
- if (tstream != null && tstream.hasFile()) {
- // File based -- send file directly to PDFBox
- pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
- } else {
- pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
- }
- metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));
-
- metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
- extractMetadata(pdfDocument, metadata, context);
-
- AccessChecker checker = localConfig.getAccessChecker();
- checker.check(metadata);
- if (handler != null) {
- if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
- handleXFAOnly(pdfDocument, handler, metadata, context);
- } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
- metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
- OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
- } else {
- if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
- metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
- }
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
- }
-
- }
- } catch (InvalidPasswordException e) {
- metadata.set("pdf:encrypted", "true");
- throw new EncryptedDocumentException(e);
- } finally {
- if (pdfDocument != null) {
- pdfDocument.close();
- }
- }
- }
-
- private String getPassword(Metadata metadata, ParseContext context) {
- String password = null;
-
- // Did they supply a new style Password Provider?
- PasswordProvider passwordProvider = context.get(PasswordProvider.class);
- if (passwordProvider != null) {
- password = passwordProvider.getPassword(metadata);
- }
-
- // Fall back on the old style metadata if set
- if (password == null && metadata.get(PASSWORD) != null) {
- password = metadata.get(PASSWORD);
- }
-
- // If no password is given, use an empty string as the default
- if (password == null) {
- password = "";
- }
- return password;
- }
-
-
- private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
- throws TikaException {
-
- //first extract AccessPermissions
- AccessPermission ap = document.getCurrentAccessPermission();
- metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
- Boolean.toString(ap.canExtractForAccessibility()));
- metadata.set(AccessPermissions.EXTRACT_CONTENT,
- Boolean.toString(ap.canExtractContent()));
- metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
- Boolean.toString(ap.canAssembleDocument()));
- metadata.set(AccessPermissions.FILL_IN_FORM,
- Boolean.toString(ap.canFillInForm()));
- metadata.set(AccessPermissions.CAN_MODIFY,
- Boolean.toString(ap.canModify()));
- metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
- Boolean.toString(ap.canModifyAnnotations()));
- metadata.set(AccessPermissions.CAN_PRINT,
- Boolean.toString(ap.canPrint()));
- metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
- Boolean.toString(ap.canPrintDegraded()));
-
-
- //now go for the XMP
- Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), context);
-
- XMPMetadata xmp = null;
- if (dom != null) {
- xmp = new XMPMetadata(dom);
- }
- XMPSchemaDublinCore dcSchema = null;
- try {
- if (document.getDocumentCatalog().getMetadata() != null) {
- InputStream xmpIs = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
- xmp = XMPMetadata.load(xmpIs);
- }
- } catch (IOException e) {}
-
- if (xmp != null) {
- try {
- dcSchema = xmp.getDublinCoreSchema();
- } catch (IOException e) {}
-
- JempboxExtractor.extractXMPMM(xmp, metadata);
- }
-
- PDDocumentInformation info = document.getDocumentInformation();
- metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
- extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
- addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
- addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
- addMetadata(metadata, "producer", info.getProducer());
- extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
-
- // TODO: Move to description in Tika 2.0
- addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
- addMetadata(metadata, "trapped", info.getTrapped());
- // TODO Remove these in Tika 2.0
- addMetadata(metadata, "created", info.getCreationDate());
- addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
- Calendar modified = info.getModificationDate();
- addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
- addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
-
- // All remaining metadata is custom
- // Copy this over as-is
- List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate",
- "Keywords", "Producer", "Subject", "Title", "Trapped");
- for (COSName key : info.getCOSObject().keySet()) {
- String name = key.getName();
- if (!handledMetadata.contains(name)) {
- addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
- }
- }
-
- //try to get the various versions
- //Caveats:
- // there is currently a fair amount of redundancy
- // TikaCoreProperties.FORMAT can be multivalued
- // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
- metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
- metadata.add(TikaCoreProperties.FORMAT.getName(),
- MEDIA_TYPE.toString() + "; version=" +
- Float.toString(document.getDocument().getVersion()));
-
- try {
- if (xmp != null) {
- xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
- XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
- if (pdfaxmp != null) {
- if (pdfaxmp.getPart() != null) {
- metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
- }
- if (pdfaxmp.getConformance() != null) {
- metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
- String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
- metadata.set("pdfa:PDFVersion", version);
- metadata.add(TikaCoreProperties.FORMAT.getName(),
- MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
- }
- }
- // TODO WARN if this XMP version is inconsistent with document header version?
- }
- } catch (IOException e) {
- metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
- }
- //TODO: Let's try to move this into PDFBox.
- //Attempt to determine Adobe extension level, if present:
- COSDictionary root = document.getDocumentCatalog().getCOSObject();
- COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
- if (extensions != null) {
- for (COSName extName : extensions.keySet()) {
- // If it's an Adobe one, interpret it to determine the extension level:
- if (extName.equals(COSName.getPDFName("ADBE"))) {
- COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
- if (adobeExt != null) {
- String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
- int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
- //-1 is sentinel value that something went wrong in getInt
- if (el != -1) {
- metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
- metadata.add(TikaCoreProperties.FORMAT.getName(),
- MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\"");
- }
- }
- } else {
- // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
- metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
- }
- }
- }
- }
-
- /**
- * Try to extract all multilingual items from the XMPSchema
- * <p/>
- * This relies on the property having a valid xmp getName()
- * <p/>
- * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
- *
- * @param metadata
- * @param property
- * @param pdfBoxBaseline
- * @param schema
- */
- private void extractMultilingualItems(Metadata metadata, Property property,
- String pdfBoxBaseline, XMPSchema schema) {
- //if schema is null, just go with pdfBoxBaseline
- if (schema == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
-
- for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
- String value = schema.getLanguageProperty(property.getName(), lang);
-
- if (value != null && value.length() > 0) {
- //if you're going to add it below in the baseline addition, don't add it now
- if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
- continue;
- }
- addMetadata(metadata, property, value);
- if (!property.isMultiValuePermitted()) {
- return;
- }
- }
- }
-
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- //if we've already added something above and multivalue is not permitted
- //return.
- if (!property.isMultiValuePermitted()) {
- if (metadata.get(property) != null) {
- return;
- }
- }
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- }
-
-
- /**
- * This tries to read a list from a particular property in
- * XMPSchemaDublinCore.
- * If it can't find the information, it falls back to the
- * pdfboxBaseline. The pdfboxBaseline should be the value
- * that pdfbox returns from its PDDocumentInformation object
- * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
- * and it should not duplicate the pdfboxBaseline.
- * <p/>
- * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
- * on dates!
- * <p/>
- * This relies on the property having a DublinCore compliant getName()
- *
- * @param property
- * @param pdfBoxBaseline
- * @param dc
- * @param metadata
- */
- private void extractDublinCoreListItems(Metadata metadata, Property property,
- String pdfBoxBaseline, XMPSchemaDublinCore dc) {
- //if no dc, add baseline and return
- if (dc == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
- List<String> items = getXMPBagOrSeqList(dc, property.getName());
- if (items == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
- for (String item : items) {
- if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
- addMetadata(metadata, property, item);
- }
- }
- //finally, add the baseline
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- }
-
- /**
- * As of this writing, XMPSchema can contain bags or sequence lists
- * for some attributes...despite standards documentation.
- * JempBox expects one or the other for specific attributes.
- * Until more flexibility is added to JempBox, Tika will have to handle both.
- *
- * @param schema
- * @param name
- * @return list of values or null
- */
- private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
- List<String> ret = schema.getBagList(name);
- if (ret == null) {
- ret = schema.getSequenceList(name);
- }
- return ret;
- }
-
- private void addMetadata(Metadata metadata, Property property, String value) {
- if (value != null) {
- String decoded = decode(value);
- if (property.isMultiValuePermitted() || metadata.get(property) == null) {
- metadata.add(property, decoded);
- }
- //silently skip adding property that already exists if multiple values are not permitted
- }
- }
-
- private void addMetadata(Metadata metadata, String name, String value) {
- if (value != null) {
- metadata.add(name, decode(value));
- }
- }
-
- private String decode(String value) {
- if (PDFEncodedStringDecoder.shouldDecode(value)) {
- PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
- return d.decode(value);
- }
- return value;
- }
-
- private void addMetadata(Metadata metadata, String name, Calendar value) {
- if (value != null) {
- metadata.set(name, value.getTime().toString());
- }
- }
-
- private void addMetadata(Metadata metadata, Property property, Calendar value) {
- if (value != null) {
- metadata.set(property, value.getTime());
- }
- }
-
- /**
- * Used when processing custom metadata entries, as PDFBox won't do
- * the conversion for us in the way it does for the standard ones
- */
- private void addMetadata(Metadata metadata, String name, COSBase value) {
- if (value instanceof COSArray) {
- for (Object v : ((COSArray) value).toList()) {
- addMetadata(metadata, name, ((COSBase) v));
- }
- } else if (value instanceof COSString) {
- addMetadata(metadata, name, ((COSString) value).getString());
- }
- // Avoid calling COSDictionary#toString, since it can lead to infinite
- // recursion. See TIKA-1038 and PDFBOX-1835.
- else if (value != null && !(value instanceof COSDictionary)) {
- addMetadata(metadata, name, value.toString());
- }
- }
-
-
- private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) {
- if (config.getIfXFAExtractOnlyXFA() &&
- pdDocument.getDocumentCatalog() != null &&
- pdDocument.getDocumentCatalog().getAcroForm() != null &&
- pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
- return true;
- }
- return false;
- }
-
- private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws SAXException, IOException, TikaException {
- XFAExtractor ex = new XFAExtractor();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- try (InputStream is = new ByteArrayInputStream(
- pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
- ex.extract(is, xhtml, metadata, context);
- } catch (XMLStreamException e) {
- throw new TikaException("XML error in XFA", e);
- }
- xhtml.endDocument();
- }
-
- public PDFParserConfig getPDFParserConfig() {
- return defaultConfig;
- }
-
- public void setPDFParserConfig(PDFParserConfig config) {
- this.defaultConfig = config;
- }
-
- /**
- * @see #setEnableAutoSpace(boolean)
- * @deprecated use {@link #getPDFParserConfig()}
- */
- public boolean getEnableAutoSpace() {
- return defaultConfig.getEnableAutoSpace();
- }
-
- /**
- * If true (the default), the parser should estimate
- * where spaces should be inserted between words. For
- * many PDFs this is necessary as they do not include
- * explicit whitespace characters.
- *
- * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
- */
- public void setEnableAutoSpace(boolean v) {
- defaultConfig.setEnableAutoSpace(v);
- }
-
- /**
- * If true, text in annotations will be extracted.
- *
- * @deprecated use {@link #getPDFParserConfig()}
- */
- public boolean getExtractAnnotationText() {
- return defaultConfig.getExtractAnnotationText();
- }
-
- /**
- * If true (the default), text in annotations will be
- * extracted.
- *
- * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
- */
- public void setExtractAnnotationText(boolean v) {
- defaultConfig.setExtractAnnotationText(v);
- }
-
- /**
- * @see #setSuppressDuplicateOverlappingText(boolean)
- * @deprecated use {@link #getPDFParserConfig()}
- */
- public boolean getSuppressDuplicateOverlappingText() {
- return defaultConfig.getSuppressDuplicateOverlappingText();
- }
-
- /**
- * If true, the parser should try to remove duplicated
- * text over the same region. This is needed for some
- * PDFs that achieve bolding by re-writing the same
- * text in the same area. Note that this can
- * slow down extraction substantially (PDFBOX-956) and
- * sometimes remove characters that were not in fact
- * duplicated (PDFBOX-1155). By default this is disabled.
- *
- * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
- */
- public void setSuppressDuplicateOverlappingText(boolean v) {
- defaultConfig.setSuppressDuplicateOverlappingText(v);
- }
-
- /**
- * @see #setSortByPosition(boolean)
- * @deprecated use {@link #getPDFParserConfig()}
- */
- public boolean getSortByPosition() {
- return defaultConfig.getSortByPosition();
- }
-
- /**
- * If true, sort text tokens by their x/y position
- * before extracting text. This may be necessary for
- * some PDFs (if the text tokens are not rendered "in
- * order"), while for other PDFs it can produce the
- * wrong result (for example if there are 2 columns,
- * the text will be interleaved). Default is false.
- *
- * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
- */
- public void setSortByPosition(boolean v) {
- defaultConfig.setSortByPosition(v);
- }
-
-
- //can return null!
- private Document loadDOM(PDMetadata pdMetadata, ParseContext context) {
- if (pdMetadata == null) {
- return null;
- }
- try (InputStream is = pdMetadata.exportXMPMetadata()) {
- DocumentBuilder documentBuilder = context.getDocumentBuilder();
- documentBuilder.setErrorHandler((ErrorHandler)null);
- return documentBuilder.parse(is);
- } catch (IOException|SAXException|TikaException e) {
- //swallow
- }
- return null;
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
deleted file mode 100644
index 296b191..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ /dev/null
@@ -1,614 +0,0 @@
-package org.apache.tika.parser.pdf;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.util.Locale;
-import java.util.Properties;
-
-import org.apache.pdfbox.rendering.ImageType;
-import org.apache.pdfbox.text.PDFTextStripper;
-
-/**
- * Config for PDFParser.
- * <p/>
- * This allows parameters to be set programmatically:
- * <ol>
- * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
- * <li>Constructor of PDFParser</li>
- * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li>
- * </ol>
- * <p/>
- * Parameters can also be set by modifying the PDFParserConfig.properties file,
- * which lives in the expected places, in trunk:
- * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
- * <p/>
- * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
- * org/apache/tika/parser/pdf
- */
-public class PDFParserConfig implements Serializable {
-
- public enum OCR_STRATEGY {
- NO_OCR,
- OCR_ONLY,
- OCR_AND_TEXT_EXTRACTION;
-
- private static OCR_STRATEGY parse(String s) {
- if (s == null) {
- return NO_OCR;
- } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
- return NO_OCR;
- } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
- return OCR_ONLY;
- } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
- return OCR_AND_TEXT_EXTRACTION;
- }
- //default -- no ocr
- return NO_OCR;
- }
- }
-
- private static final long serialVersionUID = 6492570218190936986L;
-
- // True if we let PDFBox "guess" where spaces should go:
- private boolean enableAutoSpace = true;
-
- // True if we let PDFBox remove duplicate overlapping text:
- private boolean suppressDuplicateOverlappingText;
-
- // True if we extract annotation text ourselves
- // (workaround for PDFBOX-1143):
- private boolean extractAnnotationText = true;
-
- // True if we should sort text tokens by position
- // (necessary for some PDFs, but messes up other PDFs):
- private boolean sortByPosition = false;
-
- //True if acroform content should be extracted
- private boolean extractAcroFormContent = true;
-
- //True if inline PDXImage objects should be extracted
- private boolean extractInlineImages = false;
-
- //True if inline images (as identified by their object id within
- //a pdf file) should only be extracted once.
- private boolean extractUniqueInlineImagesOnly = true;
-
- //The character width-based tolerance value used to estimate where spaces in text should be added
- private Float averageCharTolerance;
-
- //The space width-based tolerance value used to estimate where spaces in text should be added
- private Float spacingTolerance;
-
- //If the PDF has an XFA element, process only that and skip extracting
- //content from elsewhere in the document.
- private boolean ifXFAExtractOnlyXFA = false;
-
- private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
-
- private int ocrDPI = 200;
- private ImageType ocrImageType = ImageType.GRAY;
- private String ocrImageFormatName = "png";
-
- private AccessChecker accessChecker;
-
- //The PDFParser can throw IOExceptions if there is a problem
- //with a streams. If this is set to true, Tika's
- //parser catches these exceptions, reports them in the metadata
- //and then throws the first stored exception after the parse has completed.
- private boolean isCatchIntermediateIOExceptions = true;
-
- public PDFParserConfig() {
- init(this.getClass().getResourceAsStream("PDFParser.properties"));
- }
-
- /**
- * Loads properties from InputStream and then tries to close InputStream.
- * If there is an IOException, this silently swallows the exception
- * and goes back to the default.
- *
- * @param is
- */
- public PDFParserConfig(InputStream is) {
- init(is);
- }
-
- //initializes object and then tries to close inputstream
- private void init(InputStream is) {
-
- if (is == null) {
- return;
- }
- Properties props = new Properties();
- try {
- props.load(is);
- } catch (IOException e) {
- } finally {
- if (is != null) {
- try {
- is.close();
- } catch (IOException e) {
- //swallow
- }
- }
- }
- setEnableAutoSpace(
- getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
- setSuppressDuplicateOverlappingText(
- getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
- getSuppressDuplicateOverlappingText()));
- setExtractAnnotationText(
- getBooleanProp(props.getProperty("extractAnnotationText"),
- getExtractAnnotationText()));
- setSortByPosition(
- getBooleanProp(props.getProperty("sortByPosition"),
- getSortByPosition()));
- setExtractAcroFormContent(
- getBooleanProp(props.getProperty("extractAcroFormContent"),
- getExtractAcroFormContent()));
- setExtractInlineImages(
- getBooleanProp(props.getProperty("extractInlineImages"),
- getExtractInlineImages()));
- setExtractUniqueInlineImagesOnly(
- getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
- getExtractUniqueInlineImagesOnly()));
-
- setIfXFAExtractOnlyXFA(
- getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
- getIfXFAExtractOnlyXFA()));
-
- setCatchIntermediateIOExceptions(
- getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
- isCatchIntermediateIOExceptions()));
-
- setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
-
- setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
-
- setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
-
- setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
-
-
- boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
- boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
-
- if (checkExtractAccessPermission == false) {
- //silently ignore the crazy configuration of checkExtractAccessPermission = false,
- //but allowExtractionForAccessibility=false
- accessChecker = new AccessChecker();
- } else {
- accessChecker = new AccessChecker(allowExtractionForAccessibility);
- }
- }
-
- /**
- * Configures the given pdf2XHTML.
- *
- * @param pdf2XHTML
- */
- public void configure(PDF2XHTML pdf2XHTML) {
- pdf2XHTML.setSortByPosition(getSortByPosition());
- if (getEnableAutoSpace()) {
- pdf2XHTML.setWordSeparator(" ");
- } else {
- pdf2XHTML.setWordSeparator("");
- }
- if (getAverageCharTolerance() != null) {
- pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
- }
- if (getSpacingTolerance() != null) {
- pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
- }
- pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
- }
-
- /**
- * @see #setExtractAcroFormContent(boolean)
- */
- public boolean getExtractAcroFormContent() {
- return extractAcroFormContent;
- }
-
- /**
- * If true (the default), extract content from AcroForms
- * at the end of the document. If an XFA is found,
- * try to process that, otherwise, process the AcroForm.
- *
- * @param extractAcroFormContent
- */
- public void setExtractAcroFormContent(boolean extractAcroFormContent) {
- this.extractAcroFormContent = extractAcroFormContent;
-
- }
-
- /**
- * @see #setIfXFAExtractOnlyXFA(boolean)
- * @return how to handle XFA data if it exists
- */
- public boolean getIfXFAExtractOnlyXFA() {
- return ifXFAExtractOnlyXFA;
- }
-
- /**
- * If false (the default), extract content from the full PDF
- * as well as the XFA form. This will likely lead to some duplicative
- * content.
- *
- * @param ifXFAExtractOnlyXFA
- */
- public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
- this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
- }
-
-
- /**
- * @see #setExtractInlineImages(boolean)
- */
- public boolean getExtractInlineImages() {
- return extractInlineImages;
- }
-
- /**
- * If true, extract inline embedded OBXImages.
- * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
- * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5,
- * there can be surprisingly large memory consumption and/or out of memory errors.
- * Set to <code>true</code> with caution.
- * <p/>
- * The default is <code>false</code>.
- * <p/>
- * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
- *
- * @param extractInlineImages
- */
- public void setExtractInlineImages(boolean extractInlineImages) {
- this.extractInlineImages = extractInlineImages;
- }
-
- /**
- * @see #setExtractUniqueInlineImagesOnly(boolean)
- */
- public boolean getExtractUniqueInlineImagesOnly() {
- return extractUniqueInlineImagesOnly;
- }
-
- /**
- * Multiple pages within a PDF file might refer to the same underlying image.
- * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the
- * parser will call the EmbeddedExtractor each time the image appears on a page.
- * This might be desired for some use cases. However, to avoid duplication of
- * extracted images, set this to <code>true</code>. The default is <code>true</code>.
- * <p/>
- * Note that uniqueness is determined only by the underlying PDF COSObject id, not by
- * file hash or similar equality metric.
- * If the PDF actually contains multiple copies of the same image
- * -- all with different object ids -- then all images will be extracted.
- * <p/>
- * For this parameter to have any effect, {@link #extractInlineImages} must be
- * set to <code>true</code>.
- * <p>
- * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
- * of this parameter, the extractor will only pull out one copy of each image per
- * page. This parameter tries to capture uniqueness across the entire document.
- *
- * @param extractUniqueInlineImagesOnly
- */
- public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
- this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
-
- }
-
- /**
- * @see #setEnableAutoSpace(boolean)
- */
- public boolean getEnableAutoSpace() {
- return enableAutoSpace;
- }
-
- /**
- * If true (the default), the parser should estimate
- * where spaces should be inserted between words. For
- * many PDFs this is necessary as they do not include
- * explicit whitespace characters.
- */
- public void setEnableAutoSpace(boolean enableAutoSpace) {
- this.enableAutoSpace = enableAutoSpace;
- }
-
- /**
- * @see #setSuppressDuplicateOverlappingText(boolean)
- */
- public boolean getSuppressDuplicateOverlappingText() {
- return suppressDuplicateOverlappingText;
- }
-
- /**
- * If true, the parser should try to remove duplicated
- * text over the same region. This is needed for some
- * PDFs that achieve bolding by re-writing the same
- * text in the same area. Note that this can
- * slow down extraction substantially (PDFBOX-956) and
- * sometimes remove characters that were not in fact
- * duplicated (PDFBOX-1155). By default this is disabled.
- */
- public void setSuppressDuplicateOverlappingText(
- boolean suppressDuplicateOverlappingText) {
- this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
- }
-
- /**
- * @see #setExtractAnnotationText(boolean)
- */
- public boolean getExtractAnnotationText() {
- return extractAnnotationText;
- }
-
- /**
- * If true (the default), text in annotations will be
- * extracted.
- */
- public void setExtractAnnotationText(boolean extractAnnotationText) {
- this.extractAnnotationText = extractAnnotationText;
- }
-
- /**
- * @see #setSortByPosition(boolean)
- */
- public boolean getSortByPosition() {
- return sortByPosition;
- }
-
- /**
- * If true, sort text tokens by their x/y position
- * before extracting text. This may be necessary for
- * some PDFs (if the text tokens are not rendered "in
- * order"), while for other PDFs it can produce the
- * wrong result (for example if there are 2 columns,
- * the text will be interleaved). Default is false.
- */
- public void setSortByPosition(boolean sortByPosition) {
- this.sortByPosition = sortByPosition;
- }
-
- /**
- * @see #setAverageCharTolerance(Float)
- */
- public Float getAverageCharTolerance() {
- return averageCharTolerance;
- }
-
- /**
- * See {@link PDFTextStripper#setAverageCharTolerance(float)}
- */
- public void setAverageCharTolerance(Float averageCharTolerance) {
- this.averageCharTolerance = averageCharTolerance;
- }
-
- /**
- * @see #setSpacingTolerance(Float)
- */
- public Float getSpacingTolerance() {
- return spacingTolerance;
- }
-
- /**
- * See {@link PDFTextStripper#setSpacingTolerance(float)}
- */
- public void setSpacingTolerance(Float spacingTolerance) {
- this.spacingTolerance = spacingTolerance;
- }
-
- public AccessChecker getAccessChecker() {
- return accessChecker;
- }
-
- public void setAccessChecker(AccessChecker accessChecker) {
- this.accessChecker = accessChecker;
- }
-
- /**
- * See {@link #setCatchIntermediateIOExceptions(boolean)}
- * @return whether or not to catch IOExceptions
- */
- public boolean isCatchIntermediateIOExceptions() {
- return isCatchIntermediateIOExceptions;
- }
-
- /**
- * The PDFBox parser will throw an IOException if there is
- * a problem with a stream. If this is set to <code>true</code>,
- * Tika's PDFParser will catch these exceptions and try to parse
- * the rest of the document. After the parse is completed,
- * Tika's PDFParser will throw the first caught exception.
- * @param catchIntermediateIOExceptions
- */
- public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
- isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
- }
-
- /**
- * Which strategy to use for OCR
- * @param ocrStrategy
- */
- public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
- this.ocrStrategy = ocrStrategy;
- }
-
- /**
- *
- * @return strategy to use for OCR
- */
- public OCR_STRATEGY getOCRStrategy() {
- return ocrStrategy;
- }
-
- private boolean getBooleanProp(String p, boolean defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
- if (p.toLowerCase(Locale.ROOT).equals("true")) {
- return true;
- } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
- return false;
- } else {
- return defaultMissing;
- }
- }
- //throws NumberFormatException if there's a non-null unparseable
- //string passed in
- private int getIntProp(String p, int defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
-
- return Integer.parseInt(p);
- }
-
- /**
- * String representation of the image format used to render
- * the page image for OCR (examples: png, tiff, jpeg)
- * @return
- */
- public String getOCRImageFormatName() {
- return ocrImageFormatName;
- }
-
- /**
- * @see #getOCRImageFormatName()
- *
- * @param ocrImageFormatName name of image format used to render
- * page image
- */
- public void setOCRImageFormatName(String ocrImageFormatName) {
- this.ocrImageFormatName = ocrImageFormatName;
- }
-
- /**
- * Image type used to render the page image for OCR.
- * @see #setOCRImageType(ImageType)
- * @return image type
- */
- public ImageType getOCRImageType() {
- return ocrImageType;
- }
-
- /**
- * Image type used to render the page image for OCR.
- * @param ocrImageType
- */
- public void setOCRImageType(ImageType ocrImageType) {
- this.ocrImageType = ocrImageType;
- }
-
- /**
- * Dots per inch used to render the page image for OCR
- * @return dots per inch
- */
- public int getOCRDPI() {
- return ocrDPI;
- }
-
- /**
- * Dots per inche used to render the page image for OCR
- * @param ocrDPI
- */
- public void setOCRDPI(int ocrDPI) {
- this.ocrDPI = ocrDPI;
- }
-
- private ImageType parseImageType(String ocrImageType) {
- for (ImageType t : ImageType.values()) {
- if (ocrImageType.equalsIgnoreCase(t.toString())) {
- return t;
- }
- }
- return null;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (!(o instanceof PDFParserConfig)) return false;
-
- PDFParserConfig config = (PDFParserConfig) o;
-
- if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
- if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false;
- if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false;
- if (getSortByPosition() != config.getSortByPosition()) return false;
- if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false;
- if (getExtractInlineImages() != config.getExtractInlineImages()) return false;
- if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false;
- if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false;
- if (getOCRDPI() != config.getOCRDPI()) return false;
- if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false;
- if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
- if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
- if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
- if (getOCRImageType() != config.getOCRImageType()) return false;
- if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false;
- return getAccessChecker().equals(config.getAccessChecker());
-
- }
-
- @Override
- public int hashCode() {
- int result = (getEnableAutoSpace() ? 1 : 0);
- result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
- result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
- result = 31 * result + (getSortByPosition() ? 1 : 0);
- result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
- result = 31 * result + (getExtractInlineImages() ? 1 : 0);
- result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
- result = 31 * result + getAverageCharTolerance().hashCode();
- result = 31 * result + getSpacingTolerance().hashCode();
- result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
- result = 31 * result + ocrStrategy.hashCode();
- result = 31 * result + getOCRDPI();
- result = 31 * result + getOCRImageType().hashCode();
- result = 31 * result + getOCRImageFormatName().hashCode();
- result = 31 * result + getAccessChecker().hashCode();
- result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
- return result;
- }
-
- @Override
- public String toString() {
- return "PDFParserConfig{" +
- "enableAutoSpace=" + enableAutoSpace +
- ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
- ", extractAnnotationText=" + extractAnnotationText +
- ", sortByPosition=" + sortByPosition +
- ", extractAcroFormContent=" + extractAcroFormContent +
- ", extractInlineImages=" + extractInlineImages +
- ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
- ", averageCharTolerance=" + averageCharTolerance +
- ", spacingTolerance=" + spacingTolerance +
- ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
- ", ocrStrategy=" + ocrStrategy +
- ", ocrDPI=" + ocrDPI +
- ", ocrImageType=" + ocrImageType +
- ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
- ", accessChecker=" + accessChecker +
- ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
- '}';
- }
-}