You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/03 19:55:36 UTC
[1/4] tika git commit: TIKA-1994 -- integrate OCR with PDFParser
Repository: tika
Updated Branches:
refs/heads/TIKA-1508 1202f459e -> 18ab8f91f
TIKA-1994 -- integrate OCR with PDFParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7aeb95d6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7aeb95d6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7aeb95d6
Branch: refs/heads/TIKA-1508
Commit: 7aeb95d6c7a6ac3611f2dd975baa73f566631061
Parents: a20c46c
Author: tballison <ta...@mitre.org>
Authored: Thu Jun 2 12:04:30 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Jun 2 12:04:30 2016 -0400
----------------------------------------------------------------------
.../tika/parser/ocr/TesseractOCRParser.java | 87 ++-
.../tika/parser/pdf/AbstractPDF2XHTML.java | 576 +++++++++++++++++++
.../org/apache/tika/parser/pdf/OCR2XHTML.java | 127 ++++
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 492 +---------------
.../org/apache/tika/parser/pdf/PDFParser.java | 7 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++---
.../apache/tika/parser/pdf/PDFParser.properties | 10 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 38 ++
8 files changed, 1029 insertions(+), 582 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 7db29c8..a238a7c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -56,6 +56,7 @@ import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -110,7 +111,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
- private boolean hasTesseract(TesseractOCRConfig config) {
+ public boolean hasTesseract(TesseractOCRConfig config) {
// Fetch where the config says to find Tesseract
String tesseract = config.getTesseractPath() + getTesseractProg();
@@ -157,47 +158,90 @@ public class TesseractOCRParser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (! hasTesseract(config))
return;
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ File tmpImgFile = tmp.createTemporaryFile();
+ parse(tikaStream, tmpImgFile, xhtml, config);
+ // Temporary workaround for TIKA-1445 - until we can specify
+ // composite parsers with strategies (eg Composite, Try In Turn),
+ // always send the image onwards to the regular parser to have
+ // the metadata for them extracted as well
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context);
+ xhtml.endDocument();
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ /**
+ * Use this to parse content without starting a new document.
+ * This appends SAX events to xhtml without re-adding the metadata, body start, etc.
+ * @param stream inputstream
+ * @param xhtml handler
+ * @param config TesseractOCRConfig to use for this parse
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config)
+ throws IOException, SAXException, TikaException {
+ // If Tesseract is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+ if (! hasTesseract(config))
+ return;
TemporaryResources tmp = new TemporaryResources();
- File output = null;
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
- File input = tikaStream.getFile();
- long size = tikaStream.getLength();
+ File tmpImgFile = tmp.createTemporaryFile();
+ parse(tikaStream, tmpImgFile, xhtml, config);
+ } finally {
+ tmp.dispose();
+ }
+
+ }
+
+ private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config)
+ throws IOException, SAXException, TikaException {
+ File tmpTxtOutput = null;
+
+ try {
+ File input = tikaInputStream.getFile();
+ long size = tikaInputStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
- output = tmp.createTemporaryFile();
- doOCR(input, output, config);
+ doOCR(input, tmpImgFile, config);
// Tesseract appends .txt to output file name
- output = new File(output.getAbsolutePath() + ".txt");
+ tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
- if (output.exists())
- extractOutput(new FileInputStream(output), xhtml);
+ if (tmpTxtOutput.exists()) {
+ try (InputStream is = new FileInputStream(tmpTxtOutput)) {
+ extractOutput(is, xhtml);
+ }
+ }
}
- // Temporary workaround for TIKA-1445 - until we can specify
- // composite parsers with strategies (eg Composite, Try In Turn),
- // always send the image onwards to the regular parser to have
- // the metadata for them extracted as well
- _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
- tmp.dispose();
- if (output != null) {
- output.delete();
+ if (tmpTxtOutput != null) {
+ tmpTxtOutput.delete();
}
}
}
+
// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
private static class CompositeImageParser extends CompositeParser {
@@ -283,8 +327,7 @@ public class TesseractOCRParser extends AbstractParser {
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
- xhtml.startDocument();
- xhtml.startElement("div");
+ xhtml.startElement("div", "class", "ocr");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
@@ -293,7 +336,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
xhtml.endElement("div");
- xhtml.endDocument();
+
}
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
new file mode 100644
index 0000000..d8a46a2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -0,0 +1,576 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import javax.xml.stream.XMLStreamException;
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+ /**
+ * Maximum recursive depth during AcroForm processing.
+ * Prevents theoretical AcroForm recursion bomb.
+ */
+ private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+ private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
+
+ /**
+ * Format used for signature dates
+ * TODO Make this thread-safe
+ */
+ private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+ final List<IOException> exceptions = new ArrayList<>();
+ final PDDocument pdDocument;
+ final XHTMLContentHandler xhtml;
+ private final ParseContext context;
+ private final Metadata metadata;
+ final PDFParserConfig config;
+
+ private int pageIndex = 0;
+
+ AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config) throws IOException {
+ this.pdDocument = pdDocument;
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.context = context;
+ this.metadata = metadata;
+ this.config = config;
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ xhtml.startElement("div", "class", "page");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ writeParagraphStart();
+ }
+
+ EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+ EmbeddedDocumentExtractor extractor =
+ context.get(EmbeddedDocumentExtractor.class);
+ if (extractor == null) {
+ extractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+ return extractor;
+ }
+
+ private void extractEmbeddedDocuments(PDDocument document)
+ throws IOException, SAXException, TikaException {
+ PDDocumentNameDictionary namesDictionary =
+ new PDDocumentNameDictionary(document.getDocumentCatalog());
+ PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+ if (efTree == null) {
+ return;
+ }
+
+ Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
+ //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
+ //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+ //If there is a need we could add a fully recursive search to find a non-null
+ //Map<String, COSObjectable> that contains the doc info.
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ } else {
+ List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+ if (kids == null) {
+ return;
+ }
+ for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+ embeddedFileNames = node.getNames();
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ }
+ }
+ }
+ }
+
+ private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
+ throws IOException, SAXException, TikaException {
+ if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+ return;
+ }
+
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+ for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
+ PDComplexFileSpecification spec = ent.getValue();
+ extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+ }
+ }
+
+ private void extractMultiOSPDEmbeddedFiles(String defaultName,
+ PDComplexFileSpecification spec,
+ EmbeddedDocumentExtractor extractor) throws IOException,
+ SAXException, TikaException {
+
+ if (spec == null) {
+ return;
+ }
+ //current strategy is to pull all, not just first non-null
+ extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ }
+
+ private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+ EmbeddedDocumentExtractor extractor)
+ throws SAXException, IOException, TikaException {
+
+ if (file == null) {
+ //skip silently
+ return;
+ }
+
+ fileName = (fileName == null) ? defaultName : fileName;
+
+ // TODO: other metadata?
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+ if (extractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = null;
+ try {
+ stream = TikaInputStream.get(file.createInputStream());
+ extractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ metadata, false);
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ }
+
+ void handleCatchableIOE(IOException e) throws IOException {
+ if (config.isCatchIntermediateIOExceptions()) {
+ String msg = e.getMessage();
+ if (msg == null) {
+ msg = "IOException, no message";
+ }
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+ exceptions.add(e);
+ } else {
+ throw e;
+ }
+ }
+
+ void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+ if (config.getOCRStrategy().equals(NO_OCR)) {
+ return;
+ }
+ TesseractOCRConfig tesseractConfig =
+ context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
+
+ TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
+ if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+ throw new TikaException("Tesseract is not available. "+
+ "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
+ }
+
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
+ Path tmpFile = tmp.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
+ os, config.getOCRDPI());
+ }
+ try (InputStream is = TikaInputStream.get(tmpFile)) {
+ tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+ }
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("error writing OCR content from PDF", e);
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+
+ try {
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+ for (PDAnnotation annotation : page.getAnnotations()) {
+
+ if (annotation instanceof PDAnnotationFileAttachment) {
+ PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+ PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+ try {
+ extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (config.getExtractAnnotationText()) {
+ if (annotation instanceof PDAnnotationLink) {
+ PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
+ if (annotationlink.getAction() != null) {
+ PDAction action = annotationlink.getAction();
+ if (action instanceof PDActionURI) {
+ PDActionURI uri = (PDActionURI) action;
+ String link = uri.getURI();
+ if (link != null) {
+ xhtml.startElement("div", "class", "annotation");
+ xhtml.startElement("a", "href", link);
+ xhtml.endElement("a");
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+
+ if (annotation instanceof PDAnnotationMarkup) {
+ PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+ String title = annotationMarkup.getTitlePopup();
+ String subject = annotationMarkup.getSubject();
+ String contents = annotationMarkup.getContents();
+ // TODO: maybe also annotationMarkup.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ xhtml.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ xhtml.startElement("div", "class", "annotationTitle");
+ xhtml.characters(title);
+ xhtml.endElement("div");
+ }
+
+ if (subject != null) {
+ xhtml.startElement("div", "class", "annotationSubject");
+ xhtml.characters(subject);
+ xhtml.endElement("div");
+ }
+
+ if (contents != null) {
+ xhtml.startElement("div", "class", "annotationContents");
+ xhtml.characters(contents);
+ xhtml.endElement("div");
+ }
+
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+ if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+ doOCROnCurrentPage();
+ }
+ xhtml.endElement("div");
+ } catch (SAXException|TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ } catch (IOException e) {
+ exceptions.add(e);
+ } finally {
+ pageIndex++;
+ }
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ xhtml.startDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ // Extract text for any bookmarks:
+ extractBookmarkText();
+ try {
+ extractEmbeddedDocuments(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+
+ //extract acroform data at end of doc
+ if (config.getExtractAcroFormContent() == true) {
+ try {
+ extractAcroForm(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ xhtml.endDocument();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ }
+
+ void extractBookmarkText() throws SAXException {
+ PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+ if (outline != null) {
+ extractBookmarkText(outline);
+ }
+ }
+
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+ PDOutlineItem current = bookmark.getFirstChild();
+ if (current != null) {
+ xhtml.startElement("ul");
+ while (current != null) {
+ xhtml.startElement("li");
+ xhtml.characters(current.getTitle());
+ xhtml.endElement("li");
+ // Recurse:
+ extractBookmarkText(current);
+ current = current.getNextSibling();
+ }
+ xhtml.endElement("ul");
+ }
+ }
+
+ void extractAcroForm(PDDocument pdf) throws IOException,
+ SAXException {
+ //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
+ //this code derives from Ben's code
+ PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+ if (catalog == null)
+ return;
+
+ PDAcroForm form = catalog.getAcroForm();
+ if (form == null)
+ return;
+
+ //if it has xfa, try that.
+ //if it doesn't exist or there's an exception,
+ //go with traditional AcroForm
+ PDXFAResource pdxfa = form.getXFA();
+
+ if (pdxfa != null) {
+ //if successful, return
+ XFAExtractor xfaExtractor = new XFAExtractor();
+ try (InputStream is = new BufferedInputStream(
+ new ByteArrayInputStream(pdxfa.getBytes()))) {
+ xfaExtractor.extract(is, xhtml, metadata, context);
+ return;
+ } catch (XMLStreamException |IOException e) {
+ //if there was an xml parse exception in xfa, try the AcroForm
+ }
+ }
+
+ @SuppressWarnings("rawtypes")
+ List fields = form.getFields();
+
+ if (fields == null)
+ return;
+
+ @SuppressWarnings("rawtypes")
+ ListIterator itr = fields.listIterator();
+
+ if (itr == null)
+ return;
+
+ xhtml.startElement("div", "class", "acroform");
+ xhtml.startElement("ol");
+
+ while (itr.hasNext()) {
+ Object obj = itr.next();
+ if (obj != null && obj instanceof PDField) {
+ processAcroField((PDField) obj, 0);
+ }
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("div");
+ }
+
+ private void processAcroField(PDField field, final int currentRecursiveDepth)
+ throws SAXException, IOException {
+
+ if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+ return;
+ }
+ addFieldString(field);
+ if (field instanceof PDNonTerminalField) {
+ int r = currentRecursiveDepth + 1;
+ xhtml.startElement("ol");
+ for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+ processAcroField(child, r);
+ }
+ xhtml.endElement("ol");
+ }
+ }
+
+ private void addFieldString(PDField field) throws SAXException {
+ //Pick partial name to present in content and altName for attribute
+ //Ignoring FullyQualifiedName for now
+ String partName = field.getPartialName();
+ String altName = field.getAlternateFieldName();
+
+ StringBuilder sb = new StringBuilder();
+ AttributesImpl attrs = new AttributesImpl();
+
+ if (partName != null) {
+ sb.append(partName).append(": ");
+ }
+ if (altName != null) {
+ attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+ }
+ //return early if PDSignature field
+ if (field instanceof PDSignatureField) {
+ handleSignature(attrs, (PDSignatureField) field);
+ return;
+ }
+ String value = field.getValueAsString();
+ if (value != null && !value.equals("null")) {
+ sb.append(value);
+ }
+
+ if (attrs.getLength() > 0 || sb.length() > 0) {
+ xhtml.startElement("li", attrs);
+ xhtml.characters(sb.toString());
+ xhtml.endElement("li");
+ }
+ }
+
+ private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
+ throws SAXException {
+
+ PDSignature sig = sigField.getSignature();
+ if (sig == null) {
+ return;
+ }
+ Map<String, String> vals = new TreeMap<>();
+ vals.put("name", sig.getName());
+ vals.put("contactInfo", sig.getContactInfo());
+ vals.put("location", sig.getLocation());
+ vals.put("reason", sig.getReason());
+
+ Calendar cal = sig.getSignDate();
+ if (cal != null) {
+ dateFormat.setTimeZone(cal.getTimeZone());
+ vals.put("date", dateFormat.format(cal.getTime()));
+ }
+ //see if there is any data
+ int nonNull = 0;
+ for (String val : vals.keySet()) {
+ if (val != null && !val.equals("")) {
+ nonNull++;
+ }
+ }
+ //if there is, process it
+ if (nonNull > 0) {
+ xhtml.startElement("li", parentAttributes);
+
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+ xhtml.startElement("ol", attrs);
+ for (Map.Entry<String, String> e : vals.entrySet()) {
+ if (e.getValue() == null || e.getValue().equals("")) {
+ continue;
+ }
+ attrs = new AttributesImpl();
+ attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
+ xhtml.startElement("li", attrs);
+ xhtml.characters(e.getValue());
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("li");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
new file mode 100644
index 0000000..539cd50
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to integrate text extraction via OCR only.
+ *
+ */
+class OCR2XHTML extends AbstractPDF2XHTML {
+
+ private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws IOException {
+ super(document, handler, context, metadata, config);
+ }
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if there was an exception outside of per page processing
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws SAXException, TikaException {
+ OCR2XHTML ocr2XHTML = null;
+ try {
+ ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
+ ocr2XHTML.writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+
+ @Override
+ public void flush() {
+ }
+
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ if (ocr2XHTML.exceptions.size() > 0) {
+ //throw the first
+ throw new TikaException("Unable to extract all PDF content",
+ ocr2XHTML.exceptions.get(0));
+ }
+ }
+
+ @Override
+ public void processPage(PDPage pdPage) throws IOException {
+ try {
+ startPage(pdPage);
+ doOCROnCurrentPage();
+ endPage(pdPage);
+ } catch (TikaException|SAXException e) {
+ throw new IOExceptionWithCause(e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+
+ @Override
+ protected void writeString(String text) throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeCharacters(TextPosition text) throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeWordSeparator() throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ //no-op
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 1a8bfb4..ac9823e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,74 +16,41 @@
*/
package org.apache.tika.parser.pdf;
-import javax.xml.stream.XMLStreamException;
import java.awt.image.BufferedImage;
-import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Calendar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
-import java.util.ListIterator;
-import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import java.util.TreeMap;
import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
-import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
-import org.apache.pdfbox.pdmodel.interactive.form.PDField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -93,28 +60,12 @@ import org.xml.sax.helpers.AttributesImpl;
* to produce a semi-structured XHTML SAX events instead of a plain text
* stream.
*/
-class PDF2XHTML extends PDFTextStripper {
+class PDF2XHTML extends AbstractPDF2XHTML {
- /**
- * Maximum recursive depth during AcroForm processing.
- * Prevents theoretical AcroForm recursion bomb.
- */
- private final static int MAX_ACROFORM_RECURSIONS = 10;
private static final List<String> JPEG = Arrays.asList(
COSName.DCT_DECODE.getName(),
COSName.DCT_DECODE_ABBREVIATION.getName());
- /**
- * Format used for signature dates
- * TODO Make this thread-safe
- */
- private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
- private final ContentHandler originalHandler;
- private final ParseContext context;
- private final XHTMLContentHandler handler;
- private final PDFParserConfig config;
- private final Metadata metadata;
- private final List<IOException> exceptions = new ArrayList<>();
/**
* This keeps track of the pdf object ids for inline
@@ -129,16 +80,10 @@ class PDF2XHTML extends PDFTextStripper {
*/
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private int inlineImageCounter = 0;
- private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
+ private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
throws IOException {
- //source of config (derives from context or PDFParser?) is
- //already determined in PDFParser. No need to check context here.
- this.config = config;
- this.originalHandler = handler;
- this.context = context;
- this.handler = new XHTMLContentHandler(handler, metadata);
- this.metadata = metadata;
+ super(document, handler, context, metadata, config);
}
/**
@@ -160,7 +105,7 @@ class PDF2XHTML extends PDFTextStripper {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
- pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
+ pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
config.configure(pdf2XHTML);
@@ -192,28 +137,6 @@ class PDF2XHTML extends PDFTextStripper {
}
}
- void extractBookmarkText() throws SAXException {
- PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
- if (outline != null) {
- extractBookmarkText(outline);
- }
- }
-
- void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
- PDOutlineItem current = bookmark.getFirstChild();
- if (current != null) {
- handler.startElement("ul");
- while (current != null) {
- handler.startElement("li");
- handler.characters(current.getTitle());
- handler.endElement("li");
- // Recurse:
- extractBookmarkText(current);
- current = current.getNextSibling();
- }
- handler.endElement("ul");
- }
- }
@Override
public void processPage(PDPage page) throws IOException {
@@ -225,52 +148,6 @@ class PDF2XHTML extends PDFTextStripper {
}
@Override
- protected void startDocument(PDDocument pdf) throws IOException {
- try {
- handler.startDocument();
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to start a document", e);
- }
- }
-
- @Override
- protected void endDocument(PDDocument pdf) throws IOException {
- try {
- // Extract text for any bookmarks:
- extractBookmarkText();
- try {
- extractEmbeddedDocuments(pdf, originalHandler);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
-
- //extract acroform data at end of doc
- if (config.getExtractAcroFormContent() == true) {
- try {
- extractAcroForm(pdf, handler);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
- handler.endDocument();
- } catch (TikaException e) {
- throw new IOExceptionWithCause("Unable to end a document", e);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to end a document", e);
- }
- }
-
- @Override
- protected void startPage(PDPage page) throws IOException {
- try {
- handler.startElement("div", "class", "page");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to start a page", e);
- }
- writeParagraphStart();
- }
-
- @Override
protected void endPage(PDPage page) throws IOException {
try {
writeParagraphEnd();
@@ -279,76 +156,7 @@ class PDF2XHTML extends PDFTextStripper {
} catch (IOException e) {
handleCatchableIOE(e);
}
-
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
- for (PDAnnotation annotation : page.getAnnotations()) {
-
- if (annotation instanceof PDAnnotationFileAttachment) {
- PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
- PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
- try {
- extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
- } catch (SAXException e) {
- throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
- } catch (TikaException e) {
- throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
- // TODO: remove once PDFBOX-1143 is fixed:
- if (config.getExtractAnnotationText()) {
- if (annotation instanceof PDAnnotationLink) {
- PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
- if (annotationlink.getAction() != null) {
- PDAction action = annotationlink.getAction();
- if (action instanceof PDActionURI) {
- PDActionURI uri = (PDActionURI) action;
- String link = uri.getURI();
- if (link != null) {
- handler.startElement("div", "class", "annotation");
- handler.startElement("a", "href", link);
- handler.endElement("a");
- handler.endElement("div");
- }
- }
- }
- }
-
- if (annotation instanceof PDAnnotationMarkup) {
- PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
- String title = annotationMarkup.getTitlePopup();
- String subject = annotationMarkup.getSubject();
- String contents = annotationMarkup.getContents();
- // TODO: maybe also annotationMarkup.getRichContents()?
- if (title != null || subject != null || contents != null) {
- handler.startElement("div", "class", "annotation");
-
- if (title != null) {
- handler.startElement("div", "class", "annotationTitle");
- handler.characters(title);
- handler.endElement("div");
- }
-
- if (subject != null) {
- handler.startElement("div", "class", "annotationSubject");
- handler.characters(subject);
- handler.endElement("div");
- }
-
- if (contents != null) {
- handler.startElement("div", "class", "annotationContents");
- handler.characters(contents);
- handler.endElement("div");
- }
-
- handler.endElement("div");
- }
- }
- }
- }
-
- handler.endElement("div");
+ super.endPage(page);
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
@@ -406,8 +214,8 @@ class PDF2XHTML extends PDFTextStripper {
AttributesImpl attr = new AttributesImpl();
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
attr.addAttribute("", "alt", "alt", "CDATA", fileName);
- handler.startElement("img", attr);
- handler.endElement("img");
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
//Do we only want to process unique COSObject ids?
//If so, have we already processed this one?
@@ -430,7 +238,7 @@ class PDF2XHTML extends PDFTextStripper {
writeToBuffer(image, extension, buffer);
extractor.parseEmbedded(
new ByteArrayInputStream(buffer.toByteArray()),
- new EmbeddedContentHandler(handler),
+ new EmbeddedContentHandler(xhtml),
metadata, false);
} catch (IOException e) {
handleCatchableIOE(e);
@@ -467,20 +275,11 @@ class PDF2XHTML extends PDFTextStripper {
out.flush();
}
- protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
- EmbeddedDocumentExtractor extractor =
- context.get(EmbeddedDocumentExtractor.class);
- if (extractor == null) {
- extractor = new ParsingEmbeddedDocumentExtractor(context);
- }
- return extractor;
- }
-
@Override
protected void writeParagraphStart() throws IOException {
super.writeParagraphStart();
try {
- handler.startElement("p");
+ xhtml.startElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a paragraph", e);
}
@@ -490,7 +289,7 @@ class PDF2XHTML extends PDFTextStripper {
protected void writeParagraphEnd() throws IOException {
super.writeParagraphEnd();
try {
- handler.endElement("p");
+ xhtml.endElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a paragraph", e);
}
@@ -499,7 +298,7 @@ class PDF2XHTML extends PDFTextStripper {
@Override
protected void writeString(String text) throws IOException {
try {
- handler.characters(text);
+ xhtml.characters(text);
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a string: " + text, e);
@@ -509,7 +308,7 @@ class PDF2XHTML extends PDFTextStripper {
@Override
protected void writeCharacters(TextPosition text) throws IOException {
try {
- handler.characters(text.getUnicode());
+ xhtml.characters(text.getUnicode());
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a character: " + text.getUnicode(), e);
@@ -519,7 +318,7 @@ class PDF2XHTML extends PDFTextStripper {
@Override
protected void writeWordSeparator() throws IOException {
try {
- handler.characters(getWordSeparator());
+ xhtml.characters(getWordSeparator());
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a space character", e);
@@ -529,275 +328,12 @@ class PDF2XHTML extends PDFTextStripper {
@Override
protected void writeLineSeparator() throws IOException {
try {
- handler.newline();
+ xhtml.newline();
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a newline character", e);
}
}
- private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- PDDocumentNameDictionary namesDictionary =
- new PDDocumentNameDictionary( document.getDocumentCatalog() );
- PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
- if (efTree == null) {
- return;
- }
-
- Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
- //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
- //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
- //If there is a need we could add a fully recursive search to find a non-null
- //Map<String, COSObjectable> that contains the doc info.
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- } else {
- List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
- if (kids == null) {
- return;
- }
- for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
- embeddedFileNames = node.getNames();
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- }
- }
- }
- }
-
- private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
- throws IOException, SAXException, TikaException {
- if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
- return;
- }
-
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
- for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
- PDComplexFileSpecification spec = ent.getValue();
- extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
- }
- }
-
- private void extractMultiOSPDEmbeddedFiles(String defaultName,
- PDComplexFileSpecification spec,
- EmbeddedDocumentExtractor extractor) throws IOException,
- SAXException, TikaException {
-
- if (spec == null) {
- return;
- }
- //current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
- }
-
- private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
- EmbeddedDocumentExtractor extractor)
- throws SAXException, IOException, TikaException {
-
- if (file == null) {
- //skip silently
- return;
- }
-
- fileName = (fileName == null) ? defaultName : fileName;
-
- // TODO: other metadata?
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-
- if (extractor.shouldParseEmbedded(metadata)) {
- TikaInputStream stream = null;
- try {
- stream = TikaInputStream.get(file.createInputStream());
- extractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(handler),
- metadata, false);
-
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", fileName);
- handler.startElement("div", attributes);
- handler.endElement("div");
- } finally {
- IOUtils.closeQuietly(stream);
- }
- }
- }
-
- private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException,
- SAXException {
- //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
- //this code derives from Ben's code
- PDDocumentCatalog catalog = pdf.getDocumentCatalog();
-
- if (catalog == null)
- return;
-
- PDAcroForm form = catalog.getAcroForm();
- if (form == null)
- return;
-
- //if it has xfa, try that.
- //if it doesn't exist or there's an exception,
- //go with traditional AcroForm
- PDXFAResource pdxfa = form.getXFA();
-
- if (pdxfa != null) {
- //if successful, return
- XFAExtractor xfaExtractor = new XFAExtractor();
- try (InputStream is = new BufferedInputStream(
- new ByteArrayInputStream(pdxfa.getBytes()))) {
- xfaExtractor.extract(is, handler, metadata, context);
- return;
- } catch (XMLStreamException |IOException e) {
- //if there was an xml parse exception in xfa, try the AcroForm
- }
- }
-
- @SuppressWarnings("rawtypes")
- List fields = form.getFields();
-
- if (fields == null)
- return;
-
- @SuppressWarnings("rawtypes")
- ListIterator itr = fields.listIterator();
-
- if (itr == null)
- return;
-
- handler.startElement("div", "class", "acroform");
- handler.startElement("ol");
-
- while (itr.hasNext()) {
- Object obj = itr.next();
- if (obj != null && obj instanceof PDField) {
- processAcroField((PDField) obj, handler, 0);
- }
- }
- handler.endElement("ol");
- handler.endElement("div");
- }
-
- private void processAcroField(PDField field,
- XHTMLContentHandler handler, final int currentRecursiveDepth)
- throws SAXException, IOException {
-
- if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
- return;
- }
- addFieldString(field, handler);
- if (field instanceof PDNonTerminalField) {
- int r = currentRecursiveDepth + 1;
- handler.startElement("ol");
- for (PDField child : ((PDNonTerminalField)field).getChildren()) {
- processAcroField(child, handler, r);
- }
- handler.endElement("ol");
- }
- }
-
- private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException {
- //Pick partial name to present in content and altName for attribute
- //Ignoring FullyQualifiedName for now
- String partName = field.getPartialName();
- String altName = field.getAlternateFieldName();
-
- StringBuilder sb = new StringBuilder();
- AttributesImpl attrs = new AttributesImpl();
-
- if (partName != null) {
- sb.append(partName).append(": ");
- }
- if (altName != null) {
- attrs.addAttribute("", "altName", "altName", "CDATA", altName);
- }
- //return early if PDSignature field
- if (field instanceof PDSignatureField) {
- handleSignature(attrs, (PDSignatureField) field, handler);
- return;
- }
- String value = field.getValueAsString();
- if (value != null && !value.equals("null")) {
- sb.append(value);
- }
-
- if (attrs.getLength() > 0 || sb.length() > 0) {
- handler.startElement("li", attrs);
- handler.characters(sb.toString());
- handler.endElement("li");
- }
- }
-
- private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField,
- XHTMLContentHandler handler) throws SAXException {
-
-
- PDSignature sig = sigField.getSignature();
- if (sig == null) {
- return;
- }
- Map<String, String> vals = new TreeMap<>();
- vals.put("name", sig.getName());
- vals.put("contactInfo", sig.getContactInfo());
- vals.put("location", sig.getLocation());
- vals.put("reason", sig.getReason());
-
- Calendar cal = sig.getSignDate();
- if (cal != null) {
- dateFormat.setTimeZone(cal.getTimeZone());
- vals.put("date", dateFormat.format(cal.getTime()));
- }
- //see if there is any data
- int nonNull = 0;
- for (String val : vals.keySet()) {
- if (val != null && !val.equals("")) {
- nonNull++;
- }
- }
- //if there is, process it
- if (nonNull > 0) {
- handler.startElement("li", parentAttributes);
-
- AttributesImpl attrs = new AttributesImpl();
- attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
-
- handler.startElement("ol", attrs);
- for (Map.Entry<String, String> e : vals.entrySet()) {
- if (e.getValue() == null || e.getValue().equals("")) {
- continue;
- }
- attrs = new AttributesImpl();
- attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
- handler.startElement("li", attrs);
- handler.characters(e.getValue());
- handler.endElement("li");
- }
- handler.endElement("ol");
- handler.endElement("li");
- }
- }
-
- private void handleCatchableIOE(IOException e) throws IOException {
- if (config.isCatchIntermediateIOExceptions()) {
- String msg = e.getMessage();
- if (msg == null) {
- msg = "IOException, no message";
- }
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
- exceptions.add(e);
- } else {
- throw e;
- }
- }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index b677d84..3e33962 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -58,6 +58,7 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
@@ -140,7 +141,13 @@ public class PDFParser extends AbstractParser {
if (handler != null) {
if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
+ } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
+ metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
+ OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
} else {
+ if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+ metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
+ }
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 9baeb37..296b191 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -23,6 +23,7 @@ import java.io.Serializable;
import java.util.Locale;
import java.util.Properties;
+import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
/**
@@ -44,6 +45,26 @@ import org.apache.pdfbox.text.PDFTextStripper;
*/
public class PDFParserConfig implements Serializable {
+ public enum OCR_STRATEGY {
+ NO_OCR,
+ OCR_ONLY,
+ OCR_AND_TEXT_EXTRACTION;
+
+ private static OCR_STRATEGY parse(String s) {
+ if (s == null) {
+ return NO_OCR;
+ } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
+ return NO_OCR;
+ } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
+ return OCR_ONLY;
+ } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
+ return OCR_AND_TEXT_EXTRACTION;
+ }
+ //default -- no ocr
+ return NO_OCR;
+ }
+ }
+
private static final long serialVersionUID = 6492570218190936986L;
// True if we let PDFBox "guess" where spaces should go:
@@ -80,6 +101,12 @@ public class PDFParserConfig implements Serializable {
//content from elsewhere in the document.
private boolean ifXFAExtractOnlyXFA = false;
+ private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
+
+ private int ocrDPI = 200;
+ private ImageType ocrImageType = ImageType.GRAY;
+ private String ocrImageFormatName = "png";
+
private AccessChecker accessChecker;
//The PDFParser can throw IOExceptions if there is a problem
@@ -123,36 +150,45 @@ public class PDFParserConfig implements Serializable {
}
}
setEnableAutoSpace(
- getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
+ getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
setSuppressDuplicateOverlappingText(
- getProp(props.getProperty("suppressDuplicateOverlappingText"),
+ getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
getSuppressDuplicateOverlappingText()));
setExtractAnnotationText(
- getProp(props.getProperty("extractAnnotationText"),
+ getBooleanProp(props.getProperty("extractAnnotationText"),
getExtractAnnotationText()));
setSortByPosition(
- getProp(props.getProperty("sortByPosition"),
+ getBooleanProp(props.getProperty("sortByPosition"),
getSortByPosition()));
setExtractAcroFormContent(
- getProp(props.getProperty("extractAcroFormContent"),
+ getBooleanProp(props.getProperty("extractAcroFormContent"),
getExtractAcroFormContent()));
setExtractInlineImages(
- getProp(props.getProperty("extractInlineImages"),
+ getBooleanProp(props.getProperty("extractInlineImages"),
getExtractInlineImages()));
setExtractUniqueInlineImagesOnly(
- getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+ getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
getExtractUniqueInlineImagesOnly()));
setIfXFAExtractOnlyXFA(
- getProp(props.getProperty("ifXFAExtractOnlyXFA"),
+ getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
getIfXFAExtractOnlyXFA()));
setCatchIntermediateIOExceptions(
- getProp(props.getProperty("catchIntermediateIOExceptions"),
+ getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
isCatchIntermediateIOExceptions()));
- boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false);
- boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true);
+ setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
+
+ setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
+
+ setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
+
+ setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
+
+
+ boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
+ boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
if (checkExtractAccessPermission == false) {
//silently ignore the crazy configuration of checkExtractAccessPermission = false,
@@ -408,7 +444,23 @@ public class PDFParserConfig implements Serializable {
isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
}
- private boolean getProp(String p, boolean defaultMissing) {
+ /**
+ * Which strategy to use for OCR
+ * @param ocrStrategy
+ */
+ public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
+ this.ocrStrategy = ocrStrategy;
+ }
+
+ /**
+ *
+ * @return strategy to use for OCR
+ */
+ public OCR_STRATEGY getOCRStrategy() {
+ return ocrStrategy;
+ }
+
+ private boolean getBooleanProp(String p, boolean defaultMissing) {
if (p == null) {
return defaultMissing;
}
@@ -420,83 +472,143 @@ public class PDFParserConfig implements Serializable {
return defaultMissing;
}
}
+ //throws NumberFormatException if there's a non-null unparseable
+ //string passed in
+ private int getIntProp(String p, int defaultMissing) {
+ if (p == null) {
+ return defaultMissing;
+ }
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime
- * result
- + ((averageCharTolerance == null) ? 0 : averageCharTolerance
- .hashCode());
- result = prime * result + (enableAutoSpace ? 1231 : 1237);
- result = prime * result + (extractAcroFormContent ? 1231 : 1237);
- result = prime * result + (extractAnnotationText ? 1231 : 1237);
- result = prime * result + (extractInlineImages ? 1231 : 1237);
- result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237);
- result = prime * result + (sortByPosition ? 1231 : 1237);
- result = prime
- * result
- + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode());
- result = prime * result
- + (suppressDuplicateOverlappingText ? 1231 : 1237);
- result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237);
- return result;
+ return Integer.parseInt(p);
}
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- PDFParserConfig other = (PDFParserConfig) obj;
- if (averageCharTolerance == null) {
- if (other.averageCharTolerance != null)
- return false;
- } else if (!averageCharTolerance.equals(other.averageCharTolerance))
- return false;
- if (enableAutoSpace != other.enableAutoSpace)
- return false;
- if (extractAcroFormContent != other.extractAcroFormContent)
- return false;
- if (extractAnnotationText != other.extractAnnotationText)
- return false;
- if (extractInlineImages != other.extractInlineImages)
- return false;
- if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly)
- return false;
- if (sortByPosition != other.sortByPosition)
- return false;
- if (spacingTolerance == null) {
- if (other.spacingTolerance != null)
- return false;
- } else if (!spacingTolerance.equals(other.spacingTolerance))
- return false;
- if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText)
- return false;
- if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA)
- return false;
+ /**
+ * String representation of the image format used to render
+ * the page image for OCR (examples: png, tiff, jpeg)
+ * @return
+ */
+ public String getOCRImageFormatName() {
+ return ocrImageFormatName;
+ }
- return true;
+ /**
+ * @see #getOCRImageFormatName()
+ *
+ * @param ocrImageFormatName name of image format used to render
+ * page image
+ */
+ public void setOCRImageFormatName(String ocrImageFormatName) {
+ this.ocrImageFormatName = ocrImageFormatName;
+ }
+
+ /**
+ * Image type used to render the page image for OCR.
+ * @see #setOCRImageType(ImageType)
+ * @return image type
+ */
+ public ImageType getOCRImageType() {
+ return ocrImageType;
+ }
+
+ /**
+ * Image type used to render the page image for OCR.
+ * @param ocrImageType
+ */
+ public void setOCRImageType(ImageType ocrImageType) {
+ this.ocrImageType = ocrImageType;
+ }
+
+ /**
+ * Dots per inch used to render the page image for OCR
+ * @return dots per inch
+ */
+ public int getOCRDPI() {
+ return ocrDPI;
+ }
+
+ /**
+ * Dots per inche used to render the page image for OCR
+ * @param ocrDPI
+ */
+ public void setOCRDPI(int ocrDPI) {
+ this.ocrDPI = ocrDPI;
+ }
+
+ private ImageType parseImageType(String ocrImageType) {
+ for (ImageType t : ImageType.values()) {
+ if (ocrImageType.equalsIgnoreCase(t.toString())) {
+ return t;
+ }
+ }
+ return null;
}
@Override
- public String toString() {
- return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
- + ", suppressDuplicateOverlappingText="
- + suppressDuplicateOverlappingText + ", extractAnnotationText="
- + extractAnnotationText + ", sortByPosition=" + sortByPosition
- + ", extractAcroFormContent=" + extractAcroFormContent
- + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA
- + ", extractInlineImages=" + extractInlineImages
- + ", extractUniqueInlineImagesOnly="
- + extractUniqueInlineImagesOnly + ", averageCharTolerance="
- + averageCharTolerance + ", spacingTolerance="
- + spacingTolerance + "]";
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof PDFParserConfig)) return false;
+
+ PDFParserConfig config = (PDFParserConfig) o;
+
+ if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
+ if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false;
+ if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false;
+ if (getSortByPosition() != config.getSortByPosition()) return false;
+ if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false;
+ if (getExtractInlineImages() != config.getExtractInlineImages()) return false;
+ if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false;
+ if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false;
+ if (getOCRDPI() != config.getOCRDPI()) return false;
+ if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false;
+ if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
+ if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
+ if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
+ if (getOCRImageType() != config.getOCRImageType()) return false;
+ if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false;
+ return getAccessChecker().equals(config.getAccessChecker());
+
}
+ @Override
+ public int hashCode() {
+ int result = (getEnableAutoSpace() ? 1 : 0);
+ result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
+ result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
+ result = 31 * result + (getSortByPosition() ? 1 : 0);
+ result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
+ result = 31 * result + (getExtractInlineImages() ? 1 : 0);
+ result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
+ result = 31 * result + getAverageCharTolerance().hashCode();
+ result = 31 * result + getSpacingTolerance().hashCode();
+ result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
+ result = 31 * result + ocrStrategy.hashCode();
+ result = 31 * result + getOCRDPI();
+ result = 31 * result + getOCRImageType().hashCode();
+ result = 31 * result + getOCRImageFormatName().hashCode();
+ result = 31 * result + getAccessChecker().hashCode();
+ result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+ return result;
+ }
+ @Override
+ public String toString() {
+ return "PDFParserConfig{" +
+ "enableAutoSpace=" + enableAutoSpace +
+ ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
+ ", extractAnnotationText=" + extractAnnotationText +
+ ", sortByPosition=" + sortByPosition +
+ ", extractAcroFormContent=" + extractAcroFormContent +
+ ", extractInlineImages=" + extractInlineImages +
+ ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
+ ", averageCharTolerance=" + averageCharTolerance +
+ ", spacingTolerance=" + spacingTolerance +
+ ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
+ ", ocrStrategy=" + ocrStrategy +
+ ", ocrDPI=" + ocrDPI +
+ ", ocrImageType=" + ocrImageType +
+ ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
+ ", accessChecker=" + accessChecker +
+ ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
+ '}';
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 9b404a3..319e693 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -23,4 +23,12 @@ extractUniqueInlineImagesOnly true
checkExtractAccessPermission false
allowExtractionForAccessibility true
ifXFAExtractOnlyXFA false
-catchIntermediateIOExceptions true
\ No newline at end of file
+catchIntermediateIOExceptions true
+#options: no_ocr, ocr_only, ocr_and_text_extraction
+ocrStrategy no_ocr
+#dots per inch for the ocr rendering of the page image
+ocrDPI 200
+#if you request tif, make sure you have imageio jars on your classpath!
+ocrImageFormatName png
+#options: argb, binary, gray, rgb
+ocrImageType gray
http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6d07c59..df2e27c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -50,6 +50,8 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
@@ -70,6 +72,16 @@ public class PDFParserTest extends TikaTest {
public static final MediaType TYPE_DOC = MediaType.application("msword");
public static Level PDFBOX_LOG_LEVEL = Level.INFO;
+ private static Boolean hasTesseract = null;
+
+ public static boolean canRunOCR() {
+ if (hasTesseract != null) {
+ return hasTesseract;
+ }
+ hasTesseract = new TesseractOCRParser().hasTesseract(new TesseractOCRConfig());
+ return hasTesseract;
+ }
+
@BeforeClass
public static void setup() {
//remember default logging level, but turn off for PDFParserTest
@@ -1175,6 +1187,32 @@ public class PDFParserTest extends TikaTest {
assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH));
}
+ @Test
+ public void testEmbeddedDocsWithOCROnly() throws Exception {
+ if (! canRunOCR()) { return; }
+
+ for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOCRStrategy(strategy);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ context.set(Parser.class, new AutoDetectParser());
+ //make sure everything works with regular xml _and_ with recursive
+ XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context);
+ assertContains("pdf_haystack", xmlResult.xml);
+ assertContains("Haystack", xmlResult.xml);
+ assertContains("Needle", xmlResult.xml);
+ if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
+ assertContains("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
+ } else {
+ assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
+ }
+ assertEquals(4, getRecursiveJson("testPDFEmbeddingAndEmbedded.docx", context).size());
+ }
+
+ }
+
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);
[2/4] tika git commit: TIKA-1994 -- integrate OCR with PDFParser,
update CHANGES.txt
Posted by ta...@apache.org.
TIKA-1994 -- integrate OCR with PDFParser, update CHANGES.txt
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1af1078a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1af1078a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1af1078a
Branch: refs/heads/TIKA-1508
Commit: 1af1078adcb746fced8c71e4afe5b4d008a3f6b8
Parents: 7aeb95d
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 3 11:48:42 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 3 11:48:42 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/1af1078a/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 98e0ce3..08cd8ff 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.14 - ???
+ * Integrate TesseractOCR with full page image rendering for PDFs (TIKA-1994).
+
* Add mime detection via Nick C and parser for DBF files (TIKA-1513).
* Add mime detection and parsers for MSOffice 2003 XML Word
[3/4] tika git commit: Merge remote-tracking branch
'origin/TIKA-1508' into TIKA-1508
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/TIKA-1508' into TIKA-1508
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/49ddf6e3
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/49ddf6e3
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/49ddf6e3
Branch: refs/heads/TIKA-1508
Commit: 49ddf6e39615e8284a7c9e45e76a659725c0ad7a
Parents: 1af1078 1202f45
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 3 15:09:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 3 15:09:33 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/base/Configurable.java | 44 +++++
.../main/java/org/apache/tika/config/Field.java | 43 +++++
.../main/java/org/apache/tika/config/Param.java | 191 +++++++++++++++++++
.../java/org/apache/tika/config/ParamField.java | 158 +++++++++++++++
.../java/org/apache/tika/config/TikaConfig.java | 51 ++++-
.../tika/exception/TikaConfigException.java | 39 ++++
.../org/apache/tika/parser/AbstractParser.java | 34 +++-
.../apache/tika/parser/ConfigurableParser.java | 32 ++++
.../org/apache/tika/parser/ParseContext.java | 40 ++++
.../java/org/apache/tika/parser/Parser.java | 1 +
.../org/apache/tika/utils/AnnotationUtils.java | 131 +++++++++++++
.../java/org/apache/tika/config/ParamTest.java | 71 +++++++
.../tika/parser/ConfigurableParserTest.java | 76 ++++++++
.../tika/parser/DummyConfigurableParser.java | 67 +++++++
.../tika/parser/DummyParametrizedParser.java | 97 ++++++++++
.../tika/parser/ParametrizedParserTest.java | 67 +++++++
.../apache/tika/utils/AnnotationUtilsTest.java | 190 ++++++++++++++++++
.../tika/config/TIKA-1508-configurable.xml | 37 ++++
.../tika/config/TIKA-1986-parametrized.xml | 37 ++++
19 files changed, 1404 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
[4/4] tika git commit: TIKA-1508 add some unit tests for
ParameterizedParserTest
Posted by ta...@apache.org.
TIKA-1508 add some unit tests for ParameterizedParserTest
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/18ab8f91
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/18ab8f91
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/18ab8f91
Branch: refs/heads/TIKA-1508
Commit: 18ab8f91f19b105270a107174afd64fe81d20f72
Parents: 49ddf6e
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 3 15:55:24 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 3 15:55:24 2016 -0400
----------------------------------------------------------------------
.../tika/parser/ConfigurableParserTest.java | 1 -
.../tika/parser/DummyParameterizedParser.java | 113 +++++++++++++++++
.../tika/parser/DummyParametrizedParser.java | 97 --------------
.../tika/parser/ParameterizedParserTest.java | 127 +++++++++++++++++++
.../tika/parser/ParametrizedParserTest.java | 67 ----------
.../tika/config/TIKA-1986-bad-parameters.xml | 26 ++++
.../apache/tika/config/TIKA-1986-bad-types.xml | 26 ++++
.../apache/tika/config/TIKA-1986-bad-values.xml | 26 ++++
.../tika/config/TIKA-1986-parameterized.xml | 38 ++++++
.../tika/config/TIKA-1986-parametrized.xml | 37 ------
.../tika/config/TIKA-1986-some-parameters.xml | 28 ++++
11 files changed, 384 insertions(+), 202 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
index c059626..dcf188d 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
@@ -27,7 +27,6 @@ import java.math.BigInteger;
import java.net.URI;
import java.net.URL;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Map;
public class ConfigurableParserTest {
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
new file mode 100644
index 0000000..848b774
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.osgi.util.measurement.Unit.s;
+
+/**
+ * A test Parsers to test {@link Field}
+ * @since Apache Tika 1.14
+ */
+public class DummyParameterizedParser extends AbstractParser
+ implements ConfigurableParser {
+
+ private static Set<MediaType> MIMES = new HashSet<>();
+ static {
+ MIMES.add(MediaType.TEXT_PLAIN);
+ MIMES.add(MediaType.TEXT_HTML);
+ MIMES.add(MediaType.APPLICATION_XML);
+ MIMES.add(MediaType.OCTET_STREAM);
+ }
+
+ @Field(name = "testparam") private String testParam = "init_string";
+ @Field private short xshort = -2;
+ @Field private int xint = -3;
+ @Field private long xlong = -4;
+ @Field(name = "xbigint") private BigInteger xbigInt;
+ @Field private float xfloat = -5.0f;
+ @Field private double xdouble = -6.0d;
+ @Field private boolean xbool = true;
+ @Field private URL xurl;
+ @Field private URI xuri;
+
+ @Field private String missing = "default";
+
+ private String inner = "inner";
+ private File xfile;
+
+ public DummyParameterizedParser() {
+ try {
+ xurl = new URL("http://tika.apache.org/url");
+ } catch (MalformedURLException e) {
+ throw new IllegalArgumentException(e);
+ }
+ try {
+ xuri = new URI("http://tika.apache.org/uri");
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+ @Field
+ public void setXfile(File xfile){
+ this.xfile = xfile;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+
+ return MIMES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ metadata.add("testparam", testParam);
+ metadata.add("xshort", xshort + "");
+ metadata.add("xint", xint + "");
+ metadata.add("xlong", xlong + "");
+ metadata.add("xbigint", xbigInt + "");
+ metadata.add("xfloat", xfloat + "");
+ metadata.add("xdouble", xdouble + "");
+ metadata.add("xbool", xbool + "");
+ metadata.add("xuri", xuri + "");
+ metadata.add("xurl", xurl + "");
+ metadata.add("xfile", xfile + "");
+
+ metadata.add("inner", inner + "");
+ metadata.add("missing", missing + "");
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/java/org/apache/tika/parser/DummyParametrizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParametrizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParametrizedParser.java
deleted file mode 100644
index 383a80b..0000000
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParametrizedParser.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.math.BigInteger;
-import java.net.URI;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * A test Parsers to test {@link Field}
- * @since Apache Tika 1.14
- */
-public class DummyParametrizedParser extends AbstractParser
- implements ConfigurableParser {
-
- private static Set<MediaType> MIMES = new HashSet<>();
- static {
- MIMES.add(MediaType.TEXT_PLAIN);
- MIMES.add(MediaType.TEXT_HTML);
- MIMES.add(MediaType.APPLICATION_XML);
- MIMES.add(MediaType.OCTET_STREAM);
- }
-
- @Field(name = "testparam") private String testParam;
- @Field private short xshort;
- @Field private int xint;
- @Field private long xlong;
- @Field(name = "xbigint") private BigInteger xbigInt;
- @Field private float xfloat;
- @Field private double xdouble;
- @Field private boolean xbool;
- @Field private URL xurl;
- @Field private URI xuri;
-
- @Field private String missing = "default";
-
- private String inner = "inner";
- private File xfile;
-
- @Field
- public void setXfile(File xfile){
- this.xfile = xfile;
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
-
- return MIMES;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- metadata.add("testparam", testParam);
- metadata.add("xshort", xshort + "");
- metadata.add("xint", xint + "");
- metadata.add("xlong", xlong + "");
- metadata.add("xbigint", xbigInt + "");
- metadata.add("xfloat", xfloat + "");
- metadata.add("xdouble", xdouble + "");
- metadata.add("xbool", xbool + "");
- metadata.add("xuri", xuri + "");
- metadata.add("xurl", xurl + "");
- metadata.add("xfile", xfile + "");
-
- metadata.add("inner", inner + "");
- metadata.add("missing", missing + "");
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
new file mode 100644
index 0000000..e0c3b53
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class ParameterizedParserTest {
+
+ private static final Map<String, String> expcted = new HashMap<String, String>() {
+ {
+ put("testparam", "testparamval");
+ put("xshort", "1000");
+ put("xint", "999999999");
+ put("xlong", "9999999999999");
+ put("xbigint", "99999999999999999999999999999999999999999999999");
+ put("xfloat", "10.2");
+ put("xbool", "true");
+ put("xdouble", "4.6");
+ put("xurl", "http://apache.org");
+ put("xfile", "somefile");
+ put("xuri", "tika://customuri?param=value");
+
+ put("inner", "inner");
+ put("missing", "default");
+ }
+ };
+
+
+ @Test
+ public void testConfigurableParserTypes() throws Exception {
+ Metadata md = getMetadata("TIKA-1986-parameterized.xml");
+ for (Map.Entry<String, String> entry : expcted.entrySet()) {
+ assertEquals("mismatch for " + entry.getKey(), entry.getValue(), md.get(entry.getKey()));
+ }
+ }
+
+ @Test
+ public void testSomeParams() throws Exception {
+ //test that a parameterized parser can read a config file
+ //with only some changes to the initial values
+ Metadata md = getMetadata("TIKA-1986-some-parameters.xml");
+ assertEquals("-6.0", md.get("xdouble"));
+ assertEquals("testparamval", md.get("testparam"));
+ assertEquals("true", md.get("xbool"));
+ }
+
+ @Test
+ @Ignore("can we get this to work, somehow?")
+ public void testBadParam() throws Exception {
+ try {
+ Metadata m = getMetadata("TIKA-1986-bad-parameters.xml");
+ fail("should have thrown exception");
+ } catch (TikaException e) {
+
+ }
+ }
+
+ @Test
+ public void testBadValue() throws Exception {
+ boolean ex = false;
+ try {
+ Metadata m = getMetadata("TIKA-1986-bad-values.xml");
+ fail("should have thrown exception");
+ } catch (TikaConfigException e) {
+ ex = true;
+ }
+ assertTrue("No TikaConfigException", ex);
+ }
+
+ @Test
+ public void testBadType() throws Exception {
+ //TODO: should this be a TikaConfigException instead of Runtime?
+ boolean ex = false;
+ try {
+ Metadata m = getMetadata("TIKA-1986-bad-types.xml");
+ fail("should have thrown exception");
+ } catch (RuntimeException e) {
+ ex = true;
+ }
+ assertTrue("No RuntimeException", ex);
+ }
+
+ //TODO later -- add a test for a parser that isn't configurable
+ //but that has params in the config file
+
+ private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
+ URL url = this.getClass().getResource("/org/apache/tika/config/"+name);
+ assertNotNull("couldn't find: "+name, url);
+ TikaConfig tikaConfig = new TikaConfig(url);
+ Tika tika = new Tika(tikaConfig);
+ Metadata metadata = new Metadata();
+ tika.parse(url.openStream(), metadata);
+ return metadata;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/java/org/apache/tika/parser/ParametrizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParametrizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParametrizedParserTest.java
deleted file mode 100644
index 290f819..0000000
--- a/tika-core/src/test/java/org/apache/tika/parser/ParametrizedParserTest.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.net.URL;
-import java.util.HashMap;
-import java.util.Map;
-
-public class ParametrizedParserTest {
-
- private static final String TIKA_CFG_FILE = "org/apache/tika/config/TIKA-1986-parametrized.xml";
- private static final Map<String, String> expcted = new HashMap<String, String>() {
- {
- put("testparam", "testparamval");
- put("xshort", "1000");
- put("xint", "999999999");
- put("xlong", "9999999999999");
- put("xbigint", "99999999999999999999999999999999999999999999999");
- put("xfloat", "10.2");
- put("xbool", "true");
- put("xdouble", "4.6");
- put("xurl", "http://apache.org");
- put("xfile", "/");
- put("xuri", "tika://customuri?param=value");
-
- put("inner", "inner");
- put("missing", "default");
- }
- };
-
-
- @Test
- public void testConfigurableParserTypes() throws Exception {
- URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
- assert configFileUrl != null;
- TikaConfig config = new TikaConfig(configFileUrl);
- Tika tika = new Tika(config);
- Metadata md = new Metadata();
- tika.parse(configFileUrl.openStream(), md);
-
- for (Map.Entry<String, String> entry : expcted.entrySet()) {
- Assert.assertEquals("mismatch for " + entry.getKey(), entry.getValue(), md.get(entry.getKey()));
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml
new file mode 100644
index 0000000..bc83cb5
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DummyParameterizedParser">
+ <params>
+ <param name="zippidity_doo_dah" type="string">testparamval</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-types.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-types.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-types.xml
new file mode 100644
index 0000000..690c902
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-types.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DummyParameterizedParser">
+ <params>
+ <param name="xint" type="zippity">When in the course of human events</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-values.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-values.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-values.xml
new file mode 100644
index 0000000..5219142
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-values.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DummyParameterizedParser">
+ <params>
+ <param name="xint" type="int">When in the course of human events</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parameterized.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parameterized.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parameterized.xml
new file mode 100644
index 0000000..1f3d46c
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parameterized.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DummyParameterizedParser">
+ <params>
+ <param name="testparam" type="string">testparamval</param>
+ <param name="xshort" type="short">1000</param>
+ <param name="xint" type="int">999999999</param>
+ <param name="xlong" type="long">9999999999999</param>
+ <param name="xbigint" type="bigint">99999999999999999999999999999999999999999999999</param>
+ <param name="xfloat" type="float">10.2</param>
+ <param name="xbool" type="bool">true</param>
+ <param name="xdouble" type="double">4.6</param>
+ <param name="xurl" type="url">http://apache.org</param>
+ <!-- for cross platform testing, can't include / in file -->
+ <param name="xfile" type="file">somefile</param>
+ <param name="xuri" type="uri">tika://customuri?param=value</param>
+ </params>
+ </parser>
+
+ </parsers>
+</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parametrized.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parametrized.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parametrized.xml
deleted file mode 100644
index 6689a19..0000000
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-parametrized.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DummyParametrizedParser">
- <params>
- <param name="testparam" type="string">testparamval</param>
- <param name="xshort" type="short">1000</param>
- <param name="xint" type="int">999999999</param>
- <param name="xlong" type="long">9999999999999</param>
- <param name="xbigint" type="bigint">99999999999999999999999999999999999999999999999</param>
- <param name="xfloat" type="float">10.2</param>
- <param name="xbool" type="bool">true</param>
- <param name="xdouble" type="double">4.6</param>
- <param name="xurl" type="url">http://apache.org</param>
- <param name="xfile" type="file">/</param>
- <param name="xuri" type="uri">tika://customuri?param=value</param>
- </params>
- </parser>
-
- </parsers>
-</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/18ab8f91/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
new file mode 100644
index 0000000..250d439
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DummyParameterizedParser">
+ <params>
+ <param name="testparam" type="string">testparamval</param>
+ <param name="testbool" type="bool">false</param>
+ </params>
+ </parser>
+
+ </parsers>
+</properties>