You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:38:57 UTC
[3/4] tika git commit: Merge branch 'pdf_javascript'
Merge branch 'pdf_javascript'
# Conflicts:
# tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
# tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
# tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0e0f30d9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0e0f30d9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0e0f30d9
Branch: refs/heads/master
Commit: 0e0f30d9bb7352bf08c884f8646619ca215a6907
Parents: 09931fe 4dd6fd1
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:33:31 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:33:31 2016 -0500
----------------------------------------------------------------------
.../main/java/org/apache/tika/metadata/PDF.java | 6 +
.../tika/parser/pdf/AbstractPDF2XHTML.java | 244 ++++++++++++++++---
.../org/apache/tika/parser/pdf/PDFParser.java | 1 -
.../apache/tika/parser/pdf/PDFParserConfig.java | 26 ++
.../apache/tika/parser/pdf/PDFParserTest.java | 4 +-
5 files changed, 238 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 50afc11,cb7c673..cd22895
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@@ -127,14 -166,23 +168,14 @@@ class AbstractPDF2XHTML extends PDFText
writeParagraphStart();
}
- EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
- EmbeddedDocumentExtractor extractor =
- context.get(EmbeddedDocumentExtractor.class);
- if (extractor == null) {
- extractor = new ParsingEmbeddedDocumentExtractor(context);
- }
- return extractor;
- }
-
private void extractEmbeddedDocuments(PDDocument document)
throws IOException, SAXException, TikaException {
- PDDocumentNameDictionary namesDictionary =
- new PDDocumentNameDictionary(document.getDocumentCatalog());
- PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
- if (efTree == null) {
- return;
- }
+ PDDocumentNameDictionary namesDictionary =
+ new PDDocumentNameDictionary(document.getDocumentCatalog());
+ PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+ if (efTree == null) {
+ return;
+ }
Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
//For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
@@@ -192,35 -259,26 +252,26 @@@
return;
}
- fileName = (fileName == null) ? displayName : fileName;
+ fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
+ fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
// TODO: other metadata?
- Metadata embeddedMetadata = new Metadata();
- embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
- embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
- embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+ if (extractor.shouldParseEmbedded(metadata)) {
TikaInputStream stream = null;
try {
-
- InputStream rawStream = null;
- try {
- rawStream = file.createInputStream();
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordException(e, metadata);
- return;
- }
- stream = TikaInputStream.get(rawStream);
- embeddedDocumentExtractor.parseEmbedded(
+ stream = TikaInputStream.get(file.createInputStream());
+ extractor.parseEmbedded(
stream,
new EmbeddedContentHandler(xhtml),
- metadata, false);
+ embeddedMetadata, false);
- AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
xhtml.startElement("div", attributes);
@@@ -290,6 -348,6 +341,7 @@@
protected void endPage(PDPage page) throws IOException {
try {
++ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (PDAnnotation annotation : page.getAnnotations()) {
if (annotation instanceof PDAnnotationFileAttachment) {
http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index e9eb6e5,567a4fc..ce98e0d
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@@ -125,8 -126,10 +125,10 @@@ public class PDFParserConfig implement
//with a streams. If this is set to true, Tika's
//parser catches these exceptions, reports them in the metadata
//and then throws the first stored exception after the parse has completed.
- private boolean isCatchIntermediateIOExceptions = true;
+ private boolean catchIntermediateIOExceptions = true;
+ private boolean extractActions = false;
+
public PDFParserConfig() {
init(this.getClass().getResourceAsStream("PDFParser.properties"));
}
@@@ -654,7 -672,8 +679,8 @@@
", ocrImageType=" + ocrImageType +
", ocrImageFormatName='" + ocrImageFormatName + '\'' +
", accessChecker=" + accessChecker +
- ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
+ ", extractActions=" + extractActions +
+ ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------