You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:38:57 UTC

[3/4] tika git commit: Merge branch 'pdf_javascript'

Merge branch 'pdf_javascript'

# Conflicts:
#	tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
#	tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
#	tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0e0f30d9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0e0f30d9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0e0f30d9

Branch: refs/heads/master
Commit: 0e0f30d9bb7352bf08c884f8646619ca215a6907
Parents: 09931fe 4dd6fd1
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:33:31 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:33:31 2016 -0500

----------------------------------------------------------------------
 .../main/java/org/apache/tika/metadata/PDF.java |   6 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      | 244 ++++++++++++++++---
 .../org/apache/tika/parser/pdf/PDFParser.java   |   1 -
 .../apache/tika/parser/pdf/PDFParserConfig.java |  26 ++
 .../apache/tika/parser/pdf/PDFParserTest.java   |   4 +-
 5 files changed, 238 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 50afc11,cb7c673..cd22895
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@@ -127,14 -166,23 +168,14 @@@ class AbstractPDF2XHTML extends PDFText
          writeParagraphStart();
      }
  
 -    EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
 -        EmbeddedDocumentExtractor extractor =
 -                context.get(EmbeddedDocumentExtractor.class);
 -        if (extractor == null) {
 -            extractor = new ParsingEmbeddedDocumentExtractor(context);
 -        }
 -        return extractor;
 -    }
 -
      private void extractEmbeddedDocuments(PDDocument document)
              throws IOException, SAXException, TikaException {
-         PDDocumentNameDictionary namesDictionary =
-                 new PDDocumentNameDictionary(document.getDocumentCatalog());
-         PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
-         if (efTree == null) {
-             return;
-         }
+             PDDocumentNameDictionary namesDictionary =
+                     new PDDocumentNameDictionary(document.getDocumentCatalog());
+             PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+             if (efTree == null) {
+                 return;
+             }
  
          Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
          //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
@@@ -192,35 -259,26 +252,26 @@@
              return;
          }
          
-         fileName = (fileName == null) ? displayName : fileName;
+         fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
+         fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
  
          // TODO: other metadata?
-         Metadata embeddedMetadata = new Metadata();
-         embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-         embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-         embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+         Metadata metadata = new Metadata();
+         metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+         metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                  TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
- 
-         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+         metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+         if (extractor.shouldParseEmbedded(metadata)) {
              TikaInputStream stream = null;
              try {
- 
-                 InputStream rawStream = null;
-                 try {
-                     rawStream = file.createInputStream();
-                 } catch (IOException e) {
-                     EmbeddedDocumentUtil.recordException(e, metadata);
-                     return;
-                 }
-                 stream = TikaInputStream.get(rawStream);
-                 embeddedDocumentExtractor.parseEmbedded(
+                 stream = TikaInputStream.get(file.createInputStream());
+                 extractor.parseEmbedded(
                          stream,
                          new EmbeddedContentHandler(xhtml),
 -                        metadata, false);
 +                        embeddedMetadata, false);
  
-                 AttributesImpl attributes = new AttributesImpl();
                  attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                  attributes.addAttribute("", "id", "id", "CDATA", fileName);
                  xhtml.startElement("div", attributes);
@@@ -290,6 -348,6 +341,7 @@@
      protected void endPage(PDPage page) throws IOException {
  
          try {
++            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
              for (PDAnnotation annotation : page.getAnnotations()) {
  
                  if (annotation instanceof PDAnnotationFileAttachment) {

http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index e9eb6e5,567a4fc..ce98e0d
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@@ -125,8 -126,10 +125,10 @@@ public class PDFParserConfig implement
      //with a streams.  If this is set to true, Tika's
      //parser catches these exceptions, reports them in the metadata
      //and then throws the first stored exception after the parse has completed.
 -    private boolean isCatchIntermediateIOExceptions = true;
 +    private boolean catchIntermediateIOExceptions = true;
  
+     private boolean extractActions = false;
+ 
      public PDFParserConfig() {
          init(this.getClass().getResourceAsStream("PDFParser.properties"));
      }
@@@ -654,7 -672,8 +679,8 @@@
                  ", ocrImageType=" + ocrImageType +
                  ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
                  ", accessChecker=" + accessChecker +
 -                ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
+                 ", extractActions=" + extractActions +
 +                ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
                  '}';
      }
  }

http://git-wip-us.apache.org/repos/asf/tika/blob/0e0f30d9/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------