You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/08/18 14:37:55 UTC
[tika] branch branch_2x updated: Added OCR and PDF modules
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new 4b25b3f Added OCR and PDF modules
4b25b3f is described below
commit 4b25b3f6c123e45b132cad3f1bc793e9600474bd
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 18 10:37:27 2020 -0400
Added OCR and PDF modules
---
CHANGES.txt | 2 +
pom.xml | 1 -
tika-core/pom.xml | 5 +-
tika-parser-integration-tests/pom.xml | 12 +
.../parser/tests}/ocr/TesseractOCRParserTest.java | 22 +-
.../tika/parser/tests/pdf/PDFParserTest.java | 227 ++++++++++++++++++
tika-parser-modules/pom.xml | 7 +
.../tika-parser-audiovideo-module/pom.xml | 23 +-
.../tika-parser-html-module/pom.xml | 23 +-
.../tika-parser-image-module/pom.xml | 31 +++
.../org/apache/tika/parser/image/BPGParser.java | 177 ++++++++++++++
.../org/apache/tika/parser/image/ICNSType.java | 170 ++++++++++++++
.../org/apache/tika/parser/image/ImageParser.java | 219 +++++++++++++++++
.../org/apache/tika/parser/image/PSDParser.java | 259 +++++++++++++++++++++
.../test/resources/test-documents/testHEIF.heic | Bin 0 -> 13706 bytes
.../test/resources/test-documents/testJBIG2.jb2 | Bin 0 -> 346 bytes
.../test-documents/testJPEG_commented.jpg | Bin 13325 -> 0 bytes
.../src/test/resources/test-documents/testPNG.png | Bin 0 -> 17041 bytes
.../tika-parser-microsoft-module/pom.xml | 23 +-
.../pom.xml | 29 +--
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 0
.../apache/tika/parser/ocr/TesseractOCRParser.java | 8 +-
.../services/org.apache.tika.parser.Parser | 23 +-
.../tika/parser/ocr/TesseractOCRConfig.properties | 0
.../org/apache/tika/parser/ocr/rotation.py | 0
.../test-properties/StringsConfig-full.properties | 0
.../StringsConfig-partial.properties | 0
.../TesseractOCRConfig-full.properties | 0
.../TesseractOCRConfig-partial.properties | 0
.../tika/parser/ocr/TesseractOCRConfigTest.java | 0
tika-parser-modules/tika-parser-pdf-module/pom.xml | 131 +++++++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +-
.../org/apache/tika/parser/pdf/AccessChecker.java | 0
.../tika/parser/pdf/ImageGraphicsEngine.java | 0
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 0
.../tika/parser/pdf/PDFEncodedStringDecoder.java | 0
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 5 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 0
.../apache/tika/parser/pdf/PDFParserConfig.java | 0
.../apache/tika/parser/pdf/PDFPreflightParser.java | 4 +-
.../tika/parser/pdf/PDMetadataExtractor.java | 7 +-
.../org/apache/tika/parser/pdf/XFAExtractor.java | 0
.../services/org.apache.tika.parser.Parser | 23 +-
.../apache/tika/parser/pdf/PDFParser.properties | 0
.../apache/tika/parser/pdf/AccessCheckerTest.java | 0
.../parser/pdf/PDFMarkedContent2XHTMLTest.java | 0
.../org/apache/tika/parser/pdf/PDFParserTest.java | 187 ---------------
.../tika/parser/pdf/PDFPreflightParserTest.java | 0
.../tika/parser/pdf/tika-config-non-primitives.xml | 0
.../org/apache/tika/parser/pdf/tika-config.xml | 0
.../apache/tika/parser/pdf/tika-inline-config.xml | 0
.../org/apache/tika/parser/pdf/tika-ocr-config.xml | 0
.../tika/parser/pdf/tika-preflight-config.xml | 0
.../tika/parser/pdf/tika-xml-profiler-config.xml | 0
.../resources/test-documents/testAnnotations.pdf | Bin
.../resources/test-documents/testExtraSpaces.pdf | Bin
.../resources/test-documents/testJournalParser.pdf | Bin
.../src/test/resources/test-documents/testOCR.pdf | Bin
.../test-documents/testOptionalHyphen.pdf | Bin
.../test-documents/testOverlappingText.pdf | Bin
.../test-documents/testPDF-custommetadata.pdf | Bin
.../src/test/resources/test-documents/testPDF.pdf | Bin
.../testPDFEmbeddingAndEmbedded.docx | Bin
.../test-documents/testPDFFileEmbInAnnotation.pdf | Bin
.../testPDFFileEmbInAnnotation_noContents.pdf | 0
.../resources/test-documents/testPDFPackage.pdf | Bin
.../test-documents/testPDFTripleLangTitle.pdf | Bin
.../test-documents/testPDFTwoTextBoxes.pdf | Bin
.../resources/test-documents/testPDFVarious.pdf | Bin
.../resources/test-documents/testPDF_JBIG2.pdf | Bin
.../testPDF_PDFEncodedStringInXMP.pdf | Bin
.../test-documents/testPDF_Version.10.x.pdf | Bin
.../testPDF_Version.11.x.PDFA-1b.pdf | Bin
.../test-documents/testPDF_Version.4.x.pdf | Bin
.../test-documents/testPDF_Version.5.x.pdf | Bin
.../test-documents/testPDF_Version.6.x.pdf | Bin
.../test-documents/testPDF_Version.7.x.pdf | Bin
.../test-documents/testPDF_Version.8.x.pdf | Bin
.../test-documents/testPDF_Version.9.x.pdf | Bin
.../test-documents/testPDF_XFA_govdocs1_258578.pdf | Bin
.../test-documents/testPDF_XMPBasicSchema.pdf | Bin
.../resources/test-documents/testPDF_acroform3.pdf | Bin
.../resources/test-documents/testPDF_angles.pdf | Bin
.../test-documents/testPDF_bad_page_303226.pdf | Bin
.../test/resources/test-documents/testPDF_bom.pdf | Bin
.../resources/test-documents/testPDF_bookmarks.pdf | Bin
.../test-documents/testPDF_childAttachments.pdf | Bin
.../test-documents/testPDF_diffTitles.pdf | 0
.../test-documents/testPDF_multiFormatEmbFiles.pdf | Bin
...PDF_no_extract_no_accessibility_owner_empty.pdf | 0
...tPDF_no_extract_no_accessibility_owner_user.pdf | 24 +-
...DF_no_extract_yes_accessibility_owner_empty.pdf | 24 +-
...PDF_no_extract_yes_accessibility_owner_user.pdf | 24 +-
.../resources/test-documents/testPDF_protected.pdf | Bin
.../test-documents/testPDF_twoAuthors.pdf | Bin
.../resources/test-documents/testPageNumber.pdf | Bin
.../test-documents/testPopupAnnotation.pdf | Bin
.../test-documents/testStandardsExtractor.pdf | Bin
tika-parser-modules/tika-parser-pkg-module/pom.xml | 22 ++
tika-parser-modules/tika-parser-xml-module/pom.xml | 23 +-
.../tika-parser-xmp-commons/pom.xml | 22 ++
.../testJPEG_commented_pspcs2mac.jpg | Bin 0 -> 26173 bytes
.../testJPEG_commented_xnviewmp026.jpg | Bin
.../tika-parser-zip-commons/pom.xml | 23 +-
tika-parsers/pom.xml | 51 ----
106 files changed, 1466 insertions(+), 372 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 7643b6d..9172f04 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
Release 2.0.0 - ???
BREAKING CHANGES in 2.0.0
+ * RTFParser was moved to org.apache.tika.parser.microsoft.rtf
+
* Remove deprecated Metadata keys/properties (TIKA-1974).
* Removed dangerous calls to read an inputstream or convert to bytes without specifying a charset
diff --git a/pom.xml b/pom.xml
index eec4d3e..bf64e5a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -37,7 +37,6 @@
<modules>
<module>tika-parent</module>
<module>tika-core</module>
- <module>tika-parsers</module>
<module>tika-parser-modules</module>
<module>tika-parser-integration-tests</module>
<module>tika-bundle</module>
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index a49f3b9..ad474b1 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -129,6 +129,7 @@
</execution>
</executions>
</plugin>
+ <!-- no harm, no foul for 2.0.0 :D TODO: turn this back on after the 2.0.0 release
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>clirr-maven-plugin</artifactId>
@@ -159,14 +160,14 @@
<comparisonArtifact>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>1.0</version>
+ <version>2.0.0-SNAPSHOT</version>
<type>jar</type>
</comparisonArtifact>
</comparisonArtifacts>
</configuration>
</execution>
</executions>
- </plugin>
+ </plugin> -->
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.22.1</version>
diff --git a/tika-parser-integration-tests/pom.xml b/tika-parser-integration-tests/pom.xml
index fde1038..04cebc6 100644
--- a/tika-parser-integration-tests/pom.xml
+++ b/tika-parser-integration-tests/pom.xml
@@ -52,6 +52,18 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-ocr-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pdf-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-pkg-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/ocr/TesseractOCRParserTest.java
similarity index 94%
rename from tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
rename to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/ocr/TesseractOCRParserTest.java
index 93fee35..27900bf 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/ocr/TesseractOCRParserTest.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.ocr;
+package org.apache.tika.parser.tests.ocr;
import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
import static org.junit.Assert.assertEquals;
@@ -32,19 +32,19 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.junit.Assert;
import org.junit.Assume;
import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
public class TesseractOCRParserTest extends TikaTest {
@@ -55,7 +55,7 @@ public class TesseractOCRParserTest extends TikaTest {
}
private boolean canRun(TesseractOCRConfig config) {
- String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
+ String[] checkCmd = {config.getTesseractPath() + TesseractOCRParser.getTesseractProg()};
// If Tesseract is not on the path, do not run the test.
return ExternalParser.check(checkCmd);
}
@@ -78,7 +78,7 @@ public class TesseractOCRParserTest extends TikaTest {
parseContext.set(TesseractOCRConfig.class, invalidConfig);
// No types offered
- assertEquals(0, parser.getSupportedTypes(parseContext).size());
+ Assert.assertEquals(0, parser.getSupportedTypes(parseContext).size());
// And DefaultParser won't use us
assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
@@ -98,7 +98,7 @@ public class TesseractOCRParserTest extends TikaTest {
// Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
assumeTrue("can run OCR", canRun());
- assertEquals(8, parser.getSupportedTypes(parseContext).size());
+ Assert.assertEquals(8, parser.getSupportedTypes(parseContext).size());
assertTrue(parser.getSupportedTypes(parseContext).contains(png));
// DefaultParser will now select the TesseractOCRParser.
@@ -313,10 +313,10 @@ public class TesseractOCRParserTest extends TikaTest {
assertNotNull(tesseractOCRParser);
TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
- assertEquals(241, tesseractOCRConfig.getTimeout());
- assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
- assertEquals("ceb", tesseractOCRConfig.getLanguage());
- assertEquals(false, tesseractOCRConfig.getApplyRotation());
+ Assert.assertEquals(241, tesseractOCRConfig.getTimeout());
+ Assert.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
+ Assert.assertEquals("ceb", tesseractOCRConfig.getLanguage());
+ Assert.assertEquals(false, tesseractOCRConfig.getApplyRotation());
assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
}
diff --git a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pdf/PDFParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pdf/PDFParserTest.java
new file mode 100644
index 0000000..83a8201
--- /dev/null
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pdf/PDFParserTest.java
@@ -0,0 +1,227 @@
+package org.apache.tika.parser.tests.pdf;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.parser.xml.XMLProfiler;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+
+public class PDFParserTest extends TikaTest {
+
+ public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
+ public static final MediaType TYPE_EMF = MediaType.image("emf");
+ public static final MediaType TYPE_PDF = MediaType.application("pdf");
+ public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ public static final MediaType TYPE_DOC = MediaType.application("msword");
+
+
+ @Test
+ public void testXMLProfiler() throws Exception {
+ //test that the xml profiler is not triggered by default
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf");
+ assertEquals(1, metadataList.size());
+
+ //test that it is triggered when added to the default parser
+ //via the config, tesseract should skip this file because it is too large
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+
+ metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", p);
+ assertEquals(3, metadataList.size());
+
+ int xmlProfilers = 0;
+ for (Metadata metadata : metadataList) {
+ String[] parsedBy = metadata.getValues("X-Parsed-By");
+ for (int i = 0; i < parsedBy.length; i++) {
+ if (parsedBy[i].equals(XMLProfiler.class.getCanonicalName())) {
+ xmlProfilers++;
+ }
+ }
+ }
+
+ assertEquals(2, xmlProfilers);
+
+ //check xmp first
+ String[] uris = metadataList.get(1).getValues(XMLProfiler.ENTITY_URIS);
+ String[] localNames = metadataList.get(1).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+ assertEquals(8, uris.length);
+ assertEquals(uris.length, localNames.length);
+ assertEquals("adobe:ns:meta/", uris[0]);
+ assertEquals("CreateDate CreatorTool MetadataDate ModifyDate Thumbnails", localNames[2]);
+ assertEquals("x:xmpmeta", metadataList.get(1).get(XMLProfiler.ROOT_ENTITY));
+
+ //check xfa
+ uris = metadataList.get(2).getValues(XMLProfiler.ENTITY_URIS);
+ localNames = metadataList.get(2).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+ assertEquals(8, uris.length);
+ assertEquals(uris.length, localNames.length);
+ assertEquals("http://ns.adobe.com/xdp/", uris[1]);
+ assertEquals("field form instanceManager subform value", localNames[5]);
+ assertEquals("xdp:xdp", metadataList.get(2).get(XMLProfiler.ROOT_ENTITY));
+ }
+
+ @Test //TIKA-1374
+ public void testOSSpecificEmbeddedFileExtraction() throws Exception {
+ List<Metadata> metadatas = getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf");
+ assertEquals("metadata size", 5, metadatas.size());
+
+ assertEquals("file name", "Test.txt", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertContains("os specific", metadatas.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("file name", "TestMac.txt", metadatas.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertContains("mac embedded", metadatas.get(2).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("file name", "TestDos.txt", metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertContains("dos embedded", metadatas.get(3).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertContains("unix embedded", metadatas.get(4).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+ }
+
+ //TIKA-1124
+ @Test
+ public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
+ /* format of test doc:
+ docx/
+ pdf/
+ docx
+ */
+
+ String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
+ int outerHaystack = content.indexOf("Outer_haystack");
+ int pdfHaystack = content.indexOf("pdf_haystack");
+ int needle = content.indexOf("Needle");
+ assertTrue(outerHaystack > -1);
+ assertTrue(pdfHaystack > -1);
+ assertTrue(needle > -1);
+ assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+ TrackingHandler tracker = new TrackingHandler();
+
+ ContainerExtractor ex = new ParserContainerExtractor();
+ try (TikaInputStream tis =
+ TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
+ ex.extract(tis, ex, tracker);
+ }
+
+ assertEquals(3, tracker.filenames.size());
+ assertEquals(3, tracker.mediaTypes.size());
+ assertEquals("image1.emf", tracker.filenames.get(0));
+ assertNull(tracker.filenames.get(1));
+ assertEquals("Test.docx", tracker.filenames.get(2));
+ assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+ assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+ assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+ }
+
+ @Test // TIKA-1228, TIKA-1268
+ public void testEmbeddedFilesInChildren() throws Exception {
+ String xml = getXML("testPDF_childAttachments.pdf").xml;
+ //"regressiveness" exists only in Unit10.doc not in the container pdf document
+ assertTrue(xml.contains("regressiveness"));
+
+ RecursiveParserWrapper p = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImages(true);
+ config.setExtractUniqueInlineImagesOnly(false);
+ context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+ context.set(org.apache.tika.parser.Parser.class, p);
+
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE,-1));
+ try (TikaInputStream tis = TikaInputStream.get(
+ getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
+ p.parse(tis, handler, new Metadata(), context);
+ }
+
+ List<Metadata> metadatas = handler.getMetadataList();
+
+ assertEquals(5, metadatas.size());
+ assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("image0.jpg", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("Unit10.doc", metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
+ assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testEmbeddedDocsWithOCROnly() throws Exception {
+ assumeTrue("can run OCR", canRunOCR());
+
+ for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOcrStrategy(strategy);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ //make sure everything works with regular xml _and_ with recursive
+ XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context);
+ //can get dehaystack depending on version of tesseract and/or preprocessing
+ if (xmlResult.xml.contains("pdf_haystack") || xmlResult.xml.contains("dehaystack")) {
+ //great
+ } else {
+ fail("couldn't find pdf_haystack or its variants");
+ }
+ assertContains("Haystack", xmlResult.xml);
+ assertContains("Needle", xmlResult.xml);
+ if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
+ // Tesseract may see the t in haystack as a ! some times...
+ //or it might see dehayslack...
+ //TODO: figure out how to make this test less hacky
+ String div = "<div class=\"ocr\">";
+ if (xmlResult.xml.contains(div+"pdf_hays!ack")) {
+ } else if (xmlResult.xml.contains(div+"pdf_haystack")) {
+ } else if (xmlResult.xml.contains(div+"dehayslack")) {
+ } else {
+ fail("couldn't find acceptable variants of haystack");
+ }
+ } else {
+ assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
+ }
+ assertEquals(4, getRecursiveMetadata("testPDFEmbeddingAndEmbedded.docx", context).size());
+ }
+
+ }
+
+
+ @Test
+ public void testFileInAnnotationExtractedIfNoContents() throws Exception {
+ //TIKA-2845
+ List<Metadata> contents = getRecursiveMetadata("testPDFFileEmbInAnnotation_noContents.pdf");
+ assertEquals(2, contents.size());
+ assertContains("This is a Excel", contents.get(1).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testEmbeddedFilesInAnnotations() throws Exception {
+ String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+
+ assertTrue(xml.contains("This is a Excel"));
+ }
+}
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index 42880cb..47e2a94 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -39,10 +39,15 @@
<jempbox.version>1.8.16</jempbox.version>
<mime4j.version>0.8.3</mime4j.version>
<pdfbox.version>2.0.20</pdfbox.version>
+ <jempbox.version>1.8.16</jempbox.version>
+
+ <commons.exec.version>1.3</commons.exec.version>
<commons.logging.version>1.2</commons.logging.version>
<!-- used by POI, PDFBox and Jackcess ...try to sync -->
<bouncycastle.version>1.65</bouncycastle.version>
+ <log4j.version>1.2.17</log4j.version>
+
</properties>
<dependencies>
@@ -77,6 +82,8 @@
<module>tika-parser-pkg-module</module>
<module>tika-parser-mail-commons</module>
<module>tika-parser-xml-module</module>
+ <module>tika-parser-ocr-module</module>
+ <module>tika-parser-pdf-module</module>
</modules>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-audiovideo-module/pom.xml b/tika-parser-modules/tika-parser-audiovideo-module/pom.xml
index f7a9c8a..192d929 100644
--- a/tika-parser-modules/tika-parser-audiovideo-module/pom.xml
+++ b/tika-parser-modules/tika-parser-audiovideo-module/pom.xml
@@ -23,5 +23,26 @@
</dependency>
</dependencies>
-
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.audiovideo</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-html-module/pom.xml b/tika-parser-modules/tika-parser-html-module/pom.xml
index d741c06..73d06b6 100644
--- a/tika-parser-modules/tika-parser-html-module/pom.xml
+++ b/tika-parser-modules/tika-parser-html-module/pom.xml
@@ -42,5 +42,26 @@
<scope>test</scope>
</dependency>
</dependencies>
-
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.html</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-image-module/pom.xml b/tika-parser-modules/tika-parser-image-module/pom.xml
index 3ff55d4..c39b614 100644
--- a/tika-parser-modules/tika-parser-image-module/pom.xml
+++ b/tika-parser-modules/tika-parser-image-module/pom.xml
@@ -39,6 +39,14 @@
<artifactId>jbig2-imageio</artifactId>
<version>${jbig2.version}</version>
</dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-commons</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
@@ -46,4 +54,27 @@
<scope>test</scope>
</dependency>
</dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.image</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
new file mode 100644
index 0000000..802d69f
--- /dev/null
+++ b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Photoshop;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for the Better Portable Graphics )BPG) File Format.
+ * <p/>
+ * Documentation on the file format is available from
+ * http://bellard.org/bpg/bpg_spec.txt
+ */
+public class BPGParser extends AbstractParser {
+ protected static final int EXTENSION_TAG_EXIF = 1;
+ protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
+ protected static final int EXTENSION_TAG_XMP = 3;
+ protected static final int EXTENSION_TAG_THUMBNAIL = 4;
+ private static final long serialVersionUID = -161736541253892772L;
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("x-bpg"), MediaType.image("bpg"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Check for the magic header signature
+ byte[] signature = new byte[4];
+ IOUtils.readFully(stream, signature);
+ if (signature[0] == (byte) 'B' && signature[1] == (byte) 'P' &&
+ signature[2] == (byte) 'G' && signature[3] == (byte) 0xfb) {
+ // Good, signature found
+ } else {
+ throw new TikaException("BPG magic signature invalid");
+ }
+
+ // Grab and decode the first byte
+ int pdf = stream.read();
+
+ // Pixel format: Greyscale / 4:2:0 / 4:2:2 / 4:4:4
+ int pixelFormat = pdf & 0x7;
+ // TODO Identify a suitable metadata key for this
+
+ // Is there an alpha plane as well as a colour plane?
+ boolean hasAlphaPlane1 = (pdf & 0x8) == 0x8;
+ // TODO Identify a suitable metadata key for this+hasAlphaPlane2
+
+ // Bit depth minus 8
+ int bitDepth = (pdf >> 4) + 8;
+ metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(bitDepth));
+
+ // Grab and decode the second byte
+ int cer = stream.read();
+
+ // Colour Space: YCbCr / RGB / YCgCo / YCbCrK / CMYK
+ int colourSpace = cer & 0x15;
+ switch (colourSpace) {
+ case 0:
+ metadata.set(Photoshop.COLOR_MODE, "YCbCr Colour");
+ break;
+ case 1:
+ metadata.set(Photoshop.COLOR_MODE, "RGB Colour");
+ break;
+ case 2:
+ metadata.set(Photoshop.COLOR_MODE, "YCgCo Colour");
+ break;
+ case 3:
+ metadata.set(Photoshop.COLOR_MODE, "YCbCrK Colour");
+ break;
+ case 4:
+ metadata.set(Photoshop.COLOR_MODE, "CMYK Colour");
+ break;
+ }
+
+ // Are there extensions or not?
+ boolean hasExtensions = (cer & 16) == 16;
+
+ // Is the Alpha Plane 2 flag set?
+ boolean hasAlphaPlane2 = (cer & 32) == 32;
+
+ // cer then holds 2 more booleans - limited range, reserved
+
+ // Width and height next
+ int width = (int) EndianUtils.readUE7(stream);
+ int height = (int) EndianUtils.readUE7(stream);
+ metadata.set(TIFF.IMAGE_LENGTH, height);
+ metadata.set(TIFF.IMAGE_WIDTH, width);
+
+ // Picture Data length
+ EndianUtils.readUE7(stream);
+
+ // Extension Data Length, if extensions present
+ long extensionDataLength = 0;
+ if (hasExtensions)
+ extensionDataLength = EndianUtils.readUE7(stream);
+
+ // Alpha Data Length, if alpha used
+ long alphaDataLength = 0;
+ if (hasAlphaPlane1 || hasAlphaPlane2)
+ alphaDataLength = EndianUtils.readUE7(stream);
+
+ // Extension Data
+ if (hasExtensions) {
+ long extensionsDataSeen = 0;
+ ImageMetadataExtractor metadataExtractor =
+ new ImageMetadataExtractor(metadata);
+
+ while (extensionsDataSeen < extensionDataLength) {
+ int extensionType = (int) EndianUtils.readUE7(stream);
+ int extensionLength = (int) EndianUtils.readUE7(stream);
+ switch (extensionType) {
+ case EXTENSION_TAG_EXIF:
+ metadataExtractor.parseRawExif(stream, extensionLength, true);
+ break;
+ case EXTENSION_TAG_XMP:
+ handleXMP(stream, extensionLength, metadataExtractor);
+ break;
+ default:
+ stream.skip(extensionLength);
+ }
+ extensionsDataSeen += extensionLength;
+ }
+ }
+
+ // HEVC Header + Data
+ // Alpha HEVC Header + Data
+ // We can't do anything with these parts
+
+ // We don't have any helpful text, sorry...
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ protected void handleXMP(InputStream stream, int xmpLength,
+ ImageMetadataExtractor extractor) throws IOException, TikaException, SAXException {
+ byte[] xmp = new byte[xmpLength];
+ IOUtils.readFully(stream, xmp);
+ extractor.parseRawXMP(xmp);
+ }
+}
diff --git a/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSType.java b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSType.java
new file mode 100644
index 0000000..bc02c7a
--- /dev/null
+++ b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSType.java
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2016 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Holds details on Apple ICNS icons
+ */
+class ICNSType {
+ private final int type;
+ private final int width;
+ private final int height;
+ private final int bitsPerPixel;
+ private final boolean hasMask;
+ private final boolean hasRetinaDisplay;
+
+ public int getType() {
+ return type;
+ }
+
+ public int getWidth() {
+ return width;
+ }
+
+ public int getHeight() {
+ return height;
+ }
+
+ public int getBitsPerPixel() {
+ return bitsPerPixel;
+ }
+
+ public boolean hasMask() {
+ return hasMask;
+ }
+
+ public boolean hasRetinaDisplay() {
+ return hasRetinaDisplay;
+ }
+
+ public static int converttoInt(byte[] bytes) {
+ if (bytes.length != 4) {
+ throw new IllegalArgumentException("Cannot convert to integer");
+ }
+ return ((0xff & bytes[0]) << 24)
+ | ((0xff & bytes[1]) << 16)
+ | ((0xff & bytes[2]) << 8)
+ | (0xff & bytes[3]);
+ }
+
+ private ICNSType(String type, int width, int height, int bitsPerPixel, boolean hasMask, boolean hasRetinaDisplay) {
+ byte[] bytes = null;
+ try {
+ bytes = type.getBytes("US-ASCII");
+ } catch (UnsupportedEncodingException cannotHappen) {
+ }
+ this.type = converttoInt(bytes);
+ this.width = width;
+ this.height = height;
+ this.bitsPerPixel = bitsPerPixel;
+ this.hasMask = hasMask;
+ this.hasRetinaDisplay = hasRetinaDisplay;
+
+ }
+ public static final ICNSType ICNS_32x32_1BIT_IMAGE
+ = new ICNSType("ICON", 32, 32, 1, false, false);
+ public static final ICNSType ICNS_16x12_1BIT_IMAGE_AND_MASK
+ = new ICNSType("icm#", 16, 12, 1, true, false);
+ public static final ICNSType ICNS_16x12_4BIT_IMAGE
+ = new ICNSType("icm4", 16, 12, 4, false, false);
+ public static final ICNSType ICNS_16x12_8BIT_IMAGE
+ = new ICNSType("icm8", 16, 12, 8, false, false);
+
+ public static final ICNSType ICNS_16x16_8BIT_MASK
+ = new ICNSType("s8mk", 16, 16, 8, true, false);
+ public static final ICNSType ICNS_16x16_1BIT_IMAGE_AND_MASK
+ = new ICNSType("ics#", 16, 16, 1, true, false);
+ public static final ICNSType ICNS_16x16_4BIT_IMAGE
+ = new ICNSType("ics4", 16, 16, 4, false, false);
+ public static final ICNSType ICNS_16x16_8BIT_IMAGE
+ = new ICNSType("ics8", 16, 16, 8, false, false);
+ public static final ICNSType ICNS_16x16_24BIT_IMAGE
+ = new ICNSType("is32", 16, 16, 24, false, false);
+
+ public static final ICNSType ICNS_32x32_8BIT_MASK
+ = new ICNSType("l8mk", 32, 32, 8, true, false);
+ public static final ICNSType ICNS_32x32_1BIT_IMAGE_AND_MASK
+ = new ICNSType("ICN#", 32, 32, 1, true, false);
+ public static final ICNSType ICNS_32x32_4BIT_IMAGE
+ = new ICNSType("icl4", 32, 32, 4, false, false);
+ public static final ICNSType ICNS_32x32_8BIT_IMAGE
+ = new ICNSType("icl8", 32, 32, 8, false, false);
+ public static final ICNSType ICNS_32x32_24BIT_IMAGE
+ = new ICNSType("il32", 32, 32, 24, false, false);
+
+ public static final ICNSType ICNS_48x48_8BIT_MASK
+ = new ICNSType("h8mk", 48, 48, 8, true, false);
+ public static final ICNSType ICNS_48x48_1BIT_IMAGE_AND_MASK
+ = new ICNSType("ich#", 48, 48, 1, true, false);
+ public static final ICNSType ICNS_48x48_4BIT_IMAGE
+ = new ICNSType("ich4", 48, 48, 4, false, false);
+ public static final ICNSType ICNS_48x48_8BIT_IMAGE
+ = new ICNSType("ich8", 48, 48, 8, false, false);
+ public static final ICNSType ICNS_48x48_24BIT_IMAGE
+ = new ICNSType("ih32", 48, 48, 24, false, false);
+ public static final ICNSType ICNS_128x128_8BIT_MASK
+ = new ICNSType("t8mk", 128, 128, 8, true, false);
+ public static final ICNSType ICNS_128x128_24BIT_IMAGE
+ = new ICNSType("it32", 128, 128, 24, false, false);
+
+ public static final ICNSType ICNS_16x16_JPEG_PNG_IMAGE
+ = new ICNSType("icp4", 16, 16, 0, false, false);
+ public static final ICNSType ICNS_32x32_JPEG_PNG_IMAGE
+ = new ICNSType("icp5", 32, 32, 0, false, false);
+ public static final ICNSType ICNS_64x64_JPEG_PNG_IMAGE
+ = new ICNSType("icp6", 64, 64, 0, false, false);
+ public static final ICNSType ICNS_128x128_JPEG_PNG_IMAGE
+ = new ICNSType("icp7", 128, 128, 0, false, false);
+ public static final ICNSType ICNS_256x256_JPEG_PNG_IMAGE
+ = new ICNSType("ic08", 256, 256, 0, false, false);
+ public static final ICNSType ICNS_512x512_JPEG_PNG_IMAGE
+ = new ICNSType("ic09", 512, 512, 0, false, false);
+ public static final ICNSType ICNS_1024x1024_2X_JPEG_PNG_IMAGE
+ = new ICNSType("ic10", 1024, 1024, 0, false, true);
+ public static final ICNSType ICNS_16x16_2X_JPEG_PNG_IMAGE
+ = new ICNSType("ic11", 16, 16, 0, false, true);
+ public static final ICNSType ICNS_32x32_2X_JPEG_PNG_IMAGE
+ = new ICNSType("ic12", 32, 32, 0, false, true);
+ public static final ICNSType ICNS_128x128_2X_JPEG_PNG_IMAGE
+ = new ICNSType("ic13", 128, 128, 0, false, true);
+ public static final ICNSType ICNS_256x256_2X_JPEG_PNG_IMAGE
+ = new ICNSType("ic14", 256, 256, 0, false, true);
+
+ private static final ICNSType[] allImageTypes
+ = {
+ ICNS_32x32_1BIT_IMAGE, ICNS_16x12_1BIT_IMAGE_AND_MASK, ICNS_16x12_4BIT_IMAGE, ICNS_16x12_8BIT_IMAGE,
+ ICNS_16x16_1BIT_IMAGE_AND_MASK, ICNS_16x16_4BIT_IMAGE, ICNS_16x16_8BIT_IMAGE, ICNS_16x16_24BIT_IMAGE,
+ ICNS_32x32_1BIT_IMAGE_AND_MASK, ICNS_32x32_4BIT_IMAGE, ICNS_32x32_8BIT_IMAGE, ICNS_32x32_24BIT_IMAGE,
+ ICNS_48x48_1BIT_IMAGE_AND_MASK, ICNS_48x48_4BIT_IMAGE, ICNS_48x48_8BIT_IMAGE, ICNS_48x48_24BIT_IMAGE,
+ ICNS_128x128_24BIT_IMAGE, ICNS_16x16_8BIT_MASK,
+ ICNS_32x32_8BIT_MASK, ICNS_48x48_8BIT_MASK, ICNS_128x128_8BIT_MASK,
+ ICNS_16x16_JPEG_PNG_IMAGE, ICNS_32x32_JPEG_PNG_IMAGE, ICNS_64x64_JPEG_PNG_IMAGE, ICNS_128x128_JPEG_PNG_IMAGE, ICNS_256x256_JPEG_PNG_IMAGE,
+ ICNS_512x512_JPEG_PNG_IMAGE, ICNS_1024x1024_2X_JPEG_PNG_IMAGE, ICNS_16x16_2X_JPEG_PNG_IMAGE, ICNS_32x32_2X_JPEG_PNG_IMAGE,
+ ICNS_128x128_2X_JPEG_PNG_IMAGE, ICNS_256x256_2X_JPEG_PNG_IMAGE
+ };
+
+ public static ICNSType findIconType(byte[] bytes) {
+ int type = converttoInt(bytes);
+ for (ICNSType allImageType : allImageTypes) {
+ if (allImageType.getType() == type) {
+ return allImageType;
+ }
+ }
+ return null;
+ }
+}
diff --git a/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
new file mode 100644
index 0000000..e93ab5f
--- /dev/null
+++ b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7852529269245520335L;
+
+ private static final Logger LOG = LoggerFactory.getLogger(ImageParser.class);
+
+ private static final MediaType MAIN_BMP_TYPE = MediaType.image("bmp");
+ private static final MediaType OLD_BMP_TYPE = MediaType.image("x-ms-bmp");
+
+ private static final Set<MediaType> TMP_SUPPORTED;
+
+ static {
+ TMP_SUPPORTED = new HashSet<>(Arrays.asList(
+ MAIN_BMP_TYPE,
+ OLD_BMP_TYPE,
+ MediaType.image("gif"),
+ MediaType.image("png"),
+ MediaType.image("vnd.wap.wbmp"),
+ MediaType.image("x-icon"),
+ MediaType.image("x-xcf"),
+ MediaType.image("x-jbig2")));
+ //add try/catch class.forName() for image types relying on
+ //provided dependencies
+ }
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(TMP_SUPPORTED);
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
+ if (metadata.get(imageIOkey) != null) {
+ metadata.set(tikaKey, metadata.get(imageIOkey));
+ }
+ }
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
+ if (metadata.get(imageIOkey) != null) {
+ String v = metadata.get(imageIOkey);
+ if (v.endsWith(" ")) {
+ v = v.substring(0, v.lastIndexOf(' '));
+ }
+ metadata.set(tikaProp, v);
+ }
+ }
+
+ private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+ if (imageMetadata == null) {
+ return;
+ }
+ String[] names = imageMetadata.getMetadataFormatNames();
+ if (names == null) {
+ return;
+ }
+ for (String name : names) {
+ loadNode(metadata, imageMetadata.getAsTree(name), "", false);
+ }
+ }
+
+ private static void loadNode(
+ Metadata metadata, Node node, String parents,
+ boolean addThisNodeName) {
+ if (addThisNodeName) {
+ if (parents.length() > 0) {
+ parents += " ";
+ }
+ parents += node.getNodeName();
+ }
+ NamedNodeMap map = node.getAttributes();
+ if (map != null) {
+
+ int length = map.getLength();
+ if (length == 1) {
+ metadata.add(parents, normalize(map.item(0).getNodeValue()));
+ } else if (length > 1) {
+ StringBuilder value = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ if (i > 0) {
+ value.append(", ");
+ }
+ Node attr = map.item(i);
+ value.append(attr.getNodeName());
+ value.append("=");
+ value.append(normalize(attr.getNodeValue()));
+ }
+ metadata.add(parents, value.toString());
+ }
+ }
+
+ Node child = node.getFirstChild();
+ while (child != null) {
+ // print children recursively
+ loadNode(metadata, child, parents, true);
+ child = child.getNextSibling();
+ }
+ }
+
+ private static String normalize(String value) {
+ if (value != null) {
+ value = value.trim();
+ } else {
+ value = "";
+ }
+ if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+ return Boolean.TRUE.toString();
+ } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+ return Boolean.FALSE.toString();
+ }
+ return value;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ // If the old (pre-RFC7903) BMP mime type is given,
+ // fix it up to the new one, so Java is happy
+ if (OLD_BMP_TYPE.toString().equals(type)) {
+ type = MAIN_BMP_TYPE.toString();
+ }
+
+ try {
+ Iterator<ImageReader> iterator =
+ ImageIO.getImageReadersByMIMEType(type);
+ if (iterator.hasNext()) {
+ ImageReader reader = iterator.next();
+ try {
+ try (ImageInputStream imageStream = ImageIO.createImageInputStream(
+ new CloseShieldInputStream(stream))) {
+ reader.setInput(imageStream);
+
+ metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
+ metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
+ metadata.set("height", Integer.toString(reader.getHeight(0)));
+ metadata.set("width", Integer.toString(reader.getWidth(0)));
+
+ loadMetadata(reader.getImageMetadata(0), metadata);
+ }
+ } finally {
+ reader.dispose();
+ }
+ }
+
+ // Translate certain Metadata tags from the ImageIO
+ // specific namespace into the general Tika one
+ setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
+ } catch (IIOException e) {
+ // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
+ // which Tika will just ignore.
+ if (!(e.getMessage() != null &&
+ e.getMessage().equals("Unexpected block type 0!") &&
+ type.equals("image/gif"))) {
+ throw new TikaException(type + " parse error", e);
+ }
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
diff --git a/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
new file mode 100644
index 0000000..790af65
--- /dev/null
+++ b/tika-parser-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Photoshop;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * Parser for the Adobe Photoshop PSD File Format.
+ * <p/>
+ * Documentation on the file format is available from
+ * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ *
+ * An MIT-licensed python parser with test files is:
+ * https://github.com/psd-tools/psd-tools
+ */
+public class PSDParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 883387734607994914L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("vnd.adobe.photoshop"))));
+
+ private static final int MAX_DATA_LENGTH_BYTES = 1000000;
+ private static final int MAX_BLOCKS = 10000;
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Check for the magic header signature
+ byte[] signature = new byte[4];
+ IOUtils.readFully(stream, signature);
+ if (signature[0] == (byte) '8' && signature[1] == (byte) 'B' &&
+ signature[2] == (byte) 'P' && signature[3] == (byte) 'S') {
+ // Good, signature found
+ } else {
+ throw new TikaException("PSD/PSB magic signature invalid");
+ }
+
+ // Check the version
+ int version = EndianUtils.readUShortBE(stream);
+ if (version == 1 || version == 2) {
+ // Good, we support these two
+ } else {
+ throw new TikaException("Invalid PSD/PSB version " + version);
+ }
+
+ // Skip the reserved block
+ IOUtils.readFully(stream, new byte[6]);
+
+ // Number of channels in the image
+ int numChannels = EndianUtils.readUShortBE(stream);
+ // TODO Identify a suitable metadata key for this
+
+ // Width and Height
+ int height = EndianUtils.readIntBE(stream);
+ int width = EndianUtils.readIntBE(stream);
+ metadata.set(TIFF.IMAGE_LENGTH, height);
+ metadata.set(TIFF.IMAGE_WIDTH, width);
+
+ // Depth (bits per channel)
+ int depth = EndianUtils.readUShortBE(stream);
+ metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
+
+ // Colour mode, eg Bitmap or RGB
+ int colorMode = EndianUtils.readUShortBE(stream);
+ if (colorMode < Photoshop._COLOR_MODE_CHOICES_INDEXED.length) {
+ metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+ }
+
+ // Next is the Color Mode section
+ // We don't care about this bit
+ long colorModeSectionSize = EndianUtils.readIntBE(stream);
+ IOUtils.skipFully(stream, colorModeSectionSize);
+
+ // Next is the Image Resources section
+ // Check for certain interesting keys here
+ long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
+ long read = 0;
+ //if something is corrupt about this number, prevent an
+ //infinite loop by only reading 10000 blocks
+ int blocks = 0;
+ while (read < imageResourcesSectionSize && blocks < MAX_BLOCKS) {
+ ResourceBlock rb = new ResourceBlock(stream);
+ if (rb.totalLength <= 0) {
+ //break;
+ }
+ read += rb.totalLength;
+
+ // Is it one we can do something useful with?
+ if (rb.id == ResourceBlock.ID_CAPTION) {
+ metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString());
+ } else if (rb.id == ResourceBlock.ID_EXIF_1) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_EXIF_3) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_XMP) {
+ //if there are multiple xmps in a file, this will
+ //overwrite the data from the earlier xmp
+ JempboxExtractor ex = new JempboxExtractor(metadata);
+ ex.parse(new ByteArrayInputStream(rb.data));
+ }
+ blocks++;
+ }
+
+ // Next is the Layer and Mask Info
+ // Finally we have Image Data
+ // We can't do anything with these parts
+
+ // We don't have any helpful text, sorry...
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private static class ResourceBlock {
+ private static final long SIGNATURE = 0x3842494d; // 8BIM
+ private static final int ID_CAPTION = 0x03F0;
+ private static final int ID_EXIF_1 = 0x0422;
+ private static final int ID_EXIF_3 = 0x0423;
+ private static final int ID_XMP = 0x0424;
+ //TODO
+ private static final int ID_URL = 0x040B;
+ private static final int ID_AUTO_SAVE_FILE_PATH = 0x043E;
+ private static final int ID_THUMBNAIL_RESOURCE = 0x040C;
+
+ private int id;
+ private String name;
+ private byte[] data;
+ private int totalLength;
+ static int counter = 0;
+ private ResourceBlock(InputStream stream) throws IOException, TikaException {
+ counter++;
+ // Verify the signature
+ long sig = EndianUtils.readIntBE(stream);
+ if (sig != SIGNATURE) {
+ throw new TikaException("Invalid Image Resource Block Signature Found, got " +
+ sig + " 0x" + Long.toHexString(sig) + " but the spec defines " + SIGNATURE);
+ }
+
+ // Read the block
+ id = EndianUtils.readUShortBE(stream);
+
+ StringBuffer nameB = new StringBuffer();
+ int nameLen = 0;
+ while (true) {
+ int v = stream.read();
+ if (v < 0) {
+ throw new EOFException();
+ }
+ nameLen++;
+
+ if (v == 0) {
+ // The name length is padded to be even
+ if (nameLen % 2 == 1) {
+ stream.read();
+ nameLen++;
+ }
+ break;
+ } else {
+ nameB.append((char) v);
+ }
+ name = nameB.toString();
+ }
+
+ int dataLen = EndianUtils.readIntBE(stream);
+ if (dataLen < 0) {
+ throw new TikaException("data length must be >= 0: "+dataLen);
+ }
+ if (dataLen % 2 == 1) {
+ // Data Length is even padded
+ dataLen = dataLen + 1;
+ }
+ //protect against overflow
+ if (Integer.MAX_VALUE-dataLen < nameLen+10) {
+ throw new TikaException("data length is too long:"+dataLen);
+ }
+ totalLength = 4 + 2 + nameLen + 4 + dataLen;
+ // Do we have use for the data segment?
+ if (captureData(id)) {
+ if (dataLen > MAX_DATA_LENGTH_BYTES) {
+ throw new TikaException("data length must be < "+MAX_DATA_LENGTH_BYTES+
+ ": "+dataLen);
+ }
+ data = new byte[dataLen];
+ IOUtils.readFully(stream, data);
+ } else {
+ data = new byte[0];
+ IOUtils.skipFully(stream, dataLen);
+ }
+ }
+
+ /**
+ * To save memory, only capture the data
+ * section of resource blocks we process
+ */
+ private static boolean captureData(int id) {
+ switch (id) {
+ case ID_CAPTION:
+ case ID_EXIF_1:
+ case ID_EXIF_3:
+ case ID_XMP:
+ return true;
+ }
+ return false;
+ }
+
+ private String getDataAsString() {
+ // Will be null padded
+ return new String(data, 0, data.length - 1, US_ASCII);
+ }
+ }
+}
diff --git a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testHEIF.heic b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testHEIF.heic
new file mode 100644
index 0000000..3d1893a
Binary files /dev/null and b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testHEIF.heic differ
diff --git a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJBIG2.jb2 b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJBIG2.jb2
new file mode 100644
index 0000000..8a6756f
Binary files /dev/null and b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJBIG2.jb2 differ
diff --git a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJPEG_commented.jpg b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJPEG_commented.jpg
deleted file mode 100644
index a67e304..0000000
Binary files a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJPEG_commented.jpg and /dev/null differ
diff --git a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testPNG.png b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testPNG.png
new file mode 100644
index 0000000..afbcb5f
Binary files /dev/null and b/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testPNG.png differ
diff --git a/tika-parser-modules/tika-parser-microsoft-module/pom.xml b/tika-parser-modules/tika-parser-microsoft-module/pom.xml
index 3e4c1f4..efe5a45 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/pom.xml
+++ b/tika-parser-modules/tika-parser-microsoft-module/pom.xml
@@ -163,5 +163,26 @@
<scope>test</scope>
</dependency>
</dependencies>
-
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.microsoft</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-html-module/pom.xml b/tika-parser-modules/tika-parser-ocr-module/pom.xml
similarity index 56%
copy from tika-parser-modules/tika-parser-html-module/pom.xml
copy to tika-parser-modules/tika-parser-ocr-module/pom.xml
index d741c06..ef4a2b0 100644
--- a/tika-parser-modules/tika-parser-html-module/pom.xml
+++ b/tika-parser-modules/tika-parser-ocr-module/pom.xml
@@ -9,38 +9,29 @@
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-parser-html-module</artifactId>
+ <artifactId>tika-parser-ocr-module</artifactId>
<dependencies>
<dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-image-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>de.l3s.boilerpipe</groupId>
- <artifactId>boilerpipe</artifactId>
- <version>1.1.0</version>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>${commons.lang3.version}</version>
</dependency>
<dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>${codec.version}</version>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-exec</artifactId>
+ <version>${commons.exec.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
-
- <!-- test scope -->
- <!-- this is required for basic encoding detection -->
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
</dependencies>
</project>
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
rename to tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
similarity index 99%
rename from tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
rename to tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index b2c4496..6d6a357 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -40,7 +40,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
-import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.parser.image.JpegParser;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
@@ -207,7 +207,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
return config.getImageMagickPath() + getImageMagickProg();
}
- static boolean hasPython() {
+ public static boolean hasPython() {
// check if python is installed and it has the required dependencies for the rotation program to run
boolean hasPython = false;
TemporaryResources tmp = null;
@@ -627,11 +627,11 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
}.start();
}
- static String getTesseractProg() {
+ public static String getTesseractProg() {
return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
}
- static String getImageMagickProg() {
+ public static String getImageMagickProg() {
return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert";
}
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
similarity index 66%
copy from tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
copy to tika-parser-modules/tika-parser-ocr-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 73c9083..7970668 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -12,25 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-# Tesseract properties
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
-timeout=120
-#txt or hocr
-outputType=txt
-preserveInterwordSpacing=false
-
-# properties for image processing
-# to enable processing, set enableImageProcessing to 1
-enableImageProcessing=0
-ImageMagickPath=
-density=300
-depth=4
-colorspace=gray
-filter=triangle
-resize=900
-applyRotation=false
\ No newline at end of file
+org.apache.tika.parser.ocr.TesseractOCRParser
\ No newline at end of file
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
similarity index 100%
copy from tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
copy to tika-parser-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
similarity index 100%
rename from tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
rename to tika-parser-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
diff --git a/tika-parsers/src/test/resources/test-properties/StringsConfig-full.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/StringsConfig-full.properties
similarity index 100%
rename from tika-parsers/src/test/resources/test-properties/StringsConfig-full.properties
rename to tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/StringsConfig-full.properties
diff --git a/tika-parsers/src/test/resources/test-properties/StringsConfig-partial.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/StringsConfig-partial.properties
similarity index 100%
rename from tika-parsers/src/test/resources/test-properties/StringsConfig-partial.properties
rename to tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/StringsConfig-partial.properties
diff --git a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/TesseractOCRConfig-full.properties
similarity index 100%
rename from tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
rename to tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/TesseractOCRConfig-full.properties
diff --git a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-partial.properties b/tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/TesseractOCRConfig-partial.properties
similarity index 100%
rename from tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-partial.properties
rename to tika-parser-modules/tika-parser-ocr-module/src/main/resources/test-properties/TesseractOCRConfig-partial.properties
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
similarity index 100%
rename from tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
rename to tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
new file mode 100644
index 0000000..a117f39
--- /dev/null
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-parser-modules</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>2.0.0-SNAPSHOT</version>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-parser-pdf-module</artifactId>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-image-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-ocr-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>${pdfbox.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox-tools</artifactId>
+ <version>${pdfbox.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox-debugger</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>preflight</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jempbox</artifactId>
+ <version>${jempbox.version}</version>
+ </dependency>
+ <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+ as optional, but we prefer to have them always to avoid
+ problems with encrypted PDFs. -->
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <!-- for java 10
+ See TIKA-2778 for why we need to do this now.
+ May the gods of API design fix this in the future.
+ only required for jackcess-encrypt
+ -->
+ <dependency>
+ <groupId>org.glassfish.jaxb</groupId>
+ <artifactId>jaxb-runtime</artifactId>
+ <version>${jaxb.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>jakarta.activation</groupId>
+ <artifactId>jakarta.activation-api</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.activation</groupId>
+ <artifactId>jakarta.activation</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <version>${log4j.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.pdf</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
similarity index 99%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2ae3b7f..67e4cd2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -42,8 +42,6 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
@@ -89,6 +87,8 @@ import org.apache.pdfbox.util.Vector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
@@ -100,7 +100,6 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.sas.SAS7BDATParser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
similarity index 98%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 539cd50..dce7181 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -19,12 +19,12 @@ package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.Writer;
-import org.apache.commons.io.IOExceptionWithCause;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
similarity index 99%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 9f764f9..cc14ae5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.pdf;
-import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
@@ -265,7 +264,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
ignoreTag = true;
}
if (!ignoreTag) {
- if (!StringUtils.isAllBlank(tag.clazz)) {
+ if (tag.clazz != null && tag.clazz.trim().length() > 0) {
xhtml.startElement(tag.tag, "class", tag.clazz);
} else {
xhtml.startElement(tag.tag);
@@ -323,7 +322,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
//This is only for uris, obv.
//If we want to catch within doc references (GOTO, we need to cache those in state.
//See testPDF_childAttachments.pdf for examples
- if (! StringUtils.isAllBlank(state.uri)) {
+ if (state.uri != null && state.uri.trim().length() > 0) {
xhtml.startElement("a", "href", state.uri);
xhtml.characters(state.hrefAnchorBuilder.toString());
xhtml.endElement("a");
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
similarity index 98%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
index 3676bf6..9f58391 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
@@ -27,6 +27,7 @@ import org.apache.pdfbox.pdfparser.XrefTrailerResolver;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.preflight.Format;
import org.apache.pdfbox.preflight.PreflightConfiguration;
+import org.apache.pdfbox.preflight.PreflightConstants;
import org.apache.pdfbox.preflight.PreflightContext;
import org.apache.pdfbox.preflight.PreflightDocument;
import org.apache.pdfbox.preflight.ValidationResult;
@@ -144,7 +145,8 @@ public class PDFPreflightParser extends PDFParser {
for (Object object : lObj) {
COSBase curObj = ((COSObject) object).getObject();
if (curObj instanceof COSDictionary
- && ((COSDictionary) curObj).keySet().contains(COSName.getPDFName(DICTIONARY_KEY_LINEARIZED))) {
+ && ((COSDictionary) curObj).keySet().contains(COSName.getPDFName(
+ PreflightConstants.DICTIONARY_KEY_LINEARIZED))) {
return (COSDictionary) curObj;
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
similarity index 98%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 0d3f59d..b288d5f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -22,7 +22,6 @@ import java.util.Calendar;
import java.util.List;
import java.util.Locale;
-import org.apache.commons.lang3.StringUtils;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchema;
import org.apache.jempbox.xmp.XMPSchemaBasic;
@@ -33,9 +32,9 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
-import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
@@ -44,7 +43,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMP;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.parser.xmp.JempboxExtractor;
import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
@@ -155,7 +154,7 @@ class PDMetadataExtractor {
}
private static void setNotNull(Property property, String value, Metadata metadata) {
- if (metadata.get(property) == null && ! StringUtils.isEmpty(value)) {
+ if (metadata.get(property) == null && value != null && value.trim().length() > 0) {
metadata.set(property, value);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
similarity index 100%
rename from tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
rename to tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
similarity index 66%
rename from tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
rename to tika-parser-modules/tika-parser-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 73c9083..25b9c9f 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -12,25 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-# Tesseract properties
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
-timeout=120
-#txt or hocr
-outputType=txt
-preserveInterwordSpacing=false
-
-# properties for image processing
-# to enable processing, set enableImageProcessing to 1
-enableImageProcessing=0
-ImageMagickPath=
-density=300
-depth=4
-colorspace=gray
-filter=triangle
-resize=900
-applyRotation=false
\ No newline at end of file
+org.apache.tika.parser.pdf.PDFParser
\ No newline at end of file
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
similarity index 100%
rename from tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
rename to tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
similarity index 100%
rename from tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
rename to tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
similarity index 100%
rename from tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
rename to tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
similarity index 86%
rename from tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
rename to tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 868f100..c5f5c39 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -68,7 +68,6 @@ import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.xml.XMLProfiler;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
@@ -85,11 +84,6 @@ import org.xml.sax.ContentHandler;
*/
public class PDFParserTest extends TikaTest {
- public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
- public static final MediaType TYPE_EMF = MediaType.image("emf");
- public static final MediaType TYPE_PDF = MediaType.application("pdf");
- public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
- public static final MediaType TYPE_DOC = MediaType.application("msword");
public static Level PDFBOX_LOG_LEVEL = Level.INFO;
private static Boolean hasTesseract = null;
@@ -533,41 +527,6 @@ public class PDFParserTest extends TikaTest {
}
- //TIKA-1124
- @Test
- public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
- /* format of test doc:
- docx/
- pdf/
- docx
- */
-
- String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
- int outerHaystack = content.indexOf("Outer_haystack");
- int pdfHaystack = content.indexOf("pdf_haystack");
- int needle = content.indexOf("Needle");
- assertTrue(outerHaystack > -1);
- assertTrue(pdfHaystack > -1);
- assertTrue(needle > -1);
- assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
-
- TrackingHandler tracker = new TrackingHandler();
-
- ContainerExtractor ex = new ParserContainerExtractor();
- try (TikaInputStream tis =
- TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
- ex.extract(tis, ex, tracker);
- }
-
- assertEquals(3, tracker.filenames.size());
- assertEquals(3, tracker.mediaTypes.size());
- assertEquals("image1.emf", tracker.filenames.get(0));
- assertNull(tracker.filenames.get(1));
- assertEquals("Test.docx", tracker.filenames.get(2));
- assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
- assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
- assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
- }
// TIKA-973
//commented out until test documents that are unambiguously
@@ -632,40 +591,6 @@ public class PDFParserTest extends TikaTest {
assertTrue("found", (result.xml.contains("<li>aTextField: TIKA-1226</li>")));
}
- @Test // TIKA-1228, TIKA-1268
- public void testEmbeddedFilesInChildren() throws Exception {
- String xml = getXML("testPDF_childAttachments.pdf").xml;
- //"regressiveness" exists only in Unit10.doc not in the container pdf document
- assertTrue(xml.contains("regressiveness"));
-
- RecursiveParserWrapper p = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
- ParseContext context = new ParseContext();
- PDFParserConfig config = new PDFParserConfig();
- config.setExtractInlineImages(true);
- config.setExtractUniqueInlineImagesOnly(false);
- context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
- context.set(org.apache.tika.parser.Parser.class, p);
-
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE,-1));
- try (TikaInputStream tis = TikaInputStream.get(
- getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
- p.parse(tis, handler, new Metadata(), context);
- }
-
- List<Metadata> metadatas = handler.getMetadataList();
-
- assertEquals(5, metadatas.size());
- assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("image0.jpg", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("Unit10.doc", metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
- assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
- assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
- assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
- }
-
@Test // TIKA-2232
public void testEmbeddedJBIG2Image() throws Exception {
@@ -695,12 +620,7 @@ public class PDFParserTest extends TikaTest {
metadatas.get(1).get(Metadata.CONTENT_TYPE));
}
- @Test
- public void testEmbeddedFilesInAnnotations() throws Exception {
- String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
- assertTrue(xml.contains("This is a Excel"));
- }
@Test
public void testSingleCloseDoc() throws Exception {
@@ -903,22 +823,6 @@ public class PDFParserTest extends TikaTest {
assertEquals("attachment file name", "Test.txt", firstAttachment.get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
- @Test //TIKA-1374
- public void testOSSpecificEmbeddedFileExtraction() throws Exception {
- List<Metadata> metadatas = getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf");
- assertEquals("metadata size", 5, metadatas.size());
-
- assertEquals("file name", "Test.txt", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertContains("os specific", metadatas.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
- assertEquals("file name", "TestMac.txt", metadatas.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertContains("mac embedded", metadatas.get(2).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
- assertEquals("file name", "TestDos.txt", metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertContains("dos embedded", metadatas.get(3).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
- assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertContains("unix embedded", metadatas.get(4).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
-
- }
-
@Test //TIKA-1427
public void testEmbeddedFileMarkup() throws Exception {
ParseContext context = new ParseContext();
@@ -1131,53 +1035,6 @@ public class PDFParserTest extends TikaTest {
}
@Test
- public void testXMLProfiler() throws Exception {
- //test that the xml profiler is not triggered by default
- List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf");
- assertEquals(1, metadataList.size());
-
- //test that it is triggered when added to the default parser
- //via the config, tesseract should skip this file because it is too large
- InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml");
- assertNotNull(is);
- TikaConfig tikaConfig = new TikaConfig(is);
- Parser p = new AutoDetectParser(tikaConfig);
-
- metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", p);
- assertEquals(3, metadataList.size());
-
- int xmlProfilers = 0;
- for (Metadata metadata : metadataList) {
- String[] parsedBy = metadata.getValues("X-Parsed-By");
- for (int i = 0; i < parsedBy.length; i++) {
- if (parsedBy[i].equals(XMLProfiler.class.getCanonicalName())) {
- xmlProfilers++;
- }
- }
- }
-
- assertEquals(2, xmlProfilers);
-
- //check xmp first
- String[] uris = metadataList.get(1).getValues(XMLProfiler.ENTITY_URIS);
- String[] localNames = metadataList.get(1).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
- assertEquals(8, uris.length);
- assertEquals(uris.length, localNames.length);
- assertEquals("adobe:ns:meta/", uris[0]);
- assertEquals("CreateDate CreatorTool MetadataDate ModifyDate Thumbnails", localNames[2]);
- assertEquals("x:xmpmeta", metadataList.get(1).get(XMLProfiler.ROOT_ENTITY));
-
- //check xfa
- uris = metadataList.get(2).getValues(XMLProfiler.ENTITY_URIS);
- localNames = metadataList.get(2).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
- assertEquals(8, uris.length);
- assertEquals(uris.length, localNames.length);
- assertEquals("http://ns.adobe.com/xdp/", uris[1]);
- assertEquals("field form instanceManager subform value", localNames[5]);
- assertEquals("xdp:xdp", metadataList.get(2).get(XMLProfiler.ROOT_ENTITY));
- }
-
- @Test
public void testXMPMM() throws Exception {
Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
@@ -1287,43 +1144,6 @@ public class PDFParserTest extends TikaTest {
assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH));
}
- @Test
- public void testEmbeddedDocsWithOCROnly() throws Exception {
- assumeTrue("can run OCR", canRunOCR());
-
- for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) {
- PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(strategy);
- ParseContext context = new ParseContext();
- context.set(PDFParserConfig.class, config);
- //make sure everything works with regular xml _and_ with recursive
- XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context);
- //can get dehaystack depending on version of tesseract and/or preprocessing
- if (xmlResult.xml.contains("pdf_haystack") || xmlResult.xml.contains("dehaystack")) {
- //great
- } else {
- fail("couldn't find pdf_haystack or its variants");
- }
- assertContains("Haystack", xmlResult.xml);
- assertContains("Needle", xmlResult.xml);
- if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
- // Tesseract may see the t in haystack as a ! some times...
- //or it might see dehayslack...
- //TODO: figure out how to make this test less hacky
- String div = "<div class=\"ocr\">";
- if (xmlResult.xml.contains(div+"pdf_hays!ack")) {
- } else if (xmlResult.xml.contains(div+"pdf_haystack")) {
- } else if (xmlResult.xml.contains(div+"dehayslack")) {
- } else {
- fail("couldn't find acceptable variants of haystack");
- }
- } else {
- assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
- }
- assertEquals(4, getRecursiveMetadata("testPDFEmbeddingAndEmbedded.docx", context).size());
- }
-
- }
@Test
public void testJBIG2OCROnly() throws Exception {
@@ -1483,13 +1303,6 @@ public class PDFParserTest extends TikaTest {
assertContains("transport mined materials", xml);
}
- @Test
- public void testFileInAnnotationExtractedIfNoContents() throws Exception {
- //TIKA-2845
- List<Metadata> contents = getRecursiveMetadata("testPDFFileEmbInAnnotation_noContents.pdf");
- assertEquals(2, contents.size());
- assertContains("This is a Excel", contents.get(1).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
- }
@Test
public void testUnmappedUnicodeStats() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
similarity index 100%
rename from tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
rename to tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
diff --git a/tika-parsers/src/test/resources/test-documents/testAnnotations.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testAnnotations.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testAnnotations.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testAnnotations.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testExtraSpaces.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testExtraSpaces.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testExtraSpaces.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testExtraSpaces.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testJournalParser.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testJournalParser.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testOCR.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOCR.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOCR.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOCR.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testOptionalHyphen.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOptionalHyphen.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOptionalHyphen.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOptionalHyphen.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOverlappingText.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOverlappingText.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testOverlappingText.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF-custommetadata.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-custommetadata.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF-custommetadata.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-custommetadata.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFEmbeddingAndEmbedded.docx b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFEmbeddingAndEmbedded.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFEmbeddingAndEmbedded.docx
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFEmbeddingAndEmbedded.docx
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation_noContents.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFFileEmbInAnnotation_noContents.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation_noContents.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFFileEmbInAnnotation_noContents.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFPackage.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFPackage.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFTripleLangTitle.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFTripleLangTitle.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFTripleLangTitle.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFTripleLangTitle.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFTwoTextBoxes.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFTwoTextBoxes.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFTwoTextBoxes.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFTwoTextBoxes.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDFVarious.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFVarious.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDFVarious.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDFVarious.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_JBIG2.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_JBIG2.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_JBIG2.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_JBIG2.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_PDFEncodedStringInXMP.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_PDFEncodedStringInXMP.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_PDFEncodedStringInXMP.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_PDFEncodedStringInXMP.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.10.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.10.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.10.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.10.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.11.x.PDFA-1b.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.11.x.PDFA-1b.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.11.x.PDFA-1b.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.11.x.PDFA-1b.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.4.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.4.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.4.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.4.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.5.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.5.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.5.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.5.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.6.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.6.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.6.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.6.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.7.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.7.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.7.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.7.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.8.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.8.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.8.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.8.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_Version.9.x.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.9.x.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_Version.9.x.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_Version.9.x.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_acroform3.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_acroform3.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_angles.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_angles.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_angles.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_angles.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_bad_page_303226.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bad_page_303226.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_bad_page_303226.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bad_page_303226.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bom.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bom.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bookmarks.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_bookmarks.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_childAttachments.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_childAttachments.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_diffTitles.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_diffTitles.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_diffTitles.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_diffTitles.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
similarity index 98%
rename from tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
index 77b185e..01e40ea 100644
--- a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
@@ -28,8 +28,8 @@ endobj
/Filter [/FlateDecode]
/Length 6 0 R
>>
-stream
-4�?$7v�/�=�th�;U0�TdRLG����Zφ�6a�F���Ó^��D
+stream
+4�?$7v�/�=�th�;U0�TdRLG����Zφ�6a�F���Ó^��D
endstream
endobj
5 0 obj
@@ -65,16 +65,16 @@ endobj
endobj
xref
0 10
-0000000000 65535 f
-0000000015 00000 n
-0000000078 00000 n
-0000000135 00000 n
-0000000247 00000 n
-0000000375 00000 n
-0000000408 00000 n
-0000000426 00000 n
-0000000457 00000 n
-0000000547 00000 n
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
trailer
<<
/Root 1 0 R
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
similarity index 98%
rename from tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
index 0ac1b74..519b162 100644
--- a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
@@ -28,8 +28,8 @@ endobj
/Filter [/FlateDecode]
/Length 6 0 R
>>
-stream
-�B�0�6�Ym�y��mpne�ʈ�ډ��j�W��_WA��D�Y���Vs
+stream
+�B�0�6�Ym�y��mpne�ʈ�ډ��j�W��_WA��D�Y���Vs
endstream
endobj
5 0 obj
@@ -65,16 +65,16 @@ endobj
endobj
xref
0 10
-0000000000 65535 f
-0000000015 00000 n
-0000000078 00000 n
-0000000135 00000 n
-0000000247 00000 n
-0000000375 00000 n
-0000000408 00000 n
-0000000426 00000 n
-0000000457 00000 n
-0000000547 00000 n
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
trailer
<<
/Root 1 0 R
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
similarity index 98%
rename from tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
index e4b063a..aaa6c8f 100644
--- a/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
@@ -28,8 +28,8 @@ endobj
/Filter [/FlateDecode]
/Length 6 0 R
>>
-stream
-Ě�3�ְ6�f���6��Ě)FD�x�u��K^,��^̄�8Q��Q�$J
+stream
+Ě�3�ְ6�f���6��Ě)FD�x�u��K^,��^̄�8Q��Q�$J
endstream
endobj
5 0 obj
@@ -65,16 +65,16 @@ endobj
endobj
xref
0 10
-0000000000 65535 f
-0000000015 00000 n
-0000000078 00000 n
-0000000135 00000 n
-0000000247 00000 n
-0000000375 00000 n
-0000000408 00000 n
-0000000426 00000 n
-0000000457 00000 n
-0000000547 00000 n
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
trailer
<<
/Root 1 0 R
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_protected.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_protected.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_protected.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_protected.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_twoAuthors.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_twoAuthors.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPDF_twoAuthors.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_twoAuthors.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPageNumber.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPageNumber.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPopupAnnotation.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPopupAnnotation.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPopupAnnotation.pdf
diff --git a/tika-parsers/src/test/resources/test-documents/testStandardsExtractor.pdf b/tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testStandardsExtractor.pdf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testStandardsExtractor.pdf
rename to tika-parser-modules/tika-parser-pdf-module/src/test/resources/test-documents/testStandardsExtractor.pdf
diff --git a/tika-parser-modules/tika-parser-pkg-module/pom.xml b/tika-parser-modules/tika-parser-pkg-module/pom.xml
index 40ada9b..6bf3b3f 100644
--- a/tika-parser-modules/tika-parser-pkg-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pkg-module/pom.xml
@@ -60,4 +60,26 @@
<scope>test</scope>
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.pkg</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-xml-module/pom.xml b/tika-parser-modules/tika-parser-xml-module/pom.xml
index e62c47d..30a67ce 100644
--- a/tika-parser-modules/tika-parser-xml-module/pom.xml
+++ b/tika-parser-modules/tika-parser-xml-module/pom.xml
@@ -18,5 +18,26 @@
<version>${codec.version}</version>
</dependency>
</dependencies>
-
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.xml</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-xmp-commons/pom.xml b/tika-parser-modules/tika-parser-xmp-commons/pom.xml
index 77b5570..9021874 100644
--- a/tika-parser-modules/tika-parser-xmp-commons/pom.xml
+++ b/tika-parser-modules/tika-parser-xmp-commons/pom.xml
@@ -19,4 +19,26 @@
<version>${jempbox.version}</version>
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.xmp</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-xmp-commons/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg b/tika-parser-modules/tika-parser-xmp-commons/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg
new file mode 100644
index 0000000..801a44d
Binary files /dev/null and b/tika-parser-modules/tika-parser-xmp-commons/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg differ
diff --git a/tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg b/tika-parser-modules/tika-parser-xmp-commons/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
similarity index 100%
rename from tika-parser-modules/tika-parser-image-module/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
rename to tika-parser-modules/tika-parser-xmp-commons/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
diff --git a/tika-parser-modules/tika-parser-zip-commons/pom.xml b/tika-parser-modules/tika-parser-zip-commons/pom.xml
index 638e91d..edd861c 100644
--- a/tika-parser-modules/tika-parser-zip-commons/pom.xml
+++ b/tika-parser-modules/tika-parser-zip-commons/pom.xml
@@ -18,5 +18,26 @@
<version>${commons.compress.version}</version>
</dependency>
</dependencies>
-
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.zip</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index dd6e25b..5dbfb27 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -43,8 +43,6 @@
<codec.version>1.13</codec.version>
<vorbis.version>0.8</vorbis.version>
- <pdfbox.version>2.0.20</pdfbox.version>
- <jempbox.version>1.8.16</jempbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<sis.version>1.0</sis.version>
<parso.version>2.0.11</parso.version>
@@ -164,55 +162,6 @@
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>${pdfbox.version}</version>
- <exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox-tools</artifactId>
- <version>${pdfbox.version}</version>
- <exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox-debugger</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>preflight</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>jempbox</artifactId>
- <version>${jempbox.version}</version>
- </dependency>
- <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
- as optional, but we prefer to have them always to avoid
- problems with encrypted PDFs. -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcmail-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
<!-- WARNING: when you upgrade asm make sure that you update the
OpCode in the initializer in org.apache.tika.parser.asm.XHTMLClassVisitor
See TIKA-2992.