You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/02 13:52:43 UTC

[tika] 01/12: TIKA-3094 add ignored unit test that runs the bundle against all of the test files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f6b07702895af9c12a9c5f91a20db50d506a8bbd
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 4 21:21:44 2020 -0400

    TIKA-3094 add ignored unit test that runs the bundle against all of the test files.
---
 tika-bundle/pom.xml                                |  3 +-
 .../test/java/org/apache/tika/bundle/BundleIT.java | 57 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 3628cfa..dfe8a36 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -178,7 +178,6 @@
               xmlbeans|
               jackcess|
               jackcess-encrypt|
-              commons-lang|
               commons-lang3|
               tagsoup|
               asm|
@@ -192,6 +191,7 @@
               boilerpipe|
               rome|
               rome-utils|
+              jdom2|
               sentiment-analysis-parser|
               opennlp-tools|
               geoapi|
@@ -372,6 +372,7 @@
               org.jaxen.dom4j;resolution:=optional,
               org.jaxen.pattern;resolution:=optional,
               org.jaxen.saxpath;resolution:=optional,
+              org.jaxen.util;resolution:=optional,
               org.jdom;resolution:=optional,
               org.jdom.input;resolution:=optional,
               org.jdom.output;resolution:=optional,
diff --git a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
index 4cefffb..2cab1d5 100644
--- a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
+++ b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
@@ -45,6 +45,8 @@ import javax.inject.Inject;
 import org.apache.tika.Tika;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -56,6 +58,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.internal.Activator;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.ops4j.pax.exam.Configuration;
@@ -67,6 +70,7 @@ import org.osgi.framework.Bundle;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.ServiceReference;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 @RunWith(PaxExam.class)
 @ExamReactorStrategy(PerMethod.class)
@@ -301,4 +305,57 @@ public class BundleIT {
         String content = handler.toString();
         assertTrue(content.contains("Attachment Test"));
     }
+
+    @Test
+    @Ignore
+    public void testAll() throws Exception {
+        Tika tika = new Tika();
+
+        // Package extraction
+        ContentHandler handler = new BodyContentHandler();
+
+        Parser parser = tika.getParser();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+        Metadata metadata = new Metadata();
+        Set<String> needToFix = new HashSet<>();
+        needToFix.add("testAccess2_encrypted.accdb");
+
+        Set<String> unknownProblem = new HashSet<>();
+        //these all trigger org.apache.tika.metadata.PropertyTypeException
+        //which for some reason we can't catch (?!)
+        //We don't see problems with these files in tika-parsers?!
+/*        unknownProblem.add("testPPT_embedded_two_slides.pptx");
+        unknownProblem.add("testWORD_multi_authors.docx");
+        unknownProblem.add("testEXCEL_embeded.xlsx");
+        unknownProblem.add("testVORBIS.ogg");
+        unknownProblem.add("testWORD_2006ml.docx");
+        unknownProblem.add("testRTFEmbeddedLink.rtf");*/
+        System.out.println(getTestDir());
+        for (File f : getTestDir().listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            if (needToFix.contains(f.getName()) || unknownProblem.contains(f.getName())) {
+                continue;
+            }
+            System.out.println("about to parse "+f);
+            try (InputStream is = TikaInputStream.get(f)) {
+                parser.parse(is, handler, metadata, context);
+            } catch (EncryptedDocumentException e) {
+                //swallow
+            } catch (SAXException e) {
+                //
+            } catch (TikaException e) {
+                System.err.println("tika Exception "+f.getName());
+                e.printStackTrace();
+            }
+        }
+    }
+
+    private File getTestDir() {
+        return new File("../tika-parsers/src/test/resources/test-documents");
+    }
+
+
 }