You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/05/05 01:22:47 UTC

[tika] branch master updated (db048d5 -> 5a1ee00)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from db048d5  Add support for forbiddenapis 3.0: This hides all warnings caused by commons-io not used in all modules (#318)
     new e962365  TIKA-3094: Add SparseBitSet and xmpcore-shaded to tika-bundle.
     new c118a8d  TIKA-3095: Adding jcip-annotations explicitly for tika-bundle on windows
     new 18dc79b  add -Xmx to fuzzing cli
     new 5a1ee00  TIKA-3094 add ignored unit test that runs the bundle against all of the test files.

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-bundle/pom.xml                                |  10 +--
 .../test/java/org/apache/tika/bundle/BundleIT.java |  76 +++++++++++++++++++++
 .../src/test/resources}/testPPT.pptx               | Bin
 .../org/apache/tika/fuzzing/cli/FuzzingCLI.java    |   6 +-
 .../apache/tika/fuzzing/cli/FuzzingCLIConfig.java  |  14 ++++
 tika-parsers/pom.xml                               |   6 ++
 6 files changed, 107 insertions(+), 5 deletions(-)
 copy {tika-parsers/src/test/resources/test-documents => tika-bundle/src/test/resources}/testPPT.pptx (100%)


[tika] 03/04: add -Xmx to fuzzing cli

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 18dc79b68997fc0abf71f9f97eedb9aa381c0280
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 4 15:00:10 2020 -0400

    add -Xmx to fuzzing cli
---
 .../main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java  |  6 +++++-
 .../java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
index 3857a9a..a6a5f93 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
@@ -122,6 +122,8 @@ public class FuzzingCLI {
 
             String[] args = new String[] {
                     "java",
+                    "-XX:-OmitStackTraceInFastThrow",
+                    "-Xmx"+config.xmx,
                     "-ea",
                     "-cp",
                     ProcessUtils.escapeCommandLine(cp),
@@ -210,7 +212,9 @@ public class FuzzingCLI {
                     LOG.info("hit maxfiles; file crawler is stopping early");
                     return FileVisitResult.TERMINATE;
                 }
-
+                if (!file.getFileName().toString().contains("sas7bdat")) {
+                    return FileVisitResult.CONTINUE;
+                }
                 try {
                     boolean offered = queue.offer(file, 10, TimeUnit.MINUTES);
                     if (offered) {
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
index 324b934..206510b 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
@@ -37,6 +37,8 @@ public class FuzzingCLIConfig {
 
     private static final int DEFAULT_RETRIES = 2;
 
+    private static final String DEFAULT_XMX = "512m";
+
     static Options OPTIONS;
     static {
         //By the time this commandline is parsed, there should be both an extracts and an inputDir
@@ -81,6 +83,12 @@ public class FuzzingCLIConfig {
                         .hasArg(true)
                         .required(false)
                         .build())
+                .addOption(Option.builder("x")
+                        .longOpt("xmx")
+                        .desc("e.g. 1G, max heap appended to -Xmx in the child process")
+                        .hasArg(true)
+                        .required(false)
+                        .build())
                 .addOption(Option.builder("r")
                         .longOpt("retries")
                         .desc("number of times to retry a seed file if there's a catastrophic failure")
@@ -111,6 +119,9 @@ public class FuzzingCLIConfig {
         config.retries = (commandLine.hasOption("r")) ?
                 Integer.parseInt(commandLine.getOptionValue("r")) :
                 DEFAULT_RETRIES;
+        config.xmx = (commandLine.hasOption("x")) ?
+                commandLine.getOptionValue("x") :
+                DEFAULT_XMX;
         return config;
     }
 
@@ -126,6 +137,9 @@ public class FuzzingCLIConfig {
 
     //times to retry a seed file after a catastrophic failure
     int retries;
+
+    //xmx for child process, e.g. 512m or 1G
+    String xmx;
     Path inputDir;
     Path outputDir;
 


[tika] 01/04: TIKA-3094: Add SparseBitSet and xmpcore-shaded to tika-bundle.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e9623650a37039286604e8ed3a17fdcc0ab04fc1
Author: Bob Paulin <bo...@bobpaulin.com>
AuthorDate: Wed Apr 29 17:13:10 2020 -0500

    TIKA-3094: Add SparseBitSet and xmpcore-shaded to tika-bundle.
---
 tika-bundle/pom.xml                                  |   7 ++++---
 .../test/java/org/apache/tika/bundle/BundleIT.java   |  19 +++++++++++++++++++
 tika-bundle/src/test/resources/testPPT.pptx          | Bin 0 -> 36518 bytes
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 9fae42d..07ef3bd 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -187,7 +187,7 @@
               vorbis-java-tika|
               isoparser|
               metadata-extractor|
-              xmpcore|
+              xmpcore-shaded|
               json-simple|
               boilerpipe|
               rome|
@@ -212,7 +212,9 @@
               jcip-annotations|
               jmatio|
               guava|
-              age-predictor-api</Embed-Dependency>
+              age-predictor-api|
+              SparseBitSet
+            </Embed-Dependency>
             <Embed-Transitive>true</Embed-Transitive>
             <Bundle-DocURL>${project.url}</Bundle-DocURL>
             <Export-Package>
@@ -258,7 +260,6 @@
               com.sun.msv.datatype;resolution:=optional,
               com.sun.msv.datatype.xsd;resolution:=optional,
               com.sun.tools.javadoc;resolution:=optional,
-              com.zaxxer.sparsebits;resolution:=optional,
               edu.mit.ll.mitie;resolution:=optional,
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
diff --git a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
index 75deb34..54f10ae 100644
--- a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
+++ b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
@@ -283,4 +283,23 @@ public class BundleIT {
         assertTrue(content.contains("testXML.xml"));
         assertTrue(content.contains("Rida Benjelloun"));
     }
+    
+    @Test
+    public void testPoiTikaBundle() throws Exception {
+        Tika tika = new Tika();
+
+        // Package extraction
+        ContentHandler handler = new BodyContentHandler();
+
+        Parser parser = tika.getParser();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+
+        try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/testPPT.pptx"))) {
+            parser.parse(stream, handler, new Metadata(), context);
+        }
+
+        String content = handler.toString();
+        assertTrue(content.contains("Attachment Test"));
+    }
 }
diff --git a/tika-bundle/src/test/resources/testPPT.pptx b/tika-bundle/src/test/resources/testPPT.pptx
new file mode 100644
index 0000000..38bc6a2
Binary files /dev/null and b/tika-bundle/src/test/resources/testPPT.pptx differ


[tika] 02/04: TIKA-3095: Adding jcip-annotations explicitly for tika-bundle on windows

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c118a8d666ae9142f3727b7ac94069baf2b34661
Author: Bob Paulin <bo...@bobpaulin.com>
AuthorDate: Wed Apr 29 17:20:10 2020 -0500

    TIKA-3095: Adding jcip-annotations explicitly for tika-bundle on windows
---
 tika-parsers/pom.xml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index b989340..fa65e78 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -556,6 +556,12 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <!-- TIKA-3095: Required for grib -->
+    <dependency>
+	  <groupId>net.jcip</groupId>
+	  <artifactId>jcip-annotations</artifactId>
+	  <version>1.0</version>
+	</dependency>
     <!-- TIKA-3052 -->
     <dependency>
       <groupId>com.beust</groupId>


[tika] 04/04: TIKA-3094 add ignored unit test that runs the bundle against all of the test files.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5a1ee00e64ec812574ba7be8e48f637e01fa018c
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 4 21:21:44 2020 -0400

    TIKA-3094 add ignored unit test that runs the bundle against all of the test files.
---
 tika-bundle/pom.xml                                |  3 +-
 .../test/java/org/apache/tika/bundle/BundleIT.java | 57 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 07ef3bd..5a35e32 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -178,7 +178,6 @@
               xmlbeans|
               jackcess|
               jackcess-encrypt|
-              commons-lang|
               commons-lang3|
               tagsoup|
               asm|
@@ -192,6 +191,7 @@
               boilerpipe|
               rome|
               rome-utils|
+              jdom2|
               sentiment-analysis-parser|
               opennlp-tools|
               geoapi|
@@ -372,6 +372,7 @@
               org.jaxen.dom4j;resolution:=optional,
               org.jaxen.pattern;resolution:=optional,
               org.jaxen.saxpath;resolution:=optional,
+              org.jaxen.util;resolution:=optional,
               org.jdom;resolution:=optional,
               org.jdom.input;resolution:=optional,
               org.jdom.output;resolution:=optional,
diff --git a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
index 54f10ae..12804ca 100644
--- a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
+++ b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
@@ -45,6 +45,8 @@ import javax.inject.Inject;
 import org.apache.tika.Tika;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -57,6 +59,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.internal.Activator;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.ops4j.pax.exam.Configuration;
@@ -68,6 +71,7 @@ import org.osgi.framework.Bundle;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.ServiceReference;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 @RunWith(PaxExam.class)
 @ExamReactorStrategy(PerMethod.class)
@@ -302,4 +306,57 @@ public class BundleIT {
         String content = handler.toString();
         assertTrue(content.contains("Attachment Test"));
     }
+
+    @Test
+    @Ignore
+    public void testAll() throws Exception {
+        Tika tika = new Tika();
+
+        // Package extraction
+        ContentHandler handler = new BodyContentHandler();
+
+        Parser parser = tika.getParser();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+        Metadata metadata = new Metadata();
+        Set<String> needToFix = new HashSet<>();
+        needToFix.add("testAccess2_encrypted.accdb");
+
+        Set<String> unknownProblem = new HashSet<>();
+        //these all trigger org.apache.tika.metadata.PropertyTypeException
+        //which for some reason we can't catch (?!)
+        //We don't see problems with these files in tika-parsers?!
+/*        unknownProblem.add("testPPT_embedded_two_slides.pptx");
+        unknownProblem.add("testWORD_multi_authors.docx");
+        unknownProblem.add("testEXCEL_embeded.xlsx");
+        unknownProblem.add("testVORBIS.ogg");
+        unknownProblem.add("testWORD_2006ml.docx");
+        unknownProblem.add("testRTFEmbeddedLink.rtf");*/
+        System.out.println(getTestDir());
+        for (File f : getTestDir().listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            if (needToFix.contains(f.getName()) || unknownProblem.contains(f.getName())) {
+                continue;
+            }
+            System.out.println("about to parse "+f);
+            try (InputStream is = TikaInputStream.get(f)) {
+                parser.parse(is, handler, metadata, context);
+            } catch (EncryptedDocumentException e) {
+                //swallow
+            } catch (SAXException e) {
+                //
+            } catch (TikaException e) {
+                System.err.println("tika Exception "+f.getName());
+                e.printStackTrace();
+            }
+        }
+    }
+
+    private File getTestDir() {
+        return new File("../tika-parsers/src/test/resources/test-documents");
+    }
+
+
 }