You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/19 18:11:27 UTC

[tika] branch TIKA-4135 created (now d6d13b3d3)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4135
in repository https://gitbox.apache.org/repos/asf/tika.git


      at d6d13b3d3 TIKA-4135 -- remove xerces2

This branch includes the following new commits:

     new d6d13b3d3 TIKA-4135 -- remove xerces2

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4135 -- remove xerces2

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4135
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d6d13b3d3abf01fa047c3ad8105efadf0dd70cac
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 19 14:11:12 2023 -0400

    TIKA-4135 -- remove xerces2
---
 tika-bundles/tika-bundle-standard/pom.xml          |  9 -----
 tika-core/pom.xml                                  | 10 +-----
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 10 ++++++
 .../apache/tika/sax/CustomErrorHandlerTest.java    |  2 ++
 .../tika/sax/ErrorResistantSAXParserFactory.java   | 39 ----------------------
 tika-parent/pom.xml                                | 13 --------
 .../tika-parsers-ml/tika-parser-nlp-module/pom.xml |  4 +++
 .../tika-parser-microsoft-module/pom.xml           |  2 --
 .../tika-parser-xml-module/pom.xml                 |  4 ---
 .../java/org/apache/tika/parser/XMLTestBase.java   |  3 +-
 10 files changed, 18 insertions(+), 78 deletions(-)

diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index db605c044..db6304bae 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -151,8 +151,6 @@
             </Bundle-Activator>
             <Embed-Dependency>*;scope=compile;artifactId=tika-parsers-standard-package|
               javax.activation|
-              xerces|
-              xercesImpl|
               commons-compress|
               xz|
               commons-codec|
@@ -317,10 +315,6 @@
               org.apache.tools.ant;resolution:=optional,
               org.apache.tools.ant.taskdefs;resolution:=optional,
               org.apache.tools.ant.types;resolution:=optional,
-              org.apache.xerces.parsers;resolution:=optional,
-              org.apache.xerces.util;resolution:=optional,
-              org.apache.xerces.xni;resolution:=optional,
-              org.apache.xerces.xni.parser;resolution:=optional,
               org.apache.xml.resolver;resolution:=optional,
               org.apache.xml.resolver.readers;resolution:=optional,
               org.apache.xml.resolver.tools;resolution:=optional,
@@ -411,9 +405,6 @@
               INFO
             </org.ops4j.pax.logging.DefaultServiceLog.level>
           </systemPropertyVariables>
-          <classpathDependencyExcludes>
-            <classpathDependencyExclude>xerces:xercesImpl</classpathDependencyExclude>
-          </classpathDependencyExcludes>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index 3680cb8de..58f9fd298 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -104,12 +104,6 @@
       <version>${junit5.version}</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-    <groupId>xerces</groupId>
-    <artifactId>xercesImpl</artifactId>
-    <version>2.12.2</version>
-    <scope>test</scope>
-</dependency>
   </dependencies>
 
   <build>
@@ -156,9 +150,7 @@
               org.apache.tika.config.TikaActivator
             </Bundle-Activator>
             <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
-            <Import-Package>!sun.misc,org.apache.xerces.util;resolution:=optional,
-              org.apache.commons.io.*;version="[2,3)",
-              *</Import-Package>
+            <Import-Package>!org.apache.commons.io.*;version="[2,3)",*</Import-Package>
             <Export-Package>
               org.apache.tika.*
             </Export-Package>
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 53ebc154e..262ebfef9 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -215,6 +215,9 @@ public class XMLReaderUtils implements Serializable {
      */
     public static SAXParserFactory getSAXParserFactory() {
         SAXParserFactory factory = SAXParserFactory.newInstance();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("SAXParserFactory class {}", factory.getClass());
+        }
         factory.setNamespaceAware(true);
         factory.setValidating(false);
         trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
@@ -241,6 +244,10 @@ public class XMLReaderUtils implements Serializable {
     public static DocumentBuilderFactory getDocumentBuilderFactory() {
         //borrowed from Apache POI
         DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("DocumentBuilderFactory class {}", factory.getClass());
+        }
+
         factory.setExpandEntityReferences(false);
         factory.setNamespaceAware(true);
         factory.setValidating(false);
@@ -290,6 +297,9 @@ public class XMLReaderUtils implements Serializable {
      */
     public static XMLInputFactory getXMLInputFactory() {
         XMLInputFactory factory = XMLInputFactory.newFactory();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("XMLInputFactory class {}", factory.getClass());
+        }
 
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
index 0cee6d21b..88643147f 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
@@ -27,6 +27,7 @@ import javax.xml.parsers.ParserConfigurationException;
 import org.apache.commons.io.output.ByteArrayOutputStream;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.SAXException;
 
@@ -40,6 +41,7 @@ import org.apache.tika.utils.XMLReaderUtils;
  *
  * @see <a href="https://issues.apache.org/jira/browse/TIKA-4062">TIKA-4062</a>
  */
+@Disabled("TODO -- rework without xerces")
 public class CustomErrorHandlerTest extends TikaTest {
 
     private static String DEFAULT_SAX_PARSER_FACTORY;
diff --git a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java b/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java
deleted file mode 100644
index 9916b9df5..000000000
--- a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.sax;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
-
-/**
- * 
- * A custom {@link SAXParserFactory} to force the parser to continue even after fatal error
- *
- */
-public class ErrorResistantSAXParserFactory extends SAXParserFactoryImpl {
-    
-    public ErrorResistantSAXParserFactory() throws SAXNotRecognizedException, SAXNotSupportedException,
-            ParserConfigurationException {
-        super();
-        this.setFeature("http://apache.org/xml/features/continue-after-fatal-error", true);
-    }   
-
-}
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 662cb67e1..3b147da00 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -394,7 +394,6 @@
     <uima.version>3.4.1</uima.version>
     <uimafit.version>3.4.0</uimafit.version>
     <vorbis.version>0.8</vorbis.version>
-    <xerces.version>2.12.2</xerces.version>
     <xmpcore.version>6.1.11</xmpcore.version>
     <zstd.version>1.5.5-5</zstd.version>
     <kafka.version>3.5.1</kafka.version>
@@ -929,11 +928,6 @@
         <artifactId>snappy-java</artifactId>
         <version>1.1.10.3</version>
       </dependency>
-      <dependency>
-        <groupId>xerces</groupId>
-        <artifactId>xercesImpl</artifactId>
-        <version>${xerces.version}</version>
-      </dependency>
       <dependency>
           <groupId>commons-fileupload</groupId>
           <artifactId>commons-fileupload</artifactId>
@@ -1015,13 +1009,6 @@
               <artifactId>netty-handler</artifactId>
               <version>${netty.version}</version>
             </exclude>
-            <exclude>
-              <!-- the most recent cve in sonatype for this artifact is 2.11.0,
-                  not at all the version we're using...smh-->
-              <groupId>xerces</groupId>
-              <artifactId>xercesImpl</artifactId>
-              <version>${xerces.version}</version>
-            </exclude>
             <!-- these are used by the nlp-module -->
             <exclude>
               <groupId>org.apache.lucene</groupId>
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
index 34327e839..5144584f7 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
@@ -58,6 +58,10 @@
                     <groupId>log4j</groupId>
                     <artifactId>log4j</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>xerces</groupId>
+                    <artifactId>xercesImpl</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index cd4b37326..c3edb9ffa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -42,8 +42,6 @@
       <artifactId>tika-parser-text-module</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <!-- needed for only for AbstractXML2003Parser, but it ensures that xerces
-        is used -->
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-xml-module</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
index f5056ea97..8add50ab0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
@@ -34,10 +34,6 @@
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
     </dependency>
-    <dependency>
-      <groupId>xerces</groupId>
-      <artifactId>xercesImpl</artifactId>
-    </dependency>
   </dependencies>
   <build>
     <plugins>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
index f69a7e948..f2ddf0e6b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
@@ -142,8 +142,7 @@ public class XMLTestBase extends TikaTest {
             TaggedContentHandler tagged = new TaggedContentHandler(handler);
             try {
                 SAXParserFactory saxParserFactory = SAXParserFactory
-                        .newInstance("org.apache.xerces.jaxp.SAXParserFactoryImpl",
-                                this.getClass().getClassLoader());
+                        .newInstance();
                 SAXParser parser = saxParserFactory.newSAXParser();
                 parser.parse(stream, new TextContentHandler(handler, true));
             } catch (ParserConfigurationException e) {