You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/19 18:11:28 UTC

[tika] 01/01: TIKA-4135 -- remove xerces2

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4135
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d6d13b3d3abf01fa047c3ad8105efadf0dd70cac
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 19 14:11:12 2023 -0400

    TIKA-4135 -- remove xerces2
---
 tika-bundles/tika-bundle-standard/pom.xml          |  9 -----
 tika-core/pom.xml                                  | 10 +-----
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 10 ++++++
 .../apache/tika/sax/CustomErrorHandlerTest.java    |  2 ++
 .../tika/sax/ErrorResistantSAXParserFactory.java   | 39 ----------------------
 tika-parent/pom.xml                                | 13 --------
 .../tika-parsers-ml/tika-parser-nlp-module/pom.xml |  4 +++
 .../tika-parser-microsoft-module/pom.xml           |  2 --
 .../tika-parser-xml-module/pom.xml                 |  4 ---
 .../java/org/apache/tika/parser/XMLTestBase.java   |  3 +-
 10 files changed, 18 insertions(+), 78 deletions(-)

diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index db605c044..db6304bae 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -151,8 +151,6 @@
             </Bundle-Activator>
             <Embed-Dependency>*;scope=compile;artifactId=tika-parsers-standard-package|
               javax.activation|
-              xerces|
-              xercesImpl|
               commons-compress|
               xz|
               commons-codec|
@@ -317,10 +315,6 @@
               org.apache.tools.ant;resolution:=optional,
               org.apache.tools.ant.taskdefs;resolution:=optional,
               org.apache.tools.ant.types;resolution:=optional,
-              org.apache.xerces.parsers;resolution:=optional,
-              org.apache.xerces.util;resolution:=optional,
-              org.apache.xerces.xni;resolution:=optional,
-              org.apache.xerces.xni.parser;resolution:=optional,
               org.apache.xml.resolver;resolution:=optional,
               org.apache.xml.resolver.readers;resolution:=optional,
               org.apache.xml.resolver.tools;resolution:=optional,
@@ -411,9 +405,6 @@
               INFO
             </org.ops4j.pax.logging.DefaultServiceLog.level>
           </systemPropertyVariables>
-          <classpathDependencyExcludes>
-            <classpathDependencyExclude>xerces:xercesImpl</classpathDependencyExclude>
-          </classpathDependencyExcludes>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index 3680cb8de..58f9fd298 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -104,12 +104,6 @@
       <version>${junit5.version}</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-    <groupId>xerces</groupId>
-    <artifactId>xercesImpl</artifactId>
-    <version>2.12.2</version>
-    <scope>test</scope>
-</dependency>
   </dependencies>
 
   <build>
@@ -156,9 +150,7 @@
               org.apache.tika.config.TikaActivator
             </Bundle-Activator>
             <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
-            <Import-Package>!sun.misc,org.apache.xerces.util;resolution:=optional,
-              org.apache.commons.io.*;version="[2,3)",
-              *</Import-Package>
+            <Import-Package>!org.apache.commons.io.*;version="[2,3)",*</Import-Package>
             <Export-Package>
               org.apache.tika.*
             </Export-Package>
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 53ebc154e..262ebfef9 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -215,6 +215,9 @@ public class XMLReaderUtils implements Serializable {
      */
     public static SAXParserFactory getSAXParserFactory() {
         SAXParserFactory factory = SAXParserFactory.newInstance();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("SAXParserFactory class {}", factory.getClass());
+        }
         factory.setNamespaceAware(true);
         factory.setValidating(false);
         trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
@@ -241,6 +244,10 @@ public class XMLReaderUtils implements Serializable {
     public static DocumentBuilderFactory getDocumentBuilderFactory() {
         //borrowed from Apache POI
         DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("DocumentBuilderFactory class {}", factory.getClass());
+        }
+
         factory.setExpandEntityReferences(false);
         factory.setNamespaceAware(true);
         factory.setValidating(false);
@@ -290,6 +297,9 @@ public class XMLReaderUtils implements Serializable {
      */
     public static XMLInputFactory getXMLInputFactory() {
         XMLInputFactory factory = XMLInputFactory.newFactory();
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("XMLInputFactory class {}", factory.getClass());
+        }
 
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
index 0cee6d21b..88643147f 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
@@ -27,6 +27,7 @@ import javax.xml.parsers.ParserConfigurationException;
 import org.apache.commons.io.output.ByteArrayOutputStream;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.SAXException;
 
@@ -40,6 +41,7 @@ import org.apache.tika.utils.XMLReaderUtils;
  *
  * @see <a href="https://issues.apache.org/jira/browse/TIKA-4062">TIKA-4062</a>
  */
+@Disabled("TODO -- rework without xerces")
 public class CustomErrorHandlerTest extends TikaTest {
 
     private static String DEFAULT_SAX_PARSER_FACTORY;
diff --git a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java b/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java
deleted file mode 100644
index 9916b9df5..000000000
--- a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.sax;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
-
-/**
- * 
- * A custom {@link SAXParserFactory} to force the parser to continue even after fatal error
- *
- */
-public class ErrorResistantSAXParserFactory extends SAXParserFactoryImpl {
-    
-    public ErrorResistantSAXParserFactory() throws SAXNotRecognizedException, SAXNotSupportedException,
-            ParserConfigurationException {
-        super();
-        this.setFeature("http://apache.org/xml/features/continue-after-fatal-error", true);
-    }   
-
-}
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 662cb67e1..3b147da00 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -394,7 +394,6 @@
     <uima.version>3.4.1</uima.version>
     <uimafit.version>3.4.0</uimafit.version>
     <vorbis.version>0.8</vorbis.version>
-    <xerces.version>2.12.2</xerces.version>
     <xmpcore.version>6.1.11</xmpcore.version>
     <zstd.version>1.5.5-5</zstd.version>
     <kafka.version>3.5.1</kafka.version>
@@ -929,11 +928,6 @@
         <artifactId>snappy-java</artifactId>
         <version>1.1.10.3</version>
       </dependency>
-      <dependency>
-        <groupId>xerces</groupId>
-        <artifactId>xercesImpl</artifactId>
-        <version>${xerces.version}</version>
-      </dependency>
       <dependency>
           <groupId>commons-fileupload</groupId>
           <artifactId>commons-fileupload</artifactId>
@@ -1015,13 +1009,6 @@
               <artifactId>netty-handler</artifactId>
               <version>${netty.version}</version>
             </exclude>
-            <exclude>
-              <!-- the most recent cve in sonatype for this artifact is 2.11.0,
-                  not at all the version we're using...smh-->
-              <groupId>xerces</groupId>
-              <artifactId>xercesImpl</artifactId>
-              <version>${xerces.version}</version>
-            </exclude>
             <!-- these are used by the nlp-module -->
             <exclude>
               <groupId>org.apache.lucene</groupId>
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
index 34327e839..5144584f7 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
@@ -58,6 +58,10 @@
                     <groupId>log4j</groupId>
                     <artifactId>log4j</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>xerces</groupId>
+                    <artifactId>xercesImpl</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index cd4b37326..c3edb9ffa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -42,8 +42,6 @@
       <artifactId>tika-parser-text-module</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <!-- needed for only for AbstractXML2003Parser, but it ensures that xerces
-        is used -->
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-xml-module</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
index f5056ea97..8add50ab0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml
@@ -34,10 +34,6 @@
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
     </dependency>
-    <dependency>
-      <groupId>xerces</groupId>
-      <artifactId>xercesImpl</artifactId>
-    </dependency>
   </dependencies>
   <build>
     <plugins>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
index f69a7e948..f2ddf0e6b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java
@@ -142,8 +142,7 @@ public class XMLTestBase extends TikaTest {
             TaggedContentHandler tagged = new TaggedContentHandler(handler);
             try {
                 SAXParserFactory saxParserFactory = SAXParserFactory
-                        .newInstance("org.apache.xerces.jaxp.SAXParserFactoryImpl",
-                                this.getClass().getClassLoader());
+                        .newInstance();
                 SAXParser parser = saxParserFactory.newSAXParser();
                 parser.parse(stream, new TextContentHandler(handler, true));
             } catch (ParserConfigurationException e) {