You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/06/30 19:25:51 UTC

[1/2] tika git commit: TIKA-1978 Invocation of java.net.URL.equals(Object), which blocks to do domain name resolution, in org.apache.tika.parser.geo.topic.GeoParser.initialize(URL) 2.x branch

Repository: tika
Updated Branches:
  refs/heads/2.x 2a7e52ec4 -> 573527bbc


TIKA-1978 Invocation of java.net.URL.equals(Object), which blocks to do domain name resolution, in org.apache.tika.parser.geo.topic.GeoParser.initialize(URL) 2.x branch


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/bd3ecfcd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/bd3ecfcd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/bd3ecfcd

Branch: refs/heads/2.x
Commit: bd3ecfcddeaf13262e477ba29c5256ebd44e32db
Parents: 43e3000
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu May 26 11:15:02 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu May 26 11:15:02 2016 -0700

----------------------------------------------------------------------
 .../apache/tika/parser/geo/topic/GeoParser.java | 43 +++++++++++---------
 .../tika/parser/geo/topic/GeoParserConfig.java  |  4 +-
 .../apache/tika/parser/geo/topic/GeoTag.java    | 33 +++++++--------
 .../parser/geo/topic/NameEntityExtractor.java   | 11 ++---
 4 files changed, 48 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
index eaef6ad..303f878 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
@@ -20,19 +20,21 @@ package org.apache.tika.parser.geo.topic;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URISyntaxException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.Set;
+import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.apache.commons.exec.CommandLine;
 import org.apache.commons.exec.DefaultExecutor;
-import org.apache.commons.exec.ExecuteException;
 import org.apache.commons.exec.ExecuteWatchdog;
 import org.apache.commons.exec.PumpStreamHandler;
-import org.apache.commons.exec.environment.EnvironmentUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -57,7 +59,7 @@ public class GeoParser extends AbstractParser {
 
     private boolean initialized;
     private URL modelUrl;
-    private NameEntityExtractor extractor;
+    private transient NameEntityExtractor extractor;
     private boolean available;
 
     @Override
@@ -70,9 +72,12 @@ public class GeoParser extends AbstractParser {
      * @param modelUrl the URL to NER model
      */
     public void initialize(URL modelUrl) {
-        if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
-            // Previously initialized for the same URL, no initialization needed
-            return;
+        try {
+            if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) {
+                return;
+            }
+        } catch (URISyntaxException e1) {
+            LOG.log(Level.SEVERE, e1.getMessage(), e1);
         }
         
         this.modelUrl = modelUrl;
@@ -112,7 +117,7 @@ public class GeoParser extends AbstractParser {
         String bestner = extractor.bestNameEntity;
 
         /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
-        HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+        HashMap<String, ArrayList<String>> resolvedGeonames = (HashMap<String, ArrayList<String>>) searchGeoNames(locationNameEntities);
 
         /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
         GeoTag geotag = new GeoTag();
@@ -120,22 +125,21 @@ public class GeoParser extends AbstractParser {
 
         /* add resolved entities in metadata */
 
-        metadata.add("Geographic_NAME", geotag.Geographic_NAME);
-        metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
-        metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+        metadata.add("Geographic_NAME", geotag.geoNAME);
+        metadata.add("Geographic_LONGITUDE", geotag.geoLONGTITUDE);
+        metadata.add("Geographic_LATITUDE", geotag.geoLATITUDE);
         for (int i = 0; i < geotag.alternatives.size(); ++i) {
-            GeoTag alter = (GeoTag) geotag.alternatives.get(i);
-            metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+            GeoTag alter = geotag.alternatives.get(i);
+            metadata.add("Optional_NAME" + (i + 1), alter.geoNAME);
             metadata.add("Optional_LONGITUDE" + (i + 1),
-                         alter.Geographic_LONGTITUDE);
+                         alter.geoLONGTITUDE);
             metadata.add("Optional_LATITUDE" + (i + 1),
-                         alter.Geographic_LATITUDE);
+                         alter.geoLATITUDE);
         }
     }
 
-    public HashMap<String, ArrayList<String>> searchGeoNames(
-            ArrayList<String> locationNameEntities) throws ExecuteException,
-            IOException {
+    public Map<String, ArrayList<String>> searchGeoNames(
+            List<String> locationNameEntities) throws IOException {
         CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
         cmdLine.addArgument("-s");
@@ -150,17 +154,16 @@ public class GeoParser extends AbstractParser {
         exec.setWatchdog(watchdog);
         PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
         exec.setStreamHandler(streamHandler);
-        int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment());
         String outputJson = outputStream.toString("UTF-8");
         JSONArray json = (JSONArray) JSONValue.parse(outputJson);
 
-        HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
+        HashMap<String, ArrayList<String>> returnHash = new HashMap<>();
         for (int i = 0; i < json.size(); i++) {
             JSONObject obj = (JSONObject) json.get(i);
             for (Object key : obj.keySet()) {
                 String theKey = (String) key;
                 JSONArray vals = (JSONArray) obj.get(theKey);
-                ArrayList<String> stringVals = new ArrayList<String>(
+                ArrayList<String> stringVals = new ArrayList<>(
                         vals.size());
                 for (int j = 0; j < vals.size(); j++) {
                     String val = (String) vals.get(j);

http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
index 305e663..56272e1 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
@@ -30,7 +30,7 @@ public class GeoParserConfig implements Serializable {
         this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
     }
 
-    public void setNERModelPath(String path) {
+    public void setNERModelPath(String path) throws MalformedURLException {
         if (path == null)
             return;
         File file = new File(path);
@@ -40,7 +40,7 @@ public class GeoParserConfig implements Serializable {
         try {
             this.nerModelUrl = file.toURI().toURL();
         } catch (MalformedURLException e) {
-            throw new RuntimeException(e);
+            throw new MalformedURLException(e.getMessage());
         }
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
index bccaef1..fe4b9c6 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
@@ -18,18 +18,19 @@
 package org.apache.tika.parser.geo.topic;
 
 import java.util.ArrayList;
-import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
 
 public class GeoTag {
-	String Geographic_NAME;
-	String Geographic_LONGTITUDE;
-	String Geographic_LATITUDE;
-	ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+	String geoNAME;
+	String geoLONGTITUDE;
+	String geoLATITUDE;
+	ArrayList<GeoTag> alternatives = new ArrayList<>();
 
 	public void setMain(String name, String longitude, String latitude) {
-		Geographic_NAME = name;
-		Geographic_LONGTITUDE = longitude;
-		Geographic_LATITUDE = latitude;
+		geoNAME = name;
+		geoLONGTITUDE = longitude;
+		geoLATITUDE = latitude;
 	}
 
 	public void addAlternative(GeoTag geotag) {
@@ -44,20 +45,20 @@ public class GeoTag {
 	 * @param bestNER best name entity among all the extracted entities for the
 	 * input stream
 	 */
-	public void toGeoTag(HashMap<String, ArrayList<String>> resolvedGeonames,
+	public void toGeoTag(Map<String, ArrayList<String>> resolvedGeonames,
 			String bestNER) {
 
-		for (String key : resolvedGeonames.keySet()) {
+		for (Entry<String, ArrayList<String>> key : resolvedGeonames.entrySet()) {
 			ArrayList<String> cur = resolvedGeonames.get(key);
 			if (key.equals(bestNER)) {
-				this.Geographic_NAME = cur.get(0);
-				this.Geographic_LONGTITUDE = cur.get(1);
-				this.Geographic_LATITUDE = cur.get(2);
+				this.geoNAME = cur.get(0);
+				this.geoLONGTITUDE = cur.get(1);
+				this.geoLATITUDE = cur.get(2);
 			} else {
 				GeoTag alter = new GeoTag();
-				alter.Geographic_NAME = cur.get(0);
-				alter.Geographic_LONGTITUDE = cur.get(1);
-				alter.Geographic_LATITUDE = cur.get(2);
+				alter.geoNAME = cur.get(0);
+				alter.geoLONGTITUDE = cur.get(1);
+				alter.geoLATITUDE = cur.get(2);
 				this.addAlternative(alter);
 			}
 		}

http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
index 3c6f0e8..822d343 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
@@ -43,11 +43,11 @@ public class NameEntityExtractor {
     private final NameFinderME nameFinder;
 
     public NameEntityExtractor(URL modelUrl) throws IOException {
-        this.locationNameEntities = new ArrayList<String>();
+        this.locationNameEntities = new ArrayList<>();
         this.bestNameEntity = null;
         TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
         this.nameFinder = new NameFinderME(model);
-        this.tf = new HashMap<String, Integer>();
+        this.tf = new HashMap<>();
     }
 
     /*
@@ -59,7 +59,7 @@ public class NameEntityExtractor {
      */
     public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
         String[] in = IOUtils.toString(stream, UTF_8).split(" ");
-        Span nameE[];
+        Span[] nameE;
         
         //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
         synchronized (nameFinder) {
@@ -89,7 +89,7 @@ public class NameEntityExtractor {
      * ArrayList
      */
     public void getBestNameEntity() {
-        if (this.locationNameEntities.size() == 0)
+        if (this.locationNameEntities.isEmpty())
             return;
 
         for (int i = 0; i < this.locationNameEntities.size(); ++i) {
@@ -100,10 +100,11 @@ public class NameEntityExtractor {
                 tf.put(this.locationNameEntities.get(i), 1);
         }
         int max = 0;
-        List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+        List<Map.Entry<String, Integer>> list = new ArrayList<>(
                 tf.entrySet());
         Collections.shuffle(list);
         Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+            @Override
             public int compare(Map.Entry<String, Integer> o1,
                     Map.Entry<String, Integer> o2) {
                 // Descending Order


[2/2] tika git commit: Merge branch '2.x' of https://git-wip-us.apache.org/repos/asf/tika into 2.x

Posted by le...@apache.org.
Merge branch '2.x' of https://git-wip-us.apache.org/repos/asf/tika into 2.x


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/573527bb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/573527bb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/573527bb

Branch: refs/heads/2.x
Commit: 573527bbc608d495c40f26c02c7286197c3c723b
Parents: bd3ecfc 2a7e52e
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu Jun 30 12:32:01 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu Jun 30 12:32:01 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt                                     |   36 +-
 tika-app/pom.xml                                |    8 +
 .../org/apache/tika/mime/TestMimeTypes.java     |   33 +-
 .../tika/parser/AutoDetectParserTest.java       |   24 +-
 .../ConfigurableThreadPoolExecutor.java         |   64 +-
 .../concurrent/SimpleThreadPoolExecutor.java    |   80 +-
 .../apache/tika/detect/AbstractDetector.java    |   86 +-
 .../org/apache/tika/detect/DetectorProxy.java   |  134 +-
 .../tika/detect/EncodingDetectorProxy.java      |   82 +-
 .../java/org/apache/tika/io/EndianUtils.java    |  830 ++---
 .../java/org/apache/tika/io/StringUtil.java     |  242 +-
 .../tika/metadata/TikaCoreProperties.java       |    7 +
 .../tika/osgi/TikaAbstractBundleActivator.java  |  142 +-
 .../java/org/apache/tika/osgi/TikaService.java  |   50 +-
 .../tika/osgi/internal/TikaServiceImpl.java     |  162 +-
 .../org/apache/tika/parser/AbstractParser.java  |   24 -
 .../org/apache/tika/parser/ParserProxy.java     |  148 +-
 .../org/apache/tika/utils/ConcurrentUtils.java  |  114 +-
 .../org/apache/tika/mime/tika-mimetypes.xml     |   54 +-
 .../java/org/apache/tika/TikaDetectionTest.java |    2 +-
 .../src/test/java/org/apache/tika/TikaTest.java |    6 +-
 .../org/apache/tika/config/DummyExecutor.java   |   60 +-
 .../apache/tika/detect/DetectorProxyTest.java   |  112 +-
 .../apache/tika/detect/DummyProxyDetector.java  |   62 +-
 .../org/apache/tika/io/EndianUtilsTest.java     |   35 +
 .../apache/tika/parser/DummyProxyParser.java    |   88 +-
 .../org/apache/tika/parser/ParserProxyTest.java |  130 +-
 .../apache/tika/utils/ConcurrentUtilsTest.java  |  126 +-
 .../services/org.apache.tika.parser.Parser      |   34 +-
 .../apache/tika/config/TIKA-1762-executors.xml  |   56 +-
 tika-parent/pom.xml                             |    9 +
 tika-parser-bundles/pom.xml                     |  350 +--
 .../tika-parser-advanced-bundle/pom.xml         |  162 +-
 .../tika-parser-cad-bundle/pom.xml              |  144 +-
 .../tika-parser-code-bundle/pom.xml             |  148 +-
 .../tika-parser-crypto-bundle/pom.xml           |  156 +-
 .../tika-parser-database-bundle/pom.xml         |  134 +-
 .../tika-parser-ebook-bundle/pom.xml            |  142 +-
 .../tika-parser-journal-bundle/pom.xml          |  158 +-
 .../apache/tika/module/journal/BundleIT.java    |    2 +-
 .../tika-parser-multimedia-bundle/pom.xml       |  168 +-
 .../tika-parser-office-bundle/pom.xml           |  280 +-
 .../org/apache/tika/module/office/BundleIT.java |   24 +-
 .../tika-parser-package-bundle/pom.xml          |  158 +-
 .../tika-parser-pdf-bundle/pom.xml              |  197 +-
 .../org/apache/tika/module/pdf/BundleIT.java    |    2 +-
 .../tika-parser-scientific-bundle/pom.xml       |  402 +--
 .../tika-parser-text-bundle/pom.xml             |  156 +-
 .../tika-parser-web-bundle/pom.xml              |  184 +-
 tika-parser-modules/pom.xml                     |  410 +--
 .../tika-parser-advanced-module/pom.xml         |  136 +-
 .../module/advanced/internal/Activator.java     |   72 +-
 .../tika-parser-cad-module/pom.xml              |  110 +-
 .../tika/module/cad/internal/Activator.java     |   72 +-
 .../org/apache/tika/parser/dwg/DWGParser.java   |  712 ++---
 .../org/apache/tika/parser/prt/PRTParser.java   |  555 ++--
 .../apache/tika/parser/dwg/DWGParserTest.java   |  372 ++-
 .../apache/tika/parser/prt/PRTParserTest.java   |  214 +-
 .../tika-parser-code-module/pom.xml             |  136 +-
 .../tika/module/code/internal/Activator.java    |   72 +-
 .../org/apache/tika/parser/asm/ClassParser.java |  108 +-
 .../tika/parser/asm/XHTMLClassVisitor.java      |  646 ++--
 .../tika/parser/code/SourceCodeParser.java      |  284 +-
 .../apache/tika/parser/asm/ClassParserTest.java |  118 +-
 .../tika/parser/code/SourceCodeParserTest.java  |  202 +-
 .../tika-parser-crypto-module/pom.xml           |  104 +-
 .../tika/module/crypto/internal/Activator.java  |   72 +-
 .../tika/parser/crypto/Pkcs7ParserTest.java     |   94 +-
 .../tika-parser-database-module/pom.xml         |  132 +-
 .../module/database/internal/Activator.java     |   72 +-
 .../tika-parser-ebook-module/pom.xml            |   94 +-
 .../tika/module/ebook/internal/Activator.java   |   72 +-
 .../tika/parser/epub/EpubContentParser.java     |  118 +-
 .../org/apache/tika/parser/epub/EpubParser.java |  238 +-
 .../apache/tika/parser/epub/EpubParserTest.java |  116 +-
 .../tika-parser-journal-module/pom.xml          |  134 +-
 .../tika/module/journal/internal/Activator.java |   72 +-
 .../tika-parser-multimedia-module/pom.xml       |  206 +-
 .../module/multimedia/internal/Activator.java   |   72 +-
 .../apache/tika/parser/audio/AudioParser.java   |  278 +-
 .../apache/tika/parser/audio/MidiParser.java    |  242 +-
 .../apache/tika/parser/font/TrueTypeParser.java |  222 +-
 .../parser/image/ImageMetadataExtractor.java    | 1124 +++----
 .../apache/tika/parser/image/ImageParser.java   |  406 +--
 .../tika/parser/image/MetadataFields.java       |  168 +-
 .../apache/tika/parser/image/TiffParser.java    |  136 +-
 .../org/apache/tika/parser/jpeg/JpegParser.java |  138 +-
 .../org/apache/tika/parser/mp3/AudioFrame.java  |  504 ++--
 .../tika/parser/mp3/CompositeTagHandler.java    |  284 +-
 .../org/apache/tika/parser/mp3/ID3Tags.java     |  508 ++--
 .../apache/tika/parser/mp3/ID3v1Handler.java    |  366 +--
 .../apache/tika/parser/mp3/ID3v22Handler.java   |  318 +-
 .../apache/tika/parser/mp3/ID3v23Handler.java   |  276 +-
 .../apache/tika/parser/mp3/ID3v24Handler.java   |  286 +-
 .../org/apache/tika/parser/mp3/ID3v2Frame.java  |  848 +++---
 .../apache/tika/parser/mp3/LyricsHandler.java   |  312 +-
 .../org/apache/tika/parser/mp3/MP3Frame.java    |   50 +-
 .../org/apache/tika/parser/mp3/Mp3Parser.java   |  492 +--
 .../tika/parser/ocr/TesseractOCRParser.java     |   93 +-
 .../org/apache/tika/parser/video/FLVParser.java |  536 ++--
 .../parser/ocr/TesseractOCRConfig.properties    |   40 +-
 .../tika/parser/audio/AudioParserTest.java      |  150 +-
 .../tika/parser/audio/MidiParserTest.java       |   84 +-
 .../image/ImageMetadataExtractorTest.java       |  278 +-
 .../tika/parser/image/ImageParserTest.java      |  324 +-
 .../tika/parser/image/MetadataFieldsTest.java   |   72 +-
 .../tika/parser/image/TiffParserTest.java       |  132 +-
 .../apache/tika/parser/jpeg/JpegParserTest.java |  568 ++--
 .../apache/tika/parser/mp3/Mp3ParserTest.java   |  828 ++---
 .../tika/parser/ocr/TesseractOCRConfigTest.java |  184 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java |  527 ++--
 .../apache/tika/parser/video/FLVParserTest.java |   88 +-
 .../tika-parser-office-module/pom.xml           |  250 +-
 .../tika/module/office/internal/Activator.java  |   72 +-
 .../parser/apple/AppleSingleFileParser.java     |  204 ++
 .../org/apache/tika/parser/chm/ChmParser.java   |  224 +-
 .../tika/parser/chm/accessor/ChmAccessor.java   |   78 +-
 .../chm/accessor/ChmDirectoryListingSet.java    |  796 ++---
 .../tika/parser/chm/accessor/ChmItsfHeader.java |  984 +++---
 .../tika/parser/chm/accessor/ChmItspHeader.java | 1096 +++----
 .../parser/chm/accessor/ChmLzxcControlData.java |  638 ++--
 .../parser/chm/accessor/ChmLzxcResetTable.java  |  682 ++---
 .../tika/parser/chm/accessor/ChmPmgiHeader.java |  352 +--
 .../tika/parser/chm/accessor/ChmPmglHeader.java |  412 +--
 .../chm/accessor/DirectoryListingEntry.java     |  302 +-
 .../tika/parser/chm/assertion/ChmAssert.java    |  338 +--
 .../apache/tika/parser/chm/core/ChmCommons.java |  722 ++---
 .../tika/parser/chm/core/ChmConstants.java      |  204 +-
 .../tika/parser/chm/core/ChmExtractor.java      |  784 ++---
 .../apache/tika/parser/chm/core/ChmWrapper.java |  294 +-
 .../chm/exception/ChmParsingException.java      |   54 +-
 .../tika/parser/chm/lzx/ChmBlockInfo.java       |  470 +--
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 1826 +++++------
 .../apache/tika/parser/chm/lzx/ChmLzxState.java |  654 ++--
 .../apache/tika/parser/chm/lzx/ChmSection.java  |  444 +--
 .../org/apache/tika/parser/mbox/MboxParser.java |  418 +--
 .../tika/parser/mbox/OutlookPSTParser.java      |  406 +--
 .../microsoft/AbstractPOIFSExtractor.java       |   32 +-
 .../tika/parser/microsoft/HSLFExtractor.java    |   18 +-
 .../parser/microsoft/JackcessExtractor.java     |    4 +-
 .../parser/microsoft/MSOwnerFileParser.java     |   80 +
 .../tika/parser/microsoft/OfficeParser.java     |    2 +-
 .../tika/parser/microsoft/WordExtractor.java    |   22 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   12 +-
 .../microsoft/xml/AbstractXML2003Parser.java    |   93 +-
 .../microsoft/xml/SpreadsheetMLParser.java      |   42 +-
 .../tika/parser/microsoft/xml/WordMLParser.java |  121 +-
 .../parser/odf/NSNormalizerContentHandler.java  |  198 +-
 .../parser/odf/OpenDocumentContentParser.java   |  992 +++---
 .../tika/parser/odf/OpenDocumentMetaParser.java |  398 +--
 .../tika/parser/odf/OpenDocumentParser.java     |  450 +--
 .../org/apache/tika/parser/opc/OPCDetector.java |  310 +-
 .../parser/opendocument/OpenOfficeParser.java   |   56 +-
 .../org/apache/tika/parser/rtf/GroupState.java  |  134 +-
 .../apache/tika/parser/rtf/ListDescriptor.java  |   70 +-
 .../tika/parser/rtf/RTFEmbObjHandler.java       |    7 +-
 .../tika/parser/rtf/RTFObjDataParser.java       |   43 +-
 .../org/apache/tika/parser/rtf/RTFParser.java   |  186 +-
 .../apache/tika/parser/rtf/TextExtractor.java   | 2853 +++++++++---------
 .../services/org.apache.tika.parser.Parser      |    3 +-
 .../parser/apple/AppleSingleFileParserTest.java |   43 +
 .../tika/parser/chm/TestChmBlockInfo.java       |  250 +-
 .../tika/parser/chm/TestChmExtraction.java      |  424 +--
 .../tika/parser/chm/TestChmExtractor.java       |  126 +-
 .../tika/parser/chm/TestChmItsfHeader.java      |  244 +-
 .../tika/parser/chm/TestChmItspHeader.java      |  320 +-
 .../apache/tika/parser/chm/TestChmLzxState.java |  202 +-
 .../tika/parser/chm/TestChmLzxcControlData.java |  288 +-
 .../tika/parser/chm/TestChmLzxcResetTable.java  |  312 +-
 .../parser/chm/TestDirectoryListingEntry.java   |  170 +-
 .../apache/tika/parser/chm/TestParameters.java  |  208 +-
 .../apache/tika/parser/chm/TestPmgiHeader.java  |   90 +-
 .../apache/tika/parser/chm/TestPmglHeader.java  |  152 +-
 .../apache/tika/parser/dbf/DBFParserTest.java   |    2 +
 .../apache/tika/parser/mbox/MboxParserTest.java |  312 +-
 .../tika/parser/mbox/OutlookPSTParserTest.java  |  220 +-
 .../AbstractPOIContainerExtractionTest.java     |  150 +-
 .../tika/parser/microsoft/ExcelParserTest.java  |  817 ++---
 .../parser/microsoft/MSOwnerFileParserTest.java |   31 +
 .../tika/parser/microsoft/OfficeParserTest.java |   92 +-
 .../parser/microsoft/OutlookParserTest.java     |  478 +--
 .../microsoft/POIContainerExtractionTest.java   |  764 ++---
 .../parser/microsoft/PowerPointParserTest.java  |  492 +--
 .../parser/microsoft/PublisherParserTest.java   |  106 +-
 .../tika/parser/microsoft/TNEFParserTest.java   |  196 +-
 .../tika/parser/microsoft/VisioParserTest.java  |  102 +-
 .../tika/parser/microsoft/WordParserTest.java   | 1011 ++++---
 .../ooxml/OOXMLContainerExtractionTest.java     |    2 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |   27 +
 .../parser/microsoft/xml/XML2003ParserTest.java |   40 +-
 .../apache/tika/parser/odf/ODFParserTest.java   |  680 ++---
 .../apache/tika/parser/rtf/RTFParserTest.java   | 1050 ++++---
 .../tika-parser-package-module/pom.xml          |  150 +-
 .../tika/module/pkg/internal/Activator.java     |   72 +-
 .../tika/parser/iwork/AutoPageNumberUtils.java  |  224 +-
 .../tika/parser/iwork/IWorkPackageParser.java   |  438 +--
 .../parser/iwork/KeynoteContentHandler.java     |  348 +--
 .../parser/iwork/NumbersContentHandler.java     |  462 +--
 .../tika/parser/iwork/PagesContentHandler.java  |  896 +++---
 .../apache/tika/parser/pkg/PackageParser.java   |  574 ++--
 .../tika/parser/pkg/ZipContainerDetector.java   |  648 ++--
 .../parser/iwork/AutoPageNumberUtilsTest.java   |  156 +-
 .../tika/parser/iwork/IWorkParserTest.java      |  932 +++---
 .../apache/tika/parser/pkg/AbstractPkgTest.java |  186 +-
 .../apache/tika/parser/pkg/Bzip2ParserTest.java |  178 +-
 .../apache/tika/parser/pkg/GzipParserTest.java  |  204 +-
 .../apache/tika/parser/pkg/TarParserTest.java   |  210 +-
 .../apache/tika/parser/pkg/ZipParserTest.java   |  384 +--
 .../tika-parser-pdf-module/pom.xml              |  246 +-
 .../tika/module/pdf/internal/Activator.java     |   72 +-
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  579 ++++
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   |  125 +
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  518 +---
 .../org/apache/tika/parser/pdf/PDFParser.java   |    8 +
 .../apache/tika/parser/pdf/PDFParserConfig.java |  274 +-
 .../apache/tika/parser/pdf/PDFParser.properties |   10 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |   49 +-
 .../tika-parser-scientific-module/pom.xml       |  270 +-
 .../module/scientific/internal/Activator.java   |   72 +-
 .../org/apache/tika/parser/hdf/HDFParser.java   |  244 +-
 .../apache/tika/parser/hdf/HDFParserTest.java   |  144 +-
 .../tika/parser/netcdf/NetCDFParserTest.java    |  122 +-
 .../tika-parser-text-module/pom.xml             |  132 +-
 .../tika/module/text/internal/Activator.java    |   40 +-
 .../apache/tika/parser/txt/CharsetDetector.java | 1088 +++----
 .../apache/tika/parser/txt/CharsetMatch.java    |  572 ++--
 .../tika/parser/txt/CharsetRecog_2022.java      |  326 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  198 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  278 +-
 .../tika/parser/txt/CharsetRecog_mbcs.java      | 1064 +++----
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 2706 ++++++++---------
 .../tika/parser/txt/CharsetRecognizer.java      |  108 +-
 .../org/apache/tika/parser/txt/TXTParser.java   |  196 +-
 .../parser/xml/AbstractMetadataHandler.java     |  186 +-
 .../xml/AttributeDependantMetadataHandler.java  |  164 +-
 .../parser/xml/AttributeMetadataHandler.java    |  122 +-
 .../org/apache/tika/parser/xml/DcXMLParser.java |  120 +-
 .../tika/parser/xml/ElementMetadataHandler.java |  510 ++--
 .../tika/parser/xml/FictionBookParser.java      |  234 +-
 .../apache/tika/parser/xml/MetadataHandler.java |  170 +-
 .../org/apache/tika/parser/xml/XMLParser.java   |  178 +-
 .../apache/tika/parser/txt/TXTParserTest.java   |  548 ++--
 .../apache/tika/parser/xml/DcXMLParserTest.java |  174 +-
 .../EmptyAndDuplicateElementsXMLParserTest.java |  232 +-
 .../tika/parser/xml/FictionBookParserTest.java  |  108 +-
 .../tika-parser-web-module/pom.xml              |  178 +-
 .../tika/module/web/internal/Activator.java     |   72 +-
 .../org/apache/tika/parser/feed/FeedParser.java |  254 +-
 .../parser/html/BoilerpipeContentHandler.java   |  694 ++---
 .../tika/parser/html/DefaultHtmlMapper.java     |  274 +-
 .../apache/tika/parser/html/HtmlHandler.java    |  618 ++--
 .../org/apache/tika/parser/html/HtmlMapper.java |  138 +-
 .../org/apache/tika/parser/html/HtmlParser.java |  388 +--
 .../tika/parser/html/IdentityHtmlMapper.java    |   86 +-
 .../tika/parser/html/XHTMLDowngradeHandler.java |  156 +-
 .../tika/parser/mail/MailContentHandler.java    |  752 ++---
 .../apache/tika/parser/mail/RFC822Parser.java   |  190 +-
 .../apache/tika/parser/feed/FeedParserTest.java |  150 +-
 .../apache/tika/parser/html/HtmlParserTest.java | 2262 +++++++-------
 .../tika/parser/mail/RFC822ParserTest.java      |  970 +++---
 .../tika/parser/xmp/JempboxExtractor.java       |   30 +
 .../tika/parser/xmp/JempboxExtractorTest.java   |   29 +-
 .../test-documents/testAppleSingleFile.pdf      |  Bin 0 -> 1893 bytes
 .../test/resources/test-documents/testDJVU.djvu |  Bin 0 -> 89 bytes
 .../test-documents/testEXCEL_embeddedPDF.xls    |  Bin 0 -> 38400 bytes
 .../test-documents/testEXCEL_embeddedPDF.xlsx   |  Bin 0 -> 25602 bytes
 .../test-documents/testEndNoteImportFile.enw    |   10 +
 .../resources/test-documents/testICalendar.ics  |   15 +
 .../resources/test-documents/testMSOwnerFile    |  Bin 0 -> 162 bytes
 .../test-documents/testPPT_embeddedPDF.ppt      |  Bin 0 -> 187392 bytes
 .../test-documents/testPPT_embeddedPDF.pptx     |  Bin 0 -> 108637 bytes
 .../resources/test-documents/testVCalendar.vcs  |   10 +
 .../test-documents/testWindowsMediaMeta.asx     |    6 +
 .../test/resources/test-documents/testXMP.xmp   |  178 ++
 .../test-documents/test_recursive_embedded.doc  |  Bin 0 -> 31744 bytes
 275 files changed, 39074 insertions(+), 37550 deletions(-)
----------------------------------------------------------------------