You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/06/30 19:25:51 UTC
[1/2] tika git commit: TIKA-1978 Invocation of
java.net.URL.equals(Object), which blocks to do domain name resolution,
in org.apache.tika.parser.geo.topic.GeoParser.initialize(URL) 2.x branch
Repository: tika
Updated Branches:
refs/heads/2.x 2a7e52ec4 -> 573527bbc
TIKA-1978 Invocation of java.net.URL.equals(Object), which blocks to do domain name resolution, in org.apache.tika.parser.geo.topic.GeoParser.initialize(URL) 2.x branch
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/bd3ecfcd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/bd3ecfcd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/bd3ecfcd
Branch: refs/heads/2.x
Commit: bd3ecfcddeaf13262e477ba29c5256ebd44e32db
Parents: 43e3000
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu May 26 11:15:02 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu May 26 11:15:02 2016 -0700
----------------------------------------------------------------------
.../apache/tika/parser/geo/topic/GeoParser.java | 43 +++++++++++---------
.../tika/parser/geo/topic/GeoParserConfig.java | 4 +-
.../apache/tika/parser/geo/topic/GeoTag.java | 33 +++++++--------
.../parser/geo/topic/NameEntityExtractor.java | 11 ++---
4 files changed, 48 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
index eaef6ad..303f878 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
@@ -20,19 +20,21 @@ package org.apache.tika.parser.geo.topic;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
import java.util.Set;
+import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
-import org.apache.commons.exec.ExecuteException;
import org.apache.commons.exec.ExecuteWatchdog;
import org.apache.commons.exec.PumpStreamHandler;
-import org.apache.commons.exec.environment.EnvironmentUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -57,7 +59,7 @@ public class GeoParser extends AbstractParser {
private boolean initialized;
private URL modelUrl;
- private NameEntityExtractor extractor;
+ private transient NameEntityExtractor extractor;
private boolean available;
@Override
@@ -70,9 +72,12 @@ public class GeoParser extends AbstractParser {
* @param modelUrl the URL to NER model
*/
public void initialize(URL modelUrl) {
- if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
- // Previously initialized for the same URL, no initialization needed
- return;
+ try {
+ if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) {
+ return;
+ }
+ } catch (URISyntaxException e1) {
+ LOG.log(Level.SEVERE, e1.getMessage(), e1);
}
this.modelUrl = modelUrl;
@@ -112,7 +117,7 @@ public class GeoParser extends AbstractParser {
String bestner = extractor.bestNameEntity;
/*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
- HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+ HashMap<String, ArrayList<String>> resolvedGeonames = (HashMap<String, ArrayList<String>>) searchGeoNames(locationNameEntities);
/*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
GeoTag geotag = new GeoTag();
@@ -120,22 +125,21 @@ public class GeoParser extends AbstractParser {
/* add resolved entities in metadata */
- metadata.add("Geographic_NAME", geotag.Geographic_NAME);
- metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
- metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+ metadata.add("Geographic_NAME", geotag.geoNAME);
+ metadata.add("Geographic_LONGITUDE", geotag.geoLONGTITUDE);
+ metadata.add("Geographic_LATITUDE", geotag.geoLATITUDE);
for (int i = 0; i < geotag.alternatives.size(); ++i) {
- GeoTag alter = (GeoTag) geotag.alternatives.get(i);
- metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+ GeoTag alter = geotag.alternatives.get(i);
+ metadata.add("Optional_NAME" + (i + 1), alter.geoNAME);
metadata.add("Optional_LONGITUDE" + (i + 1),
- alter.Geographic_LONGTITUDE);
+ alter.geoLONGTITUDE);
metadata.add("Optional_LATITUDE" + (i + 1),
- alter.Geographic_LATITUDE);
+ alter.geoLATITUDE);
}
}
- public HashMap<String, ArrayList<String>> searchGeoNames(
- ArrayList<String> locationNameEntities) throws ExecuteException,
- IOException {
+ public Map<String, ArrayList<String>> searchGeoNames(
+ List<String> locationNameEntities) throws IOException {
CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
cmdLine.addArgument("-s");
@@ -150,17 +154,16 @@ public class GeoParser extends AbstractParser {
exec.setWatchdog(watchdog);
PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
exec.setStreamHandler(streamHandler);
- int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment());
String outputJson = outputStream.toString("UTF-8");
JSONArray json = (JSONArray) JSONValue.parse(outputJson);
- HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
+ HashMap<String, ArrayList<String>> returnHash = new HashMap<>();
for (int i = 0; i < json.size(); i++) {
JSONObject obj = (JSONObject) json.get(i);
for (Object key : obj.keySet()) {
String theKey = (String) key;
JSONArray vals = (JSONArray) obj.get(theKey);
- ArrayList<String> stringVals = new ArrayList<String>(
+ ArrayList<String> stringVals = new ArrayList<>(
vals.size());
for (int j = 0; j < vals.size(); j++) {
String val = (String) vals.get(j);
http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
index 305e663..56272e1 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
@@ -30,7 +30,7 @@ public class GeoParserConfig implements Serializable {
this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
}
- public void setNERModelPath(String path) {
+ public void setNERModelPath(String path) throws MalformedURLException {
if (path == null)
return;
File file = new File(path);
@@ -40,7 +40,7 @@ public class GeoParserConfig implements Serializable {
try {
this.nerModelUrl = file.toURI().toURL();
} catch (MalformedURLException e) {
- throw new RuntimeException(e);
+ throw new MalformedURLException(e.getMessage());
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
index bccaef1..fe4b9c6 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
@@ -18,18 +18,19 @@
package org.apache.tika.parser.geo.topic;
import java.util.ArrayList;
-import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
public class GeoTag {
- String Geographic_NAME;
- String Geographic_LONGTITUDE;
- String Geographic_LATITUDE;
- ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+ String geoNAME;
+ String geoLONGTITUDE;
+ String geoLATITUDE;
+ ArrayList<GeoTag> alternatives = new ArrayList<>();
public void setMain(String name, String longitude, String latitude) {
- Geographic_NAME = name;
- Geographic_LONGTITUDE = longitude;
- Geographic_LATITUDE = latitude;
+ geoNAME = name;
+ geoLONGTITUDE = longitude;
+ geoLATITUDE = latitude;
}
public void addAlternative(GeoTag geotag) {
@@ -44,20 +45,20 @@ public class GeoTag {
* @param bestNER best name entity among all the extracted entities for the
* input stream
*/
- public void toGeoTag(HashMap<String, ArrayList<String>> resolvedGeonames,
+ public void toGeoTag(Map<String, ArrayList<String>> resolvedGeonames,
String bestNER) {
- for (String key : resolvedGeonames.keySet()) {
+ for (Entry<String, ArrayList<String>> key : resolvedGeonames.entrySet()) {
ArrayList<String> cur = resolvedGeonames.get(key);
if (key.equals(bestNER)) {
- this.Geographic_NAME = cur.get(0);
- this.Geographic_LONGTITUDE = cur.get(1);
- this.Geographic_LATITUDE = cur.get(2);
+ this.geoNAME = cur.get(0);
+ this.geoLONGTITUDE = cur.get(1);
+ this.geoLATITUDE = cur.get(2);
} else {
GeoTag alter = new GeoTag();
- alter.Geographic_NAME = cur.get(0);
- alter.Geographic_LONGTITUDE = cur.get(1);
- alter.Geographic_LATITUDE = cur.get(2);
+ alter.geoNAME = cur.get(0);
+ alter.geoLONGTITUDE = cur.get(1);
+ alter.geoLATITUDE = cur.get(2);
this.addAlternative(alter);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/bd3ecfcd/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
index 3c6f0e8..822d343 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
@@ -43,11 +43,11 @@ public class NameEntityExtractor {
private final NameFinderME nameFinder;
public NameEntityExtractor(URL modelUrl) throws IOException {
- this.locationNameEntities = new ArrayList<String>();
+ this.locationNameEntities = new ArrayList<>();
this.bestNameEntity = null;
TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
this.nameFinder = new NameFinderME(model);
- this.tf = new HashMap<String, Integer>();
+ this.tf = new HashMap<>();
}
/*
@@ -59,7 +59,7 @@ public class NameEntityExtractor {
*/
public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
String[] in = IOUtils.toString(stream, UTF_8).split(" ");
- Span nameE[];
+ Span[] nameE;
//name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
synchronized (nameFinder) {
@@ -89,7 +89,7 @@ public class NameEntityExtractor {
* ArrayList
*/
public void getBestNameEntity() {
- if (this.locationNameEntities.size() == 0)
+ if (this.locationNameEntities.isEmpty())
return;
for (int i = 0; i < this.locationNameEntities.size(); ++i) {
@@ -100,10 +100,11 @@ public class NameEntityExtractor {
tf.put(this.locationNameEntities.get(i), 1);
}
int max = 0;
- List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+ List<Map.Entry<String, Integer>> list = new ArrayList<>(
tf.entrySet());
Collections.shuffle(list);
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+ @Override
public int compare(Map.Entry<String, Integer> o1,
Map.Entry<String, Integer> o2) {
// Descending Order
[2/2] tika git commit: Merge branch '2.x' of
https://git-wip-us.apache.org/repos/asf/tika into 2.x
Posted by le...@apache.org.
Merge branch '2.x' of https://git-wip-us.apache.org/repos/asf/tika into 2.x
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/573527bb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/573527bb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/573527bb
Branch: refs/heads/2.x
Commit: 573527bbc608d495c40f26c02c7286197c3c723b
Parents: bd3ecfc 2a7e52e
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu Jun 30 12:32:01 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu Jun 30 12:32:01 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 36 +-
tika-app/pom.xml | 8 +
.../org/apache/tika/mime/TestMimeTypes.java | 33 +-
.../tika/parser/AutoDetectParserTest.java | 24 +-
.../ConfigurableThreadPoolExecutor.java | 64 +-
.../concurrent/SimpleThreadPoolExecutor.java | 80 +-
.../apache/tika/detect/AbstractDetector.java | 86 +-
.../org/apache/tika/detect/DetectorProxy.java | 134 +-
.../tika/detect/EncodingDetectorProxy.java | 82 +-
.../java/org/apache/tika/io/EndianUtils.java | 830 ++---
.../java/org/apache/tika/io/StringUtil.java | 242 +-
.../tika/metadata/TikaCoreProperties.java | 7 +
.../tika/osgi/TikaAbstractBundleActivator.java | 142 +-
.../java/org/apache/tika/osgi/TikaService.java | 50 +-
.../tika/osgi/internal/TikaServiceImpl.java | 162 +-
.../org/apache/tika/parser/AbstractParser.java | 24 -
.../org/apache/tika/parser/ParserProxy.java | 148 +-
.../org/apache/tika/utils/ConcurrentUtils.java | 114 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 54 +-
.../java/org/apache/tika/TikaDetectionTest.java | 2 +-
.../src/test/java/org/apache/tika/TikaTest.java | 6 +-
.../org/apache/tika/config/DummyExecutor.java | 60 +-
.../apache/tika/detect/DetectorProxyTest.java | 112 +-
.../apache/tika/detect/DummyProxyDetector.java | 62 +-
.../org/apache/tika/io/EndianUtilsTest.java | 35 +
.../apache/tika/parser/DummyProxyParser.java | 88 +-
.../org/apache/tika/parser/ParserProxyTest.java | 130 +-
.../apache/tika/utils/ConcurrentUtilsTest.java | 126 +-
.../services/org.apache.tika.parser.Parser | 34 +-
.../apache/tika/config/TIKA-1762-executors.xml | 56 +-
tika-parent/pom.xml | 9 +
tika-parser-bundles/pom.xml | 350 +--
.../tika-parser-advanced-bundle/pom.xml | 162 +-
.../tika-parser-cad-bundle/pom.xml | 144 +-
.../tika-parser-code-bundle/pom.xml | 148 +-
.../tika-parser-crypto-bundle/pom.xml | 156 +-
.../tika-parser-database-bundle/pom.xml | 134 +-
.../tika-parser-ebook-bundle/pom.xml | 142 +-
.../tika-parser-journal-bundle/pom.xml | 158 +-
.../apache/tika/module/journal/BundleIT.java | 2 +-
.../tika-parser-multimedia-bundle/pom.xml | 168 +-
.../tika-parser-office-bundle/pom.xml | 280 +-
.../org/apache/tika/module/office/BundleIT.java | 24 +-
.../tika-parser-package-bundle/pom.xml | 158 +-
.../tika-parser-pdf-bundle/pom.xml | 197 +-
.../org/apache/tika/module/pdf/BundleIT.java | 2 +-
.../tika-parser-scientific-bundle/pom.xml | 402 +--
.../tika-parser-text-bundle/pom.xml | 156 +-
.../tika-parser-web-bundle/pom.xml | 184 +-
tika-parser-modules/pom.xml | 410 +--
.../tika-parser-advanced-module/pom.xml | 136 +-
.../module/advanced/internal/Activator.java | 72 +-
.../tika-parser-cad-module/pom.xml | 110 +-
.../tika/module/cad/internal/Activator.java | 72 +-
.../org/apache/tika/parser/dwg/DWGParser.java | 712 ++---
.../org/apache/tika/parser/prt/PRTParser.java | 555 ++--
.../apache/tika/parser/dwg/DWGParserTest.java | 372 ++-
.../apache/tika/parser/prt/PRTParserTest.java | 214 +-
.../tika-parser-code-module/pom.xml | 136 +-
.../tika/module/code/internal/Activator.java | 72 +-
.../org/apache/tika/parser/asm/ClassParser.java | 108 +-
.../tika/parser/asm/XHTMLClassVisitor.java | 646 ++--
.../tika/parser/code/SourceCodeParser.java | 284 +-
.../apache/tika/parser/asm/ClassParserTest.java | 118 +-
.../tika/parser/code/SourceCodeParserTest.java | 202 +-
.../tika-parser-crypto-module/pom.xml | 104 +-
.../tika/module/crypto/internal/Activator.java | 72 +-
.../tika/parser/crypto/Pkcs7ParserTest.java | 94 +-
.../tika-parser-database-module/pom.xml | 132 +-
.../module/database/internal/Activator.java | 72 +-
.../tika-parser-ebook-module/pom.xml | 94 +-
.../tika/module/ebook/internal/Activator.java | 72 +-
.../tika/parser/epub/EpubContentParser.java | 118 +-
.../org/apache/tika/parser/epub/EpubParser.java | 238 +-
.../apache/tika/parser/epub/EpubParserTest.java | 116 +-
.../tika-parser-journal-module/pom.xml | 134 +-
.../tika/module/journal/internal/Activator.java | 72 +-
.../tika-parser-multimedia-module/pom.xml | 206 +-
.../module/multimedia/internal/Activator.java | 72 +-
.../apache/tika/parser/audio/AudioParser.java | 278 +-
.../apache/tika/parser/audio/MidiParser.java | 242 +-
.../apache/tika/parser/font/TrueTypeParser.java | 222 +-
.../parser/image/ImageMetadataExtractor.java | 1124 +++----
.../apache/tika/parser/image/ImageParser.java | 406 +--
.../tika/parser/image/MetadataFields.java | 168 +-
.../apache/tika/parser/image/TiffParser.java | 136 +-
.../org/apache/tika/parser/jpeg/JpegParser.java | 138 +-
.../org/apache/tika/parser/mp3/AudioFrame.java | 504 ++--
.../tika/parser/mp3/CompositeTagHandler.java | 284 +-
.../org/apache/tika/parser/mp3/ID3Tags.java | 508 ++--
.../apache/tika/parser/mp3/ID3v1Handler.java | 366 +--
.../apache/tika/parser/mp3/ID3v22Handler.java | 318 +-
.../apache/tika/parser/mp3/ID3v23Handler.java | 276 +-
.../apache/tika/parser/mp3/ID3v24Handler.java | 286 +-
.../org/apache/tika/parser/mp3/ID3v2Frame.java | 848 +++---
.../apache/tika/parser/mp3/LyricsHandler.java | 312 +-
.../org/apache/tika/parser/mp3/MP3Frame.java | 50 +-
.../org/apache/tika/parser/mp3/Mp3Parser.java | 492 +--
.../tika/parser/ocr/TesseractOCRParser.java | 93 +-
.../org/apache/tika/parser/video/FLVParser.java | 536 ++--
.../parser/ocr/TesseractOCRConfig.properties | 40 +-
.../tika/parser/audio/AudioParserTest.java | 150 +-
.../tika/parser/audio/MidiParserTest.java | 84 +-
.../image/ImageMetadataExtractorTest.java | 278 +-
.../tika/parser/image/ImageParserTest.java | 324 +-
.../tika/parser/image/MetadataFieldsTest.java | 72 +-
.../tika/parser/image/TiffParserTest.java | 132 +-
.../apache/tika/parser/jpeg/JpegParserTest.java | 568 ++--
.../apache/tika/parser/mp3/Mp3ParserTest.java | 828 ++---
.../tika/parser/ocr/TesseractOCRConfigTest.java | 184 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 527 ++--
.../apache/tika/parser/video/FLVParserTest.java | 88 +-
.../tika-parser-office-module/pom.xml | 250 +-
.../tika/module/office/internal/Activator.java | 72 +-
.../parser/apple/AppleSingleFileParser.java | 204 ++
.../org/apache/tika/parser/chm/ChmParser.java | 224 +-
.../tika/parser/chm/accessor/ChmAccessor.java | 78 +-
.../chm/accessor/ChmDirectoryListingSet.java | 796 ++---
.../tika/parser/chm/accessor/ChmItsfHeader.java | 984 +++---
.../tika/parser/chm/accessor/ChmItspHeader.java | 1096 +++----
.../parser/chm/accessor/ChmLzxcControlData.java | 638 ++--
.../parser/chm/accessor/ChmLzxcResetTable.java | 682 ++---
.../tika/parser/chm/accessor/ChmPmgiHeader.java | 352 +--
.../tika/parser/chm/accessor/ChmPmglHeader.java | 412 +--
.../chm/accessor/DirectoryListingEntry.java | 302 +-
.../tika/parser/chm/assertion/ChmAssert.java | 338 +--
.../apache/tika/parser/chm/core/ChmCommons.java | 722 ++---
.../tika/parser/chm/core/ChmConstants.java | 204 +-
.../tika/parser/chm/core/ChmExtractor.java | 784 ++---
.../apache/tika/parser/chm/core/ChmWrapper.java | 294 +-
.../chm/exception/ChmParsingException.java | 54 +-
.../tika/parser/chm/lzx/ChmBlockInfo.java | 470 +--
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 1826 +++++------
.../apache/tika/parser/chm/lzx/ChmLzxState.java | 654 ++--
.../apache/tika/parser/chm/lzx/ChmSection.java | 444 +--
.../org/apache/tika/parser/mbox/MboxParser.java | 418 +--
.../tika/parser/mbox/OutlookPSTParser.java | 406 +--
.../microsoft/AbstractPOIFSExtractor.java | 32 +-
.../tika/parser/microsoft/HSLFExtractor.java | 18 +-
.../parser/microsoft/JackcessExtractor.java | 4 +-
.../parser/microsoft/MSOwnerFileParser.java | 80 +
.../tika/parser/microsoft/OfficeParser.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 22 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +-
.../microsoft/xml/AbstractXML2003Parser.java | 93 +-
.../microsoft/xml/SpreadsheetMLParser.java | 42 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 121 +-
.../parser/odf/NSNormalizerContentHandler.java | 198 +-
.../parser/odf/OpenDocumentContentParser.java | 992 +++---
.../tika/parser/odf/OpenDocumentMetaParser.java | 398 +--
.../tika/parser/odf/OpenDocumentParser.java | 450 +--
.../org/apache/tika/parser/opc/OPCDetector.java | 310 +-
.../parser/opendocument/OpenOfficeParser.java | 56 +-
.../org/apache/tika/parser/rtf/GroupState.java | 134 +-
.../apache/tika/parser/rtf/ListDescriptor.java | 70 +-
.../tika/parser/rtf/RTFEmbObjHandler.java | 7 +-
.../tika/parser/rtf/RTFObjDataParser.java | 43 +-
.../org/apache/tika/parser/rtf/RTFParser.java | 186 +-
.../apache/tika/parser/rtf/TextExtractor.java | 2853 +++++++++---------
.../services/org.apache.tika.parser.Parser | 3 +-
.../parser/apple/AppleSingleFileParserTest.java | 43 +
.../tika/parser/chm/TestChmBlockInfo.java | 250 +-
.../tika/parser/chm/TestChmExtraction.java | 424 +--
.../tika/parser/chm/TestChmExtractor.java | 126 +-
.../tika/parser/chm/TestChmItsfHeader.java | 244 +-
.../tika/parser/chm/TestChmItspHeader.java | 320 +-
.../apache/tika/parser/chm/TestChmLzxState.java | 202 +-
.../tika/parser/chm/TestChmLzxcControlData.java | 288 +-
.../tika/parser/chm/TestChmLzxcResetTable.java | 312 +-
.../parser/chm/TestDirectoryListingEntry.java | 170 +-
.../apache/tika/parser/chm/TestParameters.java | 208 +-
.../apache/tika/parser/chm/TestPmgiHeader.java | 90 +-
.../apache/tika/parser/chm/TestPmglHeader.java | 152 +-
.../apache/tika/parser/dbf/DBFParserTest.java | 2 +
.../apache/tika/parser/mbox/MboxParserTest.java | 312 +-
.../tika/parser/mbox/OutlookPSTParserTest.java | 220 +-
.../AbstractPOIContainerExtractionTest.java | 150 +-
.../tika/parser/microsoft/ExcelParserTest.java | 817 ++---
.../parser/microsoft/MSOwnerFileParserTest.java | 31 +
.../tika/parser/microsoft/OfficeParserTest.java | 92 +-
.../parser/microsoft/OutlookParserTest.java | 478 +--
.../microsoft/POIContainerExtractionTest.java | 764 ++---
.../parser/microsoft/PowerPointParserTest.java | 492 +--
.../parser/microsoft/PublisherParserTest.java | 106 +-
.../tika/parser/microsoft/TNEFParserTest.java | 196 +-
.../tika/parser/microsoft/VisioParserTest.java | 102 +-
.../tika/parser/microsoft/WordParserTest.java | 1011 ++++---
.../ooxml/OOXMLContainerExtractionTest.java | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 27 +
.../parser/microsoft/xml/XML2003ParserTest.java | 40 +-
.../apache/tika/parser/odf/ODFParserTest.java | 680 ++---
.../apache/tika/parser/rtf/RTFParserTest.java | 1050 ++++---
.../tika-parser-package-module/pom.xml | 150 +-
.../tika/module/pkg/internal/Activator.java | 72 +-
.../tika/parser/iwork/AutoPageNumberUtils.java | 224 +-
.../tika/parser/iwork/IWorkPackageParser.java | 438 +--
.../parser/iwork/KeynoteContentHandler.java | 348 +--
.../parser/iwork/NumbersContentHandler.java | 462 +--
.../tika/parser/iwork/PagesContentHandler.java | 896 +++---
.../apache/tika/parser/pkg/PackageParser.java | 574 ++--
.../tika/parser/pkg/ZipContainerDetector.java | 648 ++--
.../parser/iwork/AutoPageNumberUtilsTest.java | 156 +-
.../tika/parser/iwork/IWorkParserTest.java | 932 +++---
.../apache/tika/parser/pkg/AbstractPkgTest.java | 186 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 178 +-
.../apache/tika/parser/pkg/GzipParserTest.java | 204 +-
.../apache/tika/parser/pkg/TarParserTest.java | 210 +-
.../apache/tika/parser/pkg/ZipParserTest.java | 384 +--
.../tika-parser-pdf-module/pom.xml | 246 +-
.../tika/module/pdf/internal/Activator.java | 72 +-
.../tika/parser/pdf/AbstractPDF2XHTML.java | 579 ++++
.../org/apache/tika/parser/pdf/OCR2XHTML.java | 125 +
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 518 +---
.../org/apache/tika/parser/pdf/PDFParser.java | 8 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 274 +-
.../apache/tika/parser/pdf/PDFParser.properties | 10 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 49 +-
.../tika-parser-scientific-module/pom.xml | 270 +-
.../module/scientific/internal/Activator.java | 72 +-
.../org/apache/tika/parser/hdf/HDFParser.java | 244 +-
.../apache/tika/parser/hdf/HDFParserTest.java | 144 +-
.../tika/parser/netcdf/NetCDFParserTest.java | 122 +-
.../tika-parser-text-module/pom.xml | 132 +-
.../tika/module/text/internal/Activator.java | 40 +-
.../apache/tika/parser/txt/CharsetDetector.java | 1088 +++----
.../apache/tika/parser/txt/CharsetMatch.java | 572 ++--
.../tika/parser/txt/CharsetRecog_2022.java | 326 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 198 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 278 +-
.../tika/parser/txt/CharsetRecog_mbcs.java | 1064 +++----
.../tika/parser/txt/CharsetRecog_sbcs.java | 2706 ++++++++---------
.../tika/parser/txt/CharsetRecognizer.java | 108 +-
.../org/apache/tika/parser/txt/TXTParser.java | 196 +-
.../parser/xml/AbstractMetadataHandler.java | 186 +-
.../xml/AttributeDependantMetadataHandler.java | 164 +-
.../parser/xml/AttributeMetadataHandler.java | 122 +-
.../org/apache/tika/parser/xml/DcXMLParser.java | 120 +-
.../tika/parser/xml/ElementMetadataHandler.java | 510 ++--
.../tika/parser/xml/FictionBookParser.java | 234 +-
.../apache/tika/parser/xml/MetadataHandler.java | 170 +-
.../org/apache/tika/parser/xml/XMLParser.java | 178 +-
.../apache/tika/parser/txt/TXTParserTest.java | 548 ++--
.../apache/tika/parser/xml/DcXMLParserTest.java | 174 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 232 +-
.../tika/parser/xml/FictionBookParserTest.java | 108 +-
.../tika-parser-web-module/pom.xml | 178 +-
.../tika/module/web/internal/Activator.java | 72 +-
.../org/apache/tika/parser/feed/FeedParser.java | 254 +-
.../parser/html/BoilerpipeContentHandler.java | 694 ++---
.../tika/parser/html/DefaultHtmlMapper.java | 274 +-
.../apache/tika/parser/html/HtmlHandler.java | 618 ++--
.../org/apache/tika/parser/html/HtmlMapper.java | 138 +-
.../org/apache/tika/parser/html/HtmlParser.java | 388 +--
.../tika/parser/html/IdentityHtmlMapper.java | 86 +-
.../tika/parser/html/XHTMLDowngradeHandler.java | 156 +-
.../tika/parser/mail/MailContentHandler.java | 752 ++---
.../apache/tika/parser/mail/RFC822Parser.java | 190 +-
.../apache/tika/parser/feed/FeedParserTest.java | 150 +-
.../apache/tika/parser/html/HtmlParserTest.java | 2262 +++++++-------
.../tika/parser/mail/RFC822ParserTest.java | 970 +++---
.../tika/parser/xmp/JempboxExtractor.java | 30 +
.../tika/parser/xmp/JempboxExtractorTest.java | 29 +-
.../test-documents/testAppleSingleFile.pdf | Bin 0 -> 1893 bytes
.../test/resources/test-documents/testDJVU.djvu | Bin 0 -> 89 bytes
.../test-documents/testEXCEL_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testEXCEL_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testEndNoteImportFile.enw | 10 +
.../resources/test-documents/testICalendar.ics | 15 +
.../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes
.../test-documents/testPPT_embeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_embeddedPDF.pptx | Bin 0 -> 108637 bytes
.../resources/test-documents/testVCalendar.vcs | 10 +
.../test-documents/testWindowsMediaMeta.asx | 6 +
.../test/resources/test-documents/testXMP.xmp | 178 ++
.../test-documents/test_recursive_embedded.doc | Bin 0 -> 31744 bytes
275 files changed, 39074 insertions(+), 37550 deletions(-)
----------------------------------------------------------------------