You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/24 18:26:58 UTC
[tika] branch TIKA-3304 updated (26ff633 -> 5dbffcd)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-3304
in repository https://gitbox.apache.org/repos/asf/tika.git.
from 26ff633 WIP -- do not merge...still a bunch to do
add e47f625 TIKA-3309 Add convenience constructors to RereadableInputStream (#408)
add 7f684d7 fix setter in ByteFlipper
add c11cab6 upgrade jackcess
add 8028a00 improve robustness of image processing in PDFs
add 3096f3f fix unit test to handle counts w and w/out tesseract
add cba0372 TIKA-3316 -- improve XPS parser to include open XPS and allow for streaming zips with data descriptors
add 31da853 TIKA-3318 MP3 parser should output the xmpDM:duration metadata as seconds not milliseconds
add 356cf44 TIKA-3318 Document the units of xmpDM:duration as seconds by default
add d80dc36 TIKA-3310 Check if MP4 file's compatible brands match any of the expected values
add 187fd47 TIKA-3310 Check major brand before checking compatible brands
add 4551f7d Separate search for major brand and compatible brands
add 4bd931d Merge pull request #410 from peterkronenberg/main
add 06769d3 Added case-insensitivity to tika server ocr header names (#414)
add 1bdbc56 Update CHANGES.txt
add fa5612a TIKA-3324 -- add checkstyle plugin -- fail on build for tika-core only as a start
add 87f05de TIKA-3313 Improve performance and usability of RereadableInputStream (#413)
add 01dca21 Minor cleaning and added missing javadoc on TikaServer (#415)
add 9905db9 Merge remote-tracking branch 'origin/main' into main
add 0e64563 TIKA-3324 -- add checkstyle plugin -- fix merge
add 42b719b TIKA-3324 -- add checkstyle plugin -- fix merge, again... :(
add 4428958 TIKA-3323 -- allow flexibility for 'file' command output on different operating systems.
add ba9bcb2 TIKA-3316 -- fix for slightly different behavior of RereadableInputStream
add d93ba62 TIKA-3324 -- code cleanup for checkstyle in tika-parsers-classic
add f58a27c TIKA-3325 -- writeLimit is now calculated on the full file (container and embedded documents), no longer on each.
add 20eae4f TIKA-3331 -- throw a more informative exception for an encrypted odt file
add 1766166 TIKA-3322 -- upgrade PDFBox to 2.0.23
add 33a4f42 clean up dependencies
add 29ef4b5 TIKA-3332 -- recursively search embedded file tree for attachments
add 5da9984 TIKA-3332 -- checkstyle fix
add 0beb61a TIKA-3324 -- add checkstyle enforcement to the tika-server module
add ea359c9 TIKA-3324 -- add checkstyle enforcement to tika-parsers and submodules.
add 769938f [TIKA-3311] Add github workflows to Tika
add 41a99cc Merge pull request #407 from lewismc/TIKA-3311
add 667a310 TIKA-3334 -- fix thread safety in OpenDocumentParser
add de6cf73 TIKA-3336 -- don't doubly advance...prevent new zip bomb warnings in regression tests for 1.26 release
add ac05932 TIKA-3335 -- invalid xml during encryption check shouldn't cause the parse to fail
add d87ac65 checkstyle fixes
new 9416f59 Merge remote-tracking branch 'origin/main' into TIKA-3304
new 5dbffcd merge from main and required updates/conflict resolution
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.github/pull_request_template.md | 12 +
.../workflows/main-build.yml | 32 +-
.gitignore | 1 +
CHANGES.txt | 25 +-
tika-core/pom.xml | 30 +
tika-core/src/main/java/org/apache/tika/Tika.java | 143 +-
.../concurrent/ConfigurableThreadPoolExecutor.java | 64 +-
.../tika/concurrent/SimpleThreadPoolExecutor.java | 82 +-
.../main/java/org/apache/tika/config/Field.java | 4 +-
.../java/org/apache/tika/config/Initializable.java | 11 +-
.../tika/config/InitializableProblemHandler.java | 19 +-
.../org/apache/tika/config/LoadErrorHandler.java | 29 +-
.../main/java/org/apache/tika/config/Param.java | 261 +-
.../java/org/apache/tika/config/ParamField.java | 56 +-
.../java/org/apache/tika/config/ServiceLoader.java | 212 +-
.../java/org/apache/tika/config/TikaActivator.java | 5 +-
.../java/org/apache/tika/config/TikaConfig.java | 701 +++---
.../apache/tika/config/TikaConfigSerializer.java | 119 +-
.../org/apache/tika/detect/AutoDetectReader.java | 100 +-
.../org/apache/tika/detect/CompositeDetector.java | 34 +-
.../tika/detect/CompositeEncodingDetector.java | 22 +-
.../org/apache/tika/detect/DefaultDetector.java | 83 +-
.../tika/detect/DefaultEncodingDetector.java | 13 +-
.../apache/tika/detect/DefaultProbDetector.java | 32 +-
.../main/java/org/apache/tika/detect/Detector.java | 2 +-
.../java/org/apache/tika/detect/EmptyDetector.java | 3 +-
.../org/apache/tika/detect/EncodingDetector.java | 2 +-
.../apache/tika/detect/FileCommandDetector.java | 58 +-
.../java/org/apache/tika/detect/MagicDetector.java | 357 ++-
.../apache/tika/detect/NNExampleModelDetector.java | 19 +-
.../org/apache/tika/detect/NNTrainedModel.java | 147 +-
.../apache/tika/detect/NNTrainedModelBuilder.java | 77 +-
.../java/org/apache/tika/detect/NameDetector.java | 6 +-
.../tika/detect/NonDetectingEncodingDetector.java | 8 +-
.../org/apache/tika/detect/OverrideDetector.java | 9 +-
.../java/org/apache/tika/detect/TextDetector.java | 15 +-
.../org/apache/tika/detect/TextStatistics.java | 27 +-
.../java/org/apache/tika/detect/TrainedModel.java | 7 +-
.../apache/tika/detect/TrainedModelDetector.java | 22 +-
.../java/org/apache/tika/detect/TypeDetector.java | 2 +-
.../org/apache/tika/detect/XmlRootExtractor.java | 16 +-
.../apache/tika/detect/ZeroSizeFileDetector.java | 13 +-
.../java/org/apache/tika/embedder/Embedder.java | 37 +-
.../org/apache/tika/embedder/ExternalEmbedder.java | 243 +-
.../tika/exception/EncryptedDocumentException.java | 2 +-
.../apache/tika/exception/TikaConfigException.java | 1 +
.../org/apache/tika/exception/TikaException.java | 2 +-
.../tika/exception/TikaMemoryLimitException.java | 2 +-
.../{package-info.java => WriteLimitReached.java} | 10 +-
.../tika/exception/ZeroByteFileException.java | 15 +-
.../apache/tika/extractor/ContainerExtractor.java | 23 +-
.../extractor/DefaultEmbeddedStreamTranslator.java | 8 +-
.../apache/tika/extractor/DocumentSelector.java | 2 +-
.../tika/extractor/EmbeddedDocumentExtractor.java | 7 +-
.../tika/extractor/EmbeddedDocumentUtil.java | 24 +-
.../tika/extractor/EmbeddedStreamTranslator.java | 4 +-
.../tika/extractor/ParserContainerExtractor.java | 8 +-
.../ParsingEmbeddedDocumentExtractor.java | 28 +-
.../org/apache/tika/fork/ClassLoaderProxy.java | 10 +-
.../org/apache/tika/fork/ClassLoaderResource.java | 7 +-
.../org/apache/tika/fork/ContentHandlerProxy.java | 54 +-
.../apache/tika/fork/ContentHandlerResource.java | 14 +-
.../main/java/org/apache/tika/fork/ForkClient.java | 184 +-
.../apache/tika/fork/ForkObjectInputStream.java | 43 +-
.../main/java/org/apache/tika/fork/ForkParser.java | 162 +-
.../java/org/apache/tika/fork/ForkResource.java | 4 +-
.../main/java/org/apache/tika/fork/ForkServer.java | 126 +-
.../org/apache/tika/fork/InputStreamProxy.java | 4 +-
.../org/apache/tika/fork/InputStreamResource.java | 3 +-
.../org/apache/tika/fork/MemoryURLConnection.java | 2 +-
.../apache/tika/fork/MemoryURLStreamHandler.java | 4 +-
.../tika/fork/MemoryURLStreamHandlerFactory.java | 2 +-
.../apache/tika/fork/MemoryURLStreamRecord.java | 2 +-
.../apache/tika/fork/MetadataContentHandler.java | 6 +-
.../org/apache/tika/fork/ParserFactoryFactory.java | 13 +-
.../fork/RecursiveMetadataContentHandlerProxy.java | 39 +-
.../RecursiveMetadataContentHandlerResource.java | 40 +-
.../org/apache/tika/io/BoundedInputStream.java | 4 +-
.../main/java/org/apache/tika/io/EndianUtils.java | 42 +-
.../java/org/apache/tika/io/FilenameUtils.java | 30 +-
.../src/main/java/org/apache/tika/io/IOUtils.java | 18 +-
.../org/apache/tika/io/InputStreamFactory.java | 17 +-
.../org/apache/tika/io/LookaheadInputStream.java | 10 +-
.../org/apache/tika/io/MappedBufferCleaner.java | 96 +-
.../main/java/org/apache/tika/io/TailStream.java | 141 +-
.../org/apache/tika/io/TemporaryResources.java | 8 +-
.../java/org/apache/tika/io/TikaInputStream.java | 314 ++-
.../apache/tika/language/LanguageIdentifier.java | 165 +-
.../org/apache/tika/language/LanguageProfile.java | 106 +-
.../tika/language/LanguageProfilerBuilder.java | 496 ++--
.../org/apache/tika/language/ProfilingHandler.java | 3 +-
.../org/apache/tika/language/ProfilingWriter.java | 2 +-
.../tika/language/detect/LanguageConfidence.java | 5 +-
.../tika/language/detect/LanguageDetector.java | 342 +--
.../tika/language/detect/LanguageHandler.java | 10 +-
.../apache/tika/language/detect/LanguageNames.java | 111 +-
.../tika/language/detect/LanguageResult.java | 158 +-
.../tika/language/detect/LanguageWriter.java | 10 +-
.../tika/language/translate/DefaultTranslator.java | 14 +-
.../tika/language/translate/EmptyTranslator.java | 2 +-
.../apache/tika/language/translate/Translator.java | 22 +-
.../apache/tika/metadata/AccessPermissions.java | 24 +-
.../org/apache/tika/metadata/ClimateForcast.java | 30 +-
.../org/apache/tika/metadata/CreativeCommons.java | 2 +-
.../java/org/apache/tika/metadata/Database.java | 14 +-
.../java/org/apache/tika/metadata/DublinCore.java | 76 +-
.../main/java/org/apache/tika/metadata/Font.java | 4 +-
.../java/org/apache/tika/metadata/Geographic.java | 15 +-
.../main/java/org/apache/tika/metadata/HTML.java | 6 +-
.../java/org/apache/tika/metadata/HttpHeaders.java | 4 +-
.../main/java/org/apache/tika/metadata/IPTC.java | 2527 ++++++++++----------
.../org/apache/tika/metadata/MachineMetadata.java | 143 +-
.../java/org/apache/tika/metadata/Message.java | 38 +-
.../java/org/apache/tika/metadata/Metadata.java | 241 +-
.../main/java/org/apache/tika/metadata/Office.java | 211 +-
.../apache/tika/metadata/OfficeOpenXMLCore.java | 52 +-
.../tika/metadata/OfficeOpenXMLExtended.java | 65 +-
.../main/java/org/apache/tika/metadata/PDF.java | 76 +-
.../java/org/apache/tika/metadata/PagedText.java | 4 +-
.../java/org/apache/tika/metadata/Photoshop.java | 31 +-
.../java/org/apache/tika/metadata/Property.java | 265 +-
.../tika/metadata/PropertyTypeException.java | 8 +-
.../java/org/apache/tika/metadata/QuattroPro.java | 66 +-
.../java/org/apache/tika/metadata/RTFMetadata.java | 45 +-
.../main/java/org/apache/tika/metadata/TIFF.java | 100 +-
.../apache/tika/metadata/TikaCoreProperties.java | 337 ++-
.../java/org/apache/tika/metadata/WordPerfect.java | 100 +-
.../main/java/org/apache/tika/metadata/XMP.java | 4 +-
.../main/java/org/apache/tika/metadata/XMPDM.java | 290 +--
.../main/java/org/apache/tika/metadata/XMPIdq.java | 4 +-
.../main/java/org/apache/tika/metadata/XMPMM.java | 42 +-
.../java/org/apache/tika/metadata/XMPRights.java | 20 +-
.../metadata/filter/ClearByMimeMetadataFilter.java | 13 +-
.../metadata/filter/CompositeMetadataFilter.java | 4 +-
.../metadata/filter/DefaultMetadataFilter.java | 20 +-
.../filter/ExcludeFieldMetadataFilter.java | 10 +-
.../metadata/filter/FieldNameMappingFilter.java | 25 +-
.../filter/IncludeFieldMetadataFilter.java | 11 +-
.../tika/metadata/filter/MetadataFilter.java | 4 +-
.../main/java/org/apache/tika/mime/HexCoDec.java | 49 +-
.../src/main/java/org/apache/tika/mime/Magic.java | 2 -
.../main/java/org/apache/tika/mime/MagicMatch.java | 12 +-
.../main/java/org/apache/tika/mime/MediaType.java | 273 +--
.../org/apache/tika/mime/MediaTypeRegistry.java | 67 +-
.../main/java/org/apache/tika/mime/MimeType.java | 283 ++-
.../org/apache/tika/mime/MimeTypeException.java | 6 +-
.../main/java/org/apache/tika/mime/MimeTypes.java | 247 +-
.../org/apache/tika/mime/MimeTypesFactory.java | 108 +-
.../java/org/apache/tika/mime/MimeTypesReader.java | 346 +--
.../apache/tika/mime/MimeTypesReaderMetKeys.java | 2 +-
.../org/apache/tika/mime/MinShouldMatchClause.java | 11 +-
.../main/java/org/apache/tika/mime/Patterns.java | 99 +-
.../mime/ProbabilisticMimeDetectionSelector.java | 93 +-
.../parser/AbstractEncodingDetectorParser.java | 1 +
.../org/apache/tika/parser/AbstractParser.java | 11 +-
.../org/apache/tika/parser/AutoDetectParser.java | 33 +-
.../tika/parser/AutoDetectParserFactory.java | 9 +-
.../org/apache/tika/parser/CompositeParser.java | 93 +-
.../java/org/apache/tika/parser/CryptoParser.java | 26 +-
.../java/org/apache/tika/parser/DefaultParser.java | 88 +-
.../org/apache/tika/parser/DelegatingParser.java | 13 +-
.../org/apache/tika/parser/DigestingParser.java | 63 +-
.../java/org/apache/tika/parser/EmptyParser.java | 20 +-
.../java/org/apache/tika/parser/ErrorParser.java | 12 +-
.../java/org/apache/tika/parser/NetworkParser.java | 66 +-
.../java/org/apache/tika/parser/ParseContext.java | 67 +-
.../main/java/org/apache/tika/parser/Parser.java | 23 +-
.../org/apache/tika/parser/ParserDecorator.java | 84 +-
.../java/org/apache/tika/parser/ParserFactory.java | 7 +-
.../apache/tika/parser/ParserPostProcessor.java | 11 +-
.../java/org/apache/tika/parser/ParsingReader.java | 130 +-
.../org/apache/tika/parser/PasswordProvider.java | 11 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 177 +-
.../org/apache/tika/parser/StatefulParser.java | 2 +-
.../tika/parser/digest/CompositeDigester.java | 2 +-
.../tika/parser/digest/InputStreamDigester.java | 64 +-
.../parser/external/CompositeExternalParser.java | 25 +-
.../tika/parser/external/ExternalParser.java | 293 ++-
.../external/ExternalParsersConfigReader.java | 335 ++-
.../ExternalParsersConfigReaderMetKeys.java | 14 +-
.../parser/external/ExternalParsersFactory.java | 93 +-
.../parser/multiple/AbstractMultipleParser.java | 376 +--
.../tika/parser/multiple/FallbackParser.java | 36 +-
.../tika/parser/multiple/SupplementingParser.java | 50 +-
.../apache/tika/pipes/emitter/AbstractEmitter.java | 38 +-
.../org/apache/tika/pipes/emitter/EmitData.java | 25 +-
.../org/apache/tika/pipes/emitter/EmitKey.java | 18 +-
.../org/apache/tika/pipes/emitter/Emitter.java | 4 +-
.../apache/tika/pipes/emitter/EmitterManager.java | 16 +-
.../apache/tika/pipes/emitter/EmptyEmitter.java | 7 +-
.../apache/tika/pipes/emitter/StreamEmitter.java | 4 +-
.../apache/tika/pipes/fetcher/EmptyFetcher.java | 6 +-
.../org/apache/tika/pipes/fetcher/FetchKey.java | 22 +-
.../org/apache/tika/pipes/fetcher/Fetcher.java | 10 +-
.../apache/tika/pipes/fetcher/FetcherManager.java | 18 +-
.../tika/pipes/fetcher/FileSystemFetcher.java | 50 +-
.../pipes/fetchiterator/EmptyFetchIterator.java | 6 -
.../tika/pipes/fetchiterator/FetchEmitTuple.java | 38 +-
.../tika/pipes/fetchiterator/FetchIterator.java | 62 +-
.../fetchiterator/FileSystemFetchIterator.java | 41 +-
.../sax/AbstractRecursiveParserWrapperHandler.java | 53 +-
.../tika/sax/BasicContentHandlerFactory.java | 98 +-
.../org/apache/tika/sax/BodyContentHandler.java | 12 +-
.../java/org/apache/tika/sax/CleanPhoneText.java | 345 +--
.../apache/tika/sax/ContentHandlerDecorator.java | 18 +-
.../org/apache/tika/sax/ContentHandlerFactory.java | 15 +-
.../org/apache/tika/sax/DIFContentHandler.java | 242 +-
.../tika/sax/ElementMappingContentHandler.java | 81 +-
.../sax/EndDocumentShieldingContentHandler.java | 16 +-
.../tika/sax/ExpandedTitleContentHandler.java | 22 +-
.../src/main/java/org/apache/tika/sax/Link.java | 4 +-
.../main/java/org/apache/tika/sax/LinkBuilder.java | 12 +-
.../org/apache/tika/sax/LinkContentHandler.java | 30 +-
.../tika/sax/PhoneExtractingContentHandler.java | 20 +-
.../tika/sax/RecursiveParserWrapperHandler.java | 54 +-
.../apache/tika/sax/RichTextContentHandler.java | 3 +-
.../org/apache/tika/sax/SafeContentHandler.java | 155 +-
.../org/apache/tika/sax/SecureContentHandler.java | 86 +-
.../org/apache/tika/sax/StandardOrganizations.java | 305 +--
.../org/apache/tika/sax/StandardReference.java | 201 +-
.../sax/StandardsExtractingContentHandler.java | 155 +-
.../java/org/apache/tika/sax/StandardsText.java | 277 +--
.../org/apache/tika/sax/TaggedContentHandler.java | 8 +-
.../org/apache/tika/sax/TaggedSAXException.java | 6 +-
.../org/apache/tika/sax/TeeContentHandler.java | 18 +-
.../tika/sax/TextAndAttributeContentHandler.java | 12 +-
.../org/apache/tika/sax/TextContentHandler.java | 12 +-
.../org/apache/tika/sax/ToHTMLContentHandler.java | 10 +-
.../org/apache/tika/sax/ToTextContentHandler.java | 30 +-
.../org/apache/tika/sax/ToXMLContentHandler.java | 120 +-
.../apache/tika/sax/WriteOutContentHandler.java | 52 +-
.../org/apache/tika/sax/XHTMLContentHandler.java | 103 +-
.../org/apache/tika/sax/XMPContentHandler.java | 27 +-
.../apache/tika/sax/xpath/CompositeMatcher.java | 3 +-
.../java/org/apache/tika/sax/xpath/Matcher.java | 4 +-
.../tika/sax/xpath/MatchingContentHandler.java | 21 +-
.../org/apache/tika/sax/xpath/XPathParser.java | 10 +-
.../org/apache/tika/utils/AnnotationUtils.java | 61 +-
.../java/org/apache/tika/utils/CharsetUtils.java | 97 +-
.../java/org/apache/tika/utils/CompareUtils.java | 10 +-
.../org/apache/tika/utils/ConcurrentUtils.java | 112 +-
.../main/java/org/apache/tika/utils/DateUtils.java | 90 +-
.../java/org/apache/tika/utils/ExceptionUtils.java | 5 +-
.../java/org/apache/tika/utils/ParserUtils.java | 60 +-
.../java/org/apache/tika/utils/ProcessUtils.java | 2 +-
.../java/org/apache/tika/utils/RegexUtils.java | 20 +-
.../apache/tika/utils/RereadableInputStream.java | 261 +-
.../org/apache/tika/utils/ServiceLoaderUtils.java | 12 +-
.../java/org/apache/tika/utils/StringUtils.java | 26 +-
.../java/org/apache/tika/utils/SystemUtils.java | 15 +-
.../java/org/apache/tika/utils/XMLReaderUtils.java | 545 +++--
.../org/apache/custom/detect/MyCustomDetector.java | 6 +-
.../org/apache/tika/MultiThreadedTikaTest.java | 332 +--
.../apache/tika/ResourceLoggingClassLoader.java | 24 +-
.../org/apache/tika/TestRereadableInputStream.java | 144 +-
.../java/org/apache/tika/TikaDetectionTest.java | 51 +-
.../src/test/java/org/apache/tika/TikaIT.java | 5 +-
.../src/test/java/org/apache/tika/TikaTest.java | 444 ++--
.../org/apache/tika/TypeDetectionBenchmark.java | 18 +-
.../apache/tika/config/AbstractTikaConfigTest.java | 14 +-
.../java/org/apache/tika/config/DummyExecutor.java | 59 +-
.../java/org/apache/tika/config/DummyParser.java | 8 +-
.../java/org/apache/tika/config/ParamTest.java | 37 +-
.../tika/config/TikaConfigSerializerTest.java | 24 +-
.../org/apache/tika/config/TikaConfigTest.java | 163 +-
.../tika/detect/FileCommandDetectorTest.java | 40 +-
.../org/apache/tika/detect/MagicDetectorTest.java | 143 +-
.../tika/detect/MimeDetectionWithNNTest.java | 213 +-
.../org/apache/tika/detect/NameDetectorTest.java | 25 +-
.../org/apache/tika/detect/TextDetectorTest.java | 31 +-
.../org/apache/tika/detect/TypeDetectorTest.java | 31 +-
.../tika/detect/ZeroSizeFileDetectorTest.java | 5 +-
.../java/org/apache/tika/fork/ForkParserTest.java | 123 +-
.../apache/tika/fork/ForkParserTikaBinTest.java | 135 +-
.../java/org/apache/tika/fork/ForkTestParser.java | 21 +-
.../tika/fork/UpperCasingContentHandler.java | 7 +-
.../java/org/apache/tika/io/EndianUtilsTest.java | 38 +-
.../java/org/apache/tika/io/FilenameUtilsTest.java | 39 +-
.../apache/tika/io/LookaheadInputStreamTest.java | 20 +-
.../java/org/apache/tika/io/TailStreamTest.java | 87 +-
.../org/apache/tika/io/TemporaryResourcesTest.java | 6 +-
.../org/apache/tika/io/TikaInputStreamTest.java | 56 +-
.../tika/language/LanguageIdentifierTest.java | 44 +-
.../apache/tika/language/LanguageProfileTest.java | 2 +-
.../tika/language/LanguageProfilerBuilderTest.java | 32 +-
.../tika/language/detect/LanguageNamesTest.java | 26 +-
.../org/apache/tika/metadata/TestMetadata.java | 212 +-
.../tika/metadata/filter/MockUpperCaseFilter.java | 4 +-
.../tika/metadata/filter/TestMetadataFilter.java | 31 +-
.../org/apache/tika/mime/CustomReaderTest.java | 120 +-
.../java/org/apache/tika/mime/MediaTypeTest.java | 86 +-
.../org/apache/tika/mime/MimeDetectionTest.java | 135 +-
.../org/apache/tika/mime/MimeTypesReaderTest.java | 297 ++-
.../java/org/apache/tika/mime/PatternsTest.java | 18 +-
.../tika/mime/ProbabilisticMimeDetectionTest.java | 114 +-
.../ProbabilisticMimeDetectionTestWithTika.java | 100 +-
.../apache/tika/parser/CompositeParserTest.java | 158 +-
.../tika/parser/DummyInitializableParser.java | 29 +-
.../tika/parser/DummyParameterizedParser.java | 71 +-
.../java/org/apache/tika/parser/DummyParser.java | 57 +-
.../tika/parser/InitializableParserTest.java | 14 +-
.../tika/parser/ParameterizedParserTest.java | 35 +-
.../apache/tika/parser/ParserDecoratorTest.java | 57 +-
.../org/apache/tika/parser/mock/MockParser.java | 72 +-
.../apache/tika/parser/mock/MockParserFactory.java | 8 +-
.../org/apache/tika/parser/mock/VowelParser.java | 11 +-
.../tika/parser/multiple/MultipleParserTest.java | 137 +-
.../org/apache/tika/pipes/emitter/MockEmitter.java | 8 +-
.../tika/pipes/fetcher/FileSystemFetcherTest.java | 10 +-
.../fetchiterator/FileSystemFetchIteratorTest.java | 25 +-
.../tika/sax/BasicContentHandlerFactoryTest.java | 98 +-
.../apache/tika/sax/BodyContentHandlerTest.java | 9 +-
.../apache/tika/sax/LinkContentHandlerTest.java | 29 +-
.../apache/tika/sax/OfflineContentHandlerTest.java | 9 +-
.../tika/sax/RichTextContentHandlerTest.java | 15 +-
.../apache/tika/sax/SecureContentHandlerTest.java | 7 +-
.../java/org/apache/tika/sax/SerializerTest.java | 55 +-
.../apache/tika/sax/XHTMLContentHandlerTest.java | 77 +-
.../org/apache/tika/utils/AnnotationUtilsTest.java | 47 +-
.../org/apache/tika/utils/CharsetUtilsTest.java | 14 +-
.../org/apache/tika/utils/ConcurrentUtilsTest.java | 126 +-
.../java/org/apache/tika/utils/RegexUtilsTest.java | 31 +-
.../apache/tika/utils/ServiceLoaderUtilsTest.java | 28 +-
tika-core/src/test/resources/log4j.properties | 1 +
.../org/apache/tika/config/FileCommandDetector.xml | 2 +-
.../org/apache/tika/config/TIKA-1762-executors.xml | 62 +-
.../apache/tika/fuzzing/general/ByteFlipper.java | 2 +-
tika-parent/checkstyle.xml | 139 ++
tika-parent/pom.xml | 42 +-
tika-parsers/pom.xml | 38 +-
.../tika/parser/recognition/AgeRecogniser.java | 182 +-
.../parser/recognition/AgeRecogniserConfig.java | 59 +-
.../tika/parser/recognition/AgeRecogniserTest.java | 53 +-
.../tika/dl/imagerec/DL4JInceptionV3Net.java | 138 +-
.../org/apache/tika/dl/imagerec/DL4JVGG16Net.java | 73 +-
.../tika/dl/imagerec/DL4JInceptionV3NetTest.java | 14 +-
.../apache/tika/dl/imagerec/DL4JVGG16NetTest.java | 15 +-
.../tika/parser/captioning/CaptionObject.java | 6 +-
.../captioning/tf/TensorflowRESTCaptioner.java | 44 +-
.../tika/parser/pot/PooledTimeSeriesParser.java | 85 +-
.../tika/parser/recognition/ObjectRecogniser.java | 36 +-
.../recognition/ObjectRecognitionParser.java | 50 +-
.../tika/parser/recognition/RecognisedObject.java | 7 +-
.../recognition/tf/TensorflowImageRecParser.java | 97 +-
.../recognition/tf/TensorflowRESTRecogniser.java | 54 +-
.../tf/TensorflowRESTVideoRecogniser.java | 30 +-
.../tika/parser/captioning/tf/model_info.xml | 3 +-
.../recognition/tika-config-tflow-video-rest.xml | 3 +-
.../recognition/ObjectRecognitionParserTest.java | 96 +-
.../tf/TensorflowImageRecParserTest.java | 28 +-
.../tf/TensorflowVideoRecParserTest.java | 30 +-
.../parser/ctakes/CTAKESAnnotationProperty.java | 16 +-
.../apache/tika/parser/ctakes/CTAKESConfig.java | 249 +-
.../tika/parser/ctakes/CTAKESContentHandler.java | 178 +-
.../apache/tika/parser/ctakes/CTAKESParser.java | 42 +-
.../tika/parser/ctakes/CTAKESSerializer.java | 5 +-
.../org/apache/tika/parser/ctakes/CTAKESUtils.java | 423 ++--
.../java/org/apache/tika/parser/geo/GeoParser.java | 85 +-
.../apache/tika/parser/geo/GeoParserConfig.java | 33 +-
.../java/org/apache/tika/parser/geo/GeoTag.java | 73 +-
.../tika/parser/geo/NameEntityExtractor.java | 23 +-
.../parser/geo/gazetteer/GeoGazetteerClient.java | 157 +-
.../apache/tika/parser/geo/gazetteer/Location.java | 107 +-
.../tika/parser/journal/GrobidRESTParser.java | 62 +-
.../apache/tika/parser/journal/JournalParser.java | 44 +-
.../apache/tika/parser/journal/TEIDOMParser.java | 159 +-
.../org/apache/tika/parser/ner/NERecogniser.java | 8 +-
.../apache/tika/parser/ner/NamedEntityParser.java | 76 +-
.../parser/ner/corenlp/CoreNLPNERecogniser.java | 93 +-
.../tika/parser/ner/grobid/GrobidNERecogniser.java | 208 +-
.../tika/parser/ner/mitie/MITIENERecogniser.java | 115 +-
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 69 +-
.../parser/ner/opennlp/OpenNLPNERecogniser.java | 42 +-
.../tika/parser/ner/opennlp/OpenNLPNameFinder.java | 37 +-
.../tika/parser/ner/regex/RegexNERecogniser.java | 31 +-
.../parser/sentiment/SentimentAnalysisParser.java | 33 +-
.../tika/parser/ctakes/CTAKESConfig.properties | 2 +-
.../tika/parser/geo/GeoTopicConfig.properties | 2 +-
.../tika/parser/journal/GrobidExtractor.properties | 2 +-
.../tika/parser/ner/grobid/GrobidServer.properties | 4 +-
.../tika/parser/ner/nltk/NLTKServer.properties | 2 +-
.../org/apache/tika/parser/geo/GeoParserTest.java | 147 +-
.../tika/parser/journal/JournalParserTest.java | 5 +-
.../org/apache/tika/parser/journal/TEITest.java | 32 +-
.../tika/parser/ner/NamedEntityParserTest.java | 29 +-
.../tika/parser/ner/nltk/NLTKNERecogniserTest.java | 20 +-
.../parser/ner/regex/RegexNERecogniserTest.java | 18 +-
.../sentiment/SentimentAnalysisParserTest.java | 49 +-
.../tika/config/TIKA-3078-geo.topic.GeoParser.xml | 22 +-
tika-parsers/tika-parsers-classic/pom.xml | 89 +-
.../apache/tika/detect/apple/BPListDetector.java | 66 +-
.../apache/tika/detect/apple/IWorkDetector.java | 18 +-
.../tika/parser/apple/AppleSingleFileParser.java | 60 +-
.../org/apache/tika/parser/apple/PListParser.java | 92 +-
.../tika/parser/iwork/AutoPageNumberUtils.java | 146 +-
.../tika/parser/iwork/IWorkPackageParser.java | 271 ++-
.../tika/parser/iwork/KeynoteContentHandler.java | 36 +-
.../tika/parser/iwork/NumbersContentHandler.java | 16 +-
.../tika/parser/iwork/PagesContentHandler.java | 436 ++--
.../parser/iwork/iwana/IWork13PackageParser.java | 198 +-
.../parser/iwork/iwana/IWork18PackageParser.java | 180 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../tika/parser/iwork/AutoPageNumberUtilsTest.java | 85 +-
.../apache/tika/parser/iwork/IWorkParserTest.java | 141 +-
.../tika/parser/iwork/iwana/IWork13ParserTest.java | 23 +-
.../org/apache/tika/parser/audio/AudioParser.java | 32 +-
.../org/apache/tika/parser/audio/MidiParser.java | 41 +-
.../org/apache/tika/parser/mp3/AudioFrame.java | 239 +-
.../java/org/apache/tika/parser/mp3/ID3Tags.java | 294 +--
.../org/apache/tika/parser/mp3/ID3v1Handler.java | 103 +-
.../org/apache/tika/parser/mp3/ID3v22Handler.java | 71 +-
.../org/apache/tika/parser/mp3/ID3v23Handler.java | 31 +-
.../org/apache/tika/parser/mp3/ID3v24Handler.java | 35 +-
.../org/apache/tika/parser/mp3/ID3v2Frame.java | 418 ++--
.../org/apache/tika/parser/mp3/LyricsHandler.java | 82 +-
.../java/org/apache/tika/parser/mp3/MP3Frame.java | 2 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 210 +-
.../org/apache/tika/parser/mp3/MpegStream.java | 445 ++--
.../apache/tika/parser/mp4/ISO6709Extractor.java | 26 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 172 +-
.../org/apache/tika/parser/video/FLVParser.java | 81 +-
.../apache/tika/parser/audio/AudioParserTest.java | 15 +-
.../apache/tika/parser/audio/MidiParserTest.java | 9 +-
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 151 +-
.../org/apache/tika/parser/mp3/MpegStreamTest.java | 93 +-
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 30 +-
.../apache/tika/parser/video/FLVParserTest.java | 7 +-
.../java/org/apache/tika/parser/dwg/DWGParser.java | 336 ++-
.../java/org/apache/tika/parser/prt/PRTParser.java | 413 ++--
.../org/apache/tika/parser/dwg/DWGParserTest.java | 99 +-
.../org/apache/tika/parser/prt/PRTParserTest.java | 135 +-
.../org/apache/tika/parser/asm/ClassParser.java | 17 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 45 +-
.../apache/tika/parser/code/SourceCodeParser.java | 36 +-
.../tika/parser/executable/ExecutableParser.java | 656 ++---
.../java/org/apache/tika/parser/mat/MatParser.java | 44 +-
.../org/apache/tika/parser/sas/SAS7BDATParser.java | 60 +-
.../apache/tika/parser/asm/ClassParserTest.java | 28 +-
.../tika/parser/code/SourceCodeParserTest.java | 56 +-
.../parser/executable/ExecutableParserTest.java | 32 +-
.../org/apache/tika/parser/mat/MatParserTest.java | 3 +-
.../apache/tika/parser/sas/SAS7BDATParserTest.java | 37 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 41 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 257 +-
.../apache/tika/parser/crypto/Pkcs7ParserTest.java | 3 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 16 +-
.../parser/digestutils/BouncyCastleDigester.java | 13 +-
.../tika/parser/digestutils/CommonsDigester.java | 69 +-
.../tika/parser/font/AdobeFontMetricParser.java | 167 +-
.../apache/tika/parser/font/TrueTypeParser.java | 27 +-
.../apache/tika/parser/font/FontParsersTest.java | 28 +-
.../sax/boilerpipe/BoilerpipeContentHandler.java | 46 +-
.../org/apache/tika/parser/html/DataURIScheme.java | 13 +-
.../parser/html/DataURISchemeParseException.java | 4 +-
.../apache/tika/parser/html/DataURISchemeUtil.java | 14 +-
.../apache/tika/parser/html/DefaultHtmlMapper.java | 122 +-
.../tika/parser/html/HtmlEncodingDetector.java | 66 +-
.../org/apache/tika/parser/html/HtmlHandler.java | 104 +-
.../org/apache/tika/parser/html/HtmlParser.java | 83 +-
.../tika/parser/html/XHTMLDowngradeHandler.java | 20 +-
.../html/charsetdetector/CharsetAliases.java | 55 +-
.../charsetdetector/CharsetDetectionResult.java | 12 +-
.../parser/html/charsetdetector/MetaProcessor.java | 18 +-
.../parser/html/charsetdetector/PreScanner.java | 83 +-
.../StandardHtmlEncodingDetector.java | 28 +-
.../charsets/XUserDefinedCharset.java | 8 +-
.../tika/parser/html/DataURISchemeParserTest.java | 19 +-
.../tika/parser/html/HtmlEncodingDetectorTest.java | 60 +-
.../apache/tika/parser/html/HtmlParserTest.java | 705 +++---
.../html/StandardHtmlEncodingDetectorTest.java | 139 +-
.../tika/parser/image/AbstractImageParser.java | 46 +-
.../org/apache/tika/parser/image/BPGParser.java | 30 +-
.../org/apache/tika/parser/image/HeifParser.java | 33 +-
.../org/apache/tika/parser/image/ICNSParser.java | 55 +-
.../org/apache/tika/parser/image/ICNSType.java | 241 +-
.../tika/parser/image/ImageMetadataExtractor.java | 159 +-
.../org/apache/tika/parser/image/ImageParser.java | 63 +-
.../org/apache/tika/parser/image/JpegParser.java | 12 +-
.../apache/tika/parser/image/MetadataFields.java | 5 +-
.../org/apache/tika/parser/image/PSDParser.java | 43 +-
.../org/apache/tika/parser/image/TiffParser.java | 11 +-
.../org/apache/tika/parser/image/WebPParser.java | 11 +-
.../apache/tika/parser/image/HeifParserTest.java | 16 +-
.../apache/tika/parser/image/ICNSParserTest.java | 33 +-
.../parser/image/ImageMetadataExtractorTest.java | 17 +-
.../apache/tika/parser/image/ImageParserTest.java | 110 +-
.../apache/tika/parser/image/JpegParserTest.java | 76 +-
.../apache/tika/parser/image/PSDParserTest.java | 13 +-
.../apache/tika/parser/image/WebPParserTest.java | 3 +-
.../apache/tika/parser/jdbc/AbstractDBParser.java | 33 +-
.../apache/tika/parser/jdbc/JDBCTableReader.java | 79 +-
.../apache/tika/parser/mailcommons/MailUtil.java | 9 +-
.../tika/parser/mailcommons/MailUtilTest.java | 11 +-
.../tika/parser/mail/MailContentHandler.java | 207 +-
.../org/apache/tika/parser/mail/RFC822Parser.java | 30 +-
.../org/apache/tika/parser/mbox/MboxParser.java | 36 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 175 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 25 +-
.../detect/microsoft/POIFSContainerDetector.java | 87 +-
.../detect/microsoft/ooxml/OPCPackageDetector.java | 224 +-
.../microsoft/MSEmbeddedStreamTranslator.java | 24 +-
.../tika/parser/microsoft/AbstractListManager.java | 23 +-
.../parser/microsoft/AbstractOfficeParser.java | 61 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 57 +-
.../org/apache/tika/parser/microsoft/Cell.java | 3 +-
.../tika/parser/microsoft/CellDecorator.java | 3 +-
.../apache/tika/parser/microsoft/EMFParser.java | 78 +-
.../tika/parser/microsoft/ExcelExtractor.java | 142 +-
.../tika/parser/microsoft/FormattingUtils.java | 23 +-
.../tika/parser/microsoft/HSLFExtractor.java | 124 +-
.../tika/parser/microsoft/JackcessExtractor.java | 81 +-
.../tika/parser/microsoft/JackcessParser.java | 33 +-
.../apache/tika/parser/microsoft/LinkedCell.java | 3 +-
.../apache/tika/parser/microsoft/ListManager.java | 33 +-
.../tika/parser/microsoft/MSOwnerFileParser.java | 37 +-
.../apache/tika/parser/microsoft/NumberCell.java | 3 +-
.../apache/tika/parser/microsoft/OfficeParser.java | 146 +-
.../tika/parser/microsoft/OfficeParserConfig.java | 69 +-
.../tika/parser/microsoft/OldExcelParser.java | 23 +-
.../tika/parser/microsoft/OutlookExtractor.java | 361 ++-
.../tika/parser/microsoft/SummaryExtractor.java | 68 +-
.../apache/tika/parser/microsoft/TNEFParser.java | 44 +-
.../org/apache/tika/parser/microsoft/TextCell.java | 3 +-
.../parser/microsoft/TikaExcelDataFormatter.java | 11 +-
.../parser/microsoft/TikaExcelGeneralFormat.java | 2 +-
.../apache/tika/parser/microsoft/WMFParser.java | 24 +-
.../tika/parser/microsoft/WordExtractor.java | 98 +-
.../tika/parser/microsoft/chm/ChmAccessor.java | 10 +-
.../tika/parser/microsoft/chm/ChmAssert.java | 139 +-
.../tika/parser/microsoft/chm/ChmBlockInfo.java | 103 +-
.../tika/parser/microsoft/chm/ChmCommons.java | 293 +--
.../tika/parser/microsoft/chm/ChmConstants.java | 54 +-
.../microsoft/chm/ChmDirectoryListingSet.java | 234 +-
.../tika/parser/microsoft/chm/ChmExtractor.java | 284 +--
.../tika/parser/microsoft/chm/ChmItsfHeader.java | 192 +-
.../tika/parser/microsoft/chm/ChmItspHeader.java | 271 +--
.../tika/parser/microsoft/chm/ChmLzxBlock.java | 455 ++--
.../tika/parser/microsoft/chm/ChmLzxState.java | 262 +-
.../parser/microsoft/chm/ChmLzxcControlData.java | 147 +-
.../parser/microsoft/chm/ChmLzxcResetTable.java | 129 +-
.../tika/parser/microsoft/chm/ChmParser.java | 39 +-
.../tika/parser/microsoft/chm/ChmPmgiHeader.java | 51 +-
.../tika/parser/microsoft/chm/ChmPmglHeader.java | 62 +-
.../tika/parser/microsoft/chm/ChmSection.java | 61 +-
.../tika/parser/microsoft/chm/ChmWrapper.java | 12 +-
.../microsoft/chm/DirectoryListingEntry.java | 41 +-
.../tika/parser/microsoft/onenote/CompactID.java | 9 +-
.../tika/parser/microsoft/onenote/Error.java | 11 +-
.../parser/microsoft/onenote/ExtendedGUID.java | 11 +-
.../microsoft/onenote/FileChunkReference.java | 20 +-
.../tika/parser/microsoft/onenote/FileNode.java | 102 +-
.../microsoft/onenote/FileNodeListHeader.java | 32 +-
.../tika/parser/microsoft/onenote/FileNodePtr.java | 3 +-
.../parser/microsoft/onenote/FileNodeUnion.java | 33 +-
.../microsoft/onenote/FndStructureConstants.java | 44 +-
.../apache/tika/parser/microsoft/onenote/GUID.java | 65 +-
.../apache/tika/parser/microsoft/onenote/JCID.java | 38 +-
.../microsoft/onenote/JCIDPropertySetTypeEnum.java | 76 +-
.../onenote/ObjectDeclarationWithRefCount.java | 27 +-
.../onenote/ObjectDeclarationWithRefCountBody.java | 3 +-
.../onenote/ObjectSpaceObjectPropSet.java | 12 +-
...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java | 6 +-
.../onenote/OneNoteDirectFileResource.java | 9 +-
.../parser/microsoft/onenote/OneNoteDocument.java | 13 +-
.../parser/microsoft/onenote/OneNoteHeader.java | 18 +-
.../onenote/OneNoteLegacyDumpStrings.java | 47 +-
.../parser/microsoft/onenote/OneNoteParser.java | 149 +-
.../microsoft/onenote/OneNotePropertyEnum.java | 208 +-
.../microsoft/onenote/OneNotePropertyId.java | 7 +-
.../tika/parser/microsoft/onenote/OneNotePtr.java | 515 ++--
.../microsoft/onenote/OneNoteTreeWalker.java | 215 +-
.../onenote/OneNoteTreeWalkerOptions.java | 14 +-
.../parser/microsoft/onenote/PropertyIDType.java | 7 +-
.../tika/parser/microsoft/onenote/PropertySet.java | 37 +-
.../parser/microsoft/onenote/PropertyValue.java | 20 +-
.../tika/parser/microsoft/onenote/Revision.java | 23 +-
.../microsoft/onenote/RootObjectReference.java | 3 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 168 +-
.../parser/microsoft/ooxml/MetadataExtractor.java | 79 +-
.../parser/microsoft/ooxml/OOXMLExtractor.java | 10 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 97 +-
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 60 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 72 +-
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 110 +-
.../ooxml/POIXMLTextExtractorDecorator.java | 8 +-
.../microsoft/ooxml/ParagraphProperties.java | 18 +-
.../tika/parser/microsoft/ooxml/RunProperties.java | 9 +-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 258 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 93 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 95 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 48 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 181 +-
.../parser/microsoft/ooxml/XWPFListManager.java | 21 +-
.../ooxml/XWPFWordExtractorDecorator.java | 124 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 122 +-
.../microsoft/ooxml/xps/XPSPageContentHandler.java | 98 +-
.../microsoft/ooxml/xps/XPSTextExtractor.java | 7 +-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 16 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 63 +-
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 28 +-
.../ooxml/xwpf/ml2006/AbstractPartHandler.java | 11 +-
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 12 +-
.../ooxml/xwpf/ml2006/CorePropertiesHandler.java | 17 +-
.../xwpf/ml2006/ExtendedPropertiesHandler.java | 3 +-
.../microsoft/ooxml/xwpf/ml2006/PartHandler.java | 7 +-
.../ooxml/xwpf/ml2006/RelationshipsHandler.java | 5 +-
.../ooxml/xwpf/ml2006/RelationshipsManager.java | 3 +-
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 40 +-
.../ooxml/xwpf/ml2006/Word2006MLParser.java | 21 +-
.../ml2006/WordAndPowerPointTextPartHandler.java | 17 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 62 +-
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 40 +-
.../parser/microsoft/rtf/RTFObjDataParser.java | 62 +-
.../tika/parser/microsoft/rtf/RTFParser.java | 53 +-
.../tika/parser/microsoft/rtf/TextExtractor.java | 93 +-
.../microsoft/xml/AbstractXML2003Parser.java | 41 +-
.../parser/microsoft/xml/HyperlinkHandler.java | 23 +-
.../parser/microsoft/xml/SpreadsheetMLParser.java | 50 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 75 +-
.../AbstractPOIContainerExtractionTest.java | 16 +-
.../tika/parser/microsoft/EMFParserTest.java | 13 +-
.../tika/parser/microsoft/ExcelParserTest.java | 69 +-
.../tika/parser/microsoft/JackcessParserTest.java | 40 +-
.../parser/microsoft/MSOwnerFileParserTest.java | 7 +-
.../tika/parser/microsoft/OfficeParserTest.java | 4 +-
.../tika/parser/microsoft/OldExcelParserTest.java | 13 +-
.../tika/parser/microsoft/OutlookParserTest.java | 83 +-
.../microsoft/POIContainerExtractionTest.java | 19 +-
.../parser/microsoft/PowerPointParserTest.java | 47 +-
.../tika/parser/microsoft/ProjectParserTest.java | 27 +-
.../tika/parser/microsoft/PublisherParserTest.java | 13 +-
.../parser/microsoft/SolidworksParserTest.java | 46 +-
.../tika/parser/microsoft/TNEFParserTest.java | 9 +-
.../tika/parser/microsoft/VisioParserTest.java | 13 +-
.../tika/parser/microsoft/WMFParserTest.java | 9 +-
.../tika/parser/microsoft/WordParserTest.java | 123 +-
.../parser/microsoft/WriteProtectedParserTest.java | 9 +-
.../parser/microsoft/chm/TestChmBlockInfo.java | 50 +-
.../parser/microsoft/chm/TestChmExtraction.java | 161 +-
.../parser/microsoft/chm/TestChmExtractor.java | 16 +-
.../parser/microsoft/chm/TestChmItsfHeader.java | 40 +-
.../parser/microsoft/chm/TestChmItspHeader.java | 60 +-
.../tika/parser/microsoft/chm/TestChmLzxState.java | 37 +-
.../microsoft/chm/TestChmLzxcControlData.java | 54 +-
.../microsoft/chm/TestChmLzxcResetTable.java | 59 +-
.../microsoft/chm/TestDirectoryListingEntry.java | 9 +-
.../tika/parser/microsoft/chm/TestParameters.java | 34 +-
.../tika/parser/microsoft/chm/TestPmglHeader.java | 24 +-
.../microsoft/onenote/OneNoteParserTest.java | 83 +-
.../ooxml/OOXMLContainerExtractionTest.java | 24 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 274 +--
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 122 +-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 97 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 41 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 68 +-
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 29 +-
.../parser/microsoft/pst/OutlookPSTParserTest.java | 84 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 91 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 35 +-
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
.../apache/tika/detect/ole/MiscOLEDetector.java | 69 +-
.../java/org/apache/tika/parser/dbf/DBFCell.java | 30 +-
.../apache/tika/parser/dbf/DBFColumnHeader.java | 68 +-
.../org/apache/tika/parser/dbf/DBFFileHeader.java | 46 +-
.../java/org/apache/tika/parser/dbf/DBFParser.java | 34 +-
.../java/org/apache/tika/parser/dbf/DBFReader.java | 167 +-
.../java/org/apache/tika/parser/dbf/DBFRow.java | 16 +-
.../apache/tika/parser/dif/DIFContentHandler.java | 241 +-
.../java/org/apache/tika/parser/dif/DIFParser.java | 85 +-
.../apache/tika/parser/epub/EpubContentParser.java | 29 +-
.../org/apache/tika/parser/epub/EpubParser.java | 124 +-
.../apache/tika/parser/hwp/HwpStreamReader.java | 2 +-
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 111 +-
.../org/apache/tika/parser/hwp/HwpV5Parser.java | 16 +-
.../apache/tika/parser/mif/MIFContentHandler.java | 17 +-
.../org/apache/tika/parser/mif/MIFExtractor.java | 34 +-
.../java/org/apache/tika/parser/mif/MIFParser.java | 42 +-
.../parser/odf/FlatOpenDocumentMacroHandler.java | 43 +-
.../tika/parser/odf/FlatOpenDocumentParser.java | 101 +-
.../parser/odf/NSNormalizerContentHandler.java | 29 +-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 311 +--
.../tika/parser/odf/OpenDocumentContentParser.java | 40 +-
.../tika/parser/odf/OpenDocumentMacroHandler.java | 16 +-
.../parser/odf/OpenDocumentManifestHandler.java | 35 +-
.../tika/parser/odf/OpenDocumentMetaParser.java | 101 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 156 +-
.../tika/parser/wordperfect/QPWTextExtractor.java | 251 +-
.../tika/parser/wordperfect/QuattroProParser.java | 34 +-
.../tika/parser/wordperfect/WP5Charsets.java | 289 ++-
.../wordperfect/WP5DocumentAreaExtractor.java | 66 +-
.../tika/parser/wordperfect/WP6Charsets.java | 750 +++---
.../wordperfect/WP6DocumentAreaExtractor.java | 58 +-
.../wordperfect/WPDocumentAreaExtractor.java | 23 +-
.../tika/parser/wordperfect/WPInputStream.java | 25 +-
.../tika/parser/wordperfect/WPPrefixArea.java | 37 +-
.../parser/wordperfect/WPPrefixAreaExtractor.java | 10 +-
.../tika/parser/wordperfect/WordPerfectParser.java | 78 +-
.../org/apache/tika/parser/dbf/DBFParserTest.java | 36 +-
.../org/apache/tika/parser/dif/DIFParserTest.java | 25 +-
.../apache/tika/parser/epub/EpubParserTest.java | 30 +-
.../apache/tika/parser/hwp/HwpV5ParserTest.java | 17 +-
.../tika/parser/ibooks/iBooksParserTest.java | 18 +-
.../org/apache/tika/parser/mif/MIFParserTest.java | 9 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 265 +-
.../tika/parser/wordperfect/QuattroProTest.java | 12 +-
.../tika/parser/wordperfect/WPInputStreamTest.java | 14 +-
.../tika/parser/wordperfect/WordPerfectTest.java | 20 +-
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
.../org/apache/tika/parser/feed/FeedParser.java | 76 +-
.../apache/tika/parser/iptc/IptcAnpaParser.java | 1404 +++++------
.../apache/tika/parser/feed/FeedParserTest.java | 23 +-
.../apache/tika/parser/ocr/ImagePreprocessor.java | 67 +-
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 197 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 364 ++-
.../apache/tika/parser/ocr/tess4j/ImageDeskew.java | 10 +-
.../apache/tika/parser/ocr/tess4j/ImageUtil.java | 17 +-
.../tika/parser/ocr/TesseractOCRConfigTest.java | 149 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 101 +-
.../resources/test-configs/TIKA-2705-tesseract.xml | 26 +-
.../tika-config-tesseract-arbitrary.xml | 22 +-
.../test-configs/tika-config-tesseract-full.xml | 38 +-
.../tika-config-tesseract-load-langs.xml | 20 +-
.../test-configs/tika-config-tesseract-partial.xml | 32 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 484 ++--
.../org/apache/tika/parser/pdf/AccessChecker.java | 18 +-
.../tika/parser/pdf/ImageGraphicsEngine.java | 290 ++-
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 20 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 97 +-
.../tika/parser/pdf/PDFEncodedStringDecoder.java | 6 +-
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 207 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 133 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 360 +--
.../apache/tika/parser/pdf/PDFPreflightParser.java | 82 +-
.../tika/parser/pdf/PDMetadataExtractor.java | 54 +-
.../org/apache/tika/parser/pdf/XFAExtractor.java | 75 +-
.../apache/tika/parser/pdf/AccessCheckerTest.java | 6 +-
.../parser/pdf/PDFMarkedContent2XHTMLTest.java | 21 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 324 +--
.../tika/parser/pdf/PDFPreflightParserTest.java | 18 +-
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
.../apache/tika/parser/pkg/CompressorParser.java | 54 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 223 +-
.../java/org/apache/tika/parser/pkg/RarParser.java | 30 +-
.../apache/tika/parser/pkg/AbstractPkgTest.java | 94 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 11 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 37 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 39 +-
.../tika/parser/pkg/CompressorParserTest.java | 21 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 29 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 19 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 99 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 69 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 67 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 98 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 33 +-
.../src/test/resources/test-documents/testSVG.svg | 8 +-
.../java/org/apache/tika/parser/csv/CSVParams.java | 4 +-
.../java/org/apache/tika/parser/csv/CSVResult.java | 17 +-
.../org/apache/tika/parser/csv/CSVSniffer.java | 84 +-
.../apache/tika/parser/csv/TextAndCSVParser.java | 169 +-
.../tika/parser/strings/Latin1StringsParser.java | 145 +-
.../apache/tika/parser/strings/StringsConfig.java | 163 +-
.../tika/parser/strings/StringsEncoding.java | 62 +-
.../apache/tika/parser/strings/StringsParser.java | 495 ++--
.../apache/tika/parser/txt/CharsetDetector.java | 46 +-
.../org/apache/tika/parser/txt/CharsetMatch.java | 10 +-
.../apache/tika/parser/txt/CharsetRecog_2022.java | 20 +-
.../apache/tika/parser/txt/CharsetRecog_UTF8.java | 8 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 2 +-
.../apache/tika/parser/txt/CharsetRecog_mbcs.java | 113 +-
.../apache/tika/parser/txt/CharsetRecog_sbcs.java | 1801 ++++++++------
.../apache/tika/parser/txt/CharsetRecognizer.java | 2 +-
.../tika/parser/txt/Icu4jEncodingDetector.java | 17 +-
.../java/org/apache/tika/parser/txt/TXTParser.java | 26 +-
.../tika/parser/txt/UniversalEncodingDetector.java | 16 +-
.../tika/parser/txt/UniversalEncodingListener.java | 21 +-
.../org/apache/tika/parser/csv/CSVSnifferTest.java | 51 +-
.../tika/parser/csv/TextAndCSVParserTest.java | 112 +-
.../parser/strings/Latin1StringsParserTest.java | 28 +-
.../tika/parser/strings/StringsConfigTest.java | 111 +-
.../tika/parser/strings/StringsParserTest.java | 89 +-
.../tika/parser/txt/CharsetDetectorTest.java | 21 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 126 +-
.../test-configs/tika-config-strings-full.xml | 18 +-
.../test-configs/tika-config-strings-partial.xml | 16 +-
.../src/test/resources/test-documents/resume.html | 140 +-
.../tika/parser/xliff/XLIFF12ContentHandler.java | 15 +-
.../apache/tika/parser/xliff/XLIFF12Parser.java | 27 +-
.../org/apache/tika/parser/xliff/XLZParser.java | 43 +-
.../tika/parser/xml/AbstractMetadataHandler.java | 46 +-
.../xml/AttributeDependantMetadataHandler.java | 34 +-
.../tika/parser/xml/AttributeMetadataHandler.java | 28 +-
.../org/apache/tika/parser/xml/DcXMLParser.java | 22 +-
.../tika/parser/xml/ElementMetadataHandler.java | 69 +-
.../apache/tika/parser/xml/FictionBookParser.java | 33 +-
.../apache/tika/parser/xml/MetadataHandler.java | 33 +-
.../tika/parser/xml/TextAndAttributeXMLParser.java | 6 +-
.../java/org/apache/tika/parser/xml/XMLParser.java | 39 +-
.../org/apache/tika/parser/xml/XMLProfiler.java | 99 +-
.../tika/parser/xliff/XLIFF12ParserTest.java | 5 +-
.../apache/tika/parser/xliff/XLZParserTest.java | 18 +-
.../apache/tika/parser/xml/DcXMLParserTest.java | 27 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 56 +-
.../tika/parser/xml/FictionBookParserTest.java | 10 +-
.../parser/xml/TextAndAttributeXMLParserTest.java | 21 +-
.../src/test/resources/test-documents/testXML.xml | 30 +-
.../src/test/resources/test-documents/testXML2.xml | 10 +-
.../src/test/resources/test-documents/testXML3.xml | 38 +-
.../apache/tika/parser/xmp/JempboxExtractor.java | 91 +-
.../apache/tika/parser/xmp/XMPPacketScanner.java | 4 +-
.../tika/parser/xmp/JempboxExtractorTest.java | 31 +-
.../src/test/resources/test-documents/testXMP.xmp | 342 ++-
.../tika/detect/zip/CompressorConstants.java | 3 +-
.../detect/zip/DefaultZipContainerDetector.java | 165 +-
.../DeprecatedStreamingZipContainerDetector.java | 37 +-
.../detect/zip/DeprecatedZipContainerDetector.java | 3 -
.../org/apache/tika/detect/zip/IPADetector.java | 21 +-
.../org/apache/tika/detect/zip/JarDetector.java | 14 +-
.../org/apache/tika/detect/zip/KMZDetector.java | 22 +-
.../tika/detect/zip/OpenDocumentDetector.java | 20 +-
.../apache/tika/detect/zip/PackageConstants.java | 1 +
.../apache/tika/detect/zip/StarOfficeDetector.java | 78 +-
.../tika/detect/zip/StreamingDetectContext.java | 16 +-
.../detect/zip/StreamingZipContainerDetector.java | 13 +-
.../tika/detect/zip/ZipContainerDetector.java | 16 +-
.../tika/detect/zip/ZipContainerDetectorBase.java | 47 +-
.../org/apache/tika/zip/utils/ZipSalvager.java | 104 +-
.../org/apache/tika/detect/zip/ZipParserTest.java | 14 +-
.../org/apache/tika/parser/internal/Activator.java | 22 +-
.../apache/tika/config/TikaDetectorConfigTest.java | 87 +-
.../tika/config/TikaEncodingDetectorTest.java | 82 +-
.../apache/tika/config/TikaParserConfigTest.java | 69 +-
.../tika/config/TikaTranslatorConfigTest.java | 21 +-
.../tika/detect/TestContainerAwareDetector.java | 278 ++-
.../apache/tika/detect/TestDetectorLoading.java | 15 +-
.../tika/detect/TestFileCommandDetector.java | 12 +-
.../tika/extractor/EmbeddedDocumentUtilTest.java | 3 +-
.../java/org/apache/tika/mime/MimeTypeTest.java | 12 +-
.../java/org/apache/tika/mime/MimeTypesTest.java | 4 +-
.../java/org/apache/tika/mime/TestMimeTypes.java | 733 +++---
.../apache/tika/parser/AutoDetectParserTest.java | 357 ++-
.../tika/parser/AutoDetectReaderParserTest.java | 24 +-
.../parser/BouncyCastleDigestingParserTest.java | 125 +-
.../apache/tika/parser/DigestingParserTest.java | 120 +-
.../org/apache/tika/parser/ParsingReaderTest.java | 13 +-
.../tika/parser/RecursiveParserWrapperTest.java | 61 +-
.../org/apache/tika/parser/TabularFormatsTest.java | 252 +-
.../java/org/apache/tika/parser/TestParsers.java | 49 +-
.../apache/tika/parser/TestXMLEntityExpansion.java | 90 +-
.../java/org/apache/tika/parser/TestXXEInXML.java | 115 +-
.../java/org/apache/tika/parser/XMLTestBase.java | 80 +-
.../parser/apple/AppleSingleFileParserTest.java | 8 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 19 +-
.../parser/fork/ForkParserIntegrationTest.java | 285 +--
.../apache/tika/parser/html/HtmlParserTest.java | 20 +-
.../apache/tika/parser/mail/MboxParserTest.java | 16 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 42 +-
.../tika/parser/microsoft/EMFParserTest.java | 17 +-
.../tika/parser/microsoft/ExcelParserTest.java | 7 +-
.../microsoft/POIContainerExtractionTest.java | 9 +-
.../parser/microsoft/PowerPointParserTest.java | 14 +-
.../tika/parser/microsoft/XML2003ParserTest.java | 25 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 17 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 65 +-
.../apache/tika/parser/mock/MockParserTest.java | 70 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 54 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 54 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 100 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 17 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 11 +-
.../pkg/CompositeZipContainerDetectorTest.java | 141 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 25 +-
.../tika/parser/pkg/CompressorParserTest.java | 17 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 13 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 16 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 64 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 11 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 65 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 11 +-
.../tika/parser/xml/FictionBookParserTest.java | 14 +-
.../sax/PhoneExtractingContentHandlerTest.java | 20 +-
.../sax/StandardsExtractingContentHandlerTest.java | 47 +-
.../apache/tika/utils/ServiceLoaderUtilsTest.java | 6 +-
.../src/test/resources/log4j.properties | 1 +
.../test-documents/testJAVAPROPS.properties | 1 +
.../apache/tika/parser/envi/EnviHeaderParser.java | 112 +-
.../org/apache/tika/parser/gdal/GDALParser.java | 249 +-
.../geoinfo/GeographicInformationParser.java | 451 ++--
.../org/apache/tika/parser/grib/GribParser.java | 41 +-
.../java/org/apache/tika/parser/hdf/HDFParser.java | 32 +-
.../org/apache/tika/parser/isatab/ISATabUtils.java | 311 +--
.../apache/tika/parser/isatab/ISArchiveParser.java | 222 +-
.../apache/tika/parser/netcdf/NetCDFParser.java | 28 +-
.../tika/parser/envi/EnviHeaderParserTest.java | 59 +-
.../apache/tika/parser/gdal/TestGDALParser.java | 112 +-
.../geoinfo/GeographicInformationParserTest.java | 5 +-
.../apache/tika/parser/grib/GribParserTest.java | 17 +-
.../org/apache/tika/parser/hdf/HDFParserTest.java | 48 +-
.../tika/parser/isatab/ISArchiveParserTest.java | 68 +-
.../tika/parser/netcdf/NetCDFParserTest.java | 23 +-
.../ground-truth/EnviHeaderGroundTruth.txt | 1 +
.../tika/parser/sqlite3/SQLite3DBParser.java | 19 +-
.../apache/tika/parser/sqlite3/SQLite3Parser.java | 17 +-
.../tika/parser/sqlite3/SQLite3TableReader.java | 19 +-
.../tika/parser/sqlite3/SQLite3ParserTest.java | 37 +-
.../apache/tika/mime/TestMimeTypesExtended.java | 23 +-
.../tika/parser/sqlite3/SQLite3ParserTest.java | 68 +-
tika-server/pom.xml | 38 +-
.../server/classic/config/PDFServerConfig.java | 42 +-
.../classic/config/TesseractServerConfig.java | 41 +-
.../classic/resource/XMPMetadataResource.java | 34 +-
.../classic/writer/XMPMessageBodyWriter.java | 26 +-
.../src/main/resources/log4j.properties | 4 +-
.../tika/server/classic/DetectorResourceTest.java | 55 +-
.../apache/tika/server/classic/FetcherTest.java | 35 +-
.../tika/server/classic/MetadataResourceTest.java | 96 +-
.../classic/RecursiveMetadataFilterTest.java | 42 +-
.../classic/RecursiveMetadataResourceTest.java | 215 +-
.../tika/server/classic/TikaDetectorsTest.java | 41 +-
.../tika/server/classic/TikaMimeTypesTest.java | 39 +-
.../tika/server/classic/TikaParsersTest.java | 46 +-
.../tika/server/classic/TikaResourceTest.java | 418 ++--
.../tika/server/classic/UnpackerResourceTest.java | 91 +-
.../test/resources/config/TIKA-3137-include.xml | 38 +-
.../src/test/resources/log4j.properties | 6 +-
.../test/resources/test-documents/testHTML.html | 20 +-
.../org/apache/tika/server/client/TikaClient.java | 33 +-
.../apache/tika/server/client/TikaClientCLI.java | 70 +-
.../tika/server/client/TikaEmitterResult.java | 24 +-
.../apache/tika/server/client/TikaHttpClient.java | 79 +-
.../src/main/resources/log4j.properties | 6 +-
.../org/apache/tika/server/client/TestBasic.java | 15 +-
.../src/test/resources/log4j.properties | 6 +-
.../server/core/CompositeParseContextConfig.java | 10 +-
.../server/core/DefaultInputStreamFactory.java | 9 +-
.../tika/server/core/FetcherStreamFactory.java | 17 +-
.../org/apache/tika/server/core/HTMLHelper.java | 7 +-
.../tika/server/core/InputStreamFactory.java | 9 +-
.../org/apache/tika/server/core/MetadataList.java | 7 +-
.../tika/server/core/ParseContextConfig.java | 16 +-
.../org/apache/tika/server/core/ServerStatus.java | 122 +-
.../tika/server/core/ServerStatusWatcher.java | 57 +-
.../apache/tika/server/core/TikaLoggingFilter.java | 7 +-
.../org/apache/tika/server/core/TikaServerCli.java | 39 +-
.../apache/tika/server/core/TikaServerConfig.java | 543 ++---
.../tika/server/core/TikaServerParseException.java | 3 +-
.../core/TikaServerParseExceptionMapper.java | 12 +-
.../apache/tika/server/core/TikaServerProcess.java | 194 +-
.../tika/server/core/TikaServerWatchDog.java | 222 +-
.../apache/tika/server/core/WatchDogResult.java | 7 +-
.../server/core/config/DocumentSelectorConfig.java | 10 +-
.../server/core/config/PasswordProviderConfig.java | 27 +-
.../tika/server/core/resource/AsyncEmitter.java | 30 +-
.../tika/server/core/resource/AsyncParser.java | 42 +-
.../tika/server/core/resource/AsyncRequest.java | 4 +-
.../tika/server/core/resource/AsyncResource.java | 56 +-
.../server/core/resource/DetectorResource.java | 21 +-
.../tika/server/core/resource/EmitterResource.java | 133 +-
.../server/core/resource/LanguageResource.java | 55 +-
.../server/core/resource/MetadataResource.java | 62 +-
.../core/resource/RecursiveMetadataResource.java | 131 +-
.../tika/server/core/resource/TikaDetectors.java | 15 +-
.../tika/server/core/resource/TikaMimeTypes.java | 37 +-
.../tika/server/core/resource/TikaParsers.java | 43 +-
.../tika/server/core/resource/TikaResource.java | 243 +-
.../server/core/resource/TikaServerStatus.java | 8 +-
.../tika/server/core/resource/TikaWelcome.java | 44 +-
.../server/core/resource/TranslateResource.java | 135 +-
.../server/core/resource/UnpackerResource.java | 84 +-
.../server/core/writer/CSVMessageBodyWriter.java | 29 +-
.../server/core/writer/JSONMessageBodyWriter.java | 29 +-
.../tika/server/core/writer/JSONObjWriter.java | 30 +-
.../core/writer/MetadataListMessageBodyWriter.java | 29 +-
.../apache/tika/server/core/writer/TarWriter.java | 25 +-
.../server/core/writer/TextMessageBodyWriter.java | 28 +-
.../apache/tika/server/core/writer/ZipWriter.java | 27 +-
.../src/main/resources/tikaserver-template.html | 18 +-
.../main/resources/tikaserver-version.properties | 15 +
.../org/apache/tika/server/core/CXFTestBase.java | 84 +-
.../tika/server/core/IntegrationTestBase.java | 92 +-
.../tika/server/core/LanguageResourceTest.java | 139 +-
.../tika/server/core/NullWebClientLogger.java | 5 +-
.../apache/tika/server/core/ServerStatusTest.java | 16 +-
.../apache/tika/server/core/StackTraceOffTest.java | 54 +-
.../apache/tika/server/core/StackTraceTest.java | 62 +-
.../apache/tika/server/core/TikaEmitterTest.java | 162 +-
.../apache/tika/server/core/TikaMimeTypesTest.java | 34 +-
.../apache/tika/server/core/TikaResourceTest.java | 60 +-
.../core/TikaServerAsyncIntegrationTest.java | 123 +-
.../tika/server/core/TikaServerConfigTest.java | 32 +-
.../core/TikaServerEmitterIntegrationTest.java | 147 +-
.../server/core/TikaServerIntegrationTest.java | 217 +-
.../tika/server/core/TikaServerStatusTest.java | 29 +-
.../apache/tika/server/core/TikaVersionTest.java | 20 +-
.../apache/tika/server/core/TikaWelcomeTest.java | 54 +-
.../tika/server/core/TranslateResourceTest.java | 90 +-
.../src/test/resources/log4j.properties | 6 +-
.../test-documents/mock/heavy_hang_100.xml | 2 +-
.../test-documents/mock/heavy_hang_30000.xml | 2 +-
.../resources/test-documents/mock/system_exit.xml | 2 +-
.../test-documents/mock/testStaticStdOutErr.xml | 45 +-
.../test-documents/mock/testStdOutErr.xml | 45 +-
.../test-documents/mock/thread_interrupt.xml | 2 +-
1006 files changed, 38548 insertions(+), 37878 deletions(-)
create mode 100644 .github/pull_request_template.md
copy tika-batch/src/test/resources/log4j.properties => .github/workflows/main-build.yml (59%)
copy tika-core/src/main/java/org/apache/tika/exception/{package-info.java => WriteLimitReached.java} (90%)
create mode 100644 tika-parent/checkstyle.xml
create mode 100644 tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
create mode 100644 tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
copy tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java => tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java (53%)
create mode 100644 tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt
create mode 100644 tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
create mode 100644 tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/resources/ground-truth/EnviHeaderGroundTruth.txt
[tika] 02/02: merge from main and required updates/conflict
resolution
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3304
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5dbffcd8eaae9135b509cf05a13d17c5542e2fc9
Author: tballison <ta...@apache.org>
AuthorDate: Wed Mar 24 14:26:41 2021 -0400
merge from main and required updates/conflict resolution
---
.gitignore | 1 +
.../tika/pipes/fetchiterator/FetchIterator.java | 46 ++++++++++++----------
.../org/apache/tika/server/client/TikaClient.java | 2 +-
.../apache/tika/server/client/TikaClientCLI.java | 35 ++++++++++++++--
.../tika/server/client/TikaEmitterResult.java | 7 ++--
.../apache/tika/server/client/TikaHttpClient.java | 6 +--
.../tika/server/core/resource/AsyncEmitter.java | 2 +-
.../tika/server/core/resource/EmitterResource.java | 23 ++---------
8 files changed, 72 insertions(+), 50 deletions(-)
diff --git a/.gitignore b/.gitignore
index dda6180..c608dc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
.svn
target
dependency-reduced-pom.xml
+.editorconfig
.idea
.classpath
.project
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchIterator.java
index dde8222..2b9273d 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchIterator.java
@@ -17,18 +17,24 @@
package org.apache.tika.pipes.fetchiterator;
import java.io.IOException;
+import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
-
+import org.apache.tika.exception.TikaTimeoutException;
/**
* Abstract class that handles the testing for timeouts/thread safety
* issues. Concrete classes implement the blocking {@link #enqueue()}.
@@ -36,14 +42,13 @@ import org.apache.tika.exception.TikaConfigException;
* a RuntimeException. It will throw an IllegalStateException if
* next() is called after hasNext() has returned false.
*/
-public abstract class FetchIterator implements Callable<Integer>,
- Iterable<FetchEmitTuple>, Initializable {
+public abstract class FetchIterator
+ implements Callable<Integer>, Iterable<FetchEmitTuple>, Initializable {
public static final long DEFAULT_MAX_WAIT_MS = 300_000;
public static final int DEFAULT_QUEUE_SIZE = 1000;
- public static final FetchEmitTuple COMPLETED_SEMAPHORE =
- new FetchEmitTuple(null, null, null);
+ public static final FetchEmitTuple COMPLETED_SEMAPHORE = new FetchEmitTuple(null, null, null);
private static final Logger LOGGER = LoggerFactory.getLogger(FetchIterator.class);
@@ -53,16 +58,21 @@ public abstract class FetchIterator implements Callable<Integer>,
private String fetcherName;
private String emitterName;
private int added = 0;
- private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
+ private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
private FutureTask<Integer> futureTask;
+ public String getFetcherName() {
+ return fetcherName;
+ }
+
@Field
public void setFetcherName(String fetcherName) {
this.fetcherName = fetcherName;
}
- public String getFetcherName() {
- return fetcherName;
+ public String getEmitterName() {
+ return emitterName;
}
@Field
@@ -70,10 +80,6 @@ public abstract class FetchIterator implements Callable<Integer>,
this.emitterName = emitterName;
}
- public String getEmitterName() {
- return emitterName;
- }
-
@Field
public void setMaxWaitMs(long maxWaitMs) {
this.maxWaitMs = maxWaitMs;
@@ -84,6 +90,10 @@ public abstract class FetchIterator implements Callable<Integer>,
this.queueSize = queueSize;
}
+ public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() {
+ return onParseException;
+ }
+
@Field
public void setOnParseException(String onParseException) throws TikaConfigException {
if ("skip".equalsIgnoreCase(onParseException)) {
@@ -99,10 +109,6 @@ public abstract class FetchIterator implements Callable<Integer>,
this.onParseException = onParseException;
}
- public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() {
- return onParseException;
- }
-
public Integer call() throws Exception {
enqueue();
tryToAdd(COMPLETED_SEMAPHORE);
@@ -168,19 +174,19 @@ public abstract class FetchIterator implements Callable<Integer>,
FetchEmitTuple t = null;
long start = System.currentTimeMillis();
try {
- long elapsed = System.currentTimeMillis()-start;
+ long elapsed = System.currentTimeMillis() - start;
while (t == null && elapsed < maxWaitMs) {
checkThreadOk();
t = queue.poll(100, TimeUnit.MILLISECONDS);
- elapsed = System.currentTimeMillis()-start;
+ elapsed = System.currentTimeMillis() - start;
}
} catch (InterruptedException e) {
LOGGER.warn("interrupted");
return COMPLETED_SEMAPHORE;
}
if (t == null) {
- throw new TikaTimeoutException("waited longer than "+
- maxWaitMs+"ms for the next tuple");
+ throw new TikaTimeoutException(
+ "waited longer than " + maxWaitMs + "ms for the next tuple");
}
return t;
}
diff --git a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClient.java b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClient.java
index 3b1fb25..a9f7319 100644
--- a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClient.java
+++ b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClient.java
@@ -31,7 +31,7 @@ import org.apache.tika.pipes.fetchiterator.FetchEmitTuple;
public class TikaClient {
private final Random random = new Random();
- private List<TikaHttpClient> clients;
+ private final List<TikaHttpClient> clients;
private TikaClient(List<TikaHttpClient> clients) {
diff --git a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientCLI.java b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientCLI.java
index b959839..5e583db 100644
--- a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientCLI.java
+++ b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientCLI.java
@@ -45,8 +45,9 @@ import org.apache.tika.pipes.fetchiterator.FetchIterator;
public class TikaClientCLI {
private static final Logger LOGGER = LoggerFactory.getLogger(TikaClientCLI.class);
+ private static final int QUEUE_SIZE = 10000;
- private long maxWaitMs = 300000;
+ private final long maxWaitMs = 300000;
public static void main(String[] args) throws Exception {
//TODO -- add an actual commandline,
@@ -65,9 +66,10 @@ public class TikaClientCLI {
ExecutorCompletionService<Integer> completionService =
new ExecutorCompletionService<>(executorService);
final FetchIterator fetchIterator = config.getFetchIterator();
- final ArrayBlockingQueue<FetchEmitTuple> queue = fetchIterator.init(numThreads);
+ final ArrayBlockingQueue<FetchEmitTuple> queue =
+ new ArrayBlockingQueue<>(QUEUE_SIZE);
- completionService.submit(fetchIterator);
+ completionService.submit(new FetchIteratorWrapper(fetchIterator, queue, numThreads));
if (tikaServerUrls.size() == numThreads) {
logDiffSizes(tikaServerUrls.size(), numThreads);
for (int i = 0; i < numThreads; i++) {
@@ -179,4 +181,31 @@ public class TikaClientCLI {
}
}
}
+
+ private class FetchIteratorWrapper implements Callable<Integer> {
+ private final FetchIterator fetchIterator;
+ private final ArrayBlockingQueue<FetchEmitTuple> queue;
+ private final int numThreads;
+
+ public FetchIteratorWrapper(FetchIterator fetchIterator,
+ ArrayBlockingQueue<FetchEmitTuple> queue,
+ int numThreads) {
+ this.fetchIterator = fetchIterator;
+ this.queue = queue;
+ this.numThreads = numThreads;
+
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ for (FetchEmitTuple t : fetchIterator) {
+ //potentially blocks forever
+ queue.put(t);
+ }
+ for (int i = 0; i < numThreads; i ++) {
+ queue.put(FetchIterator.COMPLETED_SEMAPHORE);
+ }
+ return 1;
+ }
+ }
}
diff --git a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaEmitterResult.java b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaEmitterResult.java
index 1438885..4cb134c 100644
--- a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaEmitterResult.java
+++ b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaEmitterResult.java
@@ -19,9 +19,10 @@ package org.apache.tika.server.client;
public class TikaEmitterResult {
- private STATUS status;
- private String msg;//used for exceptions. will be null for status ok
- private long timeElapsed;
+ private final STATUS status;
+ private final String msg;//used for exceptions. will be null for status ok
+ private final long timeElapsed;
+
public TikaEmitterResult(STATUS status, long timeElapsed, String msg) {
this.status = status;
this.timeElapsed = timeElapsed;
diff --git a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaHttpClient.java b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaHttpClient.java
index 6d6517f..9a5ff7f 100644
--- a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaHttpClient.java
+++ b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaHttpClient.java
@@ -48,11 +48,11 @@ class TikaHttpClient {
private final String emitEndPointUrl;
private final String asyncEndPointUrl;
private final String tikaUrl;
- private int maxRetries = 3;
+ private final int maxRetries = 3;
//if can't make contact with Tika server, max wait time in ms
- private long maxWaitForTikaMs = 120000;
+ private final long maxWaitForTikaMs = 120000;
//how often to ping /tika (in ms) to see if the server is up and running
- private long pulseWaitForTikaMs = 1000;
+ private final long pulseWaitForTikaMs = 1000;
/**
* @param baseUrl url to base endpoint
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncEmitter.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncEmitter.java
index 1ebe787..054a1d2 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncEmitter.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncEmitter.java
@@ -97,7 +97,7 @@ public class AsyncEmitter implements Callable<Integer> {
void add(EmitData data) {
size++;
long sz = AbstractEmitter
- .estimateSizeInBytes(data.getEmitKey().getKey(), data.getMetadataList());
+ .estimateSizeInBytes(data.getEmitKey().getEmitKey(), data.getMetadataList());
if (estimatedSize + sz > maxBytes) {
LOG.debug("estimated size ({}) > maxBytes({}), going to emitAll",
(estimatedSize + sz), maxBytes);
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
index 56a71db..306732a 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
@@ -64,18 +64,13 @@ public class EmitterResource {
private static final String FETCH_KEY_ABBREV = "fk";
private static final String EMIT_KEY_ABBREV = "ek";
- /**
- * key that is safe to pass through http header.
- * The user _must_ specify this for the fsemitter if calling 'put'
- */
- public static final String EMIT_KEY_FOR_HTTP_HEADER = "emit-key";
private static final Logger LOG = LoggerFactory.getLogger(EmitterResource.class);
static EmitKey calcEmitKey(FetchEmitTuple t) {
//use fetch key if emitter key is not specified
//TODO: clean this up?
EmitKey emitKey = t.getEmitKey();
- if (StringUtils.isBlank(emitKey.getKey())) {
+ if (StringUtils.isBlank(emitKey.getEmitKey())) {
emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getKey());
}
return emitKey;
@@ -197,21 +192,11 @@ public class EmitterResource {
return emit(calcEmitKey(t), metadataList);
}
- static EmitKey calcEmitKey(FetchEmitTuple t) {
- //use fetch key if emitter key is not specified
- //TODO: clean this up?
- EmitKey emitKey = t.getEmitKey();
- if (StringUtils.isBlank(emitKey.getKey())) {
- emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getKey());
- }
- return emitKey;
- }
-
private Map<String, String> skip(FetchEmitTuple t, List<Metadata> metadataList) {
Map<String, String> statusMap = new HashMap<>();
statusMap.put("status", "ok");
statusMap.put("emitter", t.getEmitKey().getEmitterName());
- statusMap.put("emitKey", t.getEmitKey().getKey());
+ statusMap.put("emitKey", t.getEmitKey().getEmitKey());
String msg = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
statusMap.put("parse_exception", msg);
return statusMap;
@@ -269,9 +254,9 @@ public class EmitterResource {
String status = "ok";
String exceptionMsg = "";
try {
- emitter.emit(emitKey.getKey(), metadataList);
+ emitter.emit(emitKey.getEmitKey(), metadataList);
} catch (IOException | TikaEmitterException e) {
- LOG.warn("problem emitting (" + emitKey.getKey() + ")", e);
+ LOG.warn("problem emitting (" + emitKey.getEmitterName() + ")", e);
status = "emitter_exception";
exceptionMsg = ExceptionUtils.getStackTrace(e);
}
[tika] 01/02: Merge remote-tracking branch 'origin/main' into
TIKA-3304
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3304
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9416f5926af5ab7637b8afa6a787be37e1b360e6
Merge: 26ff633 d87ac65
Author: tballison <ta...@apache.org>
AuthorDate: Wed Mar 24 13:30:44 2021 -0400
Merge remote-tracking branch 'origin/main' into TIKA-3304
# Conflicts:
# tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
# tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchIterator.java
# tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
# tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncEmitter.java
# tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
# tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerEmitterIntegrationTest.java
.github/pull_request_template.md | 12 +
.../workflows/main-build.yml | 33 +-
CHANGES.txt | 25 +-
tika-core/pom.xml | 30 +
tika-core/src/main/java/org/apache/tika/Tika.java | 143 +-
.../concurrent/ConfigurableThreadPoolExecutor.java | 64 +-
.../tika/concurrent/SimpleThreadPoolExecutor.java | 82 +-
.../main/java/org/apache/tika/config/Field.java | 4 +-
.../java/org/apache/tika/config/Initializable.java | 11 +-
.../tika/config/InitializableProblemHandler.java | 19 +-
.../org/apache/tika/config/LoadErrorHandler.java | 29 +-
.../main/java/org/apache/tika/config/Param.java | 261 +-
.../java/org/apache/tika/config/ParamField.java | 56 +-
.../java/org/apache/tika/config/ServiceLoader.java | 212 +-
.../java/org/apache/tika/config/TikaActivator.java | 5 +-
.../java/org/apache/tika/config/TikaConfig.java | 701 +++---
.../apache/tika/config/TikaConfigSerializer.java | 119 +-
.../org/apache/tika/detect/AutoDetectReader.java | 100 +-
.../org/apache/tika/detect/CompositeDetector.java | 34 +-
.../tika/detect/CompositeEncodingDetector.java | 22 +-
.../org/apache/tika/detect/DefaultDetector.java | 83 +-
.../tika/detect/DefaultEncodingDetector.java | 13 +-
.../apache/tika/detect/DefaultProbDetector.java | 32 +-
.../main/java/org/apache/tika/detect/Detector.java | 2 +-
.../java/org/apache/tika/detect/EmptyDetector.java | 3 +-
.../org/apache/tika/detect/EncodingDetector.java | 2 +-
.../apache/tika/detect/FileCommandDetector.java | 58 +-
.../java/org/apache/tika/detect/MagicDetector.java | 357 ++-
.../apache/tika/detect/NNExampleModelDetector.java | 19 +-
.../org/apache/tika/detect/NNTrainedModel.java | 147 +-
.../apache/tika/detect/NNTrainedModelBuilder.java | 77 +-
.../java/org/apache/tika/detect/NameDetector.java | 6 +-
.../tika/detect/NonDetectingEncodingDetector.java | 8 +-
.../org/apache/tika/detect/OverrideDetector.java | 9 +-
.../java/org/apache/tika/detect/TextDetector.java | 15 +-
.../org/apache/tika/detect/TextStatistics.java | 27 +-
.../java/org/apache/tika/detect/TrainedModel.java | 7 +-
.../apache/tika/detect/TrainedModelDetector.java | 22 +-
.../java/org/apache/tika/detect/TypeDetector.java | 2 +-
.../org/apache/tika/detect/XmlRootExtractor.java | 16 +-
.../apache/tika/detect/ZeroSizeFileDetector.java | 13 +-
.../java/org/apache/tika/embedder/Embedder.java | 37 +-
.../org/apache/tika/embedder/ExternalEmbedder.java | 243 +-
.../tika/exception/EncryptedDocumentException.java | 2 +-
.../apache/tika/exception/TikaConfigException.java | 1 +
.../org/apache/tika/exception/TikaException.java | 2 +-
.../tika/exception/TikaMemoryLimitException.java | 2 +-
.../apache/tika/exception/WriteLimitReached.java | 9 +-
.../tika/exception/ZeroByteFileException.java | 15 +-
.../apache/tika/extractor/ContainerExtractor.java | 23 +-
.../extractor/DefaultEmbeddedStreamTranslator.java | 8 +-
.../apache/tika/extractor/DocumentSelector.java | 2 +-
.../tika/extractor/EmbeddedDocumentExtractor.java | 7 +-
.../tika/extractor/EmbeddedDocumentUtil.java | 24 +-
.../tika/extractor/EmbeddedStreamTranslator.java | 4 +-
.../tika/extractor/ParserContainerExtractor.java | 8 +-
.../ParsingEmbeddedDocumentExtractor.java | 28 +-
.../org/apache/tika/fork/ClassLoaderProxy.java | 10 +-
.../org/apache/tika/fork/ClassLoaderResource.java | 7 +-
.../org/apache/tika/fork/ContentHandlerProxy.java | 54 +-
.../apache/tika/fork/ContentHandlerResource.java | 14 +-
.../main/java/org/apache/tika/fork/ForkClient.java | 184 +-
.../apache/tika/fork/ForkObjectInputStream.java | 43 +-
.../main/java/org/apache/tika/fork/ForkParser.java | 162 +-
.../java/org/apache/tika/fork/ForkResource.java | 4 +-
.../main/java/org/apache/tika/fork/ForkServer.java | 126 +-
.../org/apache/tika/fork/InputStreamProxy.java | 4 +-
.../org/apache/tika/fork/InputStreamResource.java | 3 +-
.../org/apache/tika/fork/MemoryURLConnection.java | 2 +-
.../apache/tika/fork/MemoryURLStreamHandler.java | 4 +-
.../tika/fork/MemoryURLStreamHandlerFactory.java | 2 +-
.../apache/tika/fork/MemoryURLStreamRecord.java | 2 +-
.../apache/tika/fork/MetadataContentHandler.java | 6 +-
.../org/apache/tika/fork/ParserFactoryFactory.java | 13 +-
.../fork/RecursiveMetadataContentHandlerProxy.java | 39 +-
.../RecursiveMetadataContentHandlerResource.java | 40 +-
.../org/apache/tika/io/BoundedInputStream.java | 4 +-
.../main/java/org/apache/tika/io/EndianUtils.java | 42 +-
.../java/org/apache/tika/io/FilenameUtils.java | 30 +-
.../src/main/java/org/apache/tika/io/IOUtils.java | 18 +-
.../org/apache/tika/io/InputStreamFactory.java | 17 +-
.../org/apache/tika/io/LookaheadInputStream.java | 10 +-
.../org/apache/tika/io/MappedBufferCleaner.java | 96 +-
.../main/java/org/apache/tika/io/TailStream.java | 141 +-
.../org/apache/tika/io/TemporaryResources.java | 8 +-
.../java/org/apache/tika/io/TikaInputStream.java | 314 ++-
.../apache/tika/language/LanguageIdentifier.java | 165 +-
.../org/apache/tika/language/LanguageProfile.java | 106 +-
.../tika/language/LanguageProfilerBuilder.java | 496 ++--
.../org/apache/tika/language/ProfilingHandler.java | 3 +-
.../org/apache/tika/language/ProfilingWriter.java | 2 +-
.../tika/language/detect/LanguageConfidence.java | 5 +-
.../tika/language/detect/LanguageDetector.java | 342 +--
.../tika/language/detect/LanguageHandler.java | 10 +-
.../apache/tika/language/detect/LanguageNames.java | 111 +-
.../tika/language/detect/LanguageResult.java | 158 +-
.../tika/language/detect/LanguageWriter.java | 10 +-
.../tika/language/translate/DefaultTranslator.java | 14 +-
.../tika/language/translate/EmptyTranslator.java | 2 +-
.../apache/tika/language/translate/Translator.java | 22 +-
.../apache/tika/metadata/AccessPermissions.java | 24 +-
.../org/apache/tika/metadata/ClimateForcast.java | 30 +-
.../org/apache/tika/metadata/CreativeCommons.java | 2 +-
.../java/org/apache/tika/metadata/Database.java | 14 +-
.../java/org/apache/tika/metadata/DublinCore.java | 76 +-
.../main/java/org/apache/tika/metadata/Font.java | 4 +-
.../java/org/apache/tika/metadata/Geographic.java | 15 +-
.../main/java/org/apache/tika/metadata/HTML.java | 6 +-
.../java/org/apache/tika/metadata/HttpHeaders.java | 4 +-
.../main/java/org/apache/tika/metadata/IPTC.java | 2527 ++++++++++----------
.../org/apache/tika/metadata/MachineMetadata.java | 143 +-
.../java/org/apache/tika/metadata/Message.java | 38 +-
.../java/org/apache/tika/metadata/Metadata.java | 241 +-
.../main/java/org/apache/tika/metadata/Office.java | 211 +-
.../apache/tika/metadata/OfficeOpenXMLCore.java | 52 +-
.../tika/metadata/OfficeOpenXMLExtended.java | 65 +-
.../main/java/org/apache/tika/metadata/PDF.java | 76 +-
.../java/org/apache/tika/metadata/PagedText.java | 4 +-
.../java/org/apache/tika/metadata/Photoshop.java | 31 +-
.../java/org/apache/tika/metadata/Property.java | 265 +-
.../tika/metadata/PropertyTypeException.java | 8 +-
.../java/org/apache/tika/metadata/QuattroPro.java | 66 +-
.../java/org/apache/tika/metadata/RTFMetadata.java | 45 +-
.../main/java/org/apache/tika/metadata/TIFF.java | 100 +-
.../apache/tika/metadata/TikaCoreProperties.java | 337 ++-
.../java/org/apache/tika/metadata/WordPerfect.java | 100 +-
.../main/java/org/apache/tika/metadata/XMP.java | 4 +-
.../main/java/org/apache/tika/metadata/XMPDM.java | 290 +--
.../main/java/org/apache/tika/metadata/XMPIdq.java | 4 +-
.../main/java/org/apache/tika/metadata/XMPMM.java | 42 +-
.../java/org/apache/tika/metadata/XMPRights.java | 20 +-
.../metadata/filter/ClearByMimeMetadataFilter.java | 13 +-
.../metadata/filter/CompositeMetadataFilter.java | 4 +-
.../metadata/filter/DefaultMetadataFilter.java | 20 +-
.../filter/ExcludeFieldMetadataFilter.java | 10 +-
.../metadata/filter/FieldNameMappingFilter.java | 25 +-
.../filter/IncludeFieldMetadataFilter.java | 11 +-
.../tika/metadata/filter/MetadataFilter.java | 4 +-
.../main/java/org/apache/tika/mime/HexCoDec.java | 49 +-
.../src/main/java/org/apache/tika/mime/Magic.java | 2 -
.../main/java/org/apache/tika/mime/MagicMatch.java | 12 +-
.../main/java/org/apache/tika/mime/MediaType.java | 273 +--
.../org/apache/tika/mime/MediaTypeRegistry.java | 67 +-
.../main/java/org/apache/tika/mime/MimeType.java | 283 ++-
.../org/apache/tika/mime/MimeTypeException.java | 6 +-
.../main/java/org/apache/tika/mime/MimeTypes.java | 247 +-
.../org/apache/tika/mime/MimeTypesFactory.java | 108 +-
.../java/org/apache/tika/mime/MimeTypesReader.java | 346 +--
.../apache/tika/mime/MimeTypesReaderMetKeys.java | 2 +-
.../org/apache/tika/mime/MinShouldMatchClause.java | 11 +-
.../main/java/org/apache/tika/mime/Patterns.java | 99 +-
.../mime/ProbabilisticMimeDetectionSelector.java | 93 +-
.../parser/AbstractEncodingDetectorParser.java | 1 +
.../org/apache/tika/parser/AbstractParser.java | 11 +-
.../org/apache/tika/parser/AutoDetectParser.java | 33 +-
.../tika/parser/AutoDetectParserFactory.java | 9 +-
.../org/apache/tika/parser/CompositeParser.java | 93 +-
.../java/org/apache/tika/parser/CryptoParser.java | 26 +-
.../java/org/apache/tika/parser/DefaultParser.java | 88 +-
.../org/apache/tika/parser/DelegatingParser.java | 13 +-
.../org/apache/tika/parser/DigestingParser.java | 63 +-
.../java/org/apache/tika/parser/EmptyParser.java | 20 +-
.../java/org/apache/tika/parser/ErrorParser.java | 12 +-
.../java/org/apache/tika/parser/NetworkParser.java | 66 +-
.../java/org/apache/tika/parser/ParseContext.java | 67 +-
.../main/java/org/apache/tika/parser/Parser.java | 23 +-
.../org/apache/tika/parser/ParserDecorator.java | 84 +-
.../java/org/apache/tika/parser/ParserFactory.java | 7 +-
.../apache/tika/parser/ParserPostProcessor.java | 11 +-
.../java/org/apache/tika/parser/ParsingReader.java | 130 +-
.../org/apache/tika/parser/PasswordProvider.java | 11 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 177 +-
.../org/apache/tika/parser/StatefulParser.java | 2 +-
.../tika/parser/digest/CompositeDigester.java | 2 +-
.../tika/parser/digest/InputStreamDigester.java | 64 +-
.../parser/external/CompositeExternalParser.java | 25 +-
.../tika/parser/external/ExternalParser.java | 293 ++-
.../external/ExternalParsersConfigReader.java | 335 ++-
.../ExternalParsersConfigReaderMetKeys.java | 14 +-
.../parser/external/ExternalParsersFactory.java | 93 +-
.../parser/multiple/AbstractMultipleParser.java | 376 +--
.../tika/parser/multiple/FallbackParser.java | 36 +-
.../tika/parser/multiple/SupplementingParser.java | 50 +-
.../apache/tika/pipes/emitter/AbstractEmitter.java | 38 +-
.../org/apache/tika/pipes/emitter/EmitData.java | 25 +-
.../org/apache/tika/pipes/emitter/EmitKey.java | 18 +-
.../org/apache/tika/pipes/emitter/Emitter.java | 4 +-
.../apache/tika/pipes/emitter/EmitterManager.java | 16 +-
.../apache/tika/pipes/emitter/EmptyEmitter.java | 7 +-
.../apache/tika/pipes/emitter/StreamEmitter.java | 4 +-
.../apache/tika/pipes/fetcher/EmptyFetcher.java | 6 +-
.../org/apache/tika/pipes/fetcher/FetchKey.java | 22 +-
.../org/apache/tika/pipes/fetcher/Fetcher.java | 10 +-
.../apache/tika/pipes/fetcher/FetcherManager.java | 18 +-
.../tika/pipes/fetcher/FileSystemFetcher.java | 50 +-
.../pipes/fetchiterator/EmptyFetchIterator.java | 6 -
.../tika/pipes/fetchiterator/FetchEmitTuple.java | 38 +-
.../tika/pipes/fetchiterator/FetchIterator.java | 24 +-
.../fetchiterator/FileSystemFetchIterator.java | 41 +-
.../sax/AbstractRecursiveParserWrapperHandler.java | 53 +-
.../tika/sax/BasicContentHandlerFactory.java | 98 +-
.../org/apache/tika/sax/BodyContentHandler.java | 12 +-
.../java/org/apache/tika/sax/CleanPhoneText.java | 345 +--
.../apache/tika/sax/ContentHandlerDecorator.java | 18 +-
.../org/apache/tika/sax/ContentHandlerFactory.java | 15 +-
.../org/apache/tika/sax/DIFContentHandler.java | 242 +-
.../tika/sax/ElementMappingContentHandler.java | 81 +-
.../sax/EndDocumentShieldingContentHandler.java | 16 +-
.../tika/sax/ExpandedTitleContentHandler.java | 22 +-
.../src/main/java/org/apache/tika/sax/Link.java | 4 +-
.../main/java/org/apache/tika/sax/LinkBuilder.java | 12 +-
.../org/apache/tika/sax/LinkContentHandler.java | 30 +-
.../tika/sax/PhoneExtractingContentHandler.java | 20 +-
.../tika/sax/RecursiveParserWrapperHandler.java | 54 +-
.../apache/tika/sax/RichTextContentHandler.java | 3 +-
.../org/apache/tika/sax/SafeContentHandler.java | 155 +-
.../org/apache/tika/sax/SecureContentHandler.java | 86 +-
.../org/apache/tika/sax/StandardOrganizations.java | 305 +--
.../org/apache/tika/sax/StandardReference.java | 201 +-
.../sax/StandardsExtractingContentHandler.java | 155 +-
.../java/org/apache/tika/sax/StandardsText.java | 277 +--
.../org/apache/tika/sax/TaggedContentHandler.java | 8 +-
.../org/apache/tika/sax/TaggedSAXException.java | 6 +-
.../org/apache/tika/sax/TeeContentHandler.java | 18 +-
.../tika/sax/TextAndAttributeContentHandler.java | 12 +-
.../org/apache/tika/sax/TextContentHandler.java | 12 +-
.../org/apache/tika/sax/ToHTMLContentHandler.java | 10 +-
.../org/apache/tika/sax/ToTextContentHandler.java | 30 +-
.../org/apache/tika/sax/ToXMLContentHandler.java | 120 +-
.../apache/tika/sax/WriteOutContentHandler.java | 52 +-
.../org/apache/tika/sax/XHTMLContentHandler.java | 103 +-
.../org/apache/tika/sax/XMPContentHandler.java | 27 +-
.../apache/tika/sax/xpath/CompositeMatcher.java | 3 +-
.../java/org/apache/tika/sax/xpath/Matcher.java | 4 +-
.../tika/sax/xpath/MatchingContentHandler.java | 21 +-
.../org/apache/tika/sax/xpath/XPathParser.java | 10 +-
.../org/apache/tika/utils/AnnotationUtils.java | 61 +-
.../java/org/apache/tika/utils/CharsetUtils.java | 97 +-
.../java/org/apache/tika/utils/CompareUtils.java | 10 +-
.../org/apache/tika/utils/ConcurrentUtils.java | 112 +-
.../main/java/org/apache/tika/utils/DateUtils.java | 90 +-
.../java/org/apache/tika/utils/ExceptionUtils.java | 5 +-
.../java/org/apache/tika/utils/ParserUtils.java | 60 +-
.../java/org/apache/tika/utils/ProcessUtils.java | 2 +-
.../java/org/apache/tika/utils/RegexUtils.java | 20 +-
.../apache/tika/utils/RereadableInputStream.java | 261 +-
.../org/apache/tika/utils/ServiceLoaderUtils.java | 12 +-
.../java/org/apache/tika/utils/StringUtils.java | 26 +-
.../java/org/apache/tika/utils/SystemUtils.java | 15 +-
.../java/org/apache/tika/utils/XMLReaderUtils.java | 545 +++--
.../org/apache/custom/detect/MyCustomDetector.java | 6 +-
.../org/apache/tika/MultiThreadedTikaTest.java | 332 +--
.../apache/tika/ResourceLoggingClassLoader.java | 24 +-
.../org/apache/tika/TestRereadableInputStream.java | 144 +-
.../java/org/apache/tika/TikaDetectionTest.java | 51 +-
.../src/test/java/org/apache/tika/TikaIT.java | 5 +-
.../src/test/java/org/apache/tika/TikaTest.java | 444 ++--
.../org/apache/tika/TypeDetectionBenchmark.java | 18 +-
.../apache/tika/config/AbstractTikaConfigTest.java | 14 +-
.../java/org/apache/tika/config/DummyExecutor.java | 59 +-
.../java/org/apache/tika/config/DummyParser.java | 8 +-
.../java/org/apache/tika/config/ParamTest.java | 37 +-
.../tika/config/TikaConfigSerializerTest.java | 24 +-
.../org/apache/tika/config/TikaConfigTest.java | 163 +-
.../tika/detect/FileCommandDetectorTest.java | 40 +-
.../org/apache/tika/detect/MagicDetectorTest.java | 143 +-
.../tika/detect/MimeDetectionWithNNTest.java | 213 +-
.../org/apache/tika/detect/NameDetectorTest.java | 25 +-
.../org/apache/tika/detect/TextDetectorTest.java | 31 +-
.../org/apache/tika/detect/TypeDetectorTest.java | 31 +-
.../tika/detect/ZeroSizeFileDetectorTest.java | 5 +-
.../java/org/apache/tika/fork/ForkParserTest.java | 123 +-
.../apache/tika/fork/ForkParserTikaBinTest.java | 135 +-
.../java/org/apache/tika/fork/ForkTestParser.java | 21 +-
.../tika/fork/UpperCasingContentHandler.java | 7 +-
.../java/org/apache/tika/io/EndianUtilsTest.java | 38 +-
.../java/org/apache/tika/io/FilenameUtilsTest.java | 39 +-
.../apache/tika/io/LookaheadInputStreamTest.java | 20 +-
.../java/org/apache/tika/io/TailStreamTest.java | 87 +-
.../org/apache/tika/io/TemporaryResourcesTest.java | 6 +-
.../org/apache/tika/io/TikaInputStreamTest.java | 56 +-
.../tika/language/LanguageIdentifierTest.java | 44 +-
.../apache/tika/language/LanguageProfileTest.java | 2 +-
.../tika/language/LanguageProfilerBuilderTest.java | 32 +-
.../tika/language/detect/LanguageNamesTest.java | 26 +-
.../org/apache/tika/metadata/TestMetadata.java | 212 +-
.../tika/metadata/filter/MockUpperCaseFilter.java | 4 +-
.../tika/metadata/filter/TestMetadataFilter.java | 31 +-
.../org/apache/tika/mime/CustomReaderTest.java | 120 +-
.../java/org/apache/tika/mime/MediaTypeTest.java | 86 +-
.../org/apache/tika/mime/MimeDetectionTest.java | 135 +-
.../org/apache/tika/mime/MimeTypesReaderTest.java | 297 ++-
.../java/org/apache/tika/mime/PatternsTest.java | 18 +-
.../tika/mime/ProbabilisticMimeDetectionTest.java | 114 +-
.../ProbabilisticMimeDetectionTestWithTika.java | 100 +-
.../apache/tika/parser/CompositeParserTest.java | 158 +-
.../tika/parser/DummyInitializableParser.java | 29 +-
.../tika/parser/DummyParameterizedParser.java | 71 +-
.../java/org/apache/tika/parser/DummyParser.java | 57 +-
.../tika/parser/InitializableParserTest.java | 14 +-
.../tika/parser/ParameterizedParserTest.java | 35 +-
.../apache/tika/parser/ParserDecoratorTest.java | 57 +-
.../org/apache/tika/parser/mock/MockParser.java | 72 +-
.../apache/tika/parser/mock/MockParserFactory.java | 8 +-
.../org/apache/tika/parser/mock/VowelParser.java | 11 +-
.../tika/parser/multiple/MultipleParserTest.java | 137 +-
.../org/apache/tika/pipes/emitter/MockEmitter.java | 8 +-
.../tika/pipes/fetcher/FileSystemFetcherTest.java | 10 +-
.../fetchiterator/FileSystemFetchIteratorTest.java | 25 +-
.../tika/sax/BasicContentHandlerFactoryTest.java | 98 +-
.../apache/tika/sax/BodyContentHandlerTest.java | 9 +-
.../apache/tika/sax/LinkContentHandlerTest.java | 29 +-
.../apache/tika/sax/OfflineContentHandlerTest.java | 9 +-
.../tika/sax/RichTextContentHandlerTest.java | 15 +-
.../apache/tika/sax/SecureContentHandlerTest.java | 7 +-
.../java/org/apache/tika/sax/SerializerTest.java | 55 +-
.../apache/tika/sax/XHTMLContentHandlerTest.java | 77 +-
.../org/apache/tika/utils/AnnotationUtilsTest.java | 47 +-
.../org/apache/tika/utils/CharsetUtilsTest.java | 14 +-
.../org/apache/tika/utils/ConcurrentUtilsTest.java | 126 +-
.../java/org/apache/tika/utils/RegexUtilsTest.java | 31 +-
.../apache/tika/utils/ServiceLoaderUtilsTest.java | 28 +-
tika-core/src/test/resources/log4j.properties | 1 +
.../org/apache/tika/config/FileCommandDetector.xml | 2 +-
.../org/apache/tika/config/TIKA-1762-executors.xml | 62 +-
.../apache/tika/fuzzing/general/ByteFlipper.java | 2 +-
tika-parent/checkstyle.xml | 139 ++
tika-parent/pom.xml | 42 +-
tika-parsers/pom.xml | 38 +-
.../tika/parser/recognition/AgeRecogniser.java | 182 +-
.../parser/recognition/AgeRecogniserConfig.java | 59 +-
.../tika/parser/recognition/AgeRecogniserTest.java | 53 +-
.../tika/dl/imagerec/DL4JInceptionV3Net.java | 138 +-
.../org/apache/tika/dl/imagerec/DL4JVGG16Net.java | 73 +-
.../tika/dl/imagerec/DL4JInceptionV3NetTest.java | 14 +-
.../apache/tika/dl/imagerec/DL4JVGG16NetTest.java | 15 +-
.../tika/parser/captioning/CaptionObject.java | 6 +-
.../captioning/tf/TensorflowRESTCaptioner.java | 44 +-
.../tika/parser/pot/PooledTimeSeriesParser.java | 85 +-
.../tika/parser/recognition/ObjectRecogniser.java | 36 +-
.../recognition/ObjectRecognitionParser.java | 50 +-
.../tika/parser/recognition/RecognisedObject.java | 7 +-
.../recognition/tf/TensorflowImageRecParser.java | 97 +-
.../recognition/tf/TensorflowRESTRecogniser.java | 54 +-
.../tf/TensorflowRESTVideoRecogniser.java | 30 +-
.../tika/parser/captioning/tf/model_info.xml | 3 +-
.../recognition/tika-config-tflow-video-rest.xml | 3 +-
.../recognition/ObjectRecognitionParserTest.java | 96 +-
.../tf/TensorflowImageRecParserTest.java | 28 +-
.../tf/TensorflowVideoRecParserTest.java | 30 +-
.../parser/ctakes/CTAKESAnnotationProperty.java | 16 +-
.../apache/tika/parser/ctakes/CTAKESConfig.java | 249 +-
.../tika/parser/ctakes/CTAKESContentHandler.java | 178 +-
.../apache/tika/parser/ctakes/CTAKESParser.java | 42 +-
.../tika/parser/ctakes/CTAKESSerializer.java | 5 +-
.../org/apache/tika/parser/ctakes/CTAKESUtils.java | 423 ++--
.../java/org/apache/tika/parser/geo/GeoParser.java | 85 +-
.../apache/tika/parser/geo/GeoParserConfig.java | 33 +-
.../java/org/apache/tika/parser/geo/GeoTag.java | 73 +-
.../tika/parser/geo/NameEntityExtractor.java | 23 +-
.../parser/geo/gazetteer/GeoGazetteerClient.java | 157 +-
.../apache/tika/parser/geo/gazetteer/Location.java | 107 +-
.../tika/parser/journal/GrobidRESTParser.java | 62 +-
.../apache/tika/parser/journal/JournalParser.java | 44 +-
.../apache/tika/parser/journal/TEIDOMParser.java | 159 +-
.../org/apache/tika/parser/ner/NERecogniser.java | 8 +-
.../apache/tika/parser/ner/NamedEntityParser.java | 76 +-
.../parser/ner/corenlp/CoreNLPNERecogniser.java | 93 +-
.../tika/parser/ner/grobid/GrobidNERecogniser.java | 208 +-
.../tika/parser/ner/mitie/MITIENERecogniser.java | 115 +-
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 69 +-
.../parser/ner/opennlp/OpenNLPNERecogniser.java | 42 +-
.../tika/parser/ner/opennlp/OpenNLPNameFinder.java | 37 +-
.../tika/parser/ner/regex/RegexNERecogniser.java | 31 +-
.../parser/sentiment/SentimentAnalysisParser.java | 33 +-
.../tika/parser/ctakes/CTAKESConfig.properties | 2 +-
.../tika/parser/geo/GeoTopicConfig.properties | 2 +-
.../tika/parser/journal/GrobidExtractor.properties | 2 +-
.../tika/parser/ner/grobid/GrobidServer.properties | 4 +-
.../tika/parser/ner/nltk/NLTKServer.properties | 2 +-
.../org/apache/tika/parser/geo/GeoParserTest.java | 147 +-
.../tika/parser/journal/JournalParserTest.java | 5 +-
.../org/apache/tika/parser/journal/TEITest.java | 32 +-
.../tika/parser/ner/NamedEntityParserTest.java | 29 +-
.../tika/parser/ner/nltk/NLTKNERecogniserTest.java | 20 +-
.../parser/ner/regex/RegexNERecogniserTest.java | 18 +-
.../sentiment/SentimentAnalysisParserTest.java | 49 +-
.../tika/config/TIKA-3078-geo.topic.GeoParser.xml | 22 +-
tika-parsers/tika-parsers-classic/pom.xml | 89 +-
.../apache/tika/detect/apple/BPListDetector.java | 66 +-
.../apache/tika/detect/apple/IWorkDetector.java | 18 +-
.../tika/parser/apple/AppleSingleFileParser.java | 60 +-
.../org/apache/tika/parser/apple/PListParser.java | 92 +-
.../tika/parser/iwork/AutoPageNumberUtils.java | 146 +-
.../tika/parser/iwork/IWorkPackageParser.java | 271 ++-
.../tika/parser/iwork/KeynoteContentHandler.java | 36 +-
.../tika/parser/iwork/NumbersContentHandler.java | 16 +-
.../tika/parser/iwork/PagesContentHandler.java | 436 ++--
.../parser/iwork/iwana/IWork13PackageParser.java | 198 +-
.../parser/iwork/iwana/IWork18PackageParser.java | 180 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../tika/parser/iwork/AutoPageNumberUtilsTest.java | 85 +-
.../apache/tika/parser/iwork/IWorkParserTest.java | 141 +-
.../tika/parser/iwork/iwana/IWork13ParserTest.java | 23 +-
.../org/apache/tika/parser/audio/AudioParser.java | 32 +-
.../org/apache/tika/parser/audio/MidiParser.java | 41 +-
.../org/apache/tika/parser/mp3/AudioFrame.java | 239 +-
.../java/org/apache/tika/parser/mp3/ID3Tags.java | 294 +--
.../org/apache/tika/parser/mp3/ID3v1Handler.java | 103 +-
.../org/apache/tika/parser/mp3/ID3v22Handler.java | 71 +-
.../org/apache/tika/parser/mp3/ID3v23Handler.java | 31 +-
.../org/apache/tika/parser/mp3/ID3v24Handler.java | 35 +-
.../org/apache/tika/parser/mp3/ID3v2Frame.java | 418 ++--
.../org/apache/tika/parser/mp3/LyricsHandler.java | 82 +-
.../java/org/apache/tika/parser/mp3/MP3Frame.java | 2 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 210 +-
.../org/apache/tika/parser/mp3/MpegStream.java | 445 ++--
.../apache/tika/parser/mp4/ISO6709Extractor.java | 26 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 172 +-
.../org/apache/tika/parser/video/FLVParser.java | 81 +-
.../apache/tika/parser/audio/AudioParserTest.java | 15 +-
.../apache/tika/parser/audio/MidiParserTest.java | 9 +-
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 151 +-
.../org/apache/tika/parser/mp3/MpegStreamTest.java | 93 +-
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 30 +-
.../apache/tika/parser/video/FLVParserTest.java | 7 +-
.../java/org/apache/tika/parser/dwg/DWGParser.java | 336 ++-
.../java/org/apache/tika/parser/prt/PRTParser.java | 413 ++--
.../org/apache/tika/parser/dwg/DWGParserTest.java | 99 +-
.../org/apache/tika/parser/prt/PRTParserTest.java | 135 +-
.../org/apache/tika/parser/asm/ClassParser.java | 17 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 45 +-
.../apache/tika/parser/code/SourceCodeParser.java | 36 +-
.../tika/parser/executable/ExecutableParser.java | 656 ++---
.../java/org/apache/tika/parser/mat/MatParser.java | 44 +-
.../org/apache/tika/parser/sas/SAS7BDATParser.java | 60 +-
.../apache/tika/parser/asm/ClassParserTest.java | 28 +-
.../tika/parser/code/SourceCodeParserTest.java | 56 +-
.../parser/executable/ExecutableParserTest.java | 32 +-
.../org/apache/tika/parser/mat/MatParserTest.java | 3 +-
.../apache/tika/parser/sas/SAS7BDATParserTest.java | 37 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 41 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 257 +-
.../apache/tika/parser/crypto/Pkcs7ParserTest.java | 3 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 16 +-
.../parser/digestutils/BouncyCastleDigester.java | 13 +-
.../tika/parser/digestutils/CommonsDigester.java | 69 +-
.../tika/parser/font/AdobeFontMetricParser.java | 167 +-
.../apache/tika/parser/font/TrueTypeParser.java | 27 +-
.../apache/tika/parser/font/FontParsersTest.java | 28 +-
.../sax/boilerpipe/BoilerpipeContentHandler.java | 46 +-
.../org/apache/tika/parser/html/DataURIScheme.java | 13 +-
.../parser/html/DataURISchemeParseException.java | 4 +-
.../apache/tika/parser/html/DataURISchemeUtil.java | 14 +-
.../apache/tika/parser/html/DefaultHtmlMapper.java | 122 +-
.../tika/parser/html/HtmlEncodingDetector.java | 66 +-
.../org/apache/tika/parser/html/HtmlHandler.java | 104 +-
.../org/apache/tika/parser/html/HtmlParser.java | 83 +-
.../tika/parser/html/XHTMLDowngradeHandler.java | 20 +-
.../html/charsetdetector/CharsetAliases.java | 55 +-
.../charsetdetector/CharsetDetectionResult.java | 12 +-
.../parser/html/charsetdetector/MetaProcessor.java | 18 +-
.../parser/html/charsetdetector/PreScanner.java | 83 +-
.../StandardHtmlEncodingDetector.java | 28 +-
.../charsets/XUserDefinedCharset.java | 8 +-
.../tika/parser/html/DataURISchemeParserTest.java | 19 +-
.../tika/parser/html/HtmlEncodingDetectorTest.java | 60 +-
.../apache/tika/parser/html/HtmlParserTest.java | 705 +++---
.../html/StandardHtmlEncodingDetectorTest.java | 139 +-
.../tika/parser/image/AbstractImageParser.java | 46 +-
.../org/apache/tika/parser/image/BPGParser.java | 30 +-
.../org/apache/tika/parser/image/HeifParser.java | 33 +-
.../org/apache/tika/parser/image/ICNSParser.java | 55 +-
.../org/apache/tika/parser/image/ICNSType.java | 241 +-
.../tika/parser/image/ImageMetadataExtractor.java | 159 +-
.../org/apache/tika/parser/image/ImageParser.java | 63 +-
.../org/apache/tika/parser/image/JpegParser.java | 12 +-
.../apache/tika/parser/image/MetadataFields.java | 5 +-
.../org/apache/tika/parser/image/PSDParser.java | 43 +-
.../org/apache/tika/parser/image/TiffParser.java | 11 +-
.../org/apache/tika/parser/image/WebPParser.java | 11 +-
.../apache/tika/parser/image/HeifParserTest.java | 16 +-
.../apache/tika/parser/image/ICNSParserTest.java | 33 +-
.../parser/image/ImageMetadataExtractorTest.java | 17 +-
.../apache/tika/parser/image/ImageParserTest.java | 110 +-
.../apache/tika/parser/image/JpegParserTest.java | 76 +-
.../apache/tika/parser/image/PSDParserTest.java | 13 +-
.../apache/tika/parser/image/WebPParserTest.java | 3 +-
.../apache/tika/parser/jdbc/AbstractDBParser.java | 33 +-
.../apache/tika/parser/jdbc/JDBCTableReader.java | 79 +-
.../apache/tika/parser/mailcommons/MailUtil.java | 9 +-
.../tika/parser/mailcommons/MailUtilTest.java | 11 +-
.../tika/parser/mail/MailContentHandler.java | 207 +-
.../org/apache/tika/parser/mail/RFC822Parser.java | 30 +-
.../org/apache/tika/parser/mbox/MboxParser.java | 36 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 175 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 25 +-
.../detect/microsoft/POIFSContainerDetector.java | 87 +-
.../detect/microsoft/ooxml/OPCPackageDetector.java | 224 +-
.../microsoft/MSEmbeddedStreamTranslator.java | 24 +-
.../tika/parser/microsoft/AbstractListManager.java | 23 +-
.../parser/microsoft/AbstractOfficeParser.java | 61 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 57 +-
.../org/apache/tika/parser/microsoft/Cell.java | 3 +-
.../tika/parser/microsoft/CellDecorator.java | 3 +-
.../apache/tika/parser/microsoft/EMFParser.java | 78 +-
.../tika/parser/microsoft/ExcelExtractor.java | 142 +-
.../tika/parser/microsoft/FormattingUtils.java | 23 +-
.../tika/parser/microsoft/HSLFExtractor.java | 124 +-
.../tika/parser/microsoft/JackcessExtractor.java | 81 +-
.../tika/parser/microsoft/JackcessParser.java | 33 +-
.../apache/tika/parser/microsoft/LinkedCell.java | 3 +-
.../apache/tika/parser/microsoft/ListManager.java | 33 +-
.../tika/parser/microsoft/MSOwnerFileParser.java | 37 +-
.../apache/tika/parser/microsoft/NumberCell.java | 3 +-
.../apache/tika/parser/microsoft/OfficeParser.java | 146 +-
.../tika/parser/microsoft/OfficeParserConfig.java | 69 +-
.../tika/parser/microsoft/OldExcelParser.java | 23 +-
.../tika/parser/microsoft/OutlookExtractor.java | 361 ++-
.../tika/parser/microsoft/SummaryExtractor.java | 68 +-
.../apache/tika/parser/microsoft/TNEFParser.java | 44 +-
.../org/apache/tika/parser/microsoft/TextCell.java | 3 +-
.../parser/microsoft/TikaExcelDataFormatter.java | 11 +-
.../parser/microsoft/TikaExcelGeneralFormat.java | 2 +-
.../apache/tika/parser/microsoft/WMFParser.java | 24 +-
.../tika/parser/microsoft/WordExtractor.java | 98 +-
.../tika/parser/microsoft/chm/ChmAccessor.java | 10 +-
.../tika/parser/microsoft/chm/ChmAssert.java | 139 +-
.../tika/parser/microsoft/chm/ChmBlockInfo.java | 103 +-
.../tika/parser/microsoft/chm/ChmCommons.java | 293 +--
.../tika/parser/microsoft/chm/ChmConstants.java | 54 +-
.../microsoft/chm/ChmDirectoryListingSet.java | 234 +-
.../tika/parser/microsoft/chm/ChmExtractor.java | 284 +--
.../tika/parser/microsoft/chm/ChmItsfHeader.java | 192 +-
.../tika/parser/microsoft/chm/ChmItspHeader.java | 271 +--
.../tika/parser/microsoft/chm/ChmLzxBlock.java | 455 ++--
.../tika/parser/microsoft/chm/ChmLzxState.java | 262 +-
.../parser/microsoft/chm/ChmLzxcControlData.java | 147 +-
.../parser/microsoft/chm/ChmLzxcResetTable.java | 129 +-
.../tika/parser/microsoft/chm/ChmParser.java | 39 +-
.../tika/parser/microsoft/chm/ChmPmgiHeader.java | 51 +-
.../tika/parser/microsoft/chm/ChmPmglHeader.java | 62 +-
.../tika/parser/microsoft/chm/ChmSection.java | 61 +-
.../tika/parser/microsoft/chm/ChmWrapper.java | 12 +-
.../microsoft/chm/DirectoryListingEntry.java | 41 +-
.../tika/parser/microsoft/onenote/CompactID.java | 9 +-
.../tika/parser/microsoft/onenote/Error.java | 11 +-
.../parser/microsoft/onenote/ExtendedGUID.java | 11 +-
.../microsoft/onenote/FileChunkReference.java | 20 +-
.../tika/parser/microsoft/onenote/FileNode.java | 102 +-
.../microsoft/onenote/FileNodeListHeader.java | 32 +-
.../tika/parser/microsoft/onenote/FileNodePtr.java | 3 +-
.../parser/microsoft/onenote/FileNodeUnion.java | 33 +-
.../microsoft/onenote/FndStructureConstants.java | 44 +-
.../apache/tika/parser/microsoft/onenote/GUID.java | 65 +-
.../apache/tika/parser/microsoft/onenote/JCID.java | 38 +-
.../microsoft/onenote/JCIDPropertySetTypeEnum.java | 76 +-
.../onenote/ObjectDeclarationWithRefCount.java | 27 +-
.../onenote/ObjectDeclarationWithRefCountBody.java | 3 +-
.../onenote/ObjectSpaceObjectPropSet.java | 12 +-
...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java | 6 +-
.../onenote/OneNoteDirectFileResource.java | 9 +-
.../parser/microsoft/onenote/OneNoteDocument.java | 13 +-
.../parser/microsoft/onenote/OneNoteHeader.java | 18 +-
.../onenote/OneNoteLegacyDumpStrings.java | 47 +-
.../parser/microsoft/onenote/OneNoteParser.java | 149 +-
.../microsoft/onenote/OneNotePropertyEnum.java | 208 +-
.../microsoft/onenote/OneNotePropertyId.java | 7 +-
.../tika/parser/microsoft/onenote/OneNotePtr.java | 515 ++--
.../microsoft/onenote/OneNoteTreeWalker.java | 215 +-
.../onenote/OneNoteTreeWalkerOptions.java | 14 +-
.../parser/microsoft/onenote/PropertyIDType.java | 7 +-
.../tika/parser/microsoft/onenote/PropertySet.java | 37 +-
.../parser/microsoft/onenote/PropertyValue.java | 20 +-
.../tika/parser/microsoft/onenote/Revision.java | 23 +-
.../microsoft/onenote/RootObjectReference.java | 3 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 168 +-
.../parser/microsoft/ooxml/MetadataExtractor.java | 79 +-
.../parser/microsoft/ooxml/OOXMLExtractor.java | 10 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 97 +-
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 60 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 72 +-
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 110 +-
.../ooxml/POIXMLTextExtractorDecorator.java | 8 +-
.../microsoft/ooxml/ParagraphProperties.java | 18 +-
.../tika/parser/microsoft/ooxml/RunProperties.java | 9 +-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 258 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 93 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 95 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 48 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 181 +-
.../parser/microsoft/ooxml/XWPFListManager.java | 21 +-
.../ooxml/XWPFWordExtractorDecorator.java | 124 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 122 +-
.../microsoft/ooxml/xps/XPSPageContentHandler.java | 98 +-
.../microsoft/ooxml/xps/XPSTextExtractor.java | 7 +-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 16 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 63 +-
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 28 +-
.../ooxml/xwpf/ml2006/AbstractPartHandler.java | 11 +-
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 12 +-
.../ooxml/xwpf/ml2006/CorePropertiesHandler.java | 17 +-
.../xwpf/ml2006/ExtendedPropertiesHandler.java | 3 +-
.../microsoft/ooxml/xwpf/ml2006/PartHandler.java | 7 +-
.../ooxml/xwpf/ml2006/RelationshipsHandler.java | 5 +-
.../ooxml/xwpf/ml2006/RelationshipsManager.java | 3 +-
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 40 +-
.../ooxml/xwpf/ml2006/Word2006MLParser.java | 21 +-
.../ml2006/WordAndPowerPointTextPartHandler.java | 17 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 62 +-
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 40 +-
.../parser/microsoft/rtf/RTFObjDataParser.java | 62 +-
.../tika/parser/microsoft/rtf/RTFParser.java | 53 +-
.../tika/parser/microsoft/rtf/TextExtractor.java | 93 +-
.../microsoft/xml/AbstractXML2003Parser.java | 41 +-
.../parser/microsoft/xml/HyperlinkHandler.java | 23 +-
.../parser/microsoft/xml/SpreadsheetMLParser.java | 50 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 75 +-
.../AbstractPOIContainerExtractionTest.java | 16 +-
.../tika/parser/microsoft/EMFParserTest.java | 13 +-
.../tika/parser/microsoft/ExcelParserTest.java | 69 +-
.../tika/parser/microsoft/JackcessParserTest.java | 40 +-
.../parser/microsoft/MSOwnerFileParserTest.java | 7 +-
.../tika/parser/microsoft/OfficeParserTest.java | 4 +-
.../tika/parser/microsoft/OldExcelParserTest.java | 13 +-
.../tika/parser/microsoft/OutlookParserTest.java | 83 +-
.../microsoft/POIContainerExtractionTest.java | 19 +-
.../parser/microsoft/PowerPointParserTest.java | 47 +-
.../tika/parser/microsoft/ProjectParserTest.java | 27 +-
.../tika/parser/microsoft/PublisherParserTest.java | 13 +-
.../parser/microsoft/SolidworksParserTest.java | 46 +-
.../tika/parser/microsoft/TNEFParserTest.java | 9 +-
.../tika/parser/microsoft/VisioParserTest.java | 13 +-
.../tika/parser/microsoft/WMFParserTest.java | 9 +-
.../tika/parser/microsoft/WordParserTest.java | 123 +-
.../parser/microsoft/WriteProtectedParserTest.java | 9 +-
.../parser/microsoft/chm/TestChmBlockInfo.java | 50 +-
.../parser/microsoft/chm/TestChmExtraction.java | 161 +-
.../parser/microsoft/chm/TestChmExtractor.java | 16 +-
.../parser/microsoft/chm/TestChmItsfHeader.java | 40 +-
.../parser/microsoft/chm/TestChmItspHeader.java | 60 +-
.../tika/parser/microsoft/chm/TestChmLzxState.java | 37 +-
.../microsoft/chm/TestChmLzxcControlData.java | 54 +-
.../microsoft/chm/TestChmLzxcResetTable.java | 59 +-
.../microsoft/chm/TestDirectoryListingEntry.java | 9 +-
.../tika/parser/microsoft/chm/TestParameters.java | 34 +-
.../tika/parser/microsoft/chm/TestPmglHeader.java | 24 +-
.../microsoft/onenote/OneNoteParserTest.java | 83 +-
.../ooxml/OOXMLContainerExtractionTest.java | 24 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 274 +--
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 122 +-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 97 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 41 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 68 +-
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 29 +-
.../parser/microsoft/pst/OutlookPSTParserTest.java | 84 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 91 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 35 +-
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
.../apache/tika/detect/ole/MiscOLEDetector.java | 69 +-
.../java/org/apache/tika/parser/dbf/DBFCell.java | 30 +-
.../apache/tika/parser/dbf/DBFColumnHeader.java | 68 +-
.../org/apache/tika/parser/dbf/DBFFileHeader.java | 46 +-
.../java/org/apache/tika/parser/dbf/DBFParser.java | 34 +-
.../java/org/apache/tika/parser/dbf/DBFReader.java | 167 +-
.../java/org/apache/tika/parser/dbf/DBFRow.java | 16 +-
.../apache/tika/parser/dif/DIFContentHandler.java | 241 +-
.../java/org/apache/tika/parser/dif/DIFParser.java | 85 +-
.../apache/tika/parser/epub/EpubContentParser.java | 29 +-
.../org/apache/tika/parser/epub/EpubParser.java | 124 +-
.../apache/tika/parser/hwp/HwpStreamReader.java | 2 +-
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 111 +-
.../org/apache/tika/parser/hwp/HwpV5Parser.java | 16 +-
.../apache/tika/parser/mif/MIFContentHandler.java | 17 +-
.../org/apache/tika/parser/mif/MIFExtractor.java | 34 +-
.../java/org/apache/tika/parser/mif/MIFParser.java | 42 +-
.../parser/odf/FlatOpenDocumentMacroHandler.java | 43 +-
.../tika/parser/odf/FlatOpenDocumentParser.java | 101 +-
.../parser/odf/NSNormalizerContentHandler.java | 29 +-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 311 +--
.../tika/parser/odf/OpenDocumentContentParser.java | 40 +-
.../tika/parser/odf/OpenDocumentMacroHandler.java | 16 +-
.../parser/odf/OpenDocumentManifestHandler.java | 35 +-
.../tika/parser/odf/OpenDocumentMetaParser.java | 101 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 156 +-
.../tika/parser/wordperfect/QPWTextExtractor.java | 251 +-
.../tika/parser/wordperfect/QuattroProParser.java | 34 +-
.../tika/parser/wordperfect/WP5Charsets.java | 289 ++-
.../wordperfect/WP5DocumentAreaExtractor.java | 66 +-
.../tika/parser/wordperfect/WP6Charsets.java | 750 +++---
.../wordperfect/WP6DocumentAreaExtractor.java | 58 +-
.../wordperfect/WPDocumentAreaExtractor.java | 23 +-
.../tika/parser/wordperfect/WPInputStream.java | 25 +-
.../tika/parser/wordperfect/WPPrefixArea.java | 37 +-
.../parser/wordperfect/WPPrefixAreaExtractor.java | 10 +-
.../tika/parser/wordperfect/WordPerfectParser.java | 78 +-
.../org/apache/tika/parser/dbf/DBFParserTest.java | 36 +-
.../org/apache/tika/parser/dif/DIFParserTest.java | 25 +-
.../apache/tika/parser/epub/EpubParserTest.java | 30 +-
.../apache/tika/parser/hwp/HwpV5ParserTest.java | 17 +-
.../tika/parser/ibooks/iBooksParserTest.java | 18 +-
.../org/apache/tika/parser/mif/MIFParserTest.java | 9 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 265 +-
.../tika/parser/wordperfect/QuattroProTest.java | 12 +-
.../tika/parser/wordperfect/WPInputStreamTest.java | 14 +-
.../tika/parser/wordperfect/WordPerfectTest.java | 20 +-
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
.../org/apache/tika/parser/feed/FeedParser.java | 76 +-
.../apache/tika/parser/iptc/IptcAnpaParser.java | 1404 +++++------
.../apache/tika/parser/feed/FeedParserTest.java | 23 +-
.../apache/tika/parser/ocr/ImagePreprocessor.java | 67 +-
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 197 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 364 ++-
.../apache/tika/parser/ocr/tess4j/ImageDeskew.java | 10 +-
.../apache/tika/parser/ocr/tess4j/ImageUtil.java | 17 +-
.../tika/parser/ocr/TesseractOCRConfigTest.java | 149 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 101 +-
.../resources/test-configs/TIKA-2705-tesseract.xml | 26 +-
.../tika-config-tesseract-arbitrary.xml | 22 +-
.../test-configs/tika-config-tesseract-full.xml | 38 +-
.../tika-config-tesseract-load-langs.xml | 20 +-
.../test-configs/tika-config-tesseract-partial.xml | 32 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 484 ++--
.../org/apache/tika/parser/pdf/AccessChecker.java | 18 +-
.../tika/parser/pdf/ImageGraphicsEngine.java | 290 ++-
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 20 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 97 +-
.../tika/parser/pdf/PDFEncodedStringDecoder.java | 6 +-
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 207 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 133 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 360 +--
.../apache/tika/parser/pdf/PDFPreflightParser.java | 82 +-
.../tika/parser/pdf/PDMetadataExtractor.java | 54 +-
.../org/apache/tika/parser/pdf/XFAExtractor.java | 75 +-
.../apache/tika/parser/pdf/AccessCheckerTest.java | 6 +-
.../parser/pdf/PDFMarkedContent2XHTMLTest.java | 21 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 324 +--
.../tika/parser/pdf/PDFPreflightParserTest.java | 18 +-
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
.../apache/tika/parser/pkg/CompressorParser.java | 54 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 223 +-
.../java/org/apache/tika/parser/pkg/RarParser.java | 30 +-
.../apache/tika/parser/pkg/AbstractPkgTest.java | 94 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 11 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 37 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 39 +-
.../tika/parser/pkg/CompressorParserTest.java | 21 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 29 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 19 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 99 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 69 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 67 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 98 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 33 +-
.../src/test/resources/test-documents/testSVG.svg | 8 +-
.../java/org/apache/tika/parser/csv/CSVParams.java | 4 +-
.../java/org/apache/tika/parser/csv/CSVResult.java | 17 +-
.../org/apache/tika/parser/csv/CSVSniffer.java | 84 +-
.../apache/tika/parser/csv/TextAndCSVParser.java | 169 +-
.../tika/parser/strings/Latin1StringsParser.java | 145 +-
.../apache/tika/parser/strings/StringsConfig.java | 163 +-
.../tika/parser/strings/StringsEncoding.java | 62 +-
.../apache/tika/parser/strings/StringsParser.java | 495 ++--
.../apache/tika/parser/txt/CharsetDetector.java | 46 +-
.../org/apache/tika/parser/txt/CharsetMatch.java | 10 +-
.../apache/tika/parser/txt/CharsetRecog_2022.java | 20 +-
.../apache/tika/parser/txt/CharsetRecog_UTF8.java | 8 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 2 +-
.../apache/tika/parser/txt/CharsetRecog_mbcs.java | 113 +-
.../apache/tika/parser/txt/CharsetRecog_sbcs.java | 1801 ++++++++------
.../apache/tika/parser/txt/CharsetRecognizer.java | 2 +-
.../tika/parser/txt/Icu4jEncodingDetector.java | 17 +-
.../java/org/apache/tika/parser/txt/TXTParser.java | 26 +-
.../tika/parser/txt/UniversalEncodingDetector.java | 16 +-
.../tika/parser/txt/UniversalEncodingListener.java | 21 +-
.../org/apache/tika/parser/csv/CSVSnifferTest.java | 51 +-
.../tika/parser/csv/TextAndCSVParserTest.java | 112 +-
.../parser/strings/Latin1StringsParserTest.java | 28 +-
.../tika/parser/strings/StringsConfigTest.java | 111 +-
.../tika/parser/strings/StringsParserTest.java | 89 +-
.../tika/parser/txt/CharsetDetectorTest.java | 21 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 126 +-
.../test-configs/tika-config-strings-full.xml | 18 +-
.../test-configs/tika-config-strings-partial.xml | 16 +-
.../src/test/resources/test-documents/resume.html | 140 +-
.../tika/parser/xliff/XLIFF12ContentHandler.java | 15 +-
.../apache/tika/parser/xliff/XLIFF12Parser.java | 27 +-
.../org/apache/tika/parser/xliff/XLZParser.java | 43 +-
.../tika/parser/xml/AbstractMetadataHandler.java | 46 +-
.../xml/AttributeDependantMetadataHandler.java | 34 +-
.../tika/parser/xml/AttributeMetadataHandler.java | 28 +-
.../org/apache/tika/parser/xml/DcXMLParser.java | 22 +-
.../tika/parser/xml/ElementMetadataHandler.java | 69 +-
.../apache/tika/parser/xml/FictionBookParser.java | 33 +-
.../apache/tika/parser/xml/MetadataHandler.java | 33 +-
.../tika/parser/xml/TextAndAttributeXMLParser.java | 6 +-
.../java/org/apache/tika/parser/xml/XMLParser.java | 39 +-
.../org/apache/tika/parser/xml/XMLProfiler.java | 99 +-
.../tika/parser/xliff/XLIFF12ParserTest.java | 5 +-
.../apache/tika/parser/xliff/XLZParserTest.java | 18 +-
.../apache/tika/parser/xml/DcXMLParserTest.java | 27 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 56 +-
.../tika/parser/xml/FictionBookParserTest.java | 10 +-
.../parser/xml/TextAndAttributeXMLParserTest.java | 21 +-
.../src/test/resources/test-documents/testXML.xml | 30 +-
.../src/test/resources/test-documents/testXML2.xml | 10 +-
.../src/test/resources/test-documents/testXML3.xml | 38 +-
.../apache/tika/parser/xmp/JempboxExtractor.java | 91 +-
.../apache/tika/parser/xmp/XMPPacketScanner.java | 4 +-
.../tika/parser/xmp/JempboxExtractorTest.java | 31 +-
.../src/test/resources/test-documents/testXMP.xmp | 342 ++-
.../tika/detect/zip/CompressorConstants.java | 3 +-
.../detect/zip/DefaultZipContainerDetector.java | 165 +-
.../DeprecatedStreamingZipContainerDetector.java | 37 +-
.../detect/zip/DeprecatedZipContainerDetector.java | 3 -
.../org/apache/tika/detect/zip/IPADetector.java | 21 +-
.../org/apache/tika/detect/zip/JarDetector.java | 14 +-
.../org/apache/tika/detect/zip/KMZDetector.java | 22 +-
.../tika/detect/zip/OpenDocumentDetector.java | 20 +-
.../apache/tika/detect/zip/PackageConstants.java | 1 +
.../apache/tika/detect/zip/StarOfficeDetector.java | 78 +-
.../tika/detect/zip/StreamingDetectContext.java | 16 +-
.../detect/zip/StreamingZipContainerDetector.java | 13 +-
.../tika/detect/zip/ZipContainerDetector.java | 16 +-
.../tika/detect/zip/ZipContainerDetectorBase.java | 47 +-
.../org/apache/tika/zip/utils/ZipSalvager.java | 104 +-
.../org/apache/tika/detect/zip/ZipParserTest.java | 14 +-
.../org/apache/tika/parser/internal/Activator.java | 22 +-
.../apache/tika/config/TikaDetectorConfigTest.java | 87 +-
.../tika/config/TikaEncodingDetectorTest.java | 82 +-
.../apache/tika/config/TikaParserConfigTest.java | 69 +-
.../tika/config/TikaTranslatorConfigTest.java | 21 +-
.../tika/detect/TestContainerAwareDetector.java | 278 ++-
.../apache/tika/detect/TestDetectorLoading.java | 15 +-
.../tika/detect/TestFileCommandDetector.java | 12 +-
.../tika/extractor/EmbeddedDocumentUtilTest.java | 3 +-
.../java/org/apache/tika/mime/MimeTypeTest.java | 12 +-
.../java/org/apache/tika/mime/MimeTypesTest.java | 4 +-
.../java/org/apache/tika/mime/TestMimeTypes.java | 733 +++---
.../apache/tika/parser/AutoDetectParserTest.java | 357 ++-
.../tika/parser/AutoDetectReaderParserTest.java | 24 +-
.../parser/BouncyCastleDigestingParserTest.java | 125 +-
.../apache/tika/parser/DigestingParserTest.java | 120 +-
.../org/apache/tika/parser/ParsingReaderTest.java | 13 +-
.../tika/parser/RecursiveParserWrapperTest.java | 61 +-
.../org/apache/tika/parser/TabularFormatsTest.java | 252 +-
.../java/org/apache/tika/parser/TestParsers.java | 49 +-
.../apache/tika/parser/TestXMLEntityExpansion.java | 90 +-
.../java/org/apache/tika/parser/TestXXEInXML.java | 115 +-
.../java/org/apache/tika/parser/XMLTestBase.java | 80 +-
.../parser/apple/AppleSingleFileParserTest.java | 8 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 19 +-
.../parser/fork/ForkParserIntegrationTest.java | 285 +--
.../apache/tika/parser/html/HtmlParserTest.java | 20 +-
.../apache/tika/parser/mail/MboxParserTest.java | 16 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 42 +-
.../tika/parser/microsoft/EMFParserTest.java | 17 +-
.../tika/parser/microsoft/ExcelParserTest.java | 7 +-
.../microsoft/POIContainerExtractionTest.java | 9 +-
.../parser/microsoft/PowerPointParserTest.java | 14 +-
.../tika/parser/microsoft/XML2003ParserTest.java | 25 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 17 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 65 +-
.../apache/tika/parser/mock/MockParserTest.java | 70 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 54 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 54 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 100 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 17 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 11 +-
.../pkg/CompositeZipContainerDetectorTest.java | 141 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 25 +-
.../tika/parser/pkg/CompressorParserTest.java | 17 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 13 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 16 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 64 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 11 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 65 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 11 +-
.../tika/parser/xml/FictionBookParserTest.java | 14 +-
.../sax/PhoneExtractingContentHandlerTest.java | 20 +-
.../sax/StandardsExtractingContentHandlerTest.java | 47 +-
.../apache/tika/utils/ServiceLoaderUtilsTest.java | 6 +-
.../src/test/resources/log4j.properties | 1 +
.../test-documents/testJAVAPROPS.properties | 1 +
.../apache/tika/parser/envi/EnviHeaderParser.java | 112 +-
.../org/apache/tika/parser/gdal/GDALParser.java | 249 +-
.../geoinfo/GeographicInformationParser.java | 451 ++--
.../org/apache/tika/parser/grib/GribParser.java | 41 +-
.../java/org/apache/tika/parser/hdf/HDFParser.java | 32 +-
.../org/apache/tika/parser/isatab/ISATabUtils.java | 311 +--
.../apache/tika/parser/isatab/ISArchiveParser.java | 222 +-
.../apache/tika/parser/netcdf/NetCDFParser.java | 28 +-
.../tika/parser/envi/EnviHeaderParserTest.java | 59 +-
.../apache/tika/parser/gdal/TestGDALParser.java | 112 +-
.../geoinfo/GeographicInformationParserTest.java | 5 +-
.../apache/tika/parser/grib/GribParserTest.java | 17 +-
.../org/apache/tika/parser/hdf/HDFParserTest.java | 48 +-
.../tika/parser/isatab/ISArchiveParserTest.java | 68 +-
.../tika/parser/netcdf/NetCDFParserTest.java | 23 +-
.../ground-truth/EnviHeaderGroundTruth.txt | 1 +
.../tika/parser/sqlite3/SQLite3DBParser.java | 19 +-
.../apache/tika/parser/sqlite3/SQLite3Parser.java | 17 +-
.../tika/parser/sqlite3/SQLite3TableReader.java | 19 +-
.../tika/parser/sqlite3/SQLite3ParserTest.java | 37 +-
.../apache/tika/mime/TestMimeTypesExtended.java | 23 +-
.../tika/parser/sqlite3/SQLite3ParserTest.java | 68 +-
tika-server/pom.xml | 38 +-
.../server/classic/config/PDFServerConfig.java | 42 +-
.../classic/config/TesseractServerConfig.java | 41 +-
.../classic/resource/XMPMetadataResource.java | 34 +-
.../classic/writer/XMPMessageBodyWriter.java | 26 +-
.../src/main/resources/log4j.properties | 4 +-
.../tika/server/classic/DetectorResourceTest.java | 55 +-
.../apache/tika/server/classic/FetcherTest.java | 35 +-
.../tika/server/classic/MetadataResourceTest.java | 96 +-
.../classic/RecursiveMetadataFilterTest.java | 42 +-
.../classic/RecursiveMetadataResourceTest.java | 215 +-
.../tika/server/classic/TikaDetectorsTest.java | 41 +-
.../tika/server/classic/TikaMimeTypesTest.java | 39 +-
.../tika/server/classic/TikaParsersTest.java | 46 +-
.../tika/server/classic/TikaResourceTest.java | 418 ++--
.../tika/server/classic/UnpackerResourceTest.java | 91 +-
.../test/resources/config/TIKA-3137-include.xml | 38 +-
.../src/test/resources/log4j.properties | 6 +-
.../test/resources/test-documents/testHTML.html | 20 +-
.../org/apache/tika/server/client/TikaClient.java | 31 +-
.../apache/tika/server/client/TikaClientCLI.java | 35 +-
.../tika/server/client/TikaEmitterResult.java | 19 +-
.../apache/tika/server/client/TikaHttpClient.java | 73 +-
.../src/main/resources/log4j.properties | 6 +-
.../org/apache/tika/server/client/TestBasic.java | 15 +-
.../src/test/resources/log4j.properties | 6 +-
.../server/core/CompositeParseContextConfig.java | 10 +-
.../server/core/DefaultInputStreamFactory.java | 9 +-
.../tika/server/core/FetcherStreamFactory.java | 17 +-
.../org/apache/tika/server/core/HTMLHelper.java | 7 +-
.../tika/server/core/InputStreamFactory.java | 9 +-
.../org/apache/tika/server/core/MetadataList.java | 7 +-
.../tika/server/core/ParseContextConfig.java | 16 +-
.../org/apache/tika/server/core/ServerStatus.java | 122 +-
.../tika/server/core/ServerStatusWatcher.java | 57 +-
.../apache/tika/server/core/TikaLoggingFilter.java | 7 +-
.../org/apache/tika/server/core/TikaServerCli.java | 39 +-
.../apache/tika/server/core/TikaServerConfig.java | 543 ++---
.../tika/server/core/TikaServerParseException.java | 3 +-
.../core/TikaServerParseExceptionMapper.java | 12 +-
.../apache/tika/server/core/TikaServerProcess.java | 194 +-
.../tika/server/core/TikaServerWatchDog.java | 222 +-
.../apache/tika/server/core/WatchDogResult.java | 7 +-
.../server/core/config/DocumentSelectorConfig.java | 10 +-
.../server/core/config/PasswordProviderConfig.java | 27 +-
.../tika/server/core/resource/AsyncEmitter.java | 30 +-
.../tika/server/core/resource/AsyncParser.java | 42 +-
.../tika/server/core/resource/AsyncRequest.java | 4 +-
.../tika/server/core/resource/AsyncResource.java | 56 +-
.../server/core/resource/DetectorResource.java | 21 +-
.../tika/server/core/resource/EmitterResource.java | 122 +-
.../server/core/resource/LanguageResource.java | 55 +-
.../server/core/resource/MetadataResource.java | 62 +-
.../core/resource/RecursiveMetadataResource.java | 131 +-
.../tika/server/core/resource/TikaDetectors.java | 15 +-
.../tika/server/core/resource/TikaMimeTypes.java | 37 +-
.../tika/server/core/resource/TikaParsers.java | 43 +-
.../tika/server/core/resource/TikaResource.java | 243 +-
.../server/core/resource/TikaServerStatus.java | 8 +-
.../tika/server/core/resource/TikaWelcome.java | 44 +-
.../server/core/resource/TranslateResource.java | 135 +-
.../server/core/resource/UnpackerResource.java | 84 +-
.../server/core/writer/CSVMessageBodyWriter.java | 29 +-
.../server/core/writer/JSONMessageBodyWriter.java | 29 +-
.../tika/server/core/writer/JSONObjWriter.java | 30 +-
.../core/writer/MetadataListMessageBodyWriter.java | 29 +-
.../apache/tika/server/core/writer/TarWriter.java | 25 +-
.../server/core/writer/TextMessageBodyWriter.java | 28 +-
.../apache/tika/server/core/writer/ZipWriter.java | 27 +-
.../src/main/resources/tikaserver-template.html | 18 +-
.../main/resources/tikaserver-version.properties | 15 +
.../org/apache/tika/server/core/CXFTestBase.java | 84 +-
.../tika/server/core/IntegrationTestBase.java | 92 +-
.../tika/server/core/LanguageResourceTest.java | 139 +-
.../tika/server/core/NullWebClientLogger.java | 5 +-
.../apache/tika/server/core/ServerStatusTest.java | 16 +-
.../apache/tika/server/core/StackTraceOffTest.java | 54 +-
.../apache/tika/server/core/StackTraceTest.java | 62 +-
.../apache/tika/server/core/TikaEmitterTest.java | 162 +-
.../apache/tika/server/core/TikaMimeTypesTest.java | 34 +-
.../apache/tika/server/core/TikaResourceTest.java | 60 +-
.../core/TikaServerAsyncIntegrationTest.java | 123 +-
.../tika/server/core/TikaServerConfigTest.java | 32 +-
.../core/TikaServerEmitterIntegrationTest.java | 147 +-
.../server/core/TikaServerIntegrationTest.java | 217 +-
.../tika/server/core/TikaServerStatusTest.java | 29 +-
.../apache/tika/server/core/TikaVersionTest.java | 20 +-
.../apache/tika/server/core/TikaWelcomeTest.java | 54 +-
.../tika/server/core/TranslateResourceTest.java | 90 +-
.../src/test/resources/log4j.properties | 6 +-
.../test-documents/mock/heavy_hang_100.xml | 2 +-
.../test-documents/mock/heavy_hang_30000.xml | 2 +-
.../resources/test-documents/mock/system_exit.xml | 2 +-
.../test-documents/mock/testStaticStdOutErr.xml | 45 +-
.../test-documents/mock/testStdOutErr.xml | 45 +-
.../test-documents/mock/thread_interrupt.xml | 2 +-
1005 files changed, 38487 insertions(+), 37841 deletions(-)
diff --cc tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
index 39643b2,db6dd4c..dda0054
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
@@@ -46,22 -59,9 +59,9 @@@ public abstract class AbstractEmitter i
* @throws TikaEmitterException
*/
@Override
- public void emit(List<EmitData> emitData) throws IOException, TikaEmitterException {
+ public void emit(List<? extends EmitData> emitData) throws IOException, TikaEmitterException {
for (EmitData d : emitData) {
- emit(d.getEmitKey().getKey(), d.getMetadataList());
+ emit(d.getEmitKey().getEmitKey(), d.getMetadataList());
}
}
-
- public static long estimateSizeInBytes(String id, List<Metadata> metadataList) {
- long sz = 36 + id.length() * 2;
- for (Metadata m : metadataList) {
- for (String n : m.names()) {
- sz += 36 + n.length() * 2;
- for (String v : m.getValues(n)) {
- sz += 36 + v.length() * 2;
- }
- }
- }
- return sz;
- }
}
diff --cc tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchEmitTuple.java
index f6e7c74,1f7d5b9..974aa3b
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchEmitTuple.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetchiterator/FetchEmitTuple.java
@@@ -22,16 -22,11 +22,11 @@@ import org.apache.tika.pipes.fetcher.Fe
public class FetchEmitTuple {
- public enum ON_PARSE_EXCEPTION {
- SKIP,
- EMIT
- }
public static final ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = ON_PARSE_EXCEPTION.EMIT;
private final FetchKey fetchKey;
- private final EmitKey emitKey;
+ private EmitKey emitKey;
private final Metadata metadata;
private final ON_PARSE_EXCEPTION onParseException;
-
public FetchEmitTuple(FetchKey fetchKey, EmitKey emitKey, Metadata metadata) {
this(fetchKey, emitKey, metadata, DEFAULT_ON_PARSE_EXCEPTION);
}
@@@ -59,17 -55,10 +55,13 @@@
return onParseException;
}
+ public void setEmitKey(EmitKey emitKey) {
+ this.emitKey = emitKey;
+ }
@Override
public String toString() {
- return "FetchEmitTuple{" +
- "fetchKey=" + fetchKey +
- ", emitKey=" + emitKey +
- ", metadata=" + metadata +
- ", onParseException=" + onParseException +
- '}';
+ return "FetchEmitTuple{" + "fetchKey=" + fetchKey + ", emitKey=" + emitKey + ", metadata=" +
+ metadata + ", onParseException=" + onParseException + '}';
}
@Override
diff --cc tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index a44f0dc,c75b430..187cc3e
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@@ -185,11 -182,9 +181,11 @@@ public class TesseractOCRParserTest ext
assertContainsCount("<body", xml, 1);
assertContainsCount("</body", xml, 1);
assertContainsCount("</html", xml, 1);
+
+ assertNotContained("<meta name=\"Content-Type\" content=\"image/ocr-jpeg\" />", xml);
}
-
+
@Test
public void getNormalMetadataToo() throws Exception {
//this should be successful whether or not TesseractOCR is installed/active
diff --cc tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
index 047fa13,96c2e30..56a71db
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
@@@ -57,14 -63,17 +63,23 @@@ public class EmitterResource
private static final String FETCHER_NAME_ABBREV = "fn";
private static final String FETCH_KEY_ABBREV = "fk";
private static final String EMIT_KEY_ABBREV = "ek";
+
+ /**
+ * key that is safe to pass through http header.
+ * The user _must_ specify this for the fsemitter if calling 'put'
+ */
+ public static final String EMIT_KEY_FOR_HTTP_HEADER = "emit-key";
private static final Logger LOG = LoggerFactory.getLogger(EmitterResource.class);
+ static EmitKey calcEmitKey(FetchEmitTuple t) {
+ //use fetch key if emitter key is not specified
+ //TODO: clean this up?
+ EmitKey emitKey = t.getEmitKey();
+ if (StringUtils.isBlank(emitKey.getKey())) {
+ emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getKey());
+ }
+ return emitKey;
+ }
/**
* @param is input stream is ignored in 'get'
@@@ -188,24 -188,14 +197,24 @@@
return emit(calcEmitKey(t), metadataList);
}
+ static EmitKey calcEmitKey(FetchEmitTuple t) {
+ //use fetch key if emitter key is not specified
+ //TODO: clean this up?
+ EmitKey emitKey = t.getEmitKey();
- if (StringUtils.isBlank(emitKey.getEmitKey())) {
++ if (StringUtils.isBlank(emitKey.getKey())) {
+ emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getKey());
+ }
+ return emitKey;
+ }
+
private Map<String, String> skip(FetchEmitTuple t, List<Metadata> metadataList) {
- Map<String, String> statusMap = new HashMap<>();
- statusMap.put("status", "ok");
- statusMap.put("emitter", t.getEmitKey().getEmitterName());
- statusMap.put("emitKey", t.getEmitKey().getEmitKey());
- String msg = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
- statusMap.put("parse_exception", msg);
- return statusMap;
+ Map<String, String> statusMap = new HashMap<>();
+ statusMap.put("status", "ok");
+ statusMap.put("emitter", t.getEmitKey().getEmitterName());
+ statusMap.put("emitKey", t.getEmitKey().getKey());
+ String msg = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
+ statusMap.put("parse_exception", msg);
+ return statusMap;
}
private boolean checkParseException(FetchEmitTuple t, List<Metadata> metadataList) {
diff --cc tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerEmitterIntegrationTest.java
index 9594de4,fda802f..a99cea1
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerEmitterIntegrationTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerEmitterIntegrationTest.java
@@@ -261,13 -238,11 +238,14 @@@ public class TikaServerEmitterIntegrati
return testOne(fileName, shouldFileExist, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
}
- private JsonNode testOne(String fileName, boolean shouldFileExist, FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) throws Exception {
+ private JsonNode testOne(String fileName, boolean shouldFileExist,
+ FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) throws Exception {
awaitServerStartup();
- Response response = WebClient.create(endPoint + "/emit").accept("application/json")
+ System.out.println(getJsonString(fileName, onParseException));
+ Response response = WebClient
+ .create(endPoint + "/emit")
+ .accept("application/json")
.post(getJsonString(fileName, onParseException));
if (response.getStatus() == 200) {
Path targFile = TMP_OUTPUT_DIR.resolve(fileName + ".json");