You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/10/26 02:37:10 UTC
[5/7] tika git commit: Merge branch 'master' into TIKA-1343
Merge branch 'master' into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d50a6936
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d50a6936
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d50a6936
Branch: refs/heads/master
Commit: d50a69361bd0196fb2595313cb47222f61701ba4
Parents: a1250ff 07aea36
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:06:47 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:06:47 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 30 +
tika-bundle/pom.xml | 2 +-
.../main/java/org/apache/tika/config/Field.java | 45 +
.../org/apache/tika/config/Initializable.java | 33 +
.../main/java/org/apache/tika/config/Param.java | 191 +++++
.../java/org/apache/tika/config/ParamField.java | 158 ++++
.../java/org/apache/tika/config/TikaConfig.java | 47 +-
.../tika/exception/TikaConfigException.java | 39 +
.../org/apache/tika/parser/AbstractParser.java | 10 +
.../java/org/apache/tika/parser/Parser.java | 1 +
.../tika/parser/external/ExternalParser.java | 85 +-
.../apache/tika/sax/XHTMLContentHandler.java | 5 +-
.../org/apache/tika/utils/AnnotationUtils.java | 138 +++
.../apache/tika/utils/ServiceLoaderUtils.java | 30 +
.../org/apache/tika/mime/tika-mimetypes.xml | 67 +-
.../java/org/apache/tika/config/ParamTest.java | 71 ++
.../tika/parser/DummyInitializableParser.java | 68 ++
.../tika/parser/DummyParameterizedParser.java | 113 +++
.../tika/parser/InitializableParserTest.java | 45 +
.../tika/parser/ParameterizedParserTest.java | 125 +++
.../apache/tika/utils/AnnotationUtilsTest.java | 190 +++++
.../tika/config/TIKA-1508-configurable.xml | 37 +
.../tika/config/TIKA-1986-bad-parameters.xml | 26 +
.../apache/tika/config/TIKA-1986-bad-types.xml | 26 +
.../apache/tika/config/TIKA-1986-bad-values.xml | 26 +
.../tika/config/TIKA-1986-initializable.xml | 28 +
.../TIKA-1986-parameterized-decorated.xml | 39 +
.../tika/config/TIKA-1986-parameterized.xml | 38 +
.../tika/config/TIKA-1986-some-parameters.xml | 28 +
tika-parent/pom.xml | 12 +-
tika-parsers/pom.xml | 26 +-
.../chm/accessor/ChmDirectoryListingSet.java | 11 +-
.../apache/tika/parser/chm/core/ChmCommons.java | 5 +-
.../tika/parser/chm/core/ChmExtractor.java | 4 +-
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 4 +-
.../tika/parser/mail/MailContentHandler.java | 13 +-
.../org/apache/tika/parser/mat/MatParser.java | 5 +
.../tika/parser/microsoft/ExcelExtractor.java | 34 +-
.../microsoft/TikaExcelDataFormatter.java | 41 +
.../microsoft/TikaExcelGeneralFormat.java | 90 ++
.../tika/parser/microsoft/WordExtractor.java | 20 +
.../microsoft/ooxml/MetadataExtractor.java | 15 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 20 +-
.../ooxml/XWPFWordExtractorDecorator.java | 52 +-
.../microsoft/xml/AbstractXML2003Parser.java | 4 +
.../tika/parser/microsoft/xml/WordMLParser.java | 3 +
.../tika/parser/ocr/TesseractOCRConfig.java | 181 +++-
.../tika/parser/ocr/TesseractOCRParser.java | 113 ++-
.../parser/odf/OpenDocumentContentParser.java | 3 +
.../tika/parser/pdf/AbstractPDF2XHTML.java | 16 +-
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 1 -
.../org/apache/tika/parser/pdf/PDFParser.java | 29 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 86 +-
.../parser/recognition/ObjectRecogniser.java | 75 ++
.../recognition/ObjectRecognitionParser.java | 171 ++++
.../parser/recognition/RecognisedObject.java | 91 ++
.../tf/TensorflowImageRecParser.java | 152 ++++
.../tf/TensorflowRESTRecogniser.java | 142 ++++
.../apache/tika/parser/txt/CharsetDetector.java | 416 +++++----
.../apache/tika/parser/txt/CharsetMatch.java | 139 ++-
.../tika/parser/txt/CharsetRecog_2022.java | 28 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 24 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 99 ++-
.../tika/parser/txt/CharsetRecog_mbcs.java | 44 +-
.../tika/parser/txt/CharsetRecog_sbcs.java | 835 ++++++++++---------
.../tika/parser/txt/CharsetRecognizer.java | 31 +-
.../parser/ocr/TesseractOCRConfig.properties | 13 +-
.../org/apache/tika/parser/ocr/rotation.py | 72 ++
.../recognition/tf/InceptionRestDockerfile | 41 +
.../parser/recognition/tf/classify_image.py | 212 +++++
.../tika/parser/recognition/tf/inceptionapi.py | 319 +++++++
.../org/apache/tika/mime/TestMimeTypes.java | 13 +
.../tika/parser/chm/TestChmExtractor.java | 21 +-
.../apache/tika/parser/html/HtmlParserTest.java | 140 +++-
.../tika/parser/mail/RFC822ParserTest.java | 68 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 1 -
.../tika/parser/microsoft/ExcelParserTest.java | 10 +
.../tika/parser/microsoft/WordParserTest.java | 11 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 28 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 1 +
.../tika/parser/ocr/TesseractOCRConfigTest.java | 61 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 18 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 44 +-
.../ObjectRecognitionParserTest.java | 89 ++
.../tf/TensorflowImageRecParserTest.java | 58 ++
.../parser/pdf/tika-config-non-primitives.xml | 29 +
.../org/apache/tika/parser/pdf/tika-config.xml | 26 +
.../recognition/tika-config-tflow-rest.xml | 30 +
.../parser/recognition/tika-config-tflow.xml | 29 +
.../resources/test-documents/testChm_oom.chm | Bin 0 -> 4315 bytes
.../test-documents/testEXCEL_big_numbers.xls | Bin 0 -> 26112 bytes
.../test-documents/testEXCEL_big_numbers.xlsx | Bin 0 -> 8396 bytes
.../test-documents/testEmailWithPNGAtt.eml | 354 ++++++++
.../resources/test-documents/testHTML_head.html | 32 +
.../test-documents/testOpenOffice2.odt | Bin 26448 -> 26460 bytes
.../resources/test-documents/testStataDTA.dta | Bin 0 -> 1207 bytes
.../resources/test-documents/testStataDTA.txt | 15 +
.../resources/test-documents/testWORD2003.xml | 2 +-
.../test-documents/testWORD_boldHyperlink.doc | Bin 0 -> 27136 bytes
.../test-documents/testWORD_boldHyperlink.docx | Bin 0 -> 12382 bytes
.../testWORD_totalTimeOutOfRange.docx | Bin 0 -> 11047 bytes
.../TesseractOCRConfig-full.properties | 6 +
.../TesseractOCRConfig-partial.properties | 8 +-
tika-translate/pom.xml | 2 +-
104 files changed, 5612 insertions(+), 917 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d50a6936/tika-parsers/pom.xml
----------------------------------------------------------------------