You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/10/26 02:37:10 UTC

[5/7] tika git commit: Merge branch 'master' into TIKA-1343

Merge branch 'master' into TIKA-1343


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d50a6936
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d50a6936
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d50a6936

Branch: refs/heads/master
Commit: d50a69361bd0196fb2595313cb47222f61701ba4
Parents: a1250ff 07aea36
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:06:47 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:06:47 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt                                     |  30 +
 tika-bundle/pom.xml                             |   2 +-
 .../main/java/org/apache/tika/config/Field.java |  45 +
 .../org/apache/tika/config/Initializable.java   |  33 +
 .../main/java/org/apache/tika/config/Param.java | 191 +++++
 .../java/org/apache/tika/config/ParamField.java | 158 ++++
 .../java/org/apache/tika/config/TikaConfig.java |  47 +-
 .../tika/exception/TikaConfigException.java     |  39 +
 .../org/apache/tika/parser/AbstractParser.java  |  10 +
 .../java/org/apache/tika/parser/Parser.java     |   1 +
 .../tika/parser/external/ExternalParser.java    |  85 +-
 .../apache/tika/sax/XHTMLContentHandler.java    |   5 +-
 .../org/apache/tika/utils/AnnotationUtils.java  | 138 +++
 .../apache/tika/utils/ServiceLoaderUtils.java   |  30 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |  67 +-
 .../java/org/apache/tika/config/ParamTest.java  |  71 ++
 .../tika/parser/DummyInitializableParser.java   |  68 ++
 .../tika/parser/DummyParameterizedParser.java   | 113 +++
 .../tika/parser/InitializableParserTest.java    |  45 +
 .../tika/parser/ParameterizedParserTest.java    | 125 +++
 .../apache/tika/utils/AnnotationUtilsTest.java  | 190 +++++
 .../tika/config/TIKA-1508-configurable.xml      |  37 +
 .../tika/config/TIKA-1986-bad-parameters.xml    |  26 +
 .../apache/tika/config/TIKA-1986-bad-types.xml  |  26 +
 .../apache/tika/config/TIKA-1986-bad-values.xml |  26 +
 .../tika/config/TIKA-1986-initializable.xml     |  28 +
 .../TIKA-1986-parameterized-decorated.xml       |  39 +
 .../tika/config/TIKA-1986-parameterized.xml     |  38 +
 .../tika/config/TIKA-1986-some-parameters.xml   |  28 +
 tika-parent/pom.xml                             |  12 +-
 tika-parsers/pom.xml                            |  26 +-
 .../chm/accessor/ChmDirectoryListingSet.java    |  11 +-
 .../apache/tika/parser/chm/core/ChmCommons.java |   5 +-
 .../tika/parser/chm/core/ChmExtractor.java      |   4 +-
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java |   4 +-
 .../tika/parser/mail/MailContentHandler.java    |  13 +-
 .../org/apache/tika/parser/mat/MatParser.java   |   5 +
 .../tika/parser/microsoft/ExcelExtractor.java   |  34 +-
 .../microsoft/TikaExcelDataFormatter.java       |  41 +
 .../microsoft/TikaExcelGeneralFormat.java       |  90 ++
 .../tika/parser/microsoft/WordExtractor.java    |  20 +
 .../microsoft/ooxml/MetadataExtractor.java      |  15 +-
 .../ooxml/XSSFExcelExtractorDecorator.java      |  20 +-
 .../ooxml/XWPFWordExtractorDecorator.java       |  52 +-
 .../microsoft/xml/AbstractXML2003Parser.java    |   4 +
 .../tika/parser/microsoft/xml/WordMLParser.java |   3 +
 .../tika/parser/ocr/TesseractOCRConfig.java     | 181 +++-
 .../tika/parser/ocr/TesseractOCRParser.java     | 113 ++-
 .../parser/odf/OpenDocumentContentParser.java   |   3 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  16 +-
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |   1 -
 .../org/apache/tika/parser/pdf/PDFParser.java   |  29 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java |  86 +-
 .../parser/recognition/ObjectRecogniser.java    |  75 ++
 .../recognition/ObjectRecognitionParser.java    | 171 ++++
 .../parser/recognition/RecognisedObject.java    |  91 ++
 .../tf/TensorflowImageRecParser.java            | 152 ++++
 .../tf/TensorflowRESTRecogniser.java            | 142 ++++
 .../apache/tika/parser/txt/CharsetDetector.java | 416 +++++----
 .../apache/tika/parser/txt/CharsetMatch.java    | 139 ++-
 .../tika/parser/txt/CharsetRecog_2022.java      |  28 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  24 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  99 ++-
 .../tika/parser/txt/CharsetRecog_mbcs.java      |  44 +-
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 835 ++++++++++---------
 .../tika/parser/txt/CharsetRecognizer.java      |  31 +-
 .../parser/ocr/TesseractOCRConfig.properties    |  13 +-
 .../org/apache/tika/parser/ocr/rotation.py      |  72 ++
 .../recognition/tf/InceptionRestDockerfile      |  41 +
 .../parser/recognition/tf/classify_image.py     | 212 +++++
 .../tika/parser/recognition/tf/inceptionapi.py  | 319 +++++++
 .../org/apache/tika/mime/TestMimeTypes.java     |  13 +
 .../tika/parser/chm/TestChmExtractor.java       |  21 +-
 .../apache/tika/parser/html/HtmlParserTest.java | 140 +++-
 .../tika/parser/mail/RFC822ParserTest.java      |  68 +-
 .../apache/tika/parser/mbox/MboxParserTest.java |   1 -
 .../tika/parser/microsoft/ExcelParserTest.java  |  10 +
 .../tika/parser/microsoft/WordParserTest.java   |  11 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  28 +-
 .../parser/microsoft/xml/XML2003ParserTest.java |   1 +
 .../tika/parser/ocr/TesseractOCRConfigTest.java |  61 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java |  18 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  44 +-
 .../ObjectRecognitionParserTest.java            |  89 ++
 .../tf/TensorflowImageRecParserTest.java        |  58 ++
 .../parser/pdf/tika-config-non-primitives.xml   |  29 +
 .../org/apache/tika/parser/pdf/tika-config.xml  |  26 +
 .../recognition/tika-config-tflow-rest.xml      |  30 +
 .../parser/recognition/tika-config-tflow.xml    |  29 +
 .../resources/test-documents/testChm_oom.chm    | Bin 0 -> 4315 bytes
 .../test-documents/testEXCEL_big_numbers.xls    | Bin 0 -> 26112 bytes
 .../test-documents/testEXCEL_big_numbers.xlsx   | Bin 0 -> 8396 bytes
 .../test-documents/testEmailWithPNGAtt.eml      | 354 ++++++++
 .../resources/test-documents/testHTML_head.html |  32 +
 .../test-documents/testOpenOffice2.odt          | Bin 26448 -> 26460 bytes
 .../resources/test-documents/testStataDTA.dta   | Bin 0 -> 1207 bytes
 .../resources/test-documents/testStataDTA.txt   |  15 +
 .../resources/test-documents/testWORD2003.xml   |   2 +-
 .../test-documents/testWORD_boldHyperlink.doc   | Bin 0 -> 27136 bytes
 .../test-documents/testWORD_boldHyperlink.docx  | Bin 0 -> 12382 bytes
 .../testWORD_totalTimeOutOfRange.docx           | Bin 0 -> 11047 bytes
 .../TesseractOCRConfig-full.properties          |   6 +
 .../TesseractOCRConfig-partial.properties       |   8 +-
 tika-translate/pom.xml                          |   2 +-
 104 files changed, 5612 insertions(+), 917 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d50a6936/tika-parsers/pom.xml
----------------------------------------------------------------------