You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2018/07/09 17:25:00 UTC
[tika] branch TIKA-2672 updated (bfacd29 -> 60d0f6d)
This is an automated email from the ASF dual-hosted git repository.
mattmann pushed a change to branch TIKA-2672
in repository https://gitbox.apache.org/repos/asf/tika.git.
from bfacd29 TIKA-2672 -- create dev branch for upgrading dl4j to 1.0.0-beta
add 7914318 added version-specific mime-type definitions for Quattro for DOS and Windows
add b5c07d5 added missing glob patterns to old Quattro Pro definition
add 1fd441e deleted existing entry for application/x-123 b/c of collisions with Quattro Pro magic
add e6d332d added version-specific mime-type definitions for Lotus 1-2-3
add 85d25cc removed glob pattern for .wks to avoid conflic with vnd.ms-works
add c8deb97 modified quattro pro version fields so they are consistent with wb3 and qpw output
add 73d37e6 added test documents for wk1, wk3, wq1 and wq2 formats
add 83832cb added reference to PRONOM / TNA and the Open Government License to NOTICE.txt
add 817f848 added mimetype definition for Lotus 1-2-3 97/9.x
add b2a0b78 added test documents for Lotus 1-2-3 v4 and 97/9.x formats
add 6c60f5c added suffix to Lotus samples from Lotus FTP site
add c5f0de1 removed old entry for .123 extension
add 920f682 fixed indentation
add 5e6b2c6 created entry for application/vnd.lotus-1-2-3
add be9d440 commented out glob pattern for wks because it creates a conflict
add 6844d90 added samples for .wb1, .wb2 and .wks (all created using Quattro Pro 6.0 running under Windows 3.11 VM)
add 6744383 added mimetype definition for WordPerfect 4.2,based on Philip Storry's trID magic pattern
add 8c7c760 added Wordperfect 4.2 sample file (made with WordPerfect 6.1 for Windows running on VM with Windows 3.11)
add 43b84c2 Merge branch 'TIKA-2468' of https://github.com/bitsgalore/tika
add 7bd88f6 TIKA-2677 -- fix multithreaded updating/access to MediaTypeRegistry, via Yuriy Koval
add b688afa TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar
add 5eec28a TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar -- fix illegal getBytes()...mea culpa...
add 469d28a Bumped PDFBox to 2.0.11
add dc97a0c TIKA-2682 -- update jempbox to 1.8.15
add 2c75ea1 TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
add c9a81a4 TIKA-2675 -- OpenDocumentParser should fail on invalid zip via Sebastian Nagel and PR-240.
add 66417f6 improve htmlparser
add 790c124 TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar
add e7b481d Merge branch 'master' of github.com:/apache/tika
add af3c37e update to dl4j-1.0.0-SNAPSHOT + refactoring
add f0bb499 Change tika-dockers release to 0.2
add 8c0280c Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672
add 1e0d454 remove javacpp + fix jna dependency conflict
add 60d0f6d Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672
No new revisions were added by this update.
Summary of changes:
NOTICE.txt | 4 +
.../java/org/apache/tika/io/TikaInputStream.java | 11 +-
.../org/apache/tika/mime/MediaTypeRegistry.java | 5 +-
.../main/java/org/apache/tika/mime/MimeTypes.java | 5 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 128 +++++-
.../org/apache/tika/mime/MimeTypesReaderTest.java | 22 +
tika-dl/pom.xml | 73 +--
.../tika/dl/imagerec/DL4JInceptionV3Net.java | 85 ++--
.../org/apache/tika/dl/imagerec/DL4JVGG16Net.java | 31 +-
.../imagerec/imagenet_incpetionv3_class_index.json | 1 -
.../apache/tika/dl/imagerec/inceptionv3-model.json | 1 -
.../tika/dl/imagerec/DL4JInceptionV3NetTest.java | 3 -
.../apache/tika/dl/imagerec/DL4JVGG16NetTest.java | 1 -
.../tika/dl/imagerec/dl4j-inception3-config.xml | 3 +-
tika-parsers/pom.xml | 18 +-
.../tika/parser/html/HtmlEncodingDetector.java | 8 +-
.../org/apache/tika/parser/html/HtmlParser.java | 25 ++
.../parser/html/StrictHtmlEncodingDetector.java | 491 +++++++++++++++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 7 +-
.../tika/parser/html/whatwg-encoding-labels.tsv | 234 ++++++++++
.../tika/parser/html/HtmlEncodingDetectorTest.java | 142 ++++++
.../html/StrictHtmlEncodingDetectorTest.java | 300 +++++++++++++
.../org/apache/tika/parser/odf/ODFParserTest.java | 23 +
.../test-documents/testLotus123-lotusftp.123 | Bin 0 -> 18768 bytes
.../test-documents/testLotus123-lotusftp.wk4 | Bin 0 -> 6168 bytes
.../test/resources/test-documents/testLotus123.wk1 | Bin 0 -> 24291 bytes
.../test/resources/test-documents/testLotus123.wk3 | Bin 0 -> 18635 bytes
.../test/resources/test-documents/testLotus123.wks | Bin 0 -> 852 bytes
.../test-documents/testODTnotaZipFile.odt | 1 +
.../test/resources/test-documents/testQuattro.wb1 | Bin 0 -> 4813 bytes
.../test/resources/test-documents/testQuattro.wb2 | Bin 0 -> 4804 bytes
.../test/resources/test-documents/testQuattro.wq1 | Bin 0 -> 18687 bytes
.../test/resources/test-documents/testQuattro.wq2 | Bin 0 -> 7938 bytes
.../test-documents/testWordPerfect_42.doc | Bin 0 -> 725 bytes
.../apache/tika/server/resource/TikaResource.java | 17 +-
.../java/org/apache/tika/server/CXFTestBase.java | 4 +-
.../org/apache/tika/server/TikaParsersTest.java | 69 +--
.../org/apache/tika/server/TikaResourceTest.java | 38 ++
.../tika/server/tika-config-for-server-tests.xml | 5 +-
.../src/test/resources}/testPDFTwoTextBoxes.pdf | Bin
40 files changed, 1559 insertions(+), 196 deletions(-)
delete mode 100644 tika-dl/src/main/resources/org/apache/tika/dl/imagerec/imagenet_incpetionv3_class_index.json
delete mode 100644 tika-dl/src/main/resources/org/apache/tika/dl/imagerec/inceptionv3-model.json
create mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
create mode 100644 tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wk1
create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wk3
create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wks
create mode 100644 tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt
create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wb1
create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wb2
create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wq1
create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wq2
create mode 100644 tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
copy tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml => tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml (86%)
copy {tika-parsers/src/test/resources/test-documents => tika-server/src/test/resources}/testPDFTwoTextBoxes.pdf (100%)