You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2018/07/09 17:25:00 UTC

[tika] branch TIKA-2672 updated (bfacd29 -> 60d0f6d)

This is an automated email from the ASF dual-hosted git repository.

mattmann pushed a change to branch TIKA-2672
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from bfacd29  TIKA-2672 -- create dev branch for upgrading dl4j to 1.0.0-beta
     add 7914318  added version-specific mime-type definitions for Quattro for DOS and Windows
     add b5c07d5  added missing glob patterns to old Quattro Pro definition
     add 1fd441e  deleted existing entry for application/x-123 b/c of collisions with Quattro Pro magic
     add e6d332d  added version-specific mime-type definitions for Lotus 1-2-3
     add 85d25cc  removed glob pattern for .wks to avoid conflic with vnd.ms-works
     add c8deb97  modified quattro pro version fields so they are consistent with wb3 and qpw output
     add 73d37e6  added test documents for wk1, wk3, wq1 and wq2 formats
     add 83832cb  added reference to PRONOM / TNA and the Open Government License to NOTICE.txt
     add 817f848  added mimetype definition for Lotus 1-2-3 97/9.x
     add b2a0b78  added test documents for Lotus 1-2-3 v4 and 97/9.x formats
     add 6c60f5c  added suffix to Lotus samples from Lotus FTP site
     add c5f0de1  removed old entry for .123 extension
     add 920f682  fixed indentation
     add 5e6b2c6  created entry for application/vnd.lotus-1-2-3
     add be9d440  commented out glob pattern for wks because it creates a conflict
     add 6844d90  added samples for .wb1, .wb2 and .wks (all created using Quattro Pro 6.0 running under Windows 3.11 VM)
     add 6744383  added mimetype definition for WordPerfect 4.2,based on Philip Storry's trID magic pattern
     add 8c7c760  added Wordperfect 4.2 sample file (made with WordPerfect 6.1 for Windows running on VM with Windows 3.11)
     add 43b84c2  Merge branch 'TIKA-2468' of https://github.com/bitsgalore/tika
     add 7bd88f6  TIKA-2677 -- fix multithreaded updating/access to MediaTypeRegistry, via Yuriy Koval
     add b688afa  TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar
     add 5eec28a  TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar -- fix illegal getBytes()...mea culpa...
     add 469d28a  Bumped PDFBox to 2.0.11
     add dc97a0c  TIKA-2682 -- update jempbox to 1.8.15
     add 2c75ea1  TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
     add c9a81a4  TIKA-2675 -- OpenDocumentParser should fail on invalid zip via Sebastian Nagel and PR-240.
     add 66417f6  improve htmlparser
     add 790c124  TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar
     add e7b481d  Merge branch 'master' of github.com:/apache/tika
     add af3c37e  update to dl4j-1.0.0-SNAPSHOT + refactoring
     add f0bb499  Change tika-dockers release to 0.2
     add 8c0280c  Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672
     add 1e0d454  remove javacpp + fix jna dependency conflict
     add 60d0f6d  Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672

No new revisions were added by this update.

Summary of changes:
 NOTICE.txt                                         |   4 +
 .../java/org/apache/tika/io/TikaInputStream.java   |  11 +-
 .../org/apache/tika/mime/MediaTypeRegistry.java    |   5 +-
 .../main/java/org/apache/tika/mime/MimeTypes.java  |   5 +-
 .../org/apache/tika/mime/tika-mimetypes.xml        | 128 +++++-
 .../org/apache/tika/mime/MimeTypesReaderTest.java  |  22 +
 tika-dl/pom.xml                                    |  73 +--
 .../tika/dl/imagerec/DL4JInceptionV3Net.java       |  85 ++--
 .../org/apache/tika/dl/imagerec/DL4JVGG16Net.java  |  31 +-
 .../imagerec/imagenet_incpetionv3_class_index.json |   1 -
 .../apache/tika/dl/imagerec/inceptionv3-model.json |   1 -
 .../tika/dl/imagerec/DL4JInceptionV3NetTest.java   |   3 -
 .../apache/tika/dl/imagerec/DL4JVGG16NetTest.java  |   1 -
 .../tika/dl/imagerec/dl4j-inception3-config.xml    |   3 +-
 tika-parsers/pom.xml                               |  18 +-
 .../tika/parser/html/HtmlEncodingDetector.java     |   8 +-
 .../org/apache/tika/parser/html/HtmlParser.java    |  25 ++
 .../parser/html/StrictHtmlEncodingDetector.java    | 491 +++++++++++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |   7 +-
 .../tika/parser/html/whatwg-encoding-labels.tsv    | 234 ++++++++++
 .../tika/parser/html/HtmlEncodingDetectorTest.java | 142 ++++++
 .../html/StrictHtmlEncodingDetectorTest.java       | 300 +++++++++++++
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  23 +
 .../test-documents/testLotus123-lotusftp.123       | Bin 0 -> 18768 bytes
 .../test-documents/testLotus123-lotusftp.wk4       | Bin 0 -> 6168 bytes
 .../test/resources/test-documents/testLotus123.wk1 | Bin 0 -> 24291 bytes
 .../test/resources/test-documents/testLotus123.wk3 | Bin 0 -> 18635 bytes
 .../test/resources/test-documents/testLotus123.wks | Bin 0 -> 852 bytes
 .../test-documents/testODTnotaZipFile.odt          |   1 +
 .../test/resources/test-documents/testQuattro.wb1  | Bin 0 -> 4813 bytes
 .../test/resources/test-documents/testQuattro.wb2  | Bin 0 -> 4804 bytes
 .../test/resources/test-documents/testQuattro.wq1  | Bin 0 -> 18687 bytes
 .../test/resources/test-documents/testQuattro.wq2  | Bin 0 -> 7938 bytes
 .../test-documents/testWordPerfect_42.doc          | Bin 0 -> 725 bytes
 .../apache/tika/server/resource/TikaResource.java  |  17 +-
 .../java/org/apache/tika/server/CXFTestBase.java   |   4 +-
 .../org/apache/tika/server/TikaParsersTest.java    |  69 +--
 .../org/apache/tika/server/TikaResourceTest.java   |  38 ++
 .../tika/server/tika-config-for-server-tests.xml   |   5 +-
 .../src/test/resources}/testPDFTwoTextBoxes.pdf    | Bin
 40 files changed, 1559 insertions(+), 196 deletions(-)
 delete mode 100644 tika-dl/src/main/resources/org/apache/tika/dl/imagerec/imagenet_incpetionv3_class_index.json
 delete mode 100644 tika-dl/src/main/resources/org/apache/tika/dl/imagerec/inceptionv3-model.json
 create mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
 create mode 100644 tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
 create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
 create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
 create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
 create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
 create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wk1
 create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wk3
 create mode 100644 tika-parsers/src/test/resources/test-documents/testLotus123.wks
 create mode 100644 tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt
 create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wb1
 create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wb2
 create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wq1
 create mode 100644 tika-parsers/src/test/resources/test-documents/testQuattro.wq2
 create mode 100644 tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
 copy tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml => tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml (86%)
 copy {tika-parsers/src/test/resources/test-documents => tika-server/src/test/resources}/testPDFTwoTextBoxes.pdf (100%)