You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by nd...@apache.org on 2024/03/29 07:30:51 UTC

(tika) branch TIKA-4181-grpc updated (322452021 -> 100ef9c30)

This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a change to branch TIKA-4181-grpc
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 322452021 code formatting
     add 4750b9644 Bump aws.version from 1.12.653 to 1.12.654
     add 3cc670965 Merge pull request #1579 from apache/dependabot/maven/aws.version-1.12.654
     add 3bd172b47 Bump com.google.cloud:google-cloud-storage from 2.32.1 to 2.33.0
     add 769ff7198 Merge pull request #1580 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.33.0
     add 214ffcbc4 Bump it.unimi.dsi:fastutil from 8.5.12 to 8.5.13
     add 02c4ef297 Merge pull request #1581 from apache/dependabot/maven/it.unimi.dsi-fastutil-8.5.13
     add 255cc7936 TIKA-4166: update jackrabbit
     add 16e1bc9c8 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582)
     add 2e92b79dc Bump org.testcontainers:testcontainers-bom from 1.19.4 to 1.19.5
     add 7925830b8 Merge pull request #1586 from apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.5
     add bdc8fa1e2 Bump commons-codec:commons-codec from 1.16.0 to 1.16.1
     add 51605eeb4 Merge pull request #1585 from apache/dependabot/maven/commons-codec-commons-codec-1.16.1
     add cb572342b Bump org.apache.solr:solr-solrj from 8.11.2 to 8.11.3
     add d461ffc61 Merge pull request #1584 from apache/dependabot/maven/org.apache.solr-solr-solrj-8.11.3
     add 1cd7a7b3c Bump aws.version from 1.12.654 to 1.12.655
     add 87d3178b5 Merge pull request #1583 from apache/dependabot/maven/aws.version-1.12.655
     add 7abd05d99 TIKA-4166: update jwarc
     add 7d48d00ac TIKA-4188 (#1587)
     add eeee9ce49 Bump aws.version from 1.12.655 to 1.12.656
     add 914740907 Merge pull request #1588 from apache/dependabot/maven/aws.version-1.12.656
     add 7c758c31e TIKA-4196 -- add a bom EncodingDetector (#1590)
     add c2acd713b [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589)
     add 455409bf8 TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591)
     add fb6ba1a33 TIKA-4191 -- reduce tika-core's scope to "provided" where possible (#1575)
     add 71f821aec Bump aws.version from 1.12.656 to 1.12.657
     add 3612085cd Merge pull request #1592 from apache/dependabot/maven/aws.version-1.12.657
     add 47bba6baa Bump aws.version from 1.12.657 to 1.12.658
     add 28d66009b Merge pull request #1595 from apache/dependabot/maven/aws.version-1.12.658
     add cdcea4bd0 Bump io.netty:netty-bom from 4.1.106.Final to 4.1.107.Final
     add 4ac0d7b5b Merge pull request #1593 from apache/dependabot/maven/io.netty-netty-bom-4.1.107.Final
     add 65114ae25 Bump org.netpreserve:jwarc from 0.28.6 to 0.29.0
     add 1de440ecb Merge pull request #1594 from apache/dependabot/maven/org.netpreserve-jwarc-0.29.0
     add fca564f50 Bump aws.version from 1.12.658 to 1.12.659
     add 6f96ba85a Merge pull request #1597 from apache/dependabot/maven/aws.version-1.12.659
     add 5d5d472ed Bump com.google.cloud:google-cloud-storage from 2.33.0 to 2.34.0
     add 5d94627a3 Merge pull request #1596 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.34.0
     add cf618576b Bump aws.version from 1.12.659 to 1.12.660
     add 74b358968 Merge pull request #1599 from apache/dependabot/maven/aws.version-1.12.660
     add ebbc7e564 Bump org.springframework:spring-context from 5.3.31 to 5.3.32
     add 94c3cb4d4 Merge pull request #1598 from apache/dependabot/maven/org.springframework-spring-context-5.3.32
     add 8dab69e2c Bump com.google.protobuf:protobuf-java from 3.25.2 to 3.25.3
     add d35ec8fe5 Merge pull request #1601 from apache/dependabot/maven/com.google.protobuf-protobuf-java-3.25.3
     add 70edaa5df TIKA-4166: add comments
     add ee624917c Bump aws.version from 1.12.660 to 1.12.661
     add 9de08df05 Merge pull request #1602 from apache/dependabot/maven/aws.version-1.12.661
     add a3d535030 Bump logback.version from 1.4.14 to 1.5.0
     add 1fe8ffe8f Merge pull request #1603 from apache/dependabot/maven/logback.version-1.5.0
     add 0c5b3f9ee Bump aws.version from 1.12.661 to 1.12.662
     add b37a5dca2 Merge pull request #1606 from apache/dependabot/maven/aws.version-1.12.662
     add e4b23d811 Bump org.apache.commons:commons-compress from 1.25.0 to 1.26.0
     add f4f65eca3 replace deprecated
     add 8623c9261 restore license URL
     add 363a20316 try larger buffer size
     add 3648dc3c3 try larger buffer
     add 123344819 download archive element to avoid trouble with commpns-compress 1.26.0
     add 8cf526b56 adjust failing test
     add f8fec9c7e add test output to help with future debugging
     add 12a27e265 revert (wrong file)
     add c9f612ecf add code to help with future debugging
     add 9690bd716 add TODO
     add fb3f21386 Merge pull request #1605 from apache/dependabot/maven/org.apache.commons-commons-compress-1.26.0
     add e5d57528d TIKA-4199: complete delegate class
     add 4c3625fb4 TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
     add a3a830359 TIKA-4199: complete delegate class
     add fd44840a1 TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
     add a305ab772 TIKA-4199: replace deprecated
     add 3211c0e0b Bump lucene.version from 9.9.2 to 9.10.0
     add 069136300 Merge pull request #1609 from apache/dependabot/maven/lucene.version-9.10.0
     add bcfa89b93 Bump org.apache.maven.plugins:maven-shade-plugin from 3.5.1 to 3.5.2
     add 0ff5d9f7e Merge pull request #1611 from apache/dependabot/maven/org.apache.maven.plugins-maven-shade-plugin-3.5.2
     add d775b4ba3 Bump aws.version from 1.12.662 to 1.12.663
     add c3fb11de5 Merge pull request #1610 from apache/dependabot/maven/aws.version-1.12.663
     add a31b5d09e Bump org.codehaus.mojo:exec-maven-plugin from 3.1.1 to 3.2.0
     add fde6c41b4 Merge pull request #1612 from apache/dependabot/maven/org.codehaus.mojo-exec-maven-plugin-3.2.0
     add fdacb4ae7 Bump org.testcontainers:testcontainers-bom from 1.19.5 to 1.19.6
     add 3c1303a8d Merge pull request #1616 from apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.6
     add 979c1fe9b Bump org.scala-lang:scala-reflect from 2.13.12 to 2.13.13
     add d9293f205 Merge pull request #1615 from apache/dependabot/maven/org.scala-lang-scala-reflect-2.13.13
     add 53abf1e59 Bump aws.version from 1.12.663 to 1.12.664
     add 675f9b0c0 Merge pull request #1614 from apache/dependabot/maven/aws.version-1.12.664
     add 4669b9be9 Bump log4j2.version from 2.22.1 to 2.23.0
     add d787ca9e3 Merge pull request #1613 from apache/dependabot/maven/log4j2.version-2.23.0
     add 697f5caab Bump aws.version from 1.12.664 to 1.12.665
     add da2f49902 Merge pull request #1617 from apache/dependabot/maven/aws.version-1.12.665
     add 371df1f1a Bump jackrabbit.version from 2.21.23 to 2.21.24
     add 2596d9a3a Merge pull request #1618 from apache/dependabot/maven/jackrabbit.version-2.21.24
     add af4a2449b Bump com.mchange:mchange-commons-java from 0.2.20 to 0.3.0
     add 64b1ba499 Merge pull request #1620 from apache/dependabot/maven/com.mchange-mchange-commons-java-0.3.0
     add 72927ec17 TIKA-4202 -- add ocr page count to metadata for PDFs (#1621)
     add b679545b9 TIKA-4166: update aws, azure, mime4j
     add 297fcd187 TIKA-4203: add @Deprecated annotation
     add bfce47310 TIKA-4203: add @Deprecated / @Override annotation
     add f37fcad37 TIKA-4203: add @Deprecated / @Override annotation
     add c4d445bc4 Bump aws.version from 1.12.666 to 1.12.667
     add 367cda4b3 Merge pull request #1622 from apache/dependabot/maven/aws.version-1.12.667
     add fb0799166 Bump org.apache.kafka:kafka-clients from 3.6.1 to 3.7.0
     add 8db7b35d7 Merge pull request #1625 from apache/dependabot/maven/org.apache.kafka-kafka-clients-3.7.0
     add 58621d7a4 Bump jackrabbit.version from 2.21.24 to 2.21.25
     add 15afed3ee Merge pull request #1624 from apache/dependabot/maven/jackrabbit.version-2.21.25
     add 574dd7670 Bump com.fasterxml.woodstox:woodstox-core from 6.6.0 to 6.6.1
     add cd6357586 Merge pull request #1623 from apache/dependabot/maven/com.fasterxml.woodstox-woodstox-core-6.6.1
     add fb387c627 Bump aws.version from 1.12.667 to 1.12.668
     add b421acbae Merge pull request #1627 from apache/dependabot/maven/aws.version-1.12.668
     add 1d9b9e5cc Bump com.qmino:miredot-plugin from 2.4.3-Java11 to 2.4.4-Java11
     add 45bcf2b5c Merge pull request #1628 from apache/dependabot/maven/com.qmino-miredot-plugin-2.4.4-Java11
     add eefe884c8 TIKA-4204 -- improve lookup of dataspace/storage items
     add 1c1018950 TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle
     add 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629)
     add 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
     add 0be76cf28 Bump logback.version from 1.5.0 to 1.5.1
     add 4a5a21ea1 Merge pull request #1632 from apache/dependabot/maven/logback.version-1.5.1
     add 8ab8673ce Bump aws.version from 1.12.668 to 1.12.669
     add 386a5934a Merge pull request #1631 from apache/dependabot/maven/aws.version-1.12.669
     add 215b75b67 TIKA-4166: update puppycrawl
     add b3e4252b2 Bump aws.version from 1.12.669 to 1.12.670
     add 1f9e773e8 Merge pull request #1634 from apache/dependabot/maven/aws.version-1.12.670
     add 6b726fbe5 Bump jakarta.activation:jakarta.activation-api from 2.1.2 to 2.1.3
     add 6a0a59d42 Merge pull request #1635 from apache/dependabot/maven/jakarta.activation-jakarta.activation-api-2.1.3
     add ffc7df20f TIKA-4166: update aws, azure, mockito
     add b5023198b Bump logback.version from 1.5.1 to 1.5.2
     add 86d1e897e Merge pull request #1637 from apache/dependabot/maven/logback.version-1.5.2
     add 1a5f23ff4 Bump aws.version from 1.12.671 to 1.12.672
     add e3bb8cfea Merge pull request #1638 from apache/dependabot/maven/aws.version-1.12.672
     add c8097b6ad Bump logback.version from 1.5.2 to 1.5.3
     add dc612a7b5 Merge pull request #1639 from apache/dependabot/maven/logback.version-1.5.3
     add 32ef34ff4 TIKA-4199: add comment, print to stderr
     add 64c083d12 Bump aws.version from 1.12.672 to 1.12.673
     add 2f6e4cd30 Merge pull request #1640 from apache/dependabot/maven/aws.version-1.12.673
     add 36664ef41 Bump com.google.cloud:google-cloud-storage from 2.34.0 to 2.35.0
     add 26c33d46c Merge pull request #1641 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.35.0
     add 6cf215017 Bump org.testcontainers:testcontainers-bom from 1.19.6 to 1.19.7
     add 8b3230dff Merge pull request #1642 from apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.7
     add 5221d8874 Bump aws.version from 1.12.673 to 1.12.674
     add 43a4e58cc Merge pull request #1643 from apache/dependabot/maven/aws.version-1.12.674
     add b7c5d48ce Bump aws.version from 1.12.674 to 1.12.675
     add 79b194a69 Merge pull request #1644 from apache/dependabot/maven/aws.version-1.12.675
     add a89e9779f Bump jakarta.xml.bind:jakarta.xml.bind-api from 4.0.1 to 4.0.2
     add 4af4be5be Merge pull request #1645 from apache/dependabot/maven/jakarta.xml.bind-jakarta.xml.bind-api-4.0.2
     add 8b398201a TIKA-4199: revert "complete delegate class", field "in" is a dummy; remove workaround for commons-compress 1.26
     add 5b259d60a TIKA-4199: adjust test results now that commons compress bug has been fixed
     add 4d6acfc10 TIKA-4199: update commons-compress
     add 1dd99bf45 TIKA-4166: update aws
     add 5f4e380ff TIKA-4166: update jaxb
     add d477bfd3b TIKA-4166: revert jaxb update
     add 0f077da2a TIKA-4166: update jaxb and prevent convergence problem
     add f0b76e503 Bump com.googlecode.plist:dd-plist from 1.27 to 1.28
     add da3f8c970 Merge pull request #1649 from apache/dependabot/maven/com.googlecode.plist-dd-plist-1.28
     add 67790a364 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.6.0 to 3.7.0
     add 418258161 Merge pull request #1646 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.0
     add bc2167a30 Bump log4j2.version from 2.23.0 to 2.23.1
     add 17caf585d Merge pull request #1648 from apache/dependabot/maven/log4j2.version-2.23.1
     add b980d9d86 Bump com.fasterxml.jackson:jackson-bom from 2.16.1 to 2.16.2
     add bdb6a4656 Merge pull request #1647 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.16.2
     add 84f0a5b7f Bump aws.version from 1.12.676 to 1.12.677
     add 3a7bbc50d Merge pull request #1651 from apache/dependabot/maven/aws.version-1.12.677
     add 3ffadd5a3 Bump aws.version from 1.12.677 to 1.12.678
     add 49064dbe2 Merge pull request #1652 from apache/dependabot/maven/aws.version-1.12.678
     add e65d52cb5 Bump org.xerial:sqlite-jdbc from 3.45.1.0 to 3.45.2.0
     add 846f3a080 Merge pull request #1655 from apache/dependabot/maven/org.xerial-sqlite-jdbc-3.45.2.0
     add be7640d53 Bump com.fasterxml.jackson:jackson-bom from 2.16.2 to 2.17.0
     add 7cd6ee86b Merge pull request #1653 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.17.0
     add 23d26d770 Bump reactor.netty.version from 1.1.15 to 1.1.17
     add 18d9fd769 Merge pull request #1654 from apache/dependabot/maven/reactor.netty.version-1.1.17
     add 1d666ea04 Bump io.projectreactor:reactor-core from 3.6.2 to 3.6.4
     add 207594f9f Merge pull request #1656 from apache/dependabot/maven/io.projectreactor-reactor-core-3.6.4
     add 533e056bb TIKA-4166: update puppycrawl, cxf
     add 8d5c3578a Bump aws.version from 1.12.678 to 1.12.679
     add ef75d45aa Merge pull request #1658 from apache/dependabot/maven/aws.version-1.12.679
     add df573d07c Bump com.google.guava:guava from 33.0.0-jre to 33.1.0-jre
     add 290742590 Merge pull request #1657 from apache/dependabot/maven/com.google.guava-guava-33.1.0-jre
     add 91820226e TIKA-4166: update mime4j
     add 3ccfcb485 Bump pdfbox.version from 3.0.1 to 3.0.2
     add e9aa16994 Merge pull request #1660 from apache/dependabot/maven/pdfbox.version-3.0.2
     add 3c131e76a Bump org.springframework:spring-context from 5.3.32 to 5.3.33
     add d90a564ad Merge pull request #1662 from apache/dependabot/maven/org.springframework-spring-context-5.3.33
     add 6d02aa2ed Bump aws.version from 1.12.679 to 1.12.680
     add cf2073dda Merge pull request #1661 from apache/dependabot/maven/aws.version-1.12.680
     add 2ec57fb14 Bump aws.version from 1.12.680 to 1.12.681
     add 0a224b32d Merge pull request #1664 from apache/dependabot/maven/aws.version-1.12.681
     add c963c51da Bump com.google.cloud:google-cloud-storage from 2.35.0 to 2.36.0
     add 2e614b438 Merge pull request #1663 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.0
     add 67d593c27 TIKA-4166: update puppycrawl
     add 7735eeb16 Bump aws.version from 1.12.681 to 1.12.682
     add f1b7f07b7 Merge pull request #1665 from apache/dependabot/maven/aws.version-1.12.682
     add 0a9f17c2d TIKA-4166: update zookeeper
     add fcdff7cf7 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.7.0 to 3.7.1
     add b3c8c3e7e Merge pull request #1666 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.1
     add 2fa9ab30c Bump org.apache.maven.plugins:maven-compiler-plugin
     add 0e166b0d1 Merge pull request #1667 from apache/dependabot/maven/org.apache.maven.plugins-maven-compiler-plugin-3.13.0
     add 880b34556 Bump aws.version from 1.12.682 to 1.12.683
     add eac6f090b Merge pull request #1668 from apache/dependabot/maven/aws.version-1.12.683
     add 9ea184af5 Bump aws.version from 1.12.683 to 1.12.684
     add 96fd5fd6c Merge pull request #1671 from apache/dependabot/maven/aws.version-1.12.684
     add e63730e12 TIKA-4213 -- improve jdbc pipes reporter (#1669)
     add 7dc3d28a5 TIKA-4211 -- first attempt (#1670)
     add 85d713a9a TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672)
     add 237e73f18 TIKA-4216 (#1673)
     add 08727d522 TIKA-4217 -- require new line or white space as part of bitmap magic (#1674)
     add fd23e6c27 Bump io.netty:netty-bom from 4.1.107.Final to 4.1.108.Final
     add d600259c5 Merge pull request #1677 from apache/dependabot/maven/io.netty-netty-bom-4.1.108.Final
     add 8e27e31a6 Bump com.google.cloud:google-cloud-storage from 2.36.0 to 2.36.1
     add a954511bd Merge pull request #1676 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.1
     add a01e3edb4 Bump aws.version from 1.12.684 to 1.12.685
     add daad9b2b1 Merge pull request #1675 from apache/dependabot/maven/aws.version-1.12.685
     add 33ac40ccf TIKA-4166: update azure-storage-blob
     add f3f8404dd Bump commons-logging:commons-logging from 1.3.0 to 1.3.1
     add 449f8d192 Merge pull request #1683 from apache/dependabot/maven/commons-logging-commons-logging-1.3.1
     add fce53f9df Bump aws.version from 1.12.685 to 1.12.686
     add 27f1d87e5 Merge pull request #1682 from apache/dependabot/maven/aws.version-1.12.686
     add ba51ff3b6 Bump de.thetaphi:forbiddenapis from 3.6 to 3.7
     add 39b5c8a7b Merge pull request #1681 from apache/dependabot/maven/de.thetaphi-forbiddenapis-3.7
     add c51ab337d Bump org.ow2.asm:asm from 9.6 to 9.7
     add 40bf35574 Merge pull request #1680 from apache/dependabot/maven/org.ow2.asm-asm-9.7
     add b9ab4813e TIKA-4171 -- fix regression when field names are missing in the XFAExtractor (#1679)
     add a559906db TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684)
     add 36e3ba8cd TIKA-4225 -- add detection for amf (#1688)
     add 3ffbc04f7 TIKA-4224 -- add detection for 3mf (#1689)
     add c5693624c TIKA-4222 -- add openscad glob (#1690)
     add b6bfe78d9 Bump aws.version from 1.12.686 to 1.12.687
     add 035c18461 Merge pull request #1692 from apache/dependabot/maven/aws.version-1.12.687
     add 9d45b69da TIKA-4223 -- add detection of stl (#1691)
     add e88be05ad TIKA-4219 -- clean up...do not include font names in main package
     add afc05ee4b Bump com.fasterxml.woodstox:woodstox-core from 6.6.1 to 6.6.2
     add e5511a043 Merge pull request #1693 from apache/dependabot/maven/com.fasterxml.woodstox-woodstox-core-6.6.2
     add 25badd98b Bump aws.version from 1.12.687 to 1.12.688
     add 07f1f4f24 Merge pull request #1694 from apache/dependabot/maven/aws.version-1.12.688
     add 1fb5b2622 Bump aws.version from 1.12.688 to 1.12.689
     add 4f5dff9a1 Merge pull request #1696 from apache/dependabot/maven/aws.version-1.12.689
     add f8c6750c9 Bump com.github.luben:zstd-jni from 1.5.5-11 to 1.5.6-1
     add b1f8e430f Merge pull request #1697 from apache/dependabot/maven/com.github.luben-zstd-jni-1.5.6-1
     add 4fe731233 TIKA-4207: Add handling of embedded bytes to tika-pipes (#1699)
     add c36efa316 Bump aws.version from 1.12.689 to 1.12.690
     add 387d72392 Merge pull request #1700 from apache/dependabot/maven/aws.version-1.12.690
     add 81d193f56 Bump commons-io:commons-io from 2.15.1 to 2.16.0
     add 941f8f26c Merge pull request #1701 from apache/dependabot/maven/commons-io-commons-io-2.16.0
     new 100ef9c30 Merge branch 'main' of github.com:apache/tika into TIKA-4181-grpc

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 tika-app/pom.xml                                   |   1 +
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   2 +-
 .../java/org/apache/tika/cli/TikaCLIAsyncTest.java |  89 +++++++
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  59 +---
 tika-batch/pom.xml                                 |   3 +
 tika-core/src/main/java/org/apache/tika/Tika.java  |   4 +
 .../org/apache/tika/detect/AutoDetectReader.java   |  38 +--
 .../tika/detect/CompositeEncodingDetector.java     |   7 +
 .../AbstractEmbeddedDocumentBytesHandler.java      |  69 +++++
 .../tika/extractor/BasicEmbeddedBytesSelector.java |  77 ++++++
 .../BasicEmbeddedDocumentBytesHandler.java         |  58 ++++
 ...ctorFactory.java => EmbeddedBytesSelector.java} |  16 +-
 ...EmbeddedDocumentByteStoreExtractorFactory.java} |  23 +-
 .../EmbeddedDocumentBytesHandler.java}             |  12 +-
 .../ParsingEmbeddedDocumentExtractor.java          |  10 +-
 .../apache/tika/extractor/RUnpackExtractor.java    | 183 +++++++++++++
 .../tika/extractor/RUnpackExtractorFactory.java    | 111 ++++++++
 .../org/apache/tika/io/BoundedInputStream.java     |  31 ++-
 .../main/java/org/apache/tika/metadata/IPTC.java   |   8 +
 .../main/java/org/apache/tika/metadata/PDF.java    |   6 +
 .../apache/tika/metadata/TikaCoreProperties.java   |  20 ++
 .../main/java/org/apache/tika/mime/MimeTypes.java  |   4 +-
 .../org/apache/tika/parser/AbstractParser.java     |   1 +
 .../org/apache/tika/parser/AutoDetectParser.java   |  11 +-
 .../apache/tika/parser/AutoDetectParserConfig.java |   4 +-
 .../org/apache/tika/parser/ParserDecorator.java    |   1 +
 .../apache/tika/parser/RecursiveParserWrapper.java |   2 +
 .../parser/multiple/AbstractMultipleParser.java    |   1 +
 .../java/org/apache/tika/pipes/FetchEmitTuple.java |  52 +++-
 .../java/org/apache/tika/pipes/PipesServer.java    | 296 +++++++++++++++------
 .../extractor/EmbeddedDocumentBytesConfig.java     | 167 ++++++++++++
 .../EmittingEmbeddedDocumentBytesHandler.java      |  73 +++++
 .../org/apache/tika/mime/tika-mimetypes.xml        |  89 ++++++-
 .../java/org/apache/tika/TikaDetectionTest.java    |   2 +-
 .../tika/parser/AutoDetectParserConfigTest.java    |  72 +++++
 .../org/apache/tika/parser/mock/MockParser.java    |  26 +-
 .../org/apache/tika/pipes/PipesServerTest.java     | 120 ++++++++-
 ...rocessorTest.java => AsyncChaosMonkeyTest.java} |   2 +-
 .../config/TIKA-4207-embedded-bytes-config.xml     |  13 +-
 .../{TIKA-3941.xml => TIKA-4207-limit-bytes.xml}   |   4 +
 .../tika/pipes/{TIKA-3941.xml => TIKA-4207.xml}    |   0
 tika-eval/tika-eval-app/pom.xml                    |   7 +-
 .../org/apache/tika/eval/app/AbstractProfiler.java |  17 +-
 .../org/apache/tika/eval/app/ExtractProfiler.java  |   4 +
 .../java/org/apache/tika/eval/app/db/Cols.java     |   3 +
 tika-eval/tika-eval-core/pom.xml                   |   1 +
 .../eval/core/metadata/TikaEvalMetadataFilter.java |   4 +
 .../core/metadata/TikaEvalMetadataFilterTest.java  |   1 +
 tika-fuzzing/pom.xml                               |   1 +
 tika-java7/pom.xml                                 |   1 +
 tika-parent/pom.xml                                | 102 +++----
 .../apache/tika/parser/geopkg/GeoPkgDBParser.java  |  54 ++++
 .../GeoPkgParser.java}                             |  78 +++---
 .../GeoPkgTableReader.java}                        |  59 ++--
 .../tika/parser/sqlite3/SQLite3DBParser.java       |   2 +-
 .../tika/parser/sqlite3/SQLite3TableReader.java    |   2 +-
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../tika-parsers-ml/tika-age-recogniser/pom.xml    |   2 +-
 .../tika/parser/iwork/IWorkPackageParser.java      |  47 ++--
 .../apache/tika/parser/html/HtmlParserTest.java    |   2 +-
 .../detect/microsoft/ooxml/OPCPackageDetector.java |  47 ++--
 .../apache/tika/parser/microsoft/WMFParser.java    |   3 +-
 .../tika/parser/microsoft/chm/ChmCommons.java      |  11 +-
 .../tika/parser/microsoft/chm/ChmExtractor.java    |   3 +-
 .../tika/parser/microsoft/chm/ChmPmgiHeader.java   |   2 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |   3 +-
 .../tika/parser/microsoft/chm/TestChmLzxState.java |   3 +-
 .../apache/tika/detect/ole/MiscOLEDetector.java    |   4 +-
 .../apache/tika/parser/epub/EncryptionParser.java  |  88 ------
 .../org/apache/tika/parser/epub/EpubParser.java    | 193 +++++++++++---
 .../apache/tika/parser/iptc/IptcAnpaParser.java    |   1 +
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  20 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   6 +
 .../org/apache/tika/parser/pdf/OCRPageCounter.java |  18 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   6 +
 .../org/apache/tika/parser/pdf/XFAExtractor.java   |   3 +
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  12 +-
 .../detect/gzip/GZipSpecializationDetector.java    |   4 +
 .../org/apache/tika/parser/pkg/PackageParser.java  |   7 +-
 .../org/apache/tika/parser/txt/BOMDetector.java    |  93 +++++++
 .../apache/tika/parser/txt/BOMDetectorTest.java    |  91 +++++++
 .../org/apache/tika/parser/txt/TXTParserTest.java  |   2 +
 .../org/apache/tika/parser/warc/WARCParser.java    |  14 +-
 .../apache/tika/parser/warc/WARCParserTest.java    |  31 ++-
 .../test/resources/test-documents/example.arc.gz   | Bin 0 -> 1027 bytes
 .../src/test/resources/test-documents/testARC.arc  |  50 ++++
 .../apache/tika/parser/xml/MetadataHandler.java    |   4 +
 .../tika/detect/TestContainerAwareDetector.java    |   5 +
 .../java/org/apache/tika/mime/TestMimeTypes.java   |   6 +
 .../tika/parser/RecursiveParserWrapperTest.java    |   5 +-
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |   9 +
 .../tika/parser/ocr/TesseractOCRParserTest.java    |   9 +
 .../apache/tika/parser/pkg/Seven7ParserTest.java   |  12 +-
 .../resources/configs/tika-config-no-names.xml     |   2 +-
 .../resources/configs/tika-config-with-names.xml   |   2 +-
 .../src/test/resources/test-documents/test3mf.3mf  | Bin 0 -> 28243 bytes
 .../resources/test-documents/testSTL-ascii.stl     |  16 ++
 .../resources/test-documents/testSTL-binary.stl    | Bin 0 -> 160 bytes
 tika-pipes/tika-async-cli/pom.xml                  |   7 +
 .../apache/tika/async/cli/AsyncProcessorTest.java  | 140 ++++++++++
 .../apache/tika/async/cli/TikaAsyncCLITest.java    |   2 +-
 .../test/resources/configs/TIKA-4207-emitter.xml   |  15 +-
 .../resources/{ => configs}/tika-config-broken.xml |   0
 .../resources/test-documents/basic_embedded.xml    |   0
 tika-pipes/tika-pipes-iterators/pom.xml            |   1 +
 .../pom.xml                                        |  10 +-
 .../pipesiterator/json/JsonPipesIterator.java      |  65 +++++
 .../pipesiterator/json/TestJsonPipesIterator.java  |  85 ++++++
 .../test-documents/test-with-embedded-bytes.json   | 100 +++++++
 .../src/test/resources/test-documents/test.json    | 100 +++++++
 .../pipes/reporters/jdbc/JDBCPipesReporter.java    |  52 ++--
 .../metadata/serialization/JsonFetchEmitTuple.java |  71 ++++-
 .../serialization/JsonFetchEmitTupleTest.java      |  20 ++
 tika-server/tika-server-core/pom.xml               |  10 +-
 .../apache/tika/server/core/TikaServerProcess.java |   2 +-
 .../tika/server/core/resource/AsyncResource.java   |  32 ++-
 .../tika/server/core/resource/TikaResource.java    |   2 +-
 .../apache/tika/server/core/TikaVersionTest.java   |   2 +-
 .../apache/tika/server/core/TikaWelcomeTest.java   |   4 +-
 .../apache/tika/server/standard/TikaPipesTest.java |  93 +++++++
 tika-translate/pom.xml                             |   1 +
 tika-xmp/pom.xml                                   |   1 +
 123 files changed, 3146 insertions(+), 611 deletions(-)
 create mode 100644 tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
 copy tika-core/src/main/java/org/apache/tika/extractor/{EmbeddedDocumentExtractorFactory.java => EmbeddedBytesSelector.java} (74%)
 copy tika-core/src/main/java/org/apache/tika/extractor/{EmbeddedStreamTranslator.java => EmbeddedDocumentByteStoreExtractorFactory.java} (60%)
 copy tika-core/src/main/java/org/apache/tika/{pipes/emitter/StreamEmitter.java => extractor/EmbeddedDocumentBytesHandler.java} (72%)
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
 create mode 100644 tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 rename tika-core/src/test/java/org/apache/tika/pipes/async/{AsyncProcessorTest.java => AsyncChaosMonkeyTest.java} (99%)
 copy tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml => tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml (75%)
 copy tika-core/src/test/resources/org/apache/tika/pipes/{TIKA-3941.xml => TIKA-4207-limit-bytes.xml} (81%)
 copy tika-core/src/test/resources/org/apache/tika/pipes/{TIKA-3941.xml => TIKA-4207.xml} (100%)
 create mode 100644 tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
 copy tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/{sqlite3/SQLite3Parser.java => geopkg/GeoPkgParser.java} (60%)
 copy tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/{sqlite3/SQLite3TableReader.java => geopkg/GeoPkgTableReader.java} (51%)
 delete mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
 copy tika-core/src/main/java/org/apache/tika/exception/TikaException.java => tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java (73%)
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/test3mf.3mf
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testSTL-ascii.stl
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testSTL-binary.stl
 create mode 100644 tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
 copy tika-core/src/test/resources/org/apache/tika/pipes/TIKA-3941.xml => tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml (74%)
 rename tika-pipes/tika-async-cli/src/test/resources/{ => configs}/tika-config-broken.xml (100%)
 copy {tika-core => tika-pipes/tika-async-cli}/src/test/resources/test-documents/basic_embedded.xml (100%)
 copy tika-pipes/tika-pipes-iterators/{tika-pipes-iterator-csv => tika-pipes-iterator-json}/pom.xml (94%)
 create mode 100644 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java
 create mode 100644 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
 create mode 100644 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded-bytes.json
 create mode 100644 tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test.json


(tika) 01/01: Merge branch 'main' of github.com:apache/tika into TIKA-4181-grpc

Posted by nd...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a commit to branch TIKA-4181-grpc
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 100ef9c3063e49106a2c7fbff4942bbe9edc7042
Merge: 322452021 941f8f26c
Author: Nicholas DiPiazza <nd...@apache.org>
AuthorDate: Fri Mar 29 02:30:30 2024 -0500

    Merge branch 'main' of github.com:apache/tika into TIKA-4181-grpc

 CHANGES.txt                                        |   2 +
 tika-app/pom.xml                                   |   1 +
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   2 +-
 .../java/org/apache/tika/cli/TikaCLIAsyncTest.java |  89 +++++++
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  59 +---
 tika-batch/pom.xml                                 |   3 +
 tika-core/src/main/java/org/apache/tika/Tika.java  |   4 +
 .../org/apache/tika/detect/AutoDetectReader.java   |  38 +--
 .../tika/detect/CompositeEncodingDetector.java     |   7 +
 .../AbstractEmbeddedDocumentBytesHandler.java      |  69 +++++
 .../tika/extractor/BasicEmbeddedBytesSelector.java |  77 ++++++
 .../BasicEmbeddedDocumentBytesHandler.java         |  58 ++++
 .../tika/extractor/EmbeddedBytesSelector.java      |  31 +--
 .../EmbeddedDocumentByteStoreExtractorFactory.java |  36 +--
 .../extractor/EmbeddedDocumentBytesHandler.java    |  32 +--
 .../ParsingEmbeddedDocumentExtractor.java          |  10 +-
 .../apache/tika/extractor/RUnpackExtractor.java    | 183 +++++++++++++
 .../tika/extractor/RUnpackExtractorFactory.java    | 111 ++++++++
 .../org/apache/tika/io/BoundedInputStream.java     |  31 ++-
 .../main/java/org/apache/tika/metadata/IPTC.java   |   8 +
 .../main/java/org/apache/tika/metadata/PDF.java    |   6 +
 .../apache/tika/metadata/TikaCoreProperties.java   |  20 ++
 .../main/java/org/apache/tika/mime/MimeTypes.java  |   4 +-
 .../org/apache/tika/parser/AbstractParser.java     |   1 +
 .../org/apache/tika/parser/AutoDetectParser.java   |  11 +-
 .../apache/tika/parser/AutoDetectParserConfig.java |   4 +-
 .../org/apache/tika/parser/ParserDecorator.java    |   1 +
 .../apache/tika/parser/RecursiveParserWrapper.java |   2 +
 .../parser/multiple/AbstractMultipleParser.java    |   1 +
 .../java/org/apache/tika/pipes/FetchEmitTuple.java |  52 +++-
 .../java/org/apache/tika/pipes/PipesServer.java    | 296 +++++++++++++++------
 .../extractor/EmbeddedDocumentBytesConfig.java     | 167 ++++++++++++
 .../EmittingEmbeddedDocumentBytesHandler.java      |  73 +++++
 .../org/apache/tika/mime/tika-mimetypes.xml        |  89 ++++++-
 .../java/org/apache/tika/TikaDetectionTest.java    |   2 +-
 .../tika/parser/AutoDetectParserConfigTest.java    |  72 +++++
 .../org/apache/tika/parser/mock/MockParser.java    |  26 +-
 .../org/apache/tika/pipes/PipesServerTest.java     | 120 ++++++++-
 ...rocessorTest.java => AsyncChaosMonkeyTest.java} |   2 +-
 .../config/TIKA-4207-embedded-bytes-config.xml     |  13 +-
 .../apache/tika/pipes/TIKA-4207-limit-bytes.xml    |  19 +-
 .../resources/org/apache/tika/pipes/TIKA-4207.xml  |  19 +-
 tika-eval/tika-eval-app/pom.xml                    |   7 +-
 .../org/apache/tika/eval/app/AbstractProfiler.java |  17 +-
 .../org/apache/tika/eval/app/ExtractProfiler.java  |   4 +
 .../java/org/apache/tika/eval/app/db/Cols.java     |   3 +
 tika-eval/tika-eval-core/pom.xml                   |   1 +
 .../eval/core/metadata/TikaEvalMetadataFilter.java |   4 +
 .../core/metadata/TikaEvalMetadataFilterTest.java  |   1 +
 tika-fuzzing/pom.xml                               |   1 +
 tika-java7/pom.xml                                 |   1 +
 tika-parent/pom.xml                                | 102 +++----
 .../apache/tika/parser/geopkg/GeoPkgDBParser.java  |  54 ++++
 .../apache/tika/parser/geopkg/GeoPkgParser.java    | 127 +++++++++
 .../GeoPkgTableReader.java}                        |  59 ++--
 .../tika/parser/sqlite3/SQLite3DBParser.java       |   2 +-
 .../tika/parser/sqlite3/SQLite3TableReader.java    |   2 +-
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../tika-parsers-ml/tika-age-recogniser/pom.xml    |   2 +-
 .../tika/parser/iwork/IWorkPackageParser.java      |  47 ++--
 .../apache/tika/parser/html/HtmlParserTest.java    |   2 +-
 .../detect/microsoft/ooxml/OPCPackageDetector.java |  47 ++--
 .../apache/tika/parser/microsoft/WMFParser.java    |   3 +-
 .../tika/parser/microsoft/chm/ChmCommons.java      |  11 +-
 .../tika/parser/microsoft/chm/ChmExtractor.java    |   3 +-
 .../tika/parser/microsoft/chm/ChmPmgiHeader.java   |   2 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |   3 +-
 .../tika/parser/microsoft/chm/TestChmLzxState.java |   3 +-
 .../apache/tika/detect/ole/MiscOLEDetector.java    |   4 +-
 .../apache/tika/parser/epub/EncryptionParser.java  |  88 ------
 .../org/apache/tika/parser/epub/EpubParser.java    | 193 +++++++++++---
 .../apache/tika/parser/iptc/IptcAnpaParser.java    |   1 +
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  20 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   6 +
 .../org/apache/tika/parser/pdf/OCRPageCounter.java |  31 +--
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   6 +
 .../org/apache/tika/parser/pdf/XFAExtractor.java   |   3 +
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  12 +-
 .../detect/gzip/GZipSpecializationDetector.java    |   4 +
 .../org/apache/tika/parser/pkg/PackageParser.java  |   7 +-
 .../org/apache/tika/parser/txt/BOMDetector.java    |  93 +++++++
 .../apache/tika/parser/txt/BOMDetectorTest.java    |  91 +++++++
 .../org/apache/tika/parser/txt/TXTParserTest.java  |   2 +
 .../org/apache/tika/parser/warc/WARCParser.java    |  14 +-
 .../apache/tika/parser/warc/WARCParserTest.java    |  31 ++-
 .../test/resources/test-documents/example.arc.gz   | Bin 0 -> 1027 bytes
 .../src/test/resources/test-documents/testARC.arc  |  50 ++++
 .../apache/tika/parser/xml/MetadataHandler.java    |   4 +
 .../tika/detect/TestContainerAwareDetector.java    |   5 +
 .../java/org/apache/tika/mime/TestMimeTypes.java   |   6 +
 .../tika/parser/RecursiveParserWrapperTest.java    |   5 +-
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |   9 +
 .../tika/parser/ocr/TesseractOCRParserTest.java    |   9 +
 .../apache/tika/parser/pkg/Seven7ParserTest.java   |  12 +-
 .../resources/configs/tika-config-no-names.xml     |   2 +-
 .../resources/configs/tika-config-with-names.xml   |   2 +-
 .../src/test/resources/test-documents/test3mf.3mf  | Bin 0 -> 28243 bytes
 .../resources/test-documents/testSTL-ascii.stl     |  16 ++
 .../resources/test-documents/testSTL-binary.stl    | Bin 0 -> 160 bytes
 tika-pipes/tika-async-cli/pom.xml                  |   7 +
 .../apache/tika/async/cli/AsyncProcessorTest.java  | 140 ++++++++++
 .../apache/tika/async/cli/TikaAsyncCLITest.java    |   2 +-
 .../test/resources/configs/TIKA-4207-emitter.xml   |  28 +-
 .../resources/{ => configs}/tika-config-broken.xml |   0
 .../basic_embedded.xml}                            |  29 +-
 tika-pipes/tika-pipes-iterators/pom.xml            |   1 +
 .../tika-pipes-iterator-json}/pom.xml              |  43 ++-
 .../pipesiterator/json/JsonPipesIterator.java      |  65 +++++
 .../pipesiterator/json/TestJsonPipesIterator.java  |  85 ++++++
 .../test-documents/test-with-embedded-bytes.json   | 100 +++++++
 .../src/test/resources/test-documents/test.json    | 100 +++++++
 .../pipes/reporters/jdbc/JDBCPipesReporter.java    |  52 ++--
 .../metadata/serialization/JsonFetchEmitTuple.java |  71 ++++-
 .../serialization/JsonFetchEmitTupleTest.java      |  20 ++
 tika-server/tika-server-core/pom.xml               |  10 +-
 .../apache/tika/server/core/TikaServerProcess.java |   2 +-
 .../tika/server/core/resource/AsyncResource.java   |  32 ++-
 .../tika/server/core/resource/TikaResource.java    |   2 +-
 .../apache/tika/server/core/TikaVersionTest.java   |   2 +-
 .../apache/tika/server/core/TikaWelcomeTest.java   |   4 +-
 .../apache/tika/server/standard/TikaPipesTest.java |  93 +++++++
 tika-translate/pom.xml                             |   1 +
 tika-xmp/pom.xml                                   |   1 +
 123 files changed, 3290 insertions(+), 686 deletions(-)