You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/28 19:00:30 UTC

[tika] branch branch_1x updated (90c6ea4 -> 57f5912)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 90c6ea4  TIKA-3444 -- upgrade to pdfbox 2.0.24
     new 4ba5fd7  TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.
     new 57f5912  TIKA-3457 -- general upgrades for 1.27

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-bundle/pom.xml                                | 17 ++++++++++-
 .../tika/language/detect/LanguageDetector.java     | 21 ++++++++++++--
 tika-dl/pom.xml                                    | 24 ++++++++++++++++
 tika-example/pom.xml                               |  6 ++--
 tika-parent/pom.xml                                | 33 +++++++++++-----------
 tika-parsers/pom.xml                               | 26 ++++++++---------
 tika-server/pom.xml                                |  2 +-
 7 files changed, 92 insertions(+), 37 deletions(-)

[tika] 01/02: TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4ba5fd7eb8b1a6ccc45fd773b73e6f809a652370
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 28 12:15:33 2021 -0400

    TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.
---
 .../tika/language/detect/LanguageDetector.java      | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
index 7be4e4f..514b7f7 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
@@ -50,6 +50,10 @@ public abstract class LanguageDetector {
 
 	private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();
 
+	//if a user calls detect on a huge string, break it into this size
+	//and add sequentially until hasEnoughText() is true
+	private static final int BUFFER_LENGTH = 4096;
+
 	// True if text is expected to be a mix of languages, and thus higher-resolution
 	// detection must be done to avoid under-sampling the text.
 	protected boolean mixedLanguages = false;
@@ -57,7 +61,7 @@ public abstract class LanguageDetector {
 	// True if the text is expected to be 'short' (typically less than 100 chars), and
 	// thus a different algorithm and/or set of profiles should be used.
 	protected boolean shortText = false;
-	
+
 	public static LanguageDetector getDefaultLanguageDetector() {
 		List<LanguageDetector> detectors = getLanguageDetectors();
 		if (detectors.isEmpty()) {
@@ -183,8 +187,19 @@ public abstract class LanguageDetector {
 	 * @param text Characters to add to current statistics.
 	 */
 	public void addText(CharSequence text) {
-		char[] chars = text.toString().toCharArray();
-		addText(chars, 0, chars.length);
+		int len = text.length();
+		if (len < BUFFER_LENGTH) {
+			char[] chars = text.toString().toCharArray();
+			addText(chars, 0, chars.length);
+			return;
+		}
+		int start = 0;
+		while (! hasEnoughText() && start < len) {
+			int end = Math.min(start + BUFFER_LENGTH, len);
+			char[] chars = text.subSequence(start, end).toString().toCharArray();
+			addText(chars, 0, chars.length);
+			start += BUFFER_LENGTH;
+		}
 	}
 
 	

[tika] 02/02: TIKA-3457 -- general upgrades for 1.27

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 57f5912b5c46a53bd8e5a04bfd117142ac2034d4
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 28 15:00:09 2021 -0400

    TIKA-3457 -- general upgrades for 1.27
---
 tika-bundle/pom.xml  | 17 ++++++++++++++++-
 tika-dl/pom.xml      | 24 ++++++++++++++++++++++++
 tika-example/pom.xml |  6 +++---
 tika-parent/pom.xml  | 33 +++++++++++++++++----------------
 tika-parsers/pom.xml | 26 +++++++++++++-------------
 tika-server/pom.xml  |  2 +-
 6 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index deaa2ee..c90cc85 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -296,7 +296,7 @@
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
               edu.wisc.ssec.mcidas.adde;resolution:=optional,
-	          edu.usc.irds.agepredictor.spark.authorage;resolution:=optional,
+  	          edu.usc.irds.agepredictor.spark.authorage;resolution:=optional,
               javax.activation;resolution:=optional,
               javax.annotation;resolution:=optional,
               javax.mail;resolution:=optional,
@@ -388,6 +388,20 @@
               org.apache.xml.security.utils;resolution:=optional,
               org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
               org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
+              org.bouncycastle.asn1.bsi;resolution:=optional,
+              org.bouncycastle.asn1.cmp;resolution:=optional,
+              org.bouncycastle.asn1.cms;resolution:=optional,
+              org.bouncycastle.asn1.cms.ecc;resolution:=optional,
+              org.bouncycastle.asn1.crmf;resolution:=optional,
+              org.bouncycastle.asn1.cryptlib;resolution:=optional,
+              org.bouncycastle.asn1.cryptopro;resolution:=optional,
+              org.bouncycastle.asn1.dvcs;resolution:=optional,
+              org.bouncycastle.asn1.eac;resolution:=optional,
+              org.bouncycastle.asn1.ess;resolution:=optional,
+              org.bouncycastle.asn1.est;resolution:=optional,
+              org.bouncycastle.asn1.sec;resolution:=optional,
+              org.bouncycastle.asn1.smime;resolution:=optional,
+              org.bouncycastle.asn1.tsp;resolution:=optional,
               org.bouncycastle.cert;resolution:=optional,
               org.bouncycastle.cert.jcajce;resolution:=optional,
               org.bouncycastle.cert.ocsp;resolution:=optional,
@@ -433,6 +447,7 @@
               com.microsoft.schemas.office.word;resolution:=optional,
               sun.java2d.cmm.kcms;resolution:=optional,
               sun.misc;resolution:=optional,
+              sun.nio.ch;resolution:=optional,
               ucar.units;resolution:=optional,
               ucar.httpservices;resolution:=optional,
               ucar.nc2.util;resolution:=optional,
diff --git a/tika-dl/pom.xml b/tika-dl/pom.xml
index 6c741c1..d0e0b18 100644
--- a/tika-dl/pom.xml
+++ b/tika-dl/pom.xml
@@ -123,6 +123,10 @@
           <groupId>joda-time</groupId>
           <artifactId>joda-time</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.google.code.gson</groupId>
+          <artifactId>gson</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -170,6 +174,14 @@
           <groupId>net.java.dev.jna</groupId>
           <artifactId>jna</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.google.code.gson</groupId>
+          <artifactId>gson</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>it.unimi.dsi</groupId>
+          <artifactId>fastutil</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -277,6 +289,10 @@
           <groupId>commons-lang</groupId>
           <artifactId>commons-lang</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>it.unimi.dsi</groupId>
+          <artifactId>fastutil</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -301,6 +317,14 @@
           <groupId>org.apache.commons</groupId>
           <artifactId>commons-math3</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-lang3</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-collections4</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index c34096f..1324ec5 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -100,7 +100,7 @@
     <dependency>
       <groupId>org.apache.jackrabbit</groupId>
       <artifactId>jackrabbit-jcr-server</artifactId>
-      <version>2.21.5</version>
+      <version>2.21.6</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.tika</groupId>
@@ -127,7 +127,7 @@
     <dependency>
       <groupId>org.apache.jackrabbit</groupId>
       <artifactId>jackrabbit-core</artifactId>
-      <version>2.21.5</version>
+      <version>2.21.6</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.tika</groupId>
@@ -156,7 +156,7 @@
     <dependency>
       <groupId>org.springframework</groupId>
       <artifactId>spring-context</artifactId>
-      <version>5.3.5</version>
+      <version>5.3.8</version>
       <exclusions>
         <exclusion>
           <groupId>commons-logging</groupId>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index a767a26..1569910 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -299,9 +299,9 @@
             </dependency>
 
             <dependency>
-                <groupId>javax.annotation</groupId>
-                <artifactId>javax.annotation-api</artifactId>
-                <version>1.3.2</version>
+                <groupId>jakarta.annotation</groupId>
+                <artifactId>jakarta.annotation-api</artifactId>
+                <version>1.3.5</version>
             </dependency>
             <dependency>
                 <groupId>javax.xml.soap</groupId>
@@ -311,7 +311,7 @@
             <dependency>
                 <groupId>org.jvnet.staxex</groupId>
                 <artifactId>stax-ex</artifactId>
-                <version>2.0.0</version>
+                <version>2.0.1</version>
             </dependency>
         </dependencies>
     </dependencyManagement>
@@ -336,13 +336,13 @@
         <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
         <poi.version>4.1.2</poi.version>
         <commons.compress.version>1.20</commons.compress.version>
-        <commons.io.version>2.8.0</commons.io.version>
+        <commons.io.version>2.10.0</commons.io.version>
         <commons.lang3.version>3.12.0</commons.lang3.version>
-        <gson.version>2.8.6</gson.version>
+        <gson.version>2.8.7</gson.version>
         <guava.version>30.1.1-jre</guava.version>
         <osgi.core.version>6.0.0</osgi.core.version>
 
-    <cxf.version>3.4.3</cxf.version>
+    <cxf.version>3.4.4</cxf.version>
     <slf4j.version>1.7.30</slf4j.version>
     <log4j.version>1.2.17</log4j.version>
     <jackson.version>2.12.3</jackson.version>
@@ -350,10 +350,10 @@
          javax.activation dependency in tika-server.
          Until then, DO NOT go above 2.x unless you know what you're doing.
          See TIKA-3407 -->
-    <jaxb.version>2.3.3</jaxb.version>
+    <jaxb.version>2.3.4</jaxb.version>
     <cli.version>1.4</cli.version>
-    <lucene.version>8.8.1</lucene.version>
-    <mockito.version>3.8.0</mockito.version>
+    <lucene.version>8.9.0</lucene.version>
+    <mockito.version>3.11.2</mockito.version>
     <opennlp.version>1.9.3</opennlp.version>
   </properties>
 
@@ -585,16 +585,17 @@
                     </plugin>
                 </plugins>
             </build>
+            <!-- keep these at 2.x -->
             <dependencies>
                 <dependency>
-                    <groupId>javax.xml.bind</groupId>
-                    <artifactId>jaxb-api</artifactId>
-                    <version>2.3.1</version>
+                    <groupId>jakarta.xml.bind</groupId>
+                    <artifactId>jakarta.xml.bind-api</artifactId>
+                    <version>2.3.3</version>
                 </dependency>
                 <dependency>
-                    <groupId>javax.activation</groupId>
-                    <artifactId>activation</artifactId>
-                    <version>1.1.1</version>
+                    <groupId>jakarta.activation</groupId>
+                    <artifactId>jakarta.activation-api</artifactId>
+                    <version>1.2.2</version>
                 </dependency>
             </dependencies>
         </profile>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e812bfc..b2b88ff 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -45,7 +45,7 @@
     <tukaani.version>1.9</tukaani.version>
     <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
     <brotli.version>0.1.2</brotli.version>
-    <mime4j.version>0.8.3</mime4j.version>
+    <mime4j.version>0.8.4</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.24</pdfbox.version>
     <jempbox.version>1.8.16</jempbox.version>
@@ -53,7 +53,7 @@
     <sis.version>1.0</sis.version>
     <parso.version>2.0.14</parso.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
-    <bouncycastle.version>1.68</bouncycastle.version>
+    <bouncycastle.version>1.69</bouncycastle.version>
     <commonsexec.version>1.3</commonsexec.version>
     <httpcomponents.version>4.5.13</httpcomponents.version>
   </properties>
@@ -94,7 +94,7 @@
     <dependency>
       <groupId>com.fasterxml.woodstox</groupId>
       <artifactId>woodstox-core</artifactId>
-      <version>6.2.4</version>
+      <version>6.2.6</version>
     </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
@@ -102,9 +102,9 @@
       <version>${commons.lang3.version}</version>
     </dependency>
     <dependency>
-      <groupId>javax.annotation</groupId>
-      <artifactId>javax.annotation-api</artifactId>
-      <version>1.3.2</version>
+      <groupId>jakarta.annotation</groupId>
+      <artifactId>jakarta.annotation-api</artifactId>
+      <version>1.3.5</version>
     </dependency>
 
     <!-- Externally Maintained Parsers -->
@@ -171,7 +171,7 @@
     <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
-      <version>1.4.9-1</version>
+      <version>1.5.0-2</version>
       <scope>provided</scope>
     </dependency>
 
@@ -272,7 +272,7 @@
     <dependency>
       <groupId>com.healthmarketscience.jackcess</groupId>
       <artifactId>jackcess</artifactId>
-      <version>4.0.0</version>
+      <version>4.0.1</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.commons</groupId>
@@ -313,12 +313,12 @@
     <dependency>
       <groupId>org.ow2.asm</groupId>
       <artifactId>asm</artifactId>
-      <version>9.1</version>
+      <version>9.2</version>
     </dependency>
     <dependency>
       <groupId>org.tallison</groupId>
       <artifactId>isoparser</artifactId>
-      <version>1.9.41.6</version>
+      <version>1.9.41.7</version>
     </dependency>
     <!-- this is a fork of com.drewnoakes
       metadata extractor that shade/relocates com.adobe.internal
@@ -414,7 +414,7 @@
     <dependency>
       <groupId>org.xerial</groupId>
       <artifactId>sqlite-jdbc</artifactId>
-      <version>3.34.0</version>
+      <version>3.36.0</version>
       <scope>provided</scope>
     </dependency>
 
@@ -835,7 +835,7 @@
     <dependency>
       <groupId>org.apache.uima</groupId>
       <artifactId>uimafit-core</artifactId>
-      <version>3.1.0</version>
+      <version>3.2.0</version>
       <scope>provided</scope>
       <exclusions>
         <exclusion>
@@ -877,7 +877,7 @@
     <dependency>
       <groupId>org.apache.uima</groupId>
       <artifactId>uimaj-core</artifactId>
-      <version>3.1.1</version>
+      <version>3.2.0</version>
       <scope>provided</scope>
       <exclusions>
         <exclusion>
diff --git a/tika-server/pom.xml b/tika-server/pom.xml
index ad4bcb4..9f492d7 100644
--- a/tika-server/pom.xml
+++ b/tika-server/pom.xml
@@ -29,7 +29,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <cxf.micrometer.version>1.5.12</cxf.micrometer.version>
+    <cxf.micrometer.version>1.7.1</cxf.micrometer.version>
     <micrometer-extras.version>0.2.2</micrometer-extras.version>
   </properties>