You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/11 15:47:27 UTC

[tika] branch master updated (f588e2a -> 7b83f75)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from f588e2a  TIKA-3056 general dependency upgrades for 1.24
     new f82ee05  update changes for 1.24 release
     new 350df7e  Add licenses found by rat check.
     new ebda7cc  TIKA-3068 -- fix release configuration for tika-server's new bin/service mode
     new 7b83f75  TIKA-2714 -- add detection for rar4 and rar5 files; throw an UnsupportedFormatException for rar5 files

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                          |  2 +-
 pom.xml                                              |  2 ++
 .../org/apache/tika/mime/tika-mimetypes.xml          | 15 ++++++++++++++-
 tika-parent/pom.xml                                  | 20 ++++++++++++--------
 .../microsoft/onenote/OneNoteLegacyDumpStrings.java  | 16 ++++++++++++++++
 .../java/org/apache/tika/parser/pkg/RarParser.java   |  6 ++++++
 .../org/apache/tika/parser/pkg/RarParserTest.java    |  2 +-
 tika-server/assembly.xml                             | 16 ++++++++++++++++
 8 files changed, 68 insertions(+), 11 deletions(-)


[tika] 01/04: update changes for 1.24 release

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f82ee05a914499c17f256472b7bf79f290ed4d2d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 11 09:08:26 2020 -0400

    update changes for 1.24 release
---
 CHANGES.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 522a29c..a56b42f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,7 +5,7 @@ Release 2.0.0 - ???
 
    Other changes
 
-Release 1.24 - ???
+Release 1.24 - 3/11/2019
 
    * Upgrade Drew Noakes' metadata-extractor (TIKA-2952).
 


[tika] 02/04: Add licenses found by rat check.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 350df7eacc266c0497a8f5aae068d90508abf3ff
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 11 09:12:56 2020 -0400

    Add licenses found by rat check.
---
 .../microsoft/onenote/OneNoteLegacyDumpStrings.java      | 16 ++++++++++++++++
 tika-server/assembly.xml                                 | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
index 3f63576..27b011e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.parser.microsoft.onenote;
 
 import org.apache.tika.exception.TikaException;
diff --git a/tika-server/assembly.xml b/tika-server/assembly.xml
index 412a9a2..216ee71 100644
--- a/tika-server/assembly.xml
+++ b/tika-server/assembly.xml
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
   xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">


[tika] 04/04: TIKA-2714 -- add detection for rar4 and rar5 files; throw an UnsupportedFormatException for rar5 files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7b83f7538c788af4cd1c9e997e64dc550e733dee
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 11 11:38:46 2020 -0400

    TIKA-2714 -- add detection for rar4 and rar5 files; throw an UnsupportedFormatException for rar5 files
---
 .../resources/org/apache/tika/mime/tika-mimetypes.xml     | 15 ++++++++++++++-
 .../main/java/org/apache/tika/parser/pkg/RarParser.java   |  6 ++++++
 .../java/org/apache/tika/parser/pkg/RarParserTest.java    |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ea1f97b..551e55e 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4177,7 +4177,20 @@
     </magic>
     <glob pattern="*.rar"/>
   </mime-type>
-
+  <mime-type type="application/x-rar-compressed;version=4">
+    <_comment>RAR archive</_comment>
+    <magic priority="60">
+      <match value="\x52\x61\x72\x21\x1a\x07\x00" type="string" offset="0"/>
+    </magic>
+    <sub-class-of type="application/x-rar-compressed"/>
+  </mime-type>
+  <mime-type type="application/x-rar-compressed;version=5">
+    <_comment>RAR archive</_comment>
+    <magic priority="60">
+      <match value="\x52\x61\x72\x21\x1a\x07\x01\x00" type="string" offset="0"/>
+    </magic>
+    <sub-class-of type="application/x-rar-compressed"/>
+  </mime-type>
   <mime-type type="application/x-roxio-toast">
     <glob pattern="*.toast"/>
     <sub-class-of type="application/x-iso9660-image"/>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 633b2cc..4cdcedd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -27,11 +27,13 @@ import com.github.junrar.impl.FileVolumeManager;
 import com.github.junrar.rarfile.FileHeader;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.UnsupportedFormatException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -62,7 +64,11 @@ public class RarParser extends AbstractParser {
         xhtml.startDocument();
 
         EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        String mediaType = metadata.get(Metadata.CONTENT_TYPE);
 
+        if (mediaType != null && mediaType.contains("version=5")) {
+            throw new UnsupportedFormatException("Tika does not yet support rar version 5.");
+        }
         Archive rar = null;
         try (TemporaryResources tmp = new TemporaryResources()) {
             TikaInputStream tis = TikaInputStream.get(stream, tmp);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
index 34dcaab..d6f5af1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
@@ -48,7 +48,7 @@ public class RarParserTest extends AbstractPkgTest {
             AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
         }
 
-        assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/x-rar-compressed; version=4", metadata.get(Metadata.CONTENT_TYPE));
         String content = handler.toString();
         assertContains("test-documents/testEXCEL.xls", content);
         assertContains("Sample Excel Worksheet", content);


[tika] 03/04: TIKA-3068 -- fix release configuration for tika-server's new bin/service mode

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ebda7cc8ba84da1a9fd4553c8db28301c10c2675
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 11 11:19:42 2020 -0400

    TIKA-3068 -- fix release configuration for tika-server's new bin/service mode
---
 pom.xml             |  2 ++
 tika-parent/pom.xml | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/pom.xml b/pom.xml
index 92c7745..2e771f3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -102,6 +102,8 @@
                         <include name="target/*-src.zip*" />
                         <include name="tika-app/target/tika-app-${project.version}.jar*" />
                         <include name="tika-server/target/tika-server-${project.version}.jar*" />
+                        <include name="tika-server/target/tika-server-${project.version}.bin.tgz*" />
+                        <include name="tika-server/target/tika-server-${project.version}.bin.zip*" />
                         <include name="tika-eval/target/tika-eval-${project.version}.jar*" />
                       </fileset>
                     </copy>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 71c549b..eb21f99 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -401,14 +401,7 @@
                     </excludeCoordinates>
                 </configuration>
       -->
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <version>${maven.assembly.version}</version>
-        <configuration>
-          <tarLongFileMode>posix</tarLongFileMode>
-        </configuration>
-      </plugin>
+
       <plugin>
         <groupId>de.thetaphi</groupId>
         <artifactId>forbiddenapis</artifactId>
@@ -454,6 +447,17 @@
         <artifactId>maven-shade-plugin</artifactId>
         <version>${maven.shade.version}</version>
       </plugin>
+      <!-- assembly must happen after shade so that
+        tika-server's -bin.tgz/zip includes the uber jar
+        -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>${maven.assembly.version}</version>
+        <configuration>
+          <tarLongFileMode>posix</tarLongFileMode>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-release-plugin</artifactId>