You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/07 20:19:41 UTC
[tika] branch branch_1x updated (8163b59 -> a9b4b36)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.
from 8163b59 TIKA-2598 -- unbreak the build (sorry, again!), fix missing javacpp dependency.
new d9f63a0 turn off debug in powerpointparsertest
new 32c19de TIKA-2600 -- remove md5 checksum, and switch sha-1 to sha-512 for release artifacts
new b9e9e5b TIKA-2594 -- improve eml detection for those starting with Subject: and containing html
new 164c928 TIKA-2592 -- ignore charsets not supported by IANA in html meta-headers via Andreas Meier.
new b4047eb TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt
new a9b4b36 TIKA-2590 -- revert listenForAllRecords = false thanks to Grigoriy Alekseev
The 6 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGES.txt | 8 ++
pom.xml | 15 +--
.../org/apache/tika/mime/tika-mimetypes.xml | 1 +
.../tika/parser/html/HtmlEncodingDetector.java | 41 ++++++
.../tika/parser/microsoft/ExcelExtractor.java | 1 -
.../tika/parser/pkg/ZipContainerDetector.java | 42 +++++-
.../html/StandardCharsets_unsupported_by_IANA.txt | 125 ++++++++++++++++++
.../java/org/apache/tika/mime/TestMimeTypes.java | 3 +
.../apache/tika/parser/html/HtmlParserTest.java | 10 ++
.../parser/microsoft/PowerPointParserTest.java | 1 -
.../tika/parser/pkg/ZipContainerDetectorTest.java | 55 ++++++++
.../testEML_embedded_xhtml_and_img.eml | 141 +++++++++++++++++++++
.../test-documents/testHTML_charset_utf16le.html | Bin 0 -> 380 bytes
.../test-documents/testHTML_charset_utf8.html | 8 ++
14 files changed, 438 insertions(+), 13 deletions(-)
create mode 100644 tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
create mode 100644 tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
create mode 100644 tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html
create mode 100644 tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 02/06: TIKA-2600 -- remove md5 checksum,
and switch sha-1 to sha-512 for release artifacts
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 32c19dee5bd4952f9f041f5fba218130fa02bdb5
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 09:43:00 2018 -0500
TIKA-2600 -- remove md5 checksum, and switch sha-1 to sha-512 for release artifacts
---
pom.xml | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/pom.xml b/pom.xml
index 11f3490..58b3813 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,19 +104,16 @@
<include name="tika-eval/target/tika-eval-${project.version}.jar*" />
</fileset>
</copy>
- <checksum algorithm="MD5" fileext=".md5">
+ <checksum algorithm="SHA-512" fileext=".sha512">
<fileset dir="${basedir}/target/${project.version}">
<include name="*.zip" />
<include name="*.?ar" />
</fileset>
</checksum>
- <checksum algorithm="SHA1" fileext=".sha">
- <fileset dir="${basedir}/target/${project.version}">
- <include name="*.zip" />
- <include name="*.?ar" />
- </fileset>
- </checksum>
- <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" />
+ <checksum
+ file="${basedir}/target/${project.version}/tika-${project.version}-src.zip"
+ algorithm="SHA-512"
+ property="checksum" />
<echo file="${basedir}/target/vote.txt">
From: ${username}@apache.org
To: dev@tika.apache.org
@@ -129,7 +126,7 @@ A candidate for the Tika ${project.version} release is available at:
The release candidate is a zip archive of the sources in:
https://github.com/apache/tika/tree/{project.version}-rcN/
-The SHA1 checksum of the archive is
+The SHA-512 checksum of the archive is
${checksum}.
In addition, a staged maven repository is available here:
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 03/06: TIKA-2594 -- improve eml detection for those starting
with Subject: and containing html
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit b9e9e5b150aca851465e99017da6328c202ba127
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 13:14:20 2018 -0500
TIKA-2594 -- improve eml detection for those starting with Subject: and
containing html
---
.../org/apache/tika/mime/tika-mimetypes.xml | 1 +
.../java/org/apache/tika/mime/TestMimeTypes.java | 3 +
.../testEML_embedded_xhtml_and_img.eml | 141 +++++++++++++++++++++
3 files changed, 145 insertions(+)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index f6a8844..56dfd53 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5629,6 +5629,7 @@
<match value="From:" type="stringignorecase" offset="0"/>
<match value="Received:" type="stringignorecase" offset="0"/>
<match value="Message-ID:" type="stringignorecase" offset="0"/>
+ <match value="\nMessage-ID:" type="stringignorecase" offset="0:8192"/>
<match value="Date:" type="string" offset="0"/>
<match value="User-Agent:" type="string" offset="0"/>
<match value="MIME-Version:" type="stringignorecase" offset="0"/>
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index bbb25e5..0e43c25 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -908,6 +908,9 @@ public class TestMimeTypes {
//x- custom header
assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
+ //embedded xhtml and img
+ assertTypeDetection("testEML_embedded_xhtml_and_img.eml", "message/rfc822");
+
}
@Test
diff --git a/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml b/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
new file mode 100644
index 0000000..822cb8b
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
@@ -0,0 +1,141 @@
+Subject: This is the subject
+From: "Sender" <se...@test.com>
+To: <Re...@test.com>
+Date: Wed, 21 Mar 2018 14:14:32 -0800
+List-Unsubscribe: <ma...@test.com>
+Content-Type: multipart/related; boundary="f4ff6b19348ff123_b9347c3a6d123456a"
+MIME-Version: 1.0
+Message-ID: <01...@test.com>
+
+--f4ff6b19348ff123_b9347c3a6d123456a
+Content-Type: multipart/alternative; boundary="f4ff6b19348ff123_b9347c3a6d123456b"
+
+--f4ff6b19348ff123_b9347c3a6d123456b
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 8-bit
+Content-Disposition: inline
+
+This is a test
+
+Using inline content
+
+Unsubscribe:
+http://test.com/receiver/asgdnjlsakgHjghskusegnkKLghslkagjasduiLKGlkjsgslkjgeseklgj
+
+--f4ff6b19348ff123_b9347c3a6d123456b
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline
+
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>XHTML test document</title>
+ <meta name="Author" content="Tika Developers"/>
+ <meta http-equiv="refresh" content="5"/>
+ </head>
+ <body>
+ <p>
+ This document tests the ability of Apache Tika to extract content
+ from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
+ </p>
+ </body>
+</html>
+
+--f4ff6b19348ff123_b9347c3a6d123456b--
+
+
+
+
+
+--f4ff6b19348ff123_b9347c3a6d123456a
+Content-Type: image/jpeg; name="testimage.jpeg"
+Content-Description:testimage.jpeg
+Content-Disposition: inline; filename="testimage.jpeg";
+Content-Transfer-Encoding: base64
+
+/9j/4AAQSkZJRgABAQEASABIAAD//gATQ3JlYXRlZCB3aXRoIEdJTVD/2wBDAAMCAgMCAgMDAwME
+AwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBD
+AQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQU
+FBQUFBQUFBT/wgARCACAAIADAREAAhEBAxEB/8QAHAABAAICAwEAAAAAAAAAAAAAAAcIBQYDBAkB
+/8QAFAEBAAAAAAAAAAAAAAAAAAAAAP/aAAwDAQACEAMQAAABtSAAAAAa+U1JFLSgAAAGKNKJKBVo
+qgcp6ngAAAHn2ReXPLCmnlOyaixQBjyFSYDNAFECGi8pN4KymKLVHKCoBWomovUAcRhDYAa0eaAP
+QQlAEAFLy1RaUAAAFOzElzztgAAAAAAAAAio103U6ZqZtx1TZDiMAZQ2kzIANaIfMUZM+GXNkIqO
+IyZ0yViSjiK3m9ktAAAA6pohIoBBRSA7Z6kgAAAAAGqlFSRS54ABEpohZAGENgAAAAAPLY6hd8gU
+hovKTeAAVaKoG9nocdspgR0XqKSkXlzywoABRUhU5T01M0ADFGlElAAAwpDRupJQAAAAP//EACIQ
+AAICAgIDAAMBAAAAAAAAAAQFAwYCBwAQASAwFBUXQP/aAAgBAQABBQL5vXoldXM9uPzCqJtUg0/4
+s2oiYUXZFbMn63lLn4i5FFnPL8djPiHVo5pt8QyUctVZHtagrU9kHnoOr/0s/Z546sOXc6KOVQ6C
+fB97OqpCR9zVVVIryjq1bi8AmId0y5mRS4TxdbtZz5NOajZzh2/uWLCeIWvKwZ+rNLnBW+tZS5zU
+fralKnfwSxZwS6qohQRnwtWoDYDEOoG5hggsQIv+S1CPRoQWbJQvXXgA2QK/CMOVe4zOKllbwEq/
+xsddkPNag40y64L2CyDY66aRpdw1xyhuK8A9LNFnPW3ddPLritd5IaVcQiCzoPzBqSOtJ/ORBSxX
+Qcd0DSQqwyOXO3hLmdiiMUPKEv8A16rksuEETPdog5VP2OBbJfgWNgaKtowS9j3uQqUeo8EKlBK+
+tnr0FoTM6E+VlUTWJpp/rcNjgVOVZu0QgqKXCeKWXCCIWwqzp/mWVKcVzTZUpFR2daiHb7mqrURY
+VHrtW9lBGcqtxPqhghURwvL3rE0I9ZQnzQqsV6CrptjISEto5ptCQtUeu3Fk4dv5FFnPKkCzWpfR
+mqEcii63rYc/s3ShPg/4qk5WteqKvP8AL//EABQRAQAAAAAAAAAAAAAAAAAAAID/2gAIAQMBAT8B
+AH//xAAUEQEAAAAAAAAAAAAAAAAAAACA/9oACAECAQE/AQB//8QAQRAAAgECAwQECwQIBwAAAAAA
+AQIDBBEFEiEAMUFREBMUcQYgIiMwMkJhgZGhFTPB8DRDYnKx0eHxFiVAUpKT0v/aAAgBAQAGPwL0
+clbWyZIl0AHrOf8AaPfsXppo6CH2Yo41f5lgdfl3bQ4djGWRpiEiqkSxzk6BgPlcf19EamtqI6aE
+e1Id/Gw5nTdssSYpGrNxlRo1+bADpwaMO3VsZWKX0JGSx+p+fQkcaNJI5yqii5J5eirEkZhBSSNB
+FFfRbGxPxIv/AG6KqhqGaTsRTq3Y+w17L8Mv193Q9FUM0eueORfYfgbcd+7ZkSkjqVH62KZcp/5E
+H6bLiOLdXLVrYwwLqsR5nm30Hv4dMtXVyrBTxDM7tw2dVhrpVBsHWNbN79W2FVQVC1EFyuYaWPIg
+7vEqK0IzUNbIZEl32c6sp5a3t7vj0T1FYjRVNYVbqj7CD1bjgdT9Ol6bBoYarqzZqqUko3MKB8Nb
+/wA9ljxelhSncgddTZh1fvIN78P67JJG6yRuMyupuCOfTQ4fe1MkPX2HFiSNe7L9T0Q0yHzNWjJI
+p9ylge/T6nxHjkRZI3GVkYXBHLZZqbDaOnmX1ZIoFVh8bdOKyRu0ciUkrK6mxByHXpwxpHZ2s63Y
+30DsAPl0xYjQp1lXTIVeIetIm/T3jXTjfuGzxyI0ciHKyMLEHlt9sYjC1OyBkp4JAVe+4sRw0uNe
+fd6F5cEVaikc6U7SWeP4nePjfX47L9pqtBSKRn84rOw/Ztf68+O0NNCuSGFBGi3vYDQf6XEcRo8f
+7LTQwtKtJ2NH9VN2Y87fXbDMYxfwhaqpqmNWFBHQpnkZkuFBGul/ptNFNFVYbNFEZjHXQ9WxQbyN
+vNYdieVkZ4nNL5M2VS1kPE6bVBxGLEhJHSTyy4hFCI1YAn7tt2YD+G2BrL2ycV1PmhdlzyNZAfL/
+AGjcbuOzutLXtURsRJRin89HbeWHAbU2JRR1FXDU2EUdNFnkY66W+B2q628lKlJ+kR1CZXj7xtRo
+aWvharmSGHroMufN7Q19X+e0lHHTVuIzxferQwdZ1ffslXSPnhbmLEHkfFxWONGkkeklVUUXJOQ6
+beC08KVYlw+BRNDSv1U4vGoOW/EW3bR1P2f4S1zQRTeTjEiiPWMjLrvzaDam+zcPxXDMKCP2qGvP
+m728nJc89sR8HpsKro6uCjqbSmHzUpJawU8T5W3gCWpZctNSyCYmM+aPUKBm5a7eFE8kDpDN2bq5
+GSyv5vWx47eD0CQ4lBCHm7YlElqkDOclgdvC6l6quV6lKZoGxBs0klrtbNuJ3D3btvBmOTCKnDur
+xKDMalcvlck5j3922JySf4gamqp2njfBJdNeDLzHPaoHZ66nEtS0v+YuGla4Xyjbd0PJI6xxoMzO
+xsAOexSiw6SshH62STqr9wsdPzbbs2RqKutm6l2uH/dPHT86ehmp5L9XKhRrGxsdo656mtr54vuu
+2z9YI+7xERGyrPUpHILbxZm/io6IamFsk0LiRGtexGo9NNQTHJm1SXLcxsNx/PAnYwthlRPykpkM
+iMOdx+Ou0Nbi1O1LRREOIZl8qYg+qVO4aa3/ABv43ZsjVtdbN1KNYJ+8eGn512CVuHSUcJ/Wxydb
+bvFhp+bbJJG6yRuMyupuCOezySOscaDMzsbADnssNNiVHUTN6scU6sx+F/STVMzZ5pnMjta1ydT0
+OjtmWCpeOMW3Cyt/FjtUUQdloaKQxpFuu40Zjz1vb3fHonp6x2lqaMqvWn20Pq3PE6H6eN9j4dM1
+OyBXqJ4yVe+8KDw0sdOff0JJTyNJTX85SM3kPz7jpv2hqYWzwzIJEa1rg6jomrcJp2qqKUlzDCvl
+Qkn1Qo3jXS34X2EK4ZUQc5KlDGijnc/hrtDQQnPl1eXLYyMd5/PADaseRWMFXI08UttGubkfAm39
++iqrqhWj7aU6tGHsLezfHN9Pf401S48zVorxsPcoUjv0+o6EjjRpJHOVUUXJPLagpJCpkgp44mK7
+rhQPFNNW08dTCfZkG7hccjrv2WVMLjZl4Su0i/JiR45pa+nWoguGynSx5gjdt+lYh/2J/wCNu0U0
+ck1VrlnqGzMoPAbh+Op9H//EACUQAQEAAgICAwACAgMAAAAAAAERITEAQRBRIDBhgZFAcaGxwf/a
+AAgBAQABPyH6/ZwpdEe1P+1gLzJeJhSqKhhBSMacydjaegSNABIW1H00tvo1GDcFBVmDlJ5TAhcl
+Ndvk8gtKQEO0JP69+DzFVLMAG1evqT0H7OWwZsd6Kg8BoTXUDo6SlXAMA8NkA74hPQRCul0xIT3g
+eG40GsjXJlpIxQ30DglhRUfg6/ISH/qsAMqgccKMPIdIMf0H84zRSKg2AK0xNI6T4VyNX8Gysg77
+KRx8Dx8hdDKulcdGnkuqKJs0BS+kxgk4jTCaHNSneiIXGBw8xRSxQJsTvz2Hjnf4sYwxT9/GQPrM
+xMWQzW493wfMUUsRQ7E65SaeRkjBRhT+fL/iVDIBNI9+R8IyEZIdAAOgPOcNjmuJsqvC0zRHzFVD
+MUOkeuQ+SGV0SMAGacRP0EUSkX2lQGEegRigZEkZOoWBwFiZSJzITSkaFcuDv/F1RjbZHayTZj/T
+hhRatqPKDYX0vF8y0B0VaH98ctNu7QmQcGXgqmCUsOVCYMv1zCZVU1YNZ2DV1vi0MbJAasymV45x
+FBAWHSZr64sXaUuFmXfUt1vmJOsm0GYQypnDHA4ELGXJdAxzOvpoLCDpPi/4lSwADavXIhcypghh
+pwuU45hBTBogtNCbFwclFMueppf1dcjgugwB3PMAzHmjy2EUxziZ7ODhAuQI60Ms4eTZ55dgGV/1
+OaxZPnRJgIHYWuKfHEWHIbjVw17cHE+/e7HTRlknINEjJahMkcNaLc+HzFFDFUugO+UvGbrUtaiI
+sc5E5QbMGMLddAFEHLKJ+msK70CMTWHfBtajSkmMP+b8MQdQM4PzNx68YCaUrUjhyd/doH5hmQ3+
+mRQUvI3mttCAFllkQlC8Bg1D+YE6ORJ0fGAyaMZSb6UoAuCwDyF4zdaFrQVUrjCvDzFFLFAmxO+P
+mKKGKpdAd8pNPAyVgpwL/H2YCaUrUhgy9eMQdQF4P3Nz75HI1f0bawLroLXHwPDyF2MKawx2a/KH
+yQyuiRkKzBiB4zAtZmwU9IhLg2UchNKRqRyYe/AMGofzAnRwDeyt5jbQChLLZUDBnNg/cMyWf0Wo
+Cs4noPyctlzY70wB4DQumqHZ0lKGA5B8sAfWZmZkhuFx7vB5iqlmADavXG3OKrUJQxT40tvs1Gjc
+FBEuHlJ5TIhMtN9nzZorFQaQFbKOlNL4xQgUABAQANOZoWM+v//aAAwDAQACAAMAAAAQkkkkkkkk
+kkkgEkkkkkkgkkAkkAkkgkgEgkkgkgkkgkkkkkEkkkkkkkkkkkgAEEgEkgEkkAgkEEkkkgEkkEkk
+kkkkAEkkEAkkkkkkEkgkkkgkAAkkkkkEkgEkkgAkkkkk/8QAFBEBAAAAAAAAAAAAAAAAAAAAgP/a
+AAgBAwEBPxAAf//EABQRAQAAAAAAAAAAAAAAAAAAAID/2gAIAQIBAT8QAH//xAAgEAEBAAMAAgMB
+AQEAAAAAAAABEQAhMRAwIEFhcUBR/9oACAEBAAE/EPXo/oUTFbTeQoAIEcRyUu8UYBlEWydAkygs
+a3Xwj1TGSCSf72aQ1KOayxJlFzGpCsCqD4AqplWAwJQoCSrgP1ZrhZUAAKqB6jLvZkRAUURAvA4Y
+VhZbE2q1oKAeKJWPU80FiBYnJdi5J0KQ6VdClIv2cv7AgEyFC4fCugMi7AAAKgCQAqCn0WQgBHAo
+TEoaCQjKY+AIwShVD5Y42CbkAqGBiTwsSpw+i0qpmCKGXgAhiABbpaPuFGdSRSAKKsoKGK4g/Vmu
+VkQECiInnQzEX+TpJ/8AQJJli9OPOQVBCCgNnlH6s1wsiCKERRzv1AiEqohHYjj5ZetNdWEAQIiC
+eQMqXD8pTi4AQA8zfqkmh3BAISFiEfqzXCwgIoERHIqgAjXyqHx1I9KrHUFgZqcCdRjTqVHLHUMQ
+ogKs8VUiHpWAFSs2r/l/RaDpbXtRcAQZa1o/kqQIBoYjEQIf0RyQVB0FBBjhUnGoAZ12ELUymlK1
+0GFDIeuyz6sc09xooV1GBsEbTCyjVdqdEFLdL2ohANAFlsqaAb6YJVHCppomCh6yE2vqKE6XdBpt
+IyhcOoKn3HWIvbvHfXnER/Eoi/Al6011ZUAAKqBgHkIPnBSJUNZsoMW5TF0NIGkpklAcSoFKJoT+
+C2j1LGRsdkSGFkwABbxyEw7glUDZhDIBSVQ4KzGDvD59ncQg6hgU+JUvZNCxrkSEaGzhX12jRXnE
+gEJ9GXzE09KDtAANVEwBqsn9iPoIIYj9Wa5WABVIAK5omA2PeSJ2OEbTpaqgQIiDsICfoA8inT7o
+VSBp0zc5NSoWgzRw6bBPPEMkDW0XpIebFHOKqRDwrAGBGbE93cl5UQT/AEkAdk0TiLjvAEESImXs
+qF1ARd5CmFF8m9LVUCFEwdhAz0TAbXrJM5HCNAfqzXKyICBRETEfqzXKwAKpABXO/UCIWqCENCeH
+s6qpEPClEYALoDx1DJA1sB6SnmwAY42CbgAoGEqTwsSpw+y1qpmgau3xiqACMPCofHcjwoYGNQSN
+jeUEnNcVUiHhSgMAl2D4nZULqIg7ylMIDHZOIuOcAUTIqZ3JeVEE/wAJIHZTLvZsRAUQUQPQ4YVh
+bbF2ixoKofKxenHvIIhpDVGTAfqzXCyoAAVUDHDqWHtQpUUGShz41kgkv+tmFNSDm8sSZRcxqUjE
+iCfIIRlMfAEYJUqBzon5GnABbCdXrkf/2Q==
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 06/06: TIKA-2590 -- revert listenForAllRecords = false
thanks to Grigoriy Alekseev
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit a9b4b3676f9476ae78246aa2f962006502243a24
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 15:19:31 2018 -0500
TIKA-2590 -- revert listenForAllRecords = false thanks to Grigoriy Alekseev
---
.../src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java | 1 -
1 file changed, 1 deletion(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 9146b8c..4ea8068 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -284,7 +284,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
// Set up listener and register the records we want to process
HSSFRequest hssfRequest = new HSSFRequest();
- listenForAllRecords = true;
if (listenForAllRecords) {
hssfRequest.addListenerForAllRecords(formatListener);
} else {
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 04/06: TIKA-2592 -- ignore charsets not supported by IANA in
html meta-headers via Andreas Meier.
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 164c9286fc0933051e86ce0a209250aa51bee3bf
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 13:47:38 2018 -0500
TIKA-2592 -- ignore charsets not supported by IANA in html meta-headers
via Andreas Meier.
---
CHANGES.txt | 4 +
.../tika/parser/html/HtmlEncodingDetector.java | 41 +++++++
.../html/StandardCharsets_unsupported_by_IANA.txt | 125 +++++++++++++++++++++
.../apache/tika/parser/html/HtmlParserTest.java | 10 ++
.../test-documents/testHTML_charset_utf16le.html | Bin 0 -> 380 bytes
.../test-documents/testHTML_charset_utf8.html | 8 ++
6 files changed, 188 insertions(+)
diff --git a/CHANGES.txt b/CHANGES.txt
index d553961..73d3d68 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.18 - ???
+ * Ignore non-IANA supported charsets in HTML meta-headers
+ during charset detection in HTMLEncodingDetector
+ via Andreas Meier (TIKA-2592)
+
* Add detection and parsing of zstd (if user provides
com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 559ec4d..e383f80 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -16,10 +16,17 @@
*/
package org.apache.tika.parser.html;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -39,6 +46,37 @@ import org.apache.tika.utils.CharsetUtils;
*/
public class HtmlEncodingDetector implements EncodingDetector {
+ /**
+ * HTML can include non-iana supported charsets that Java
+ * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake.
+ * Ignore charsets in html meta-headers that are not supported by IANA.
+ * See: TIKA-2592
+ */
+ private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
+ static {
+ Set<String> unsupported = new HashSet<>();
+ try (BufferedReader reader =
+ new BufferedReader(
+ new InputStreamReader(
+ HtmlEncodingDetector.class
+ .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"),
+ StandardCharsets.UTF_8))) {
+ String line = reader.readLine();
+ while (line != null) {
+ if (line.startsWith("#")) {
+ continue;
+ }
+ line = line.trim();
+ if (line.length() > 0) {
+ unsupported.add(line.toLowerCase(Locale.US));
+ }
+ line = reader.readLine();
+ }
+ } catch (IOException e) {
+ throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
+ }
+ CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
+ }
// TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
private static final int DEFAULT_MARK_LIMIT = 8192;
@@ -112,6 +150,9 @@ public class HtmlEncodingDetector implements EncodingDetector {
//that is valid
while (charsetMatcher.find()) {
String candCharset = charsetMatcher.group(1);
+ if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
+ continue;
+ }
if (CharsetUtils.isSupported(candCharset)) {
try {
return CharsetUtils.forName(candCharset);
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
new file mode 100644
index 0000000..05f76ce
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
@@ -0,0 +1,125 @@
+646
+737
+775
+813
+819
+858
+874
+8859_1
+8859_13
+8859_15
+8859_2
+8859_4
+8859_5
+8859_7
+8859_9
+912
+914
+915
+920
+923
+ansi-1251
+ascii
+ascii7
+cesu8
+cp1250
+cp1251
+cp1252
+cp1253
+cp1254
+cp1257
+cp5346
+cp5347
+cp5348
+cp5349
+cp5350
+cp5353
+cp737
+cp813
+cp858
+cp874
+cp912
+cp914
+cp915
+cp920
+cp923
+csibm862
+csisolatin0
+csisolatin9
+cspcp855
+default
+ibm-437
+ibm-737
+ibm-775
+ibm-813
+ibm-819
+ibm-850
+ibm-852
+ibm-855
+ibm-857
+ibm-862
+ibm-866
+ibm-874
+ibm-912
+ibm-914
+ibm-915
+ibm-920
+ibm-923
+ibm737
+ibm813
+ibm874
+ibm912
+ibm914
+ibm915
+ibm920
+ibm923
+iso8859-1
+iso8859-13
+iso8859-15
+iso8859-2
+iso8859-4
+iso8859-5
+iso8859-7
+iso8859-9
+iso8859_1
+iso8859_13
+iso8859_15
+iso8859_15_fdis
+iso8859_2
+iso8859_4
+iso8859_5
+iso8859_7
+iso8859_9
+iso_8859-13
+iso_8859_1
+koi8
+koi8_r
+koi8_u
+l9
+latin0
+latin9
+sun_eu_greek
+unicode
+unicode-1-1-utf-8
+unicodebig
+unicodebigunmarked
+unicodelittle
+unicodelittleunmarked
+utf-32be-bom
+utf-32le-bom
+utf16
+utf32
+utf8
+utf_16
+utf_16be
+utf_16le
+utf_32
+utf_32be
+utf_32be_bom
+utf_32le
+utf_32le_bom
+windows-437
+x-utf-16be
+x-utf-16le
+x-utf-32be
+x-utf-32le
\ No newline at end of file
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 6f2eb1f..ab8e314 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1385,4 +1385,14 @@ public class HtmlParserTest extends TikaTest {
}
}
}
+
+ @Test
+ public void testCharsetsNotSupportedByIANA() throws Exception {
+ assertContains("This is a sample text",
+ getXML("testHTML_charset_utf8.html").xml);
+
+ assertContains("This is a sample text",
+ getXML("testHTML_charset_utf16le.html").xml);
+
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html
new file mode 100644
index 0000000..26cb535
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html differ
diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html
new file mode 100644
index 0000000..1f61f02
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html
@@ -0,0 +1,8 @@
+<html>
+ <head>
+ <title>Title</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=unicode">
+ <style></style>
+ </head>
+ <body>This is a sample text</body>
+</html>
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 05/06: TIKA-2591 -- Add workaround to identify TIFFs that
might confuse commons-compress's tar detection via Daniel Schmidt
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit b4047eb2d92ee4ae8d8e02d12079232419775a73
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 14:55:06 2018 -0500
TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt
---
CHANGES.txt | 4 ++
.../tika/parser/pkg/ZipContainerDetector.java | 42 ++++++++++++++++-
.../tika/parser/pkg/ZipContainerDetectorTest.java | 55 ++++++++++++++++++++++
3 files changed, 99 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 73d3d68..9b05d80 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.18 - ???
+ * Add workaround to identify TIFFs that might confuse
+ commons-compress's tar detection via Daniel Schmidt
+ (TIKA-2591)
+
* Ignore non-IANA supported charsets in HTML meta-headers
during charset detection in HTMLEncodingDetector
via Andreas Meier (TIKA-2592)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 65e2e1d..c453617 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
@@ -56,6 +57,19 @@ import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
* formats to figure out exactly what the file is.
*/
public class ZipContainerDetector implements Detector {
+
+ //Regrettably, some tiff files can be incorrectly identified
+ //as tar files. We need this ugly workaround to rule out TIFF.
+ //If commons-compress ever chooses to take over TIFF detection
+ //we can remove all of this. See TIKA-2591.
+ private final static MediaType TIFF = MediaType.image("tiff");
+ private final static byte[][] TIFF_SIGNATURES = new byte[3][];
+ static {
+ TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
+ TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
+ TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
+ }
+
private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
// TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
@@ -86,8 +100,11 @@ public class ZipContainerDetector implements Detector {
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
- if (PackageParser.isZipArchive(type)
- && TikaInputStream.isTikaInputStream(input)) {
+
+ if (type == TIFF) {
+ return TIFF;
+ } else if (PackageParser.isZipArchive(type)
+ && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
@@ -112,7 +129,28 @@ public class ZipContainerDetector implements Detector {
}
}
+ private static boolean isTiff(byte[] prefix) {
+ for (byte[] sig : TIFF_SIGNATURES) {
+ if(arrayStartWith(sig, prefix)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
+ for (int i = 0; i < needle.length; i++) {
+ if (haystack[i] != needle[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+ if (isTiff(prefix)) {
+ return TIFF;
+ }
try {
String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return PackageParser.getMediaType(name);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
new file mode 100644
index 0000000..2865442
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class ZipContainerDetectorTest extends TikaTest {
+
+ @Test
+ public void testTiffWorkaround() throws Exception {
+ //TIKA-2591
+ ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
+ Metadata metadata = new Metadata();
+ try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
+ MediaType mt = zipContainerDetector.detect(is, metadata);
+ assertEquals(MediaType.image("tiff"), mt);
+ }
+ metadata = new Metadata();
+ try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
+ MediaType mt = zipContainerDetector.detect(is, metadata);
+ assertEquals(MediaType.image("tiff"), mt);
+ }
+
+ }
+}
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.
[tika] 01/06: turn off debug in powerpointparsertest
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d9f63a0e9aab26b0fdb76428ee82aea01a922c14
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 08:45:48 2018 -0500
turn off debug in powerpointparsertest
---
.../test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java | 1 -
1 file changed, 1 deletion(-)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index f217ef0..8388c1f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -346,7 +346,6 @@ public class PowerPointParserTest extends TikaTest {
@Test
public void testEmbeddedXLSInOLEObject() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt");
- debug(metadataList);
assertEquals(3, metadataList.size());
Metadata xlsx = metadataList.get(1);
assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.