You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/07 18:14:31 UTC

[tika] branch master updated: TIKA-2594 -- improve eml detection for those starting with Subject: and containing html

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 0903104  TIKA-2594 -- improve eml detection for those starting with Subject: and containing html
0903104 is described below

commit 09031046e5bece75ed22d9ee9b184ec49a14f99a
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 13:14:20 2018 -0500

    TIKA-2594 -- improve eml detection for those starting with Subject: and
    containing html
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |   1 +
 .../java/org/apache/tika/mime/TestMimeTypes.java   |   3 +
 .../testEML_embedded_xhtml_and_img.eml             | 141 +++++++++++++++++++++
 3 files changed, 145 insertions(+)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index e8da795..8855922 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5693,6 +5693,7 @@
       <match value="From:" type="stringignorecase" offset="0"/>
       <match value="Received:" type="stringignorecase" offset="0"/>
       <match value="Message-ID:" type="stringignorecase" offset="0"/>
+      <match value="\nMessage-ID:" type="stringignorecase" offset="0:8192"/>
       <match value="Date:" type="string" offset="0"/>
       <match value="User-Agent:" type="string" offset="0"/>
       <match value="MIME-Version:" type="stringignorecase" offset="0"/>
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 835a525..65acbc3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -909,6 +909,9 @@ public class TestMimeTypes {
         //x- custom header
         assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
 
+        //embedded xhtml and img
+        assertTypeDetection("testEML_embedded_xhtml_and_img.eml", "message/rfc822");
+
     }
     
     @Test
diff --git a/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml b/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
new file mode 100644
index 0000000..822cb8b
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
@@ -0,0 +1,141 @@
+Subject: This is the subject
+From: "Sender" <se...@test.com>
+To: <Re...@test.com>
+Date: Wed, 21 Mar 2018 14:14:32 -0800
+List-Unsubscribe: <ma...@test.com>
+Content-Type: multipart/related; boundary="f4ff6b19348ff123_b9347c3a6d123456a"
+MIME-Version: 1.0
+Message-ID: <01...@test.com>
+
+--f4ff6b19348ff123_b9347c3a6d123456a
+Content-Type: multipart/alternative; boundary="f4ff6b19348ff123_b9347c3a6d123456b"
+
+--f4ff6b19348ff123_b9347c3a6d123456b
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 8-bit
+Content-Disposition: inline
+
+This is a test
+
+Using inline content
+
+Unsubscribe:
+http://test.com/receiver/asgdnjlsakgHjghskusegnkKLghslkagjasduiLKGlkjsgslkjgeseklgj
+
+--f4ff6b19348ff123_b9347c3a6d123456b
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline
+
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <title>XHTML test document</title>
+    <meta name="Author" content="Tika Developers"/>
+    <meta http-equiv="refresh" content="5"/>
+  </head>
+  <body>
+    <p>
+      This document tests the ability of Apache Tika to extract content
+      from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>.
+    </p>
+  </body>
+</html>
+
+--f4ff6b19348ff123_b9347c3a6d123456b--
+
+
+
+
+
+--f4ff6b19348ff123_b9347c3a6d123456a
+Content-Type: image/jpeg; name="testimage.jpeg"
+Content-Description:testimage.jpeg
+Content-Disposition: inline; filename="testimage.jpeg";
+Content-Transfer-Encoding: base64
+
+/9j/4AAQSkZJRgABAQEASABIAAD//gATQ3JlYXRlZCB3aXRoIEdJTVD/2wBDAAMCAgMCAgMDAwME
+AwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBD
+AQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQU
+FBQUFBQUFBT/wgARCACAAIADAREAAhEBAxEB/8QAHAABAAICAwEAAAAAAAAAAAAAAAcIBQYDBAkB
+/8QAFAEBAAAAAAAAAAAAAAAAAAAAAP/aAAwDAQACEAMQAAABtSAAAAAa+U1JFLSgAAAGKNKJKBVo
+qgcp6ngAAAHn2ReXPLCmnlOyaixQBjyFSYDNAFECGi8pN4KymKLVHKCoBWomovUAcRhDYAa0eaAP
+QQlAEAFLy1RaUAAAFOzElzztgAAAAAAAAAio103U6ZqZtx1TZDiMAZQ2kzIANaIfMUZM+GXNkIqO
+IyZ0yViSjiK3m9ktAAAA6pohIoBBRSA7Z6kgAAAAAGqlFSRS54ABEpohZAGENgAAAAAPLY6hd8gU
+hovKTeAAVaKoG9nocdspgR0XqKSkXlzywoABRUhU5T01M0ADFGlElAAAwpDRupJQAAAAP//EACIQ
+AAICAgIDAAMBAAAAAAAAAAQFAwYCBwAQASAwFBUXQP/aAAgBAQABBQL5vXoldXM9uPzCqJtUg0/4
+s2oiYUXZFbMn63lLn4i5FFnPL8djPiHVo5pt8QyUctVZHtagrU9kHnoOr/0s/Z546sOXc6KOVQ6C
+fB97OqpCR9zVVVIryjq1bi8AmId0y5mRS4TxdbtZz5NOajZzh2/uWLCeIWvKwZ+rNLnBW+tZS5zU
+fralKnfwSxZwS6qohQRnwtWoDYDEOoG5hggsQIv+S1CPRoQWbJQvXXgA2QK/CMOVe4zOKllbwEq/
+xsddkPNag40y64L2CyDY66aRpdw1xyhuK8A9LNFnPW3ddPLritd5IaVcQiCzoPzBqSOtJ/ORBSxX
+Qcd0DSQqwyOXO3hLmdiiMUPKEv8A16rksuEETPdog5VP2OBbJfgWNgaKtowS9j3uQqUeo8EKlBK+
+tnr0FoTM6E+VlUTWJpp/rcNjgVOVZu0QgqKXCeKWXCCIWwqzp/mWVKcVzTZUpFR2daiHb7mqrURY
+VHrtW9lBGcqtxPqhghURwvL3rE0I9ZQnzQqsV6CrptjISEto5ptCQtUeu3Fk4dv5FFnPKkCzWpfR
+mqEcii63rYc/s3ShPg/4qk5WteqKvP8AL//EABQRAQAAAAAAAAAAAAAAAAAAAID/2gAIAQMBAT8B
+AH//xAAUEQEAAAAAAAAAAAAAAAAAAACA/9oACAECAQE/AQB//8QAQRAAAgECAwQECwQIBwAAAAAA
+AQIDBBEFEiEAMUFREBMUcQYgIiMwMkJhgZGhFTPB8DRDYnKx0eHxFiVAUpKT0v/aAAgBAQAGPwL0
+clbWyZIl0AHrOf8AaPfsXppo6CH2Yo41f5lgdfl3bQ4djGWRpiEiqkSxzk6BgPlcf19EamtqI6aE
+e1Id/Gw5nTdssSYpGrNxlRo1+bADpwaMO3VsZWKX0JGSx+p+fQkcaNJI5yqii5J5eirEkZhBSSNB
+FFfRbGxPxIv/AG6KqhqGaTsRTq3Y+w17L8Mv193Q9FUM0eueORfYfgbcd+7ZkSkjqVH62KZcp/5E
+H6bLiOLdXLVrYwwLqsR5nm30Hv4dMtXVyrBTxDM7tw2dVhrpVBsHWNbN79W2FVQVC1EFyuYaWPIg
+7vEqK0IzUNbIZEl32c6sp5a3t7vj0T1FYjRVNYVbqj7CD1bjgdT9Ol6bBoYarqzZqqUko3MKB8Nb
+/wA9ljxelhSncgddTZh1fvIN78P67JJG6yRuMyupuCOfTQ4fe1MkPX2HFiSNe7L9T0Q0yHzNWjJI
+p9ylge/T6nxHjkRZI3GVkYXBHLZZqbDaOnmX1ZIoFVh8bdOKyRu0ciUkrK6mxByHXpwxpHZ2s63Y
+30DsAPl0xYjQp1lXTIVeIetIm/T3jXTjfuGzxyI0ciHKyMLEHlt9sYjC1OyBkp4JAVe+4sRw0uNe
+fd6F5cEVaikc6U7SWeP4nePjfX47L9pqtBSKRn84rOw/Ztf68+O0NNCuSGFBGi3vYDQf6XEcRo8f
+7LTQwtKtJ2NH9VN2Y87fXbDMYxfwhaqpqmNWFBHQpnkZkuFBGul/ptNFNFVYbNFEZjHXQ9WxQbyN
+vNYdieVkZ4nNL5M2VS1kPE6bVBxGLEhJHSTyy4hFCI1YAn7tt2YD+G2BrL2ycV1PmhdlzyNZAfL/
+AGjcbuOzutLXtURsRJRin89HbeWHAbU2JRR1FXDU2EUdNFnkY66W+B2q628lKlJ+kR1CZXj7xtRo
+aWvharmSGHroMufN7Q19X+e0lHHTVuIzxferQwdZ1ffslXSPnhbmLEHkfFxWONGkkeklVUUXJOQ6
+beC08KVYlw+BRNDSv1U4vGoOW/EW3bR1P2f4S1zQRTeTjEiiPWMjLrvzaDam+zcPxXDMKCP2qGvP
+m728nJc89sR8HpsKro6uCjqbSmHzUpJawU8T5W3gCWpZctNSyCYmM+aPUKBm5a7eFE8kDpDN2bq5
+GSyv5vWx47eD0CQ4lBCHm7YlElqkDOclgdvC6l6quV6lKZoGxBs0klrtbNuJ3D3btvBmOTCKnDur
+xKDMalcvlck5j3922JySf4gamqp2njfBJdNeDLzHPaoHZ66nEtS0v+YuGla4Xyjbd0PJI6xxoMzO
+xsAOexSiw6SshH62STqr9wsdPzbbs2RqKutm6l2uH/dPHT86ehmp5L9XKhRrGxsdo656mtr54vuu
+2z9YI+7xERGyrPUpHILbxZm/io6IamFsk0LiRGtexGo9NNQTHJm1SXLcxsNx/PAnYwthlRPykpkM
+iMOdx+Ou0Nbi1O1LRREOIZl8qYg+qVO4aa3/ABv43ZsjVtdbN1KNYJ+8eGn512CVuHSUcJ/Wxydb
+bvFhp+bbJJG6yRuMyupuCOezySOscaDMzsbADnssNNiVHUTN6scU6sx+F/STVMzZ5pnMjta1ydT0
+OjtmWCpeOMW3Cyt/FjtUUQdloaKQxpFuu40Zjz1vb3fHonp6x2lqaMqvWn20Pq3PE6H6eN9j4dM1
+OyBXqJ4yVe+8KDw0sdOff0JJTyNJTX85SM3kPz7jpv2hqYWzwzIJEa1rg6jomrcJp2qqKUlzDCvl
+Qkn1Qo3jXS34X2EK4ZUQc5KlDGijnc/hrtDQQnPl1eXLYyMd5/PADaseRWMFXI08UttGubkfAm39
++iqrqhWj7aU6tGHsLezfHN9Pf401S48zVorxsPcoUjv0+o6EjjRpJHOVUUXJPLagpJCpkgp44mK7
+rhQPFNNW08dTCfZkG7hccjrv2WVMLjZl4Su0i/JiR45pa+nWoguGynSx5gjdt+lYh/2J/wCNu0U0
+ck1VrlnqGzMoPAbh+Op9H//EACUQAQEAAgICAwACAgMAAAAAAAERITEAQRBRIDBhgZFAcaGxwf/a
+AAgBAQABPyH6/ZwpdEe1P+1gLzJeJhSqKhhBSMacydjaegSNABIW1H00tvo1GDcFBVmDlJ5TAhcl
+Ndvk8gtKQEO0JP69+DzFVLMAG1evqT0H7OWwZsd6Kg8BoTXUDo6SlXAMA8NkA74hPQRCul0xIT3g
+eG40GsjXJlpIxQ30DglhRUfg6/ISH/qsAMqgccKMPIdIMf0H84zRSKg2AK0xNI6T4VyNX8Gysg77
+KRx8Dx8hdDKulcdGnkuqKJs0BS+kxgk4jTCaHNSneiIXGBw8xRSxQJsTvz2Hjnf4sYwxT9/GQPrM
+xMWQzW493wfMUUsRQ7E65SaeRkjBRhT+fL/iVDIBNI9+R8IyEZIdAAOgPOcNjmuJsqvC0zRHzFVD
+MUOkeuQ+SGV0SMAGacRP0EUSkX2lQGEegRigZEkZOoWBwFiZSJzITSkaFcuDv/F1RjbZHayTZj/T
+hhRatqPKDYX0vF8y0B0VaH98ctNu7QmQcGXgqmCUsOVCYMv1zCZVU1YNZ2DV1vi0MbJAasymV45x
+FBAWHSZr64sXaUuFmXfUt1vmJOsm0GYQypnDHA4ELGXJdAxzOvpoLCDpPi/4lSwADavXIhcypghh
+pwuU45hBTBogtNCbFwclFMueppf1dcjgugwB3PMAzHmjy2EUxziZ7ODhAuQI60Ms4eTZ55dgGV/1
+OaxZPnRJgIHYWuKfHEWHIbjVw17cHE+/e7HTRlknINEjJahMkcNaLc+HzFFDFUugO+UvGbrUtaiI
+sc5E5QbMGMLddAFEHLKJ+msK70CMTWHfBtajSkmMP+b8MQdQM4PzNx68YCaUrUjhyd/doH5hmQ3+
+mRQUvI3mttCAFllkQlC8Bg1D+YE6ORJ0fGAyaMZSb6UoAuCwDyF4zdaFrQVUrjCvDzFFLFAmxO+P
+mKKGKpdAd8pNPAyVgpwL/H2YCaUrUhgy9eMQdQF4P3Nz75HI1f0bawLroLXHwPDyF2MKawx2a/KH
+yQyuiRkKzBiB4zAtZmwU9IhLg2UchNKRqRyYe/AMGofzAnRwDeyt5jbQChLLZUDBnNg/cMyWf0Wo
+Cs4noPyctlzY70wB4DQumqHZ0lKGA5B8sAfWZmZkhuFx7vB5iqlmADavXG3OKrUJQxT40tvs1Gjc
+FBEuHlJ5TIhMtN9nzZorFQaQFbKOlNL4xQgUABAQANOZoWM+v//aAAwDAQACAAMAAAAQkkkkkkkk
+kkkgEkkkkkkgkkAkkAkkgkgEgkkgkgkkgkkkkkEkkkkkkkkkkkgAEEgEkgEkkAgkEEkkkgEkkEkk
+kkkkAEkkEAkkkkkkEkgkkkgkAAkkkkkEkgEkkgAkkkkk/8QAFBEBAAAAAAAAAAAAAAAAAAAAgP/a
+AAgBAwEBPxAAf//EABQRAQAAAAAAAAAAAAAAAAAAAID/2gAIAQIBAT8QAH//xAAgEAEBAAMAAgMB
+AQEAAAAAAAABEQAhMRAwIEFhcUBR/9oACAEBAAE/EPXo/oUTFbTeQoAIEcRyUu8UYBlEWydAkygs
+a3Xwj1TGSCSf72aQ1KOayxJlFzGpCsCqD4AqplWAwJQoCSrgP1ZrhZUAAKqB6jLvZkRAUURAvA4Y
+VhZbE2q1oKAeKJWPU80FiBYnJdi5J0KQ6VdClIv2cv7AgEyFC4fCugMi7AAAKgCQAqCn0WQgBHAo
+TEoaCQjKY+AIwShVD5Y42CbkAqGBiTwsSpw+i0qpmCKGXgAhiABbpaPuFGdSRSAKKsoKGK4g/Vmu
+VkQECiInnQzEX+TpJ/8AQJJli9OPOQVBCCgNnlH6s1wsiCKERRzv1AiEqohHYjj5ZetNdWEAQIiC
+eQMqXD8pTi4AQA8zfqkmh3BAISFiEfqzXCwgIoERHIqgAjXyqHx1I9KrHUFgZqcCdRjTqVHLHUMQ
+ogKs8VUiHpWAFSs2r/l/RaDpbXtRcAQZa1o/kqQIBoYjEQIf0RyQVB0FBBjhUnGoAZ12ELUymlK1
+0GFDIeuyz6sc09xooV1GBsEbTCyjVdqdEFLdL2ohANAFlsqaAb6YJVHCppomCh6yE2vqKE6XdBpt
+IyhcOoKn3HWIvbvHfXnER/Eoi/Al6011ZUAAKqBgHkIPnBSJUNZsoMW5TF0NIGkpklAcSoFKJoT+
+C2j1LGRsdkSGFkwABbxyEw7glUDZhDIBSVQ4KzGDvD59ncQg6hgU+JUvZNCxrkSEaGzhX12jRXnE
+gEJ9GXzE09KDtAANVEwBqsn9iPoIIYj9Wa5WABVIAK5omA2PeSJ2OEbTpaqgQIiDsICfoA8inT7o
+VSBp0zc5NSoWgzRw6bBPPEMkDW0XpIebFHOKqRDwrAGBGbE93cl5UQT/AEkAdk0TiLjvAEESImXs
+qF1ARd5CmFF8m9LVUCFEwdhAz0TAbXrJM5HCNAfqzXKyICBRETEfqzXKwAKpABXO/UCIWqCENCeH
+s6qpEPClEYALoDx1DJA1sB6SnmwAY42CbgAoGEqTwsSpw+y1qpmgau3xiqACMPCofHcjwoYGNQSN
+jeUEnNcVUiHhSgMAl2D4nZULqIg7ylMIDHZOIuOcAUTIqZ3JeVEE/wAJIHZTLvZsRAUQUQPQ4YVh
+bbF2ixoKofKxenHvIIhpDVGTAfqzXCyoAAVUDHDqWHtQpUUGShz41kgkv+tmFNSDm8sSZRcxqUjE
+iCfIIRlMfAEYJUqBzon5GnABbCdXrkf/2Q==
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.