You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/05/21 23:49:12 UTC

svn commit: r1680957 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

Author: nick
Date: Thu May 21 21:49:11 2015
New Revision: 1680957

URL: http://svn.apache.org/r1680957
Log:
TIKA-1085 Treat a PDF with a leading Byte Order Mark the same for detection, and add low-priorty matches for the PDF magic coming in 1-1024 bytes of the start (may give false positives if too high), plus tests

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf   (with props)
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu May 21 21:49:11 2015
@@ -481,7 +481,15 @@
     <tika:link>http://www.adobe.com/devnet/pdf/pdf_reference_archive.html</tika:link>
     <tika:uti>com.adobe.pdf</tika:uti>
     <magic priority="50">
+      <!-- Normally just %PDF- -->
       <match value="%PDF-" type="string" offset="0"/>
+      <!-- Sometimes has a UTF-8 Byte Order Mark first -->
+      <match value="\xef\xbb\xbf%PDF-" type="string" offset="0"/>
+    </magic>
+    <magic priority="20">
+      <!-- Low priority match for %PDF near the start of the file -->
+      <!-- Can trigger false positives, so set the priority rather low here -->
+      <match value="%PDF-" type="string" offset="1:512"/>
     </magic>
     <glob pattern="*.pdf"/>
   </mime-type>

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu May 21 21:49:11 2015
@@ -501,10 +501,17 @@ public class TestMimeTypes {
 
     @Test
     public void testPdfDetection() throws Exception {
-        assertType("application/pdf", "testPDF.pdf");
-        assertTypeByData("application/pdf", "testPDF.pdf");
+        // PDF extension by name is enough
         assertTypeByName("application/pdf", "x.pdf");
         assertTypeByName("application/pdf", "x.PDF");
+
+        // For normal PDFs, can get by name or data or both
+        assertType("application/pdf", "testPDF.pdf");
+        assertTypeByData("application/pdf", "testPDF.pdf");
+
+        // PDF with a BoM works both ways too
+        assertType("application/pdf", "testPDF_bom.pdf");
+        assertTypeByData("application/pdf", "testPDF_bom.pdf");
     }
 
     @Test

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu May 21 21:49:11 2015
@@ -652,6 +652,8 @@ public class PDFParserTest extends TikaT
         knownMetadataDiffs.add("testAnnotations.pdf");
         // Added for TIKA-93.
         knownMetadataDiffs.add("testOCR.pdf");
+        // Added for TIKA-1085
+        knownMetadataDiffs.add("testPDF_bom.pdf");
 
         //empty for now
         Set<String> knownContentDiffs = new HashSet<String>();

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf?rev=1680957&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream