You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/05/21 23:49:12 UTC
svn commit: r1680957 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/test/java/org/apache/tika/mime/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/resources/test-documents/
Author: nick
Date: Thu May 21 21:49:11 2015
New Revision: 1680957
URL: http://svn.apache.org/r1680957
Log:
TIKA-1085 Treat a PDF with a leading Byte Order Mark the same for detection, and add low-priorty matches for the PDF magic coming in 1-1024 bytes of the start (may give false positives if too high), plus tests
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf (with props)
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu May 21 21:49:11 2015
@@ -481,7 +481,15 @@
<tika:link>http://www.adobe.com/devnet/pdf/pdf_reference_archive.html</tika:link>
<tika:uti>com.adobe.pdf</tika:uti>
<magic priority="50">
+ <!-- Normally just %PDF- -->
<match value="%PDF-" type="string" offset="0"/>
+ <!-- Sometimes has a UTF-8 Byte Order Mark first -->
+ <match value="\xef\xbb\xbf%PDF-" type="string" offset="0"/>
+ </magic>
+ <magic priority="20">
+ <!-- Low priority match for %PDF near the start of the file -->
+ <!-- Can trigger false positives, so set the priority rather low here -->
+ <match value="%PDF-" type="string" offset="1:512"/>
</magic>
<glob pattern="*.pdf"/>
</mime-type>
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu May 21 21:49:11 2015
@@ -501,10 +501,17 @@ public class TestMimeTypes {
@Test
public void testPdfDetection() throws Exception {
- assertType("application/pdf", "testPDF.pdf");
- assertTypeByData("application/pdf", "testPDF.pdf");
+ // PDF extension by name is enough
assertTypeByName("application/pdf", "x.pdf");
assertTypeByName("application/pdf", "x.PDF");
+
+ // For normal PDFs, can get by name or data or both
+ assertType("application/pdf", "testPDF.pdf");
+ assertTypeByData("application/pdf", "testPDF.pdf");
+
+ // PDF with a BoM works both ways too
+ assertType("application/pdf", "testPDF_bom.pdf");
+ assertTypeByData("application/pdf", "testPDF_bom.pdf");
}
@Test
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1680957&r1=1680956&r2=1680957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu May 21 21:49:11 2015
@@ -652,6 +652,8 @@ public class PDFParserTest extends TikaT
knownMetadataDiffs.add("testAnnotations.pdf");
// Added for TIKA-93.
knownMetadataDiffs.add("testOCR.pdf");
+ // Added for TIKA-1085
+ knownMetadataDiffs.add("testPDF_bom.pdf");
//empty for now
Set<String> knownContentDiffs = new HashSet<String>();
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf?rev=1680957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream