You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/15 17:33:11 UTC

[01/10] tika git commit: TIKA-2006 -- add mime definitions for iCal and vCalendar

Repository: tika
Updated Branches:
  refs/heads/TIKA-1508 03d38248f -> 01320372f


TIKA-2006 -- add mime definitions for iCal and vCalendar


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4d308fd7
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4d308fd7
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4d308fd7

Branch: refs/heads/TIKA-1508
Commit: 4d308fd7015391c9cdfd13ba6990dcd6e8496138
Parents: 06633cc
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 07:13:15 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 07:13:15 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                          |  2 ++
 .../org/apache/tika/mime/tika-mimetypes.xml          | 11 +++++++++++
 .../java/org/apache/tika/mime/TestMimeTypes.java     |  6 ++++++
 .../test/resources/test-documents/testICalendar.ics  | 15 +++++++++++++++
 .../test/resources/test-documents/testVCalendar.vcs  | 10 ++++++++++
 5 files changed, 44 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/4d308fd7/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 6008b51..59d2451 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Add mime definitions of iCal and vCalendar (TIKA-2006).
+
   * Upgrade to PDFBox 2.0.2 (TIKA-1996).
 
   * Add configurable maximum threshold for number of events extracted

http://git-wip-us.apache.org/repos/asf/tika/blob/4d308fd7/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 8a79844..ca84d94 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5494,8 +5494,14 @@
   </mime-type>
 
   <mime-type type="text/calendar">
+    <magic priority="50">
+      <match value="BEGIN:VCALENDAR" type="string" offset="0">
+        <match value="VERSION:2.0" type="string" offset="15:30"/>
+      </match>
+    </magic>
     <glob pattern="*.ics"/>
     <glob pattern="*.ifb"/>
+    <sub-class-of type="text/plain"/>
   </mime-type>
 
   <mime-type type="text/css">
@@ -6250,6 +6256,11 @@
   </mime-type>
 
   <mime-type type="text/x-vcalendar">
+    <magic priority="50">
+      <match value="BEGIN:VCALENDAR" type="string" offset="0">
+        <match value="VERSION:1.0" type="string" offset="15:30"/>
+      </match>
+    </magic>
     <glob pattern="*.vcs"/>
     <sub-class-of type="text/plain"/>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/4d308fd7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 46d3f5d..3f22842 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1031,6 +1031,12 @@ public class TestMimeTypes {
         assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m");
         assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m");
     }
+
+    @Test
+    public void testVandICalendars() throws Exception {
+        assertType("text/calendar", "testICalendar.ics");
+        assertType("text/x-vcalendar", "testVCalendar.vcs");
+    }
     
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);

http://git-wip-us.apache.org/repos/asf/tika/blob/4d308fd7/tika-parsers/src/test/resources/test-documents/testICalendar.ics
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testICalendar.ics b/tika-parsers/src/test/resources/test-documents/testICalendar.ics
new file mode 100644
index 0000000..cbd9bee
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testICalendar.ics
@@ -0,0 +1,15 @@
+BEGIN:VCALENDAR
+VERSION:2.0
+METHOD:PUBLISH
+BEGIN:VEVENT
+DTSTART:20010701T213000Z
+DTEND:20010701T223000Z
+LOCATION:Banque Nationale Stage ()
+UID:20141020T130403Z@montrealjazzfest.com
+DTSTAMP:20141020T130403Z
+SUMMARY:Susi Hyldgaard - Festival International de Jazz de Montr�al
+DESCRIPTION:SUSI HYLDGAARD\n\nAs the new millennium opens, the prevailing style is the m�lange... but you still have to know how to measure out the ingredients. Susi Hyldgaard\u2019s facility with jazz and world music is stunning the critics. Want proof? She\u2019s been compared to Bj�rk, Neneh Cherry and Cassandra Wilson but the singer/pianist/accordionist sets herself apart by virtue of a thoroughly personal style, spontaneous and blazingly passionate. Without a doubt, one of the most striking talents to emerge from the Danish music scene in the last five years. \n\nhttp://www.montrealjazzfest.com/program/concert.aspx?id=3579\n
+PRIORITY:5
+CLASS:PUBLIC
+END:VEVENT
+END:VCALENDAR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/4d308fd7/tika-parsers/src/test/resources/test-documents/testVCalendar.vcs
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testVCalendar.vcs b/tika-parsers/src/test/resources/test-documents/testVCalendar.vcs
new file mode 100644
index 0000000..b6ea937
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testVCalendar.vcs
@@ -0,0 +1,10 @@
+BEGIN:VCALENDAR
+VERSION:1.0
+BEGIN:VEVENT
+DTSTART:20121201T210000
+DTEND:20121201T220000
+LOCATION;ENCODING=QUOTED-PRINTABLE:Mississippi Studios (3939 N Mississippi Ave., Portland, OR)
+DESCRIPTION;ENCODING=QUOTED-PRINTABLE:
+SUMMARY;ENCODING=QUOTED-PRINTABLE:Battleme, My Goodness, the Ax
+PRIORITY:3END:VEVENT
+END:VCALENDAR
\ No newline at end of file


[03/10] tika git commit: Add mime detection and parser for Microsoft Owner File (TIKA-2008).

Posted by ta...@apache.org.
Add mime detection and parser for Microsoft Owner File (TIKA-2008).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01a9b6db
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01a9b6db
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01a9b6db

Branch: refs/heads/TIKA-1508
Commit: 01a9b6db5ac20a63f2b9de9c15de1b12ee2bde06
Parents: d405172
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:20:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:20:29 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |   7 ++
 .../parser/microsoft/MSOwnerFileParser.java     |  81 +++++++++++++++++++
 .../services/org.apache.tika.parser.Parser      |   1 +
 .../org/apache/tika/mime/TestMimeTypes.java     |   5 ++
 .../parser/microsoft/MSOwnerFileParserTest.java |  31 +++++++
 .../resources/test-documents/testMSOwnerFile    | Bin 0 -> 162 bytes
 7 files changed, 127 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d244bd4..3847d72 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Add mime definition and parser for MS Owner File (TIKA-2008).
+
   * Add mime definition for Windows Media Metafile (TIKA-2004).
 
   * Add mime definitions of iCal and vCalendar (TIKA-2006).

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 210ce0c..82df034 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3468,6 +3468,13 @@
   <mime-type type="application/x-ms-application">
     <glob pattern="*.application"/>
   </mime-type>
+  <mime-type type="application/x-ms-owner">
+    <_comment>Temporary files created by MSOffice applications</_comment>
+    <_comment>PRONOM fmt-473</_comment>
+    <magic priority="80">
+      <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-ms-wmd">
     <glob pattern="*.wmd"/>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
new file mode 100644
index 0000000..02c07a6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for temporary MSOFfice files.
+ * This currently only extracts the owner's name.
+ */
+public class MSOwnerFileParser extends AbstractParser {
+
+    private static final int ASCII_CHUNK_LENGTH = 54;
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts owner from MS temp file
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
+        IOUtils.readFully(stream, asciiNameBytes);
+        int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+        String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
+        metadata.set(TikaCoreProperties.CREATOR, asciiName);
+
+        int unicodeCharLength = stream.read();
+        if (unicodeCharLength > 0) {
+            stream.read();//zero after the char length
+            byte[] unicodeBytes = new byte[unicodeCharLength * 2];
+            IOUtils.readFully(stream, unicodeBytes);
+            String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
+            metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+        }
+        xhtml.endDocument();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index acb0224..10a5a7e 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -36,6 +36,7 @@ org.apache.tika.parser.mail.RFC822Parser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.mbox.OutlookPSTParser
 org.apache.tika.parser.microsoft.JackcessParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 5dda858..bc83678 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1043,6 +1043,11 @@ public class TestMimeTypes {
         assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
     }
 
+    @Test
+    public void testMSOwner() throws Exception {
+        assertType("application/x-ms-owner", "testMSOwnerFile");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
new file mode 100644
index 0000000..3cef3df
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class MSOwnerFileParserTest extends TikaTest {
+    @Test
+    public void testBasic() throws Exception {
+        XMLResult r = getXML("testMSOwnerFile");
+        assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testMSOwnerFile b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
new file mode 100644
index 0000000..72a5f57
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile differ


[06/10] tika git commit: TIKA-2009 -- add mime magic for djvu

Posted by ta...@apache.org.
TIKA-2009 -- add mime magic for djvu


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/acf031a0
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/acf031a0
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/acf031a0

Branch: refs/heads/TIKA-1508
Commit: acf031a06faac4061eced224dfabacba6e6fed8f
Parents: 592ae6a
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 10:07:06 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 10:07:06 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                           |   2 ++
 .../resources/org/apache/tika/mime/tika-mimetypes.xml |   3 +++
 .../test/java/org/apache/tika/mime/TestMimeTypes.java |   6 ++++++
 .../src/test/resources/test-documents/testDJVU.djvu   | Bin 0 -> 89 bytes
 4 files changed, 11 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/acf031a0/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 3847d72..82400d5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Add mime magic for DJVU files (TIKA-2009).
+
   * Add mime definition and parser for MS Owner File (TIKA-2008).
 
   * Add mime definition for Windows Media Metafile (TIKA-2004).

http://git-wip-us.apache.org/repos/asf/tika/blob/acf031a0/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 82df034..e8d2b6f 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4896,6 +4896,9 @@
   <mime-type type="image/vnd.djvu">
     <glob pattern="*.djvu"/>
     <glob pattern="*.djv"/>
+    <magic priority="50">
+      <match value="AT&amp;TFORM" type="string" offset="0"/>
+    </magic>
   </mime-type>
 
   <mime-type type="image/vnd.dwg">

http://git-wip-us.apache.org/repos/asf/tika/blob/acf031a0/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index bc83678..62c6c4b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1048,6 +1048,12 @@ public class TestMimeTypes {
         assertType("application/x-ms-owner", "testMSOwnerFile");
     }
 
+    @Test
+    public void testDJVU() throws Exception {
+        assertType("image/vnd.djvu", "testDJVU.djvu");
+        assertTypeByData("image/vnd.djvu", "testDJVU.djvu");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/acf031a0/tika-parsers/src/test/resources/test-documents/testDJVU.djvu
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testDJVU.djvu b/tika-parsers/src/test/resources/test-documents/testDJVU.djvu
new file mode 100644
index 0000000..1361d18
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testDJVU.djvu differ


[09/10] tika git commit: Merge remote-tracking branch 'origin/master' into TIKA-1508

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master' into TIKA-1508


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d9dcd59a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d9dcd59a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d9dcd59a

Branch: refs/heads/TIKA-1508
Commit: d9dcd59ae01c6ae2c8bded94bc9b04d795face9e
Parents: 03d3824 ade60ed
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 12:42:28 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 12:42:28 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   7 ++
 .../org/apache/tika/mime/tika-mimetypes.xml     |  41 +++++++++-
 .../java/org/apache/tika/TikaDetectionTest.java |   2 +-
 .../parser/microsoft/MSOwnerFileParser.java     |  81 +++++++++++++++++++
 .../services/org.apache.tika.parser.Parser      |   1 +
 .../org/apache/tika/mime/TestMimeTypes.java     |  33 +++++++-
 .../parser/microsoft/MSOwnerFileParserTest.java |  31 +++++++
 .../test/resources/test-documents/testDJVU.djvu | Bin 0 -> 89 bytes
 .../test-documents/testEndNoteImportFile.enw    |  10 +++
 .../resources/test-documents/testICalendar.ics  |  15 ++++
 .../resources/test-documents/testMSOwnerFile    | Bin 0 -> 162 bytes
 .../resources/test-documents/testVCalendar.vcs  |  10 +++
 .../test-documents/testWindowsMediaMeta.asx     |   6 ++
 13 files changed, 234 insertions(+), 3 deletions(-)
----------------------------------------------------------------------



[08/10] tika git commit: TIKA-2011 -- add mime detection for Endnote Import file

Posted by ta...@apache.org.
TIKA-2011 -- add mime detection for Endnote Import file


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ade60ed6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ade60ed6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ade60ed6

Branch: refs/heads/TIKA-1508
Commit: ade60ed6277f6b489995c70f521dd9c17f6f608b
Parents: 6291648
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 10:40:58 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 10:40:58 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                            | 13 ++++++-------
 .../resources/org/apache/tika/mime/tika-mimetypes.xml  | 12 ++++++++++++
 .../test/java/org/apache/tika/mime/TestMimeTypes.java  |  6 ++++++
 .../resources/test-documents/testEndNoteImportFile.enw | 10 ++++++++++
 4 files changed, 34 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ade60ed6/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 82400d5..2da92b8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,12 +1,11 @@
 Release 1.14 - ???
 
-  * Add mime magic for DJVU files (TIKA-2009).
-
-  * Add mime definition and parser for MS Owner File (TIKA-2008).
-
-  * Add mime definition for Windows Media Metafile (TIKA-2004).
-
-  * Add mime definitions of iCal and vCalendar (TIKA-2006).
+  * Add mime types, mime magic and/or globs for:
+     * Endnote Import File (TIKA-2011)
+     * DJVU files (TIKA-2009)
+     * MS Owner File (TIKA-2008)
+     * Windows Media Metafile (TIKA-2004)
+     * iCal and vCalendar (TIKA-2006)
 
   * Upgrade to PDFBox 2.0.2 (TIKA-1996).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/ade60ed6/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index e8d2b6f..a94f188 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3113,6 +3113,18 @@
     <sub-class-of type="text/x-tika-text-based-message"/>
   </mime-type>
 
+  <mime-type type="application/x-endnote-refer">
+    <magic priority="80">
+      <match value="%A " type="string" offset="0:50">
+        <match value="\n%D " type="string" offset="0:1000">
+          <match value="\n%T " type="string" offset="0:1000"/>
+        </match>
+      </match>
+    </magic>
+    <glob pattern="*.enw"/>
+    <glob pattern="*.enr"/>
+  </mime-type>
+
   <mime-type type="application/x-killustrator">
     <_comment>KIllustrator File</_comment>
     <glob pattern="*.kil"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/ade60ed6/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index c0a6cea..102b005 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1057,6 +1057,12 @@ public class TestMimeTypes {
         assertTypeByData("image/vnd.djvu", "testDJVU.djvu");
     }
 
+    @Test
+    public void testEndNoteImport() throws Exception {
+        assertType("application/x-endnote-refer", "testEndNoteImportFile.enw");
+        assertTypeByData("application/x-endnote-refer", "testEndNoteImportFile.enw");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/ade60ed6/tika-parsers/src/test/resources/test-documents/testEndNoteImportFile.enw
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEndNoteImportFile.enw b/tika-parsers/src/test/resources/test-documents/testEndNoteImportFile.enw
new file mode 100644
index 0000000..7a68fcf
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testEndNoteImportFile.enw
@@ -0,0 +1,10 @@
+%A Fasouliotis, S J
+%A Schenker, J G
+%D 1997
+%J Eur J Obstet Gynecol Reprod Biol
+%N 2
+%P 183-90
+%T Multifetal pregnancy reduction: a review of the world results for the period 1993-1996.
+%U 
+%V 75
+%@ 0301-2115
\ No newline at end of file


[10/10] tika git commit: TIKA-1986 -- add Initializable, strip out handling of params passed via ParseContext in PDFParser

Posted by ta...@apache.org.
TIKA-1986 -- add Initializable, strip out handling of params passed via ParseContext in PDFParser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01320372
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01320372
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01320372

Branch: refs/heads/TIKA-1508
Commit: 01320372fdbfc5e4ff0cfe0fe85fab91b5b369e7
Parents: d9dcd59
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 13:32:58 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 13:32:58 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/config/Initializable.java   | 31 ++++++++++
 .../java/org/apache/tika/config/TikaConfig.java |  3 +
 .../tika/parser/DummyInitializableParser.java   | 64 ++++++++++++++++++++
 .../tika/parser/DummyParameterizedParser.java   |  1 +
 .../tika/parser/InitializableParserTest.java    | 45 ++++++++++++++
 .../tika/parser/ParameterizedParserTest.java    |  2 +-
 .../tika/config/TIKA-1986-initializable.xml     | 28 +++++++++
 .../tika/config/TIKA-1986-some-parameters.xml   |  2 +-
 .../org/apache/tika/parser/pdf/PDFParser.java   | 10 +--
 .../apache/tika/parser/pdf/PDFParserTest.java   |  2 +
 10 files changed, 178 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/main/java/org/apache/tika/config/Initializable.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java
new file mode 100644
index 0000000..bc7769c
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Components that must do special processing across multiple fields
+ * at initialization time should implement this interface.
+ * <p>
+ * TikaConfig will call initialize on Initializable classes after
+ * setting the parameters.
+ */
+public interface Initializable {
+
+    void initialize() throws TikaConfigException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 49c5e26..fbafe7e 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -565,6 +565,9 @@ public class TikaConfig {
                 Map<String, Param<?>> params = getParams(element);
                 //Assigning the params to bean fields/setters
                 AnnotationUtils.assignFieldParams(loaded, params);
+                if (loaded instanceof Initializable) {
+                    ((Initializable) loaded).initialize();
+                }
 
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
new file mode 100644
index 0000000..4bb8668
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * This tests that initialize() is called after adding the parameters
+ * configured via TikaConfig
+ */
+public class DummyInitializableParser extends AbstractParser implements Initializable {
+
+    public static String SUM_FIELD = "SUM";
+    private static Set<MediaType> MIMES = new HashSet<>();
+    static {
+        MIMES.add(MediaType.TEXT_PLAIN);
+    }
+
+    @Field private short shortA = -2;
+    @Field private short shortB = -3;
+    private int sum = 0;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return MIMES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        metadata.set(SUM_FIELD, Integer.toString(sum));
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        sum = shortA+shortB;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
index 801d65e..435dc52 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -63,6 +63,7 @@ public class DummyParameterizedParser extends AbstractParser {
 
     @Field private String missing = "default";
 
+
     private String inner = "inner";
     private File xfile;
 

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
new file mode 100644
index 0000000..b9d378d
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+
+public class InitializableParserTest {
+
+    public static final String TIKA_CFG_FILE = "org/apache/tika/config/TIKA-1986-initializable.xml";
+
+    @Test
+    public void testInitializableParser() throws Exception {
+        URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
+        assert configFileUrl != null;
+        TikaConfig config = new TikaConfig(configFileUrl);
+        Tika tika = new Tika(config);
+        Metadata md = new Metadata();
+        tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
+        assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index 31c59c0..1471504 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -82,7 +82,7 @@ public class ParameterizedParserTest {
         Metadata md = getMetadata("TIKA-1986-some-parameters.xml");
         assertEquals("-6.0", md.get("xdouble"));
         assertEquals("testparamval", md.get("testparam"));
-        assertEquals("true", md.get("xbool"));
+        assertEquals("false", md.get("xbool"));
     }
 
     @Test

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml
new file mode 100644
index 0000000..0b11bb4
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DummyInitializableParser">
+            <params>
+                <param name="shortA" type="short">2</param>
+                <param name="shortB" type="short">3</param>
+            </params>
+        </parser>
+
+    </parsers>
+</properties>

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
index 250d439..dea8269 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml
@@ -20,7 +20,7 @@
         <parser class="org.apache.tika.parser.DummyParameterizedParser">
             <params>
                 <param name="testparam" type="string">testparamval</param>
-                <param name="testbool" type="bool">false</param>
+                <param name="xbool" type="bool">false</param>
             </params>
         </parser>
 

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index a5673ee..7b12d58 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -109,14 +109,8 @@ public class PDFParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
-        //step 1, check to see if there are params for the PDFParser class
-        Map<String, Param<?>> params = context.getParams(PDFParser.class);
-        PDFParserConfig localConfig = new PDFParserConfig();
-        if (params != null) {
-            AnnotationUtils.assignFieldParams(localConfig, params);
-        } else if (context.get(PDFParserConfig.class) != null) {
-            localConfig = context.get(PDFParserConfig.class, defaultConfig);
-        }
+        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
+
         PDDocument pdfDocument = null;
         TemporaryResources tmp = new TemporaryResources();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 2ef29f3..e9f55fe 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -61,6 +61,7 @@ import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -1231,6 +1232,7 @@ public class PDFParserTest extends TikaTest {
     }
 
     @Test
+    @Ignore("We've turned this off for now")
     public void testParameterizationViaContext() throws Exception {
         ParseContext context = new ParseContext();
 


[07/10] tika git commit: make sure to test magic for vcs/ics/asx

Posted by ta...@apache.org.
make sure to test magic for vcs/ics/asx


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6291648d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6291648d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6291648d

Branch: refs/heads/TIKA-1508
Commit: 6291648dc5ddece929be0e4e7103019d615006b3
Parents: acf031a
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 10:15:56 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 10:15:56 2016 -0400

----------------------------------------------------------------------
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java         | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/6291648d/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 62c6c4b..c0a6cea 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1036,11 +1036,14 @@ public class TestMimeTypes {
     public void testVandICalendars() throws Exception {
         assertType("text/calendar", "testICalendar.ics");
         assertType("text/x-vcalendar", "testVCalendar.vcs");
+        assertTypeByData("text/calendar", "testICalendar.ics");
+        assertTypeByData("text/x-vcalendar", "testVCalendar.vcs");
     }
 
     @Test
     public void testASX() throws Exception {
         assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
+        assertTypeByData("application/x-ms-asx", "testWindowsMediaMeta.asx");
     }
 
     @Test


[02/10] tika git commit: Add mime definition for Windows Media Metafile (TIKA-2004).

Posted by ta...@apache.org.
Add mime definition for Windows Media Metafile (TIKA-2004).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d405172c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d405172c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d405172c

Branch: refs/heads/TIKA-1508
Commit: d405172c89f0cc94135d09b30c3c6ea135d6a5b2
Parents: 4d308fd
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 08:29:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 08:29:02 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                                  | 2 ++
 .../main/resources/org/apache/tika/mime/tika-mimetypes.xml   | 8 +++++++-
 .../src/test/java/org/apache/tika/TikaDetectionTest.java     | 2 +-
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java    | 7 ++++++-
 .../test/resources/test-documents/testWindowsMediaMeta.asx   | 6 ++++++
 5 files changed, 22 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d405172c/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 59d2451..d244bd4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Add mime definition for Windows Media Metafile (TIKA-2004).
+
   * Add mime definitions of iCal and vCalendar (TIKA-2006).
 
   * Upgrade to PDFBox 2.0.2 (TIKA-1996).

http://git-wip-us.apache.org/repos/asf/tika/blob/d405172c/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ca84d94..210ce0c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -6584,11 +6584,17 @@
 
   <mime-type type="video/x-ms-asf">
     <glob pattern="*.asf"/>
-    <glob pattern="*.asx"/>
     <magic>
        <match value="0x3026b275" type="big32" offset="0" />
     </magic>
   </mime-type>
+  <mime-type type="application/x-ms-asx">
+    <_comment>Windows Media Metafile</_comment>
+    <glob pattern="*.asx"/>
+    <root-XML localName="asx"/>
+    <root-XML localName="ASX"/>
+    <sub-class-of type="application/xml"/>
+  </mime-type>
   <mime-type type="video/x-ms-wm">
     <glob pattern="*.wm"/>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/d405172c/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
index 799f977..45256fb 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
@@ -833,7 +833,7 @@ public class TikaDetectionTest {
         assertEquals("video/x-flv", tika.detect("x.flv"));
         assertEquals("video/x-m4v", tika.detect("x.m4v"));
         assertEquals("video/x-ms-asf", tika.detect("x.asf"));
-        assertEquals("video/x-ms-asf", tika.detect("x.asx"));
+        assertEquals("application/x-ms-asx", tika.detect("x.asx"));
         assertEquals("video/x-ms-wm", tika.detect("x.wm"));
         assertEquals("video/x-ms-wmv", tika.detect("x.wmv"));
         assertEquals("video/x-ms-wmx", tika.detect("x.wmx"));

http://git-wip-us.apache.org/repos/asf/tika/blob/d405172c/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 3f22842..5dda858 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1037,7 +1037,12 @@ public class TestMimeTypes {
         assertType("text/calendar", "testICalendar.ics");
         assertType("text/x-vcalendar", "testVCalendar.vcs");
     }
-    
+
+    @Test
+    public void testASX() throws Exception {
+        assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/d405172c/tika-parsers/src/test/resources/test-documents/testWindowsMediaMeta.asx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWindowsMediaMeta.asx b/tika-parsers/src/test/resources/test-documents/testWindowsMediaMeta.asx
new file mode 100644
index 0000000..cca9ae5
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testWindowsMediaMeta.asx
@@ -0,0 +1,6 @@
+<asx version = "3.0">
+  <entry>
+	<title>Council Video for 4/7/2009</title>
+   	<ref href = "mms://media1.fresno.gov/CouncilVideoArchive/20090407.wmv"/>
+  </entry>
+</asx>
\ No newline at end of file


[04/10] tika git commit: TIKA-2008 -- change owner metadata key from TikaCoreProperties.CREATOR to TikaCoreProperties.MODIFIER

Posted by ta...@apache.org.
TIKA-2008 -- change owner metadata key from TikaCoreProperties.CREATOR to TikaCoreProperties.MODIFIER


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f7fe685e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f7fe685e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f7fe685e

Branch: refs/heads/TIKA-1508
Commit: f7fe685e44d9cbc2ea9391723c79753d77a0525c
Parents: 01a9b6d
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:28:20 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:28:20 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f7fe685e/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
index 3cef3df..54a5f42 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -26,6 +26,6 @@ public class MSOwnerFileParserTest extends TikaTest {
     @Test
     public void testBasic() throws Exception {
         XMLResult r = getXML("testMSOwnerFile");
-        assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("heidi", r.metadata.get(TikaCoreProperties.MODIFIER));
     }
 }


[05/10] tika git commit: TIKA-2008 -- change owner metadata key from TikaCoreProperties.CREATOR to TikaCoreProperties.MODIFIER

Posted by ta...@apache.org.
TIKA-2008 -- change owner metadata key from TikaCoreProperties.CREATOR to TikaCoreProperties.MODIFIER


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/592ae6a6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/592ae6a6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/592ae6a6

Branch: refs/heads/TIKA-1508
Commit: 592ae6a6cda88ed33c667e1b9211cd7c393710c4
Parents: f7fe685
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 10:03:44 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 10:03:44 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/592ae6a6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index 02c07a6..c7019f2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -66,7 +66,7 @@ public class MSOwnerFileParser extends AbstractParser {
         IOUtils.readFully(stream, asciiNameBytes);
         int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
         String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
-        metadata.set(TikaCoreProperties.CREATOR, asciiName);
+        metadata.set(TikaCoreProperties.MODIFIER, asciiName);
 
         int unicodeCharLength = stream.read();
         if (unicodeCharLength > 0) {
@@ -74,7 +74,7 @@ public class MSOwnerFileParser extends AbstractParser {
             byte[] unicodeBytes = new byte[unicodeCharLength * 2];
             IOUtils.readFully(stream, unicodeBytes);
             String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
-            metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+            metadata.set(TikaCoreProperties.MODIFIER, unicodeName);
         }
         xhtml.endDocument();
     }