You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2013/02/04 17:36:42 UTC
svn commit: r1442168 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/mime/
tika-core/src/main/resources/org/apache/tika/mime/
tika-core/src/test/java/org/apache/tika/mime/
tika-parsers/src/test/java/org/apache/tika/mime/
Author: nick
Date: Mon Feb 4 16:36:42 2013
New Revision: 1442168
URL: http://svn.apache.org/viewvc?rev=1442168&view=rev
Log:
Support tika:link and tika:uti mimetype extensions, along with unit tests. Modified version of the patch from TIKA-1012
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Mon Feb 4 16:36:42 2013
@@ -17,7 +17,9 @@
package org.apache.tika.mime;
import java.io.Serializable;
+import java.net.URI;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -75,6 +77,21 @@ public final class MimeType implements C
private final MediaType type;
/**
+ * The MimeType acronym
+ */
+ private String acronym = "";
+
+ /**
+ * The http://en.wikipedia.org/wiki/Uniform_Type_Identifier
+ */
+ private String uti = "";
+
+ /**
+ * Documentation Links
+ */
+ private List<URI> links = Collections.emptyList();
+
+ /**
* Description of this media type.
*/
private String description = "";
@@ -148,6 +165,75 @@ public final class MimeType implements C
}
this.description = description;
}
+
+
+ /**
+ * Returns an acronym for this mime type.
+ *
+ * @return mime type acronym
+ */
+ public String getAcronym() {
+ return acronym;
+ }
+
+ /**
+ * Set an acronym for the mime type
+ *
+ * @param acronym
+ */
+ void setAcronym(String v) {
+ if (v == null) {
+ throw new IllegalArgumentException("Acronym is missing");
+ }
+ acronym = v;
+ }
+
+ /**
+ * Get the UTI for this mime type.
+ *
+ * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier
+ *
+ * @return The Uniform Type Identifier
+ */
+ public String getUniformTypeIdentifier() {
+ return uti;
+ }
+
+ /**
+ * Set The Uniform Type Identifier
+ *
+ * @param uti
+ */
+ void setUniformTypeIdentifier(String v) {
+ if (v == null) {
+ throw new IllegalArgumentException("Uniform Type Identifier is missing");
+ }
+ uti = v;
+ }
+
+ /**
+ * Get a list of links to help document this mime type
+ *
+ * @return an array of links (will never be null)
+ */
+ public List<URI> getLinks() {
+ return links; // this is already unmodifiable
+ }
+
+ /**
+ * Add a link to this mime type
+ * @param link
+ */
+ void addLink(URI link) {
+ if(link==null) {
+ throw new IllegalArgumentException("Missing Link");
+ }
+ List<URI> copy = new ArrayList<URI>(links.size()+1);
+ copy.addAll(links);
+ copy.add(link);
+ links = Collections.unmodifiableList(copy);
+ }
+
/**
* Add some rootXML info to this mime-type
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Mon Feb 4 16:36:42 2013
@@ -19,9 +19,10 @@ package org.apache.tika.mime;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.LinkedList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
@@ -89,6 +90,11 @@ import org.xml.sax.helpers.DefaultHandle
* type CDATA #REQUIRED>
* ]>
* </pre>
+ *
+ * In addition to the standard fields, this will also read two Tika specific fields:
+ * - link
+ * - uti
+ *
*
* @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
*/
@@ -154,7 +160,10 @@ class MimeTypesReader extends DefaultHan
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
- } else if (COMMENT_TAG.equals(qName)) {
+ } else if (ACRONYM_TAG.equals(qName)||
+ COMMENT_TAG.equals(qName)||
+ TIKA_LINK_TAG.equals(qName)||
+ TIKA_UTI_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
@@ -199,6 +208,20 @@ class MimeTypesReader extends DefaultHan
} else if (COMMENT_TAG.equals(qName)) {
type.setDescription(characters.toString().trim());
characters = null;
+ } else if (ACRONYM_TAG.equals(qName)) {
+ type.setAcronym(characters.toString().trim());
+ characters = null;
+ } else if (TIKA_UTI_TAG.equals(qName)) {
+ type.setUniformTypeIdentifier(characters.toString().trim());
+ characters = null;
+ } else if (TIKA_LINK_TAG.equals(qName)) {
+ try {
+ type.addLink(new URI(characters.toString().trim()));
+ }
+ catch (URISyntaxException e) {
+ throw new IllegalArgumentException("unable to parse link: "+characters, e);
+ }
+ characters = null;
} else if (MATCH_TAG.equals(qName)) {
current.stop();
} else if (MAGIC_TAG.equals(qName)) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java Mon Feb 4 16:36:42 2013
@@ -27,6 +27,8 @@ public interface MimeTypesReaderMetKeys
String MIME_TYPE_TYPE_ATTR = "type";
+ String ACRONYM_TAG = "acronym";
+
String COMMENT_TAG = "_comment";
String GLOB_TAG = "glob";
@@ -63,4 +65,7 @@ public interface MimeTypesReaderMetKeys
String LOCAL_NAME_ATTR = "localName";
+ String TIKA_LINK_TAG = "tika:link";
+
+ String TIKA_UTI_TAG = "tika:uti";
}
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Feb 4 16:36:42 2013
@@ -3752,6 +3752,8 @@
<alias type="image/bmp"/>
<acronym>BMP</acronym>
<_comment>Windows bitmap</_comment>
+ <tika:link>http://en.wikipedia.org/wiki/BMP_file_format</tika:link>
+ <tika:uti>com.microsoft.bmp</tika:uti>
<magic priority="50">
<match value="BM" type="string" offset="0">
<match value="0x0100" type="string" offset="26">
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java Mon Feb 4 16:36:42 2013
@@ -119,6 +119,17 @@ public class MimeTypesReaderTest extends
}
/**
+ * @since TIKA-1012
+ */
+ public void testReadExtendedMetadata() throws Exception {
+ MimeType bmp = this.mimeTypes.forName("image/x-ms-bmp");
+ assertEquals("BMP", bmp.getAcronym());
+ assertEquals("com.microsoft.bmp", bmp.getUniformTypeIdentifier());
+ assertEquals("http://en.wikipedia.org/wiki/BMP_file_format",
+ bmp.getLinks().get(0).toString());
+ }
+
+ /**
* TIKA-746 Ensures that the custom mimetype maps were also
* loaded and used
*/
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java?rev=1442168&r1=1442167&r2=1442168&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java Mon Feb 4 16:36:42 2013
@@ -63,13 +63,34 @@ public class MimeTypeTest extends TestCa
}
/** Test MimeType setDescription() */
- public void testSetDescription() {
+ public void testSetEmptyValues() {
try {
text.setDescription(null);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expected result
}
+
+ try {
+ text.setAcronym(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+
+ try {
+ text.addLink(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+
+ try {
+ text.setUniformTypeIdentifier(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
}
}