You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/01 18:32:11 UTC
svn commit: r1663133 -
/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Author: nick
Date: Sun Mar 1 17:32:10 2015
New Revision: 1663133
URL: http://svn.apache.org/r1663133
Log:
Add a Tika CLI option for comparing with the File(1) magic directory, to report types to consider adding, and types we may be able to get magic for TIKA-289
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1663133&r1=1663132&r2=1663133&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Sun Mar 1 17:32:10 2015
@@ -21,10 +21,14 @@ import javax.xml.transform.TransformerCo
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+
+import java.io.BufferedReader;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
@@ -46,6 +50,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
+import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -78,7 +83,9 @@ import org.apache.tika.metadata.serializ
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.NetworkParser;
@@ -356,6 +363,9 @@ public class TikaCLI {
} else if(arg.equals("--list-supported-types")){
pipeMode = false;
displaySupportedTypes();
+ } else if (arg.startsWith("--compare-file-magic=")) {
+ pipeMode = false;
+ compareFileMagic(arg.substring(arg.indexOf('=')+1));
} else if (arg.equals("--container-aware")
|| arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
@@ -530,6 +540,9 @@ public class TikaCLI {
out.println(" --list-supported-types");
out.println(" List all known media types and related information");
out.println();
+ out.println();
+ out.println(" --compare-file-magic=<dir>");
+ out.println(" Compares Tika's known media types to the File(1) tool's magic directory");
out.println("Description:");
out.println(" Apache Tika will parse the file(s) specified on the");
out.println(" command line and output the extracted text content");
@@ -708,6 +721,115 @@ public class TikaCLI {
}
}
}
+
+ /**
+ * Compares our mime types registry with the File(1) tool's
+ * directory of (uncompiled) Magic entries.
+ * (Well, those with mimetypes anyway)
+ * @param magicDir Path to the magic directory
+ */
+ private void compareFileMagic(String magicDir) throws Exception {
+ Set<String> tikaLacking = new TreeSet<String>();
+ Set<String> tikaNoMagic = new TreeSet<String>();
+
+ // Sanity check
+ File dir = new File(magicDir);
+ if ((new File(dir, "elf")).exists() &&
+ (new File(dir, "mime")).exists() &&
+ (new File(dir, "vorbis")).exists()) {
+ // Looks plausible
+ } else {
+ throw new IllegalArgumentException(
+ magicDir + " doesn't seem to hold uncompressed file magic entries");
+ }
+
+ // Find all the mimetypes in the directory
+ Set<String> fileMimes = new HashSet<String>();
+ for (File mf : dir.listFiles()) {
+ if (mf.isFile()) {
+ BufferedReader r = new BufferedReader(new InputStreamReader(
+ new FileInputStream(mf), IOUtils.UTF_8));
+ String line;
+ while ((line = r.readLine()) != null) {
+ if (line.startsWith("!:mime") ||
+ line.startsWith("#!:mime")) {
+ String mime = line.substring(7).trim();
+ fileMimes.add(mime);
+ }
+ }
+ r.close();
+ }
+ }
+
+ // See how those compare to the Tika ones
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ MimeTypes mimeTypes = config.getMimeRepository();
+ MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ for (String mime : fileMimes) {
+ try {
+ MimeType type = mimeTypes.getRegisteredMimeType(mime);
+
+ if (type == null) {
+ // Tika doesn't know about this one
+ tikaLacking.add(mime);
+ } else {
+ // Tika knows about this one!
+ // Check for magic on this, or parents
+ // TODO What about magic on children?
+ boolean hasMagic = false;
+ while (type != null && !hasMagic) {
+ if (type.hasMagic()) {
+ // Has magic, fine
+ hasMagic = true;
+ } else {
+ // Check the parent next
+ MediaType parent = registry.getSupertype(type.getType());
+ if (parent == MediaType.APPLICATION_XML ||
+ parent == MediaType.TEXT_PLAIN ||
+ parent == MediaType.OCTET_STREAM) {
+ // Stop checking parents if we hit a top level type
+ parent = null;
+ }
+ if (parent != null) {
+ type = mimeTypes.getRegisteredMimeType(parent.toString());
+ } else {
+ type = null;
+ }
+ }
+ }
+ if (!hasMagic) {
+ tikaNoMagic.add(mime);
+ }
+ }
+ } catch (MimeTypeException e) {
+ // Broken entry in the file magic directory
+ // Silently skip
+ }
+ }
+
+ // Check how many tika knows about
+ int tikaTypes = 0;
+ int tikaAliases = 0;
+ for (MediaType type : registry.getTypes()) {
+ tikaTypes++;
+ tikaAliases += registry.getAliases(type).size();
+ }
+
+ // Report
+ System.out.println("Tika knows about " + tikaTypes + " unique mime types");
+ System.out.println("Tika knows about " + (tikaTypes+tikaAliases) + " mime types including aliases");
+ System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
+ System.out.println();
+ System.out.println("The following mime types are known to File but not Tika:");
+ for (String mime : tikaLacking) {
+ System.out.println(" " + mime);
+ }
+ System.out.println();
+ System.out.println("The following mime types from File have no Tika magic (but their children might):");
+ for (String mime : tikaNoMagic) {
+ System.out.println(" " + mime);
+ }
+ }
/**
* Returns a output writer with the given encoding.