You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/05/02 17:31:53 UTC

svn commit: r1677327 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/mime/ test/java/org/apache/tika/mime/

Author: nick
Date: Sat May  2 15:31:53 2015
New Revision: 1677327

URL: http://svn.apache.org/r1677327
Log:
Update whitespace to match coding conventions

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java Sat May  2 15:31:53 2015
@@ -39,510 +39,503 @@ public class ProbabilisticMimeDetectionS
 
     private MimeTypes mimeTypes;
 
-	private final MediaType rootMediaType;
+    private final MediaType rootMediaType;
 
-	/** probability parameters default value */
-	private static final float DEFAULT_MAGIC_TRUST = 0.9f;
-	private static final float DEFAULT_META_TRUST = 0.8f;
-	private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
-	private float priorMagicFileType, priorExtensionFileType,
-			priorMetaFileType;
-	private float magic_trust, extension_trust, meta_trust;
-	private float magic_neg, extension_neg, meta_neg;
-	/*
-	 * any posterior probability lower than the threshold, will be considered as
-	 * an oct-stream type, the default value is 0.5
-	 */
-	private float threshold;
-
-	/*
-	 * this change rate is used when there are multiple types predicted by
-	 * magic-bytes. the first predicted type has the highest probability, and
-	 * the probability for the next type predicted by magic-bytes will decay
-	 * with this change rate. The idea is to have the first one to take
-	 * precedence among the multiple possible types predicted by MAGIC-bytes.
-	 */
-	private float changeRate;
-
-	/** ***********************/
-
-	public ProbabilisticMimeDetectionSelector() {
-		this(MimeTypes.getDefaultMimeTypes(), null);
-	}
-
-	public ProbabilisticMimeDetectionSelector(final Builder builder) {
-		this(MimeTypes.getDefaultMimeTypes(), builder);
-	}
-	
-	public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes) {
-		this(mimeTypes, null);
-	} 
-	
-	public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes,
-			final Builder builder) {
-		this.mimeTypes = mimeTypes;
-		rootMediaType = MediaType.OCTET_STREAM;
-		this.initializeDefaultProbabilityParameters();
-		this.changeRate = 0.1f;
-		if (builder != null) {
-			priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType
-					: builder.priorMagicFileType;
-			priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType
-					: builder.priorExtensionFileType;
-			priorMetaFileType = builder.priorMetaFileType == 0f ? priorMetaFileType
-					: builder.priorMetaFileType;
-
-			magic_trust = builder.magic_trust == 0f ? magic_trust
-					: builder.extension_neg;
-			extension_trust = builder.extension_trust == 0f ? extension_trust
-					: builder.extension_trust;
-			meta_trust = builder.meta_trust == 0f ? meta_trust
-					: builder.meta_trust;
-
-			magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg;
-			extension_neg = builder.extension_neg == 0f ? extension_neg
-					: builder.extension_neg;
-			meta_neg = builder.meta_neg == 0f ? meta_neg : builder.meta_neg;
-			threshold = builder.threshold == 0f ? threshold : builder.threshold;
-		}
-	}
-
-	/**
-	 * Initilize probability parameters with default values;
-	 */
-	private void initializeDefaultProbabilityParameters() {
-		priorMagicFileType = 0.5f;
-		priorExtensionFileType = 0.5f;
-		priorMetaFileType = 0.5f;
-		magic_trust = DEFAULT_MAGIC_TRUST;
-		extension_trust = DEFAULT_EXTENSION_TRUST;
-		meta_trust = DEFAULT_META_TRUST;
-
-		// probability of the type detected by magic test given that the type is
-		// not the detected type. The default is taken by 1 - the magic trust
-		magic_neg = 1 - DEFAULT_MAGIC_TRUST;
-		// probability of the type detected by extension test given that the
-		// type is not the type detected by extension test
-		extension_neg = 1 - DEFAULT_EXTENSION_TRUST;
-		// same as above; but it could be customized to suffice different use.
-		meta_neg = 1 - DEFAULT_META_TRUST;
-		threshold = 0.5001f;
-	}
-
-	public MediaType detect(InputStream input, Metadata metadata)
-			throws IOException {
-		List<MimeType> possibleTypes = null;
-
-		// Get type based on magic prefix
-		if (input != null) {
-			input.mark(mimeTypes.getMinLength());
-			try {
-				byte[] prefix = mimeTypes.readMagicHeader(input);
-				possibleTypes = mimeTypes.getMimeType(prefix);
-			} finally {
-				input.reset();
-			}
-		}
-
-		MimeType extHint = null;
-		// Get type based on resourceName hint (if available)
-		String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
-		if (resourceName != null) {
-			String name = null;
-
-			// Deal with a URI or a path name in as the resource name
-			try {
-				URI uri = new URI(resourceName);
-				String path = uri.getPath();
-				if (path != null) {
-					int slash = path.lastIndexOf('/');
-					if (slash + 1 < path.length()) {
-						name = path.substring(slash + 1);
-					}
-				}
-			} catch (URISyntaxException e) {
-				name = resourceName;
-			}
-
-			if (name != null) {
-				// MimeType hint = getMimeType(name);
-				extHint = mimeTypes.getMimeType(name);
-				// If we have some types based on mime magic, try to specialise
-				// and/or select the type based on that
-				// Otherwise, use the type identified from the name
-				// possibleTypes = applyHint(possibleTypes, hint);
-			}
-		}
-
-		// Get type based on metadata hint (if available)
-		MimeType metaHint = null;
-		String typeName = metadata.get(Metadata.CONTENT_TYPE);
-		if (typeName != null) {
-			try {
-				// MimeType hint = forName(typeName);
-				metaHint = mimeTypes.forName(typeName);
-				// possibleTypes = applyHint(possibleTypes, hint);
-			} catch (MimeTypeException e) {
-				// Malformed type name, ignore
-			}
-		}
-
-		/*
-		 * the following calls the probability selection.
-		 */
-		return applyProbilities(possibleTypes, extHint, metaHint);
-	}
-
-	private MediaType applyProbilities(final List<MimeType> possibleTypes,
-			final MimeType extMimeType, final MimeType metadataMimeType) {
-
-		/* initialize some probability variables */
-		MediaType extensionMediaType_ = extMimeType == null ? null
-				: extMimeType.getType();
-		MediaType metaMediaType_ = metadataMimeType == null ? null
-				: metadataMimeType.getType();
-
-		int n = possibleTypes.size();
-		float mag_trust = magic_trust;
-		float mag_neg = magic_neg;
-		float ext_trust = extension_trust;
-		float ext_neg = extension_neg;
-		float met_trust = meta_trust;
-		float met_neg = meta_neg;
-		/* ************************** */
-
-		/* pre-process some probability variables */
-		if (extensionMediaType_ == null
-				|| extensionMediaType_.compareTo(rootMediaType) == 0) {
-			/*
-			 * this is a root type, that means the extension method fails to
-			 * identify any type.
-			 */
-			ext_trust = 1;
-			ext_neg = 1;
-		}
-		if (metaMediaType_ == null
-				|| metaMediaType_.compareTo(rootMediaType) == 0) {
-			met_trust = 1;
-			met_neg = 1;
-		}
-
-		float maxProb = -1f;
-		MediaType bestEstimate = rootMediaType;
-
-		if (possibleTypes != null && !possibleTypes.isEmpty()) {
-			int i;
-			for (i = 0; i < n; i++) {
-				MediaType magictype = possibleTypes.get(i).getType();
-				MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
-				if (magictype != null && magictype.equals(rootMediaType)) {
-					mag_trust = 1;
-					mag_neg = 1;
-				} else {
-					// check if each identified type belongs to the same class;
-					if (extensionMediaType_ != null) {
-						if (extensionMediaType_.equals(magictype)
-								|| registry.isSpecializationOf(
-										extensionMediaType_, magictype)) {
-							// Use just this type
-							possibleTypes.set(i, extMimeType);
-						} else if (registry.isSpecializationOf(magictype,
-								extensionMediaType_)) {
-							extensionMediaType_ = magictype;
-						}
-					}
-					if (metaMediaType_ != null) {
-						if (metaMediaType_.equals(magictype)
-								|| registry.isSpecializationOf(metaMediaType_,
-										magictype)) {
-							// Use just this type
-							possibleTypes.set(i, metadataMimeType);
-						} else if (registry.isSpecializationOf(magictype,
-								metaMediaType_)) {
-							metaMediaType_ = magictype;
-						}
-					}
-				}
-
-				/*
-				 * prepare the conditional probability for file type prediction.
-				 */
-
-				float[] results = new float[3];
-				float[] trust1 = new float[3];
-				float[] negtrust1 = new float[3];
-				magictype = possibleTypes.get(i).getType();
-
-				if (i > 0) {
-					/*
-					 * decay as our trust goes down with next type predicted by
-					 * magic
-					 */
-					mag_trust = mag_trust * (1 - changeRate);
-					/*
-					 * grow as our trust goes down
-					 */
-					mag_neg = mag_neg * (1 + changeRate);
-
-				}
-
-				if (magictype != null && mag_trust != 1) {
-					trust1[0] = mag_trust;
-					negtrust1[0] = mag_neg;
-					if (metaMediaType_ != null && met_trust != 1) {
-						if (magictype.equals(metaMediaType_)) {
-							trust1[1] = met_trust;
-							negtrust1[1] = met_neg;
-						} else {
-							trust1[1] = 1 - met_trust;
-							negtrust1[1] = 1 - met_neg;
-						}
-					} else {
-						trust1[1] = 1;
-						negtrust1[1] = 1;
-					}
-					if (extensionMediaType_ != null && ext_trust != 1) {
-						if (magictype.equals(extensionMediaType_)) {
-							trust1[2] = ext_trust;
-							negtrust1[2] = ext_neg;
-						} else {
-							trust1[2] = 1 - ext_trust;
-							negtrust1[2] = 1 - ext_neg;
-						}
-					} else {
-						trust1[2] = 1;
-						negtrust1[2] = 1;
-					}
-				} else {
-					results[0] = 0.1f;
-				}
-
-				float[] trust2 = new float[3];
-				float[] negtrust2 = new float[3];
-				if (metadataMimeType != null && met_trust != 1) {
-					trust2[1] = met_trust;
-					negtrust2[1] = met_neg;
-					if (magictype != null && mag_trust != 1) {
-						if (metaMediaType_.equals(magictype)) {
-							trust2[0] = mag_trust;
-							negtrust2[0] = mag_neg;
-						} else {
-							trust2[0] = 1 - mag_trust;
-							negtrust2[0] = 1 - mag_neg;
-						}
-
-					} else {
-						trust2[0] = 1f;
-						negtrust2[0] = 1f;
-					}
-					if (extensionMediaType_ != null && ext_trust != 1) {
-						if (metaMediaType_.equals(extensionMediaType_)) {
-							trust2[2] = ext_trust;
-							negtrust2[2] = ext_neg;
-						} else {
-							trust2[2] = 1 - ext_trust;
-							negtrust2[2] = 1 - ext_neg;
-						}
-					} else {
-						trust2[2] = 1f;
-						negtrust2[2] = 1f;
-					}
-				} else {
-					results[1] = 0.1f;
-				}
-
-				float[] trust3 = new float[3];
-				float[] negtrust3 = new float[3];
-				if (extensionMediaType_ != null && ext_trust != 1) {
-					trust3[2] = ext_trust;
-					negtrust3[2] = ext_neg;
-					if (magictype != null && mag_trust != 1) {
-						if (magictype.equals(extensionMediaType_)) {
-							trust3[0] = mag_trust;
-							negtrust3[0] = mag_neg;
-						} else {
-							trust3[0] = 1 - mag_trust;
-							negtrust3[0] = 1 - mag_neg;
-						}
-					} else {
-						trust3[0] = 1f;
-						negtrust3[0] = 1f;
-					}
-
-					if (metaMediaType_ != null && met_trust != 1) {
-						if (metaMediaType_.equals(extensionMediaType_)) {
-							trust3[1] = met_trust;
-							negtrust3[1] = met_neg;
-						} else {
-							trust3[1] = 1 - met_trust;
-							negtrust3[1] = 1 - met_neg;
-						}
-					} else {
-						trust3[1] = 1f;
-						negtrust3[1] = 1f;
-					}
-				} else {
-					results[2] = 0.1f;
-				}
-				/*
-				 * compute the posterior probability for each predicted file
-				 * type and store them into the "results" array.
-				 */
-				float pPrime = priorMagicFileType;
-				float deno = 1 - priorMagicFileType;
-				int j;
-
-				if (results[0] == 0) {
-					for (j = 0; j < trust1.length; j++) {
-						pPrime *= trust1[j];
-						if (trust1[j] != 1) {
-							deno *= negtrust1[j];
-						}
-					}
-					pPrime /= (pPrime + deno);
-					results[0] = pPrime;
-
-				}
-				if (maxProb < results[0]) {
-					maxProb = results[0];
-					bestEstimate = magictype;
-				}
-
-				pPrime = priorMetaFileType;
-				deno = 1 - priorMetaFileType;
-				if (results[1] == 0) {
-					for (j = 0; j < trust2.length; j++) {
-						pPrime *= trust2[j];
-						if (trust2[j] != 1) {
-							deno *= negtrust2[j];
-						}
-					}
-					pPrime /= (pPrime + deno);
-					results[1] = pPrime;
-
-				}
-				if (maxProb < results[1]) {
-					maxProb = results[1];
-					bestEstimate = metaMediaType_;
-				}
-
-				pPrime = priorExtensionFileType;
-				deno = 1 - priorExtensionFileType;
-				if (results[2] == 0) {
-					for (j = 0; j < trust3.length; j++) {
-						pPrime *= trust3[j];
-						if (trust3[j] != 1) {
-							deno *= negtrust3[j];
-						}
-					}
-					pPrime /= (pPrime + deno);
-					results[2] = pPrime;
-				}
-				if (maxProb < results[2]) {
-					maxProb = results[2];
-					bestEstimate = extensionMediaType_;
-				}
-/*
+    /** probability parameters default value */
+    private static final float DEFAULT_MAGIC_TRUST = 0.9f;
+    private static final float DEFAULT_META_TRUST = 0.8f;
+    private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
+    private float priorMagicFileType, priorExtensionFileType,
+    priorMetaFileType;
+    private float magic_trust, extension_trust, meta_trust;
+    private float magic_neg, extension_neg, meta_neg;
+    /*
+     * any posterior probability lower than the threshold, will be considered as
+     * an oct-stream type, the default value is 0.5
+     */
+    private float threshold;
+
+    /*
+     * this change rate is used when there are multiple types predicted by
+     * magic-bytes. the first predicted type has the highest probability, and
+     * the probability for the next type predicted by magic-bytes will decay
+     * with this change rate. The idea is to have the first one to take
+     * precedence among the multiple possible types predicted by MAGIC-bytes.
+     */
+    private float changeRate;
+
+    /** ***********************/
+
+    public ProbabilisticMimeDetectionSelector() {
+        this(MimeTypes.getDefaultMimeTypes(), null);
+    }
+
+    public ProbabilisticMimeDetectionSelector(final Builder builder) {
+        this(MimeTypes.getDefaultMimeTypes(), builder);
+    }
+
+    public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes) {
+        this(mimeTypes, null);
+    } 
+
+    public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes,
+            final Builder builder) {
+        this.mimeTypes = mimeTypes;
+        rootMediaType = MediaType.OCTET_STREAM;
+        this.initializeDefaultProbabilityParameters();
+        this.changeRate = 0.1f;
+        if (builder != null) {
+            priorMagicFileType = builder.priorMagicFileType == 0f ? 
+                    priorMagicFileType : builder.priorMagicFileType;
+            priorExtensionFileType = builder.priorExtensionFileType == 0f ? 
+                    priorExtensionFileType : builder.priorExtensionFileType;
+            priorMetaFileType = builder.priorMetaFileType == 0f ? 
+                    priorMetaFileType : builder.priorMetaFileType;
+
+            magic_trust = builder.magic_trust == 0f ? magic_trust : builder.extension_neg;
+            extension_trust = builder.extension_trust == 0f ? extension_trust : builder.extension_trust;
+            meta_trust = builder.meta_trust == 0f ? meta_trust : builder.meta_trust;
+
+            magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg;
+            extension_neg = builder.extension_neg == 0f ? 
+                    extension_neg : builder.extension_neg;
+            meta_neg = builder.meta_neg == 0f ? meta_neg : builder.meta_neg;
+            threshold = builder.threshold == 0f ? threshold : builder.threshold;
+        }
+    }
+
+    /**
+     * Initilize probability parameters with default values;
+     */
+    private void initializeDefaultProbabilityParameters() {
+        priorMagicFileType = 0.5f;
+        priorExtensionFileType = 0.5f;
+        priorMetaFileType = 0.5f;
+        magic_trust = DEFAULT_MAGIC_TRUST;
+        extension_trust = DEFAULT_EXTENSION_TRUST;
+        meta_trust = DEFAULT_META_TRUST;
+
+        // probability of the type detected by magic test given that the type is
+        // not the detected type. The default is taken by 1 - the magic trust
+        magic_neg = 1 - DEFAULT_MAGIC_TRUST;
+        // probability of the type detected by extension test given that the
+        // type is not the type detected by extension test
+        extension_neg = 1 - DEFAULT_EXTENSION_TRUST;
+        // same as above; but it could be customized to suffice different use.
+        meta_neg = 1 - DEFAULT_META_TRUST;
+        threshold = 0.5001f;
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        List<MimeType> possibleTypes = null;
+
+        // Get type based on magic prefix
+        if (input != null) {
+            input.mark(mimeTypes.getMinLength());
+            try {
+                byte[] prefix = mimeTypes.readMagicHeader(input);
+                possibleTypes = mimeTypes.getMimeType(prefix);
+            } finally {
+                input.reset();
+            }
+        }
+
+        MimeType extHint = null;
+        // Get type based on resourceName hint (if available)
+        String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (resourceName != null) {
+            String name = null;
+
+            // Deal with a URI or a path name in as the resource name
+            try {
+                URI uri = new URI(resourceName);
+                String path = uri.getPath();
+                if (path != null) {
+                    int slash = path.lastIndexOf('/');
+                    if (slash + 1 < path.length()) {
+                        name = path.substring(slash + 1);
+                    }
+                }
+            } catch (URISyntaxException e) {
+                name = resourceName;
+            }
+
+            if (name != null) {
+                // MimeType hint = getMimeType(name);
+                extHint = mimeTypes.getMimeType(name);
+                // If we have some types based on mime magic, try to specialise
+                // and/or select the type based on that
+                // Otherwise, use the type identified from the name
+                // possibleTypes = applyHint(possibleTypes, hint);
+            }
+        }
+
+        // Get type based on metadata hint (if available)
+        MimeType metaHint = null;
+        String typeName = metadata.get(Metadata.CONTENT_TYPE);
+        if (typeName != null) {
+            try {
+                // MimeType hint = forName(typeName);
+                metaHint = mimeTypes.forName(typeName);
+                // possibleTypes = applyHint(possibleTypes, hint);
+            } catch (MimeTypeException e) {
+                // Malformed type name, ignore
+            }
+        }
+
+        /*
+         * the following calls the probability selection.
+         */
+        return applyProbilities(possibleTypes, extHint, metaHint);
+    }
+
+    private MediaType applyProbilities(final List<MimeType> possibleTypes,
+            final MimeType extMimeType, final MimeType metadataMimeType) {
+
+        /* initialize some probability variables */
+        MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
+        MediaType metaMediaType_ = metadataMimeType == null ? null : metadataMimeType.getType();
+
+        int n = possibleTypes.size();
+        float mag_trust = magic_trust;
+        float mag_neg = magic_neg;
+        float ext_trust = extension_trust;
+        float ext_neg = extension_neg;
+        float met_trust = meta_trust;
+        float met_neg = meta_neg;
+        /* ************************** */
+
+        /* pre-process some probability variables */
+        if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) {
+            /*
+             * this is a root type, that means the extension method fails to
+             * identify any type.
+             */
+            ext_trust = 1;
+            ext_neg = 1;
+        }
+        if (metaMediaType_ == null || metaMediaType_.compareTo(rootMediaType) == 0) {
+            met_trust = 1;
+            met_neg = 1;
+        }
+
+        float maxProb = -1f;
+        MediaType bestEstimate = rootMediaType;
+
+        if (possibleTypes != null && !possibleTypes.isEmpty()) {
+            int i;
+            for (i = 0; i < n; i++) {
+                MediaType magictype = possibleTypes.get(i).getType();
+                MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+                if (magictype != null && magictype.equals(rootMediaType)) {
+                    mag_trust = 1;
+                    mag_neg = 1;
+                } else {
+                    // check if each identified type belongs to the same class;
+                    if (extensionMediaType_ != null) {
+                        if (extensionMediaType_.equals(magictype)
+                                || registry.isSpecializationOf(
+                                        extensionMediaType_, magictype)) {
+                            // Use just this type
+                            possibleTypes.set(i, extMimeType);
+                        } else if (registry.isSpecializationOf(magictype,
+                                extensionMediaType_)) {
+                            extensionMediaType_ = magictype;
+                        }
+                    }
+                    if (metaMediaType_ != null) {
+                        if (metaMediaType_.equals(magictype)
+                                || registry.isSpecializationOf(metaMediaType_,
+                                        magictype)) {
+                            // Use just this type
+                            possibleTypes.set(i, metadataMimeType);
+                        } else if (registry.isSpecializationOf(magictype,
+                                metaMediaType_)) {
+                            metaMediaType_ = magictype;
+                        }
+                    }
+                }
+
+                /*
+                 * prepare the conditional probability for file type prediction.
+                 */
+
+                float[] results = new float[3];
+                float[] trust1 = new float[3];
+                float[] negtrust1 = new float[3];
+                magictype = possibleTypes.get(i).getType();
+
+                if (i > 0) {
+                    /*
+                     * decay as our trust goes down with next type predicted by
+                     * magic
+                     */
+                    mag_trust = mag_trust * (1 - changeRate);
+                    /*
+                     * grow as our trust goes down
+                     */
+                    mag_neg = mag_neg * (1 + changeRate);
+
+                }
+
+                if (magictype != null && mag_trust != 1) {
+                    trust1[0] = mag_trust;
+                    negtrust1[0] = mag_neg;
+                    if (metaMediaType_ != null && met_trust != 1) {
+                        if (magictype.equals(metaMediaType_)) {
+                            trust1[1] = met_trust;
+                            negtrust1[1] = met_neg;
+                        } else {
+                            trust1[1] = 1 - met_trust;
+                            negtrust1[1] = 1 - met_neg;
+                        }
+                    } else {
+                        trust1[1] = 1;
+                        negtrust1[1] = 1;
+                    }
+                    if (extensionMediaType_ != null && ext_trust != 1) {
+                        if (magictype.equals(extensionMediaType_)) {
+                            trust1[2] = ext_trust;
+                            negtrust1[2] = ext_neg;
+                        } else {
+                            trust1[2] = 1 - ext_trust;
+                            negtrust1[2] = 1 - ext_neg;
+                        }
+                    } else {
+                        trust1[2] = 1;
+                        negtrust1[2] = 1;
+                    }
+                } else {
+                    results[0] = 0.1f;
+                }
+
+                float[] trust2 = new float[3];
+                float[] negtrust2 = new float[3];
+                if (metadataMimeType != null && met_trust != 1) {
+                    trust2[1] = met_trust;
+                    negtrust2[1] = met_neg;
+                    if (magictype != null && mag_trust != 1) {
+                        if (metaMediaType_.equals(magictype)) {
+                            trust2[0] = mag_trust;
+                            negtrust2[0] = mag_neg;
+                        } else {
+                            trust2[0] = 1 - mag_trust;
+                            negtrust2[0] = 1 - mag_neg;
+                        }
+
+                    } else {
+                        trust2[0] = 1f;
+                        negtrust2[0] = 1f;
+                    }
+                    if (extensionMediaType_ != null && ext_trust != 1) {
+                        if (metaMediaType_.equals(extensionMediaType_)) {
+                            trust2[2] = ext_trust;
+                            negtrust2[2] = ext_neg;
+                        } else {
+                            trust2[2] = 1 - ext_trust;
+                            negtrust2[2] = 1 - ext_neg;
+                        }
+                    } else {
+                        trust2[2] = 1f;
+                        negtrust2[2] = 1f;
+                    }
+                } else {
+                    results[1] = 0.1f;
+                }
+
+                float[] trust3 = new float[3];
+                float[] negtrust3 = new float[3];
+                if (extensionMediaType_ != null && ext_trust != 1) {
+                    trust3[2] = ext_trust;
+                    negtrust3[2] = ext_neg;
+                    if (magictype != null && mag_trust != 1) {
+                        if (magictype.equals(extensionMediaType_)) {
+                            trust3[0] = mag_trust;
+                            negtrust3[0] = mag_neg;
+                        } else {
+                            trust3[0] = 1 - mag_trust;
+                            negtrust3[0] = 1 - mag_neg;
+                        }
+                    } else {
+                        trust3[0] = 1f;
+                        negtrust3[0] = 1f;
+                    }
+
+                    if (metaMediaType_ != null && met_trust != 1) {
+                        if (metaMediaType_.equals(extensionMediaType_)) {
+                            trust3[1] = met_trust;
+                            negtrust3[1] = met_neg;
+                        } else {
+                            trust3[1] = 1 - met_trust;
+                            negtrust3[1] = 1 - met_neg;
+                        }
+                    } else {
+                        trust3[1] = 1f;
+                        negtrust3[1] = 1f;
+                    }
+                } else {
+                    results[2] = 0.1f;
+                }
+                /*
+                 * compute the posterior probability for each predicted file
+                 * type and store them into the "results" array.
+                 */
+                float pPrime = priorMagicFileType;
+                float deno = 1 - priorMagicFileType;
+                int j;
+
+                if (results[0] == 0) {
+                    for (j = 0; j < trust1.length; j++) {
+                        pPrime *= trust1[j];
+                        if (trust1[j] != 1) {
+                            deno *= negtrust1[j];
+                        }
+                    }
+                    pPrime /= (pPrime + deno);
+                    results[0] = pPrime;
+
+                }
+                if (maxProb < results[0]) {
+                    maxProb = results[0];
+                    bestEstimate = magictype;
+                }
+
+                pPrime = priorMetaFileType;
+                deno = 1 - priorMetaFileType;
+                if (results[1] == 0) {
+                    for (j = 0; j < trust2.length; j++) {
+                        pPrime *= trust2[j];
+                        if (trust2[j] != 1) {
+                            deno *= negtrust2[j];
+                        }
+                    }
+                    pPrime /= (pPrime + deno);
+                    results[1] = pPrime;
+
+                }
+                if (maxProb < results[1]) {
+                    maxProb = results[1];
+                    bestEstimate = metaMediaType_;
+                }
+
+                pPrime = priorExtensionFileType;
+                deno = 1 - priorExtensionFileType;
+                if (results[2] == 0) {
+                    for (j = 0; j < trust3.length; j++) {
+                        pPrime *= trust3[j];
+                        if (trust3[j] != 1) {
+                            deno *= negtrust3[j];
+                        }
+                    }
+                    pPrime /= (pPrime + deno);
+                    results[2] = pPrime;
+                }
+                if (maxProb < results[2]) {
+                    maxProb = results[2];
+                    bestEstimate = extensionMediaType_;
+                }
+                /*
 				for (float r : results) {
 					System.out.print(r + "; ");
 				}
 				System.out.println();
-*/
-			}
+                 */
+            }
 
-		}
-		return maxProb < threshold ? this.rootMediaType : bestEstimate;
+        }
+        return maxProb < threshold ? this.rootMediaType : bestEstimate;
 
-	}
+    }
 
-	public MediaTypeRegistry getMediaTypeRegistry() {
-		return this.mimeTypes.getMediaTypeRegistry();
-	}
-
-	/**
-	 * build class for probability parameters setting
-	 * 
-	 * 
-	 */
-	public static class Builder {
-		/*
-		 * the following are the prior probabilities for the file type
-		 * identified by each method.
-		 */
-		private float priorMagicFileType, priorExtensionFileType,
-				priorMetaFileType;
-		/*
-		 * the following are the conditional probability for each method with
-		 * positive conditions
-		 */
-		private float magic_trust, extension_trust, meta_trust;
-
-		/*
-		 * the following *_neg are the conditional probabilities with negative
-		 * conditions
-		 */
-		private float magic_neg, extension_neg, meta_neg;
-
-		private float threshold;
-
-		public synchronized Builder priorMagicFileType(final float prior) {
-			this.priorMagicFileType = prior;
-			return this;
-		}
-
-		public synchronized Builder priorExtensionFileType(final float prior) {
-			this.priorExtensionFileType = prior;
-			return this;
-		}
-
-		public synchronized Builder priorMetaFileType(final float prior) {
-			this.priorMetaFileType = prior;
-			return this;
-		}
-
-		public synchronized Builder magic_trust(final float trust) {
-			this.magic_trust = trust;
-			return this;
-		}
-
-		public synchronized Builder extension_trust(final float trust) {
-			this.extension_trust = trust;
-			return this;
-		}
-
-		public synchronized Builder meta_trust(final float trust) {
-			this.meta_trust = trust;
-			return this;
-		}
-
-		public synchronized Builder magic_neg(final float trust) {
-			this.magic_neg = trust;
-			return this;
-		}
-
-		public synchronized Builder extension_neg(final float trust) {
-			this.extension_neg = trust;
-			return this;
-		}
-
-		public synchronized Builder meta_neg(final float trust) {
-			this.meta_neg = trust;
-			return this;
-		}
-
-		public synchronized Builder threshold(final float threshold) {
-			this.threshold = threshold;
-			return this;
-		}
-
-		/**
-		 * initialize the MimeTypes with this builder instance
-		 * 
-		 * @return
-		 */
-		public ProbabilisticMimeDetectionSelector build2() {
-			return new ProbabilisticMimeDetectionSelector(this);
-		}
-	}
+    public MediaTypeRegistry getMediaTypeRegistry() {
+        return this.mimeTypes.getMediaTypeRegistry();
+    }
+
+    /**
+     * build class for probability parameters setting
+     * 
+     * 
+     */
+    public static class Builder {
+        /*
+         * the following are the prior probabilities for the file type
+         * identified by each method.
+         */
+        private float priorMagicFileType, priorExtensionFileType,
+        priorMetaFileType;
+        /*
+         * the following are the conditional probability for each method with
+         * positive conditions
+         */
+        private float magic_trust, extension_trust, meta_trust;
+
+        /*
+         * the following *_neg are the conditional probabilities with negative
+         * conditions
+         */
+        private float magic_neg, extension_neg, meta_neg;
+
+        private float threshold;
+
+        public synchronized Builder priorMagicFileType(final float prior) {
+            this.priorMagicFileType = prior;
+            return this;
+        }
+
+        public synchronized Builder priorExtensionFileType(final float prior) {
+            this.priorExtensionFileType = prior;
+            return this;
+        }
+
+        public synchronized Builder priorMetaFileType(final float prior) {
+            this.priorMetaFileType = prior;
+            return this;
+        }
+
+        public synchronized Builder magic_trust(final float trust) {
+            this.magic_trust = trust;
+            return this;
+        }
+
+        public synchronized Builder extension_trust(final float trust) {
+            this.extension_trust = trust;
+            return this;
+        }
+
+        public synchronized Builder meta_trust(final float trust) {
+            this.meta_trust = trust;
+            return this;
+        }
+
+        public synchronized Builder magic_neg(final float trust) {
+            this.magic_neg = trust;
+            return this;
+        }
+
+        public synchronized Builder extension_neg(final float trust) {
+            this.extension_neg = trust;
+            return this;
+        }
+
+        public synchronized Builder meta_neg(final float trust) {
+            this.meta_neg = trust;
+            return this;
+        }
+
+        public synchronized Builder threshold(final float threshold) {
+            this.threshold = threshold;
+            return this;
+        }
+
+        /**
+         * initialize the MimeTypes with this builder instance
+         * 
+         * @return
+         */
+        public ProbabilisticMimeDetectionSelector build2() {
+            return new ProbabilisticMimeDetectionSelector(this);
+        }
+    }
 
 }

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java Sat May  2 15:31:53 2015
@@ -30,218 +30,218 @@ import org.junit.Before;
 import org.junit.Test;
 
 public class ProbabilisticMimeDetectionTest {
-	private static final Charset UTF8 = Charset.forName("UTF-8");
-	private ProbabilisticMimeDetectionSelector proDetector;
+    private static final Charset UTF8 = Charset.forName("UTF-8");
+    private ProbabilisticMimeDetectionSelector proDetector;
 
-	private MediaTypeRegistry registry;
+    private MediaTypeRegistry registry;
 
-	/** @inheritDoc */
-	@Before
-	public void setUp() {
-		proDetector = new ProbabilisticMimeDetectionSelector();
-		this.registry = proDetector.getMediaTypeRegistry();
-	}
-
-	@Test
-	public void testDetection() throws Exception {
-		testFile("image/svg+xml", "circles.svg");
-		testFile("image/svg+xml", "circles-with-prefix.svg");
-		testFile("image/png", "datamatrix.png");
-		testFile("text/html", "test.html");
-		testFile("application/xml", "test-iso-8859-1.xml");
-		testFile("application/xml", "test-utf8.xml");
-		testFile("application/xml", "test-utf8-bom.xml");
-		testFile("application/xml", "test-utf16le.xml");
-		testFile("application/xml", "test-utf16be.xml");
-		testFile("application/xml", "test-long-comment.xml");
-		testFile("application/xslt+xml", "stylesheet.xsl");
-		testUrl("application/rdf+xml",
-				"http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
-				"test-difficult-rdf1.xml");
-		testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
-				"test-difficult-rdf2.xml");
-		// add evil test from TIKA-327
-		testFile("text/html", "test-tika-327.html");
-		// add another evil html test from TIKA-357
-		testFile("text/html", "testlargerbuffer.html");
-		// test fragment of HTML with <div> (TIKA-1102)
-		testFile("text/html", "htmlfragment");
-		// test binary CGM detection (TIKA-1170)
-		testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
-		// test HTML detection of malformed file, previously identified as
-		// image/cgm (TIKA-1170)
-		testFile("text/html", "test-malformed-header.html.bin");
-	}
-
-	@Test
-	public void testByteOrderMark() throws Exception {
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
-				new Metadata()));
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
-				new Metadata()));
-
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
-				new Metadata()));
-	}
-
-	@Test
-	public void testSuperTypes() {
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.parse("text/something")));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.OCTET_STREAM));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/something+xml"),
-				MediaType.APPLICATION_XML));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/something+zip"),
-				MediaType.APPLICATION_ZIP));
-
-		assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
-				MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/vnd.apple.iwork"),
-				MediaType.APPLICATION_ZIP));
-	}
-
-	@SuppressWarnings("unused")
-	private void testUrlOnly(String expected, String url) throws IOException {
-		InputStream in = new URL(url).openStream();
-		testStream(expected, url, in);
-	}
-
-	private void testUrl(String expected, String url, String file)
-			throws IOException {
-		InputStream in = getClass().getResourceAsStream(file);
-		testStream(expected, url, in);
-	}
-
-	private void testFile(String expected, String filename) throws IOException {
-		InputStream in = getClass().getResourceAsStream(filename);
-		testStream(expected, filename, in);
-	}
-
-	private void testStream(String expected, String urlOrFileName,
-			InputStream in) throws IOException {
-		assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
-		if (!in.markSupported()) {
-			in = new java.io.BufferedInputStream(in);
-		}
-		try {
-			Metadata metadata = new Metadata();
-			String mime = this.proDetector.detect(in, metadata).toString();
-			assertEquals(
-					urlOrFileName + " is not properly detected: detected.",
-					expected, mime);
-
-			// Add resource name and test again
-			metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
-			mime = this.proDetector.detect(in, metadata).toString();
-			assertEquals(urlOrFileName
-					+ " is not properly detected after adding resource name.",
-					expected, mime);
-		} finally {
-			in.close();
-		}
-	}
-
-	private void assertNotNull(String string, InputStream in) {
-		// TODO Auto-generated method stub
-
-	}
-
-	/**
-	 * Test for type detection of empty documents.
-	 * 
-	 * @see <a
-	 *      href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
-	 */
-	@Test
-	public void testEmptyDocument() throws IOException {
-		assertEquals(MediaType.OCTET_STREAM, proDetector.detect(
-				new ByteArrayInputStream(new byte[0]), new Metadata()));
-
-		Metadata namehint = new Metadata();
-		namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream(new byte[0]), namehint));
-
-		Metadata typehint = new Metadata();
-		typehint.set(Metadata.CONTENT_TYPE, "text/plain");
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream(new byte[0]), typehint));
-
-	}
-
-	/**
-	 * Test for things like javascript files whose content is enclosed in XML
-	 * comment delimiters, but that aren't actually XML.
-	 * 
-	 * @see <a
-	 *      href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
-	 */
-	@Test
-	public void testNotXML() throws IOException {
-		assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
-				new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
-				new Metadata()));
-	}
-
-	/**
-	 * Tests that when we repeatedly test the detection of a document that can
-	 * be detected with Mime Magic, that we consistently detect it correctly.
-	 * See TIKA-391 for more details.
-	 */
-	@Test
-	public void testMimeMagicStability() throws IOException {
-		for (int i = 0; i < 100; i++) {
-			testFile("application/vnd.ms-excel", "test.xls");
-		}
-	}
-
-	/**
-	 * Tests that when two magic matches both apply, and both have the same
-	 * priority, we use the name to pick the right one based on the glob, or the
-	 * first one we come across if not. See TIKA-1292 for more details.
-	 */
-	@Test
-	public void testMimeMagicClashSamePriority() throws IOException {
-		byte[] helloWorld = "Hello, World!".getBytes(UTF8);
-		MediaType helloType = MediaType.parse("hello/world-file");
-		MediaType helloXType = MediaType.parse("hello/x-world-hello");
-		Metadata metadata;
-
-		// With a filename, picks the right one
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
-		assertEquals(helloType, proDetector.detect(new ByteArrayInputStream(
-				helloWorld), metadata));
-
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
-		assertEquals(helloXType, proDetector.detect(new ByteArrayInputStream(
-				helloWorld), metadata));
-
-		// Without, goes for the one that sorts last
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
-		assertEquals(helloXType, proDetector.detect(new ByteArrayInputStream(
-				helloWorld), metadata));
-	}
+    /** @inheritDoc */
+    @Before
+    public void setUp() {
+        proDetector = new ProbabilisticMimeDetectionSelector();
+        this.registry = proDetector.getMediaTypeRegistry();
+    }
+
+    @Test
+    public void testDetection() throws Exception {
+        testFile("image/svg+xml", "circles.svg");
+        testFile("image/svg+xml", "circles-with-prefix.svg");
+        testFile("image/png", "datamatrix.png");
+        testFile("text/html", "test.html");
+        testFile("application/xml", "test-iso-8859-1.xml");
+        testFile("application/xml", "test-utf8.xml");
+        testFile("application/xml", "test-utf8-bom.xml");
+        testFile("application/xml", "test-utf16le.xml");
+        testFile("application/xml", "test-utf16be.xml");
+        testFile("application/xml", "test-long-comment.xml");
+        testFile("application/xslt+xml", "stylesheet.xsl");
+        testUrl("application/rdf+xml",
+                "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+                "test-difficult-rdf1.xml");
+        testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
+                "test-difficult-rdf2.xml");
+        // add evil test from TIKA-327
+        testFile("text/html", "test-tika-327.html");
+        // add another evil html test from TIKA-357
+        testFile("text/html", "testlargerbuffer.html");
+        // test fragment of HTML with <div> (TIKA-1102)
+        testFile("text/html", "htmlfragment");
+        // test binary CGM detection (TIKA-1170)
+        testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
+        // test HTML detection of malformed file, previously identified as
+        // image/cgm (TIKA-1170)
+        testFile("text/html", "test-malformed-header.html.bin");
+    }
+
+    @Test
+    public void testByteOrderMark() throws Exception {
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
+                new Metadata()));
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
+                new Metadata()));
+
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
+                new Metadata()));
+    }
+
+    @Test
+    public void testSuperTypes() {
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.parse("text/something")));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.OCTET_STREAM));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/something+xml"),
+                MediaType.APPLICATION_XML));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/something+zip"),
+                MediaType.APPLICATION_ZIP));
+
+        assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
+                MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/vnd.apple.iwork"),
+                MediaType.APPLICATION_ZIP));
+    }
+
+    @SuppressWarnings("unused")
+    private void testUrlOnly(String expected, String url) throws IOException {
+        InputStream in = new URL(url).openStream();
+        testStream(expected, url, in);
+    }
+
+    private void testUrl(String expected, String url, String file)
+            throws IOException {
+        InputStream in = getClass().getResourceAsStream(file);
+        testStream(expected, url, in);
+    }
+
+    private void testFile(String expected, String filename) throws IOException {
+        InputStream in = getClass().getResourceAsStream(filename);
+        testStream(expected, filename, in);
+    }
+
+    private void testStream(String expected, String urlOrFileName,
+            InputStream in) throws IOException {
+        assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
+        if (!in.markSupported()) {
+            in = new java.io.BufferedInputStream(in);
+        }
+        try {
+            Metadata metadata = new Metadata();
+            String mime = this.proDetector.detect(in, metadata).toString();
+            assertEquals(
+                    urlOrFileName + " is not properly detected: detected.",
+                    expected, mime);
+
+            // Add resource name and test again
+            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
+            mime = this.proDetector.detect(in, metadata).toString();
+            assertEquals(urlOrFileName
+                    + " is not properly detected after adding resource name.",
+                    expected, mime);
+        } finally {
+            in.close();
+        }
+    }
+
+    private void assertNotNull(String string, InputStream in) {
+        // TODO Auto-generated method stub
+
+    }
+
+    /**
+     * Test for type detection of empty documents.
+     * 
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+     */
+    @Test
+    public void testEmptyDocument() throws IOException {
+        assertEquals(MediaType.OCTET_STREAM, proDetector.detect(
+                new ByteArrayInputStream(new byte[0]), new Metadata()));
+
+        Metadata namehint = new Metadata();
+        namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream(new byte[0]), namehint));
+
+        Metadata typehint = new Metadata();
+        typehint.set(Metadata.CONTENT_TYPE, "text/plain");
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream(new byte[0]), typehint));
+
+    }
+
+    /**
+     * Test for things like javascript files whose content is enclosed in XML
+     * comment delimiters, but that aren't actually XML.
+     * 
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+     */
+    @Test
+    public void testNotXML() throws IOException {
+        assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+                new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
+                new Metadata()));
+    }
+
+    /**
+     * Tests that when we repeatedly test the detection of a document that can
+     * be detected with Mime Magic, that we consistently detect it correctly.
+     * See TIKA-391 for more details.
+     */
+    @Test
+    public void testMimeMagicStability() throws IOException {
+        for (int i = 0; i < 100; i++) {
+            testFile("application/vnd.ms-excel", "test.xls");
+        }
+    }
+
+    /**
+     * Tests that when two magic matches both apply, and both have the same
+     * priority, we use the name to pick the right one based on the glob, or the
+     * first one we come across if not. See TIKA-1292 for more details.
+     */
+    @Test
+    public void testMimeMagicClashSamePriority() throws IOException {
+        byte[] helloWorld = "Hello, World!".getBytes(UTF8);
+        MediaType helloType = MediaType.parse("hello/world-file");
+        MediaType helloXType = MediaType.parse("hello/x-world-hello");
+        Metadata metadata;
+
+        // With a filename, picks the right one
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
+        assertEquals(helloType, proDetector.detect(
+                new ByteArrayInputStream(helloWorld), metadata));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
+        assertEquals(helloXType, proDetector.detect(
+                new ByteArrayInputStream(helloWorld), metadata));
+
+        // Without, goes for the one that sorts last
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
+        assertEquals(helloXType, proDetector.detect(
+                new ByteArrayInputStream(helloWorld), metadata));
+    }
 }

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Sat May  2 15:31:53 2015
@@ -42,247 +42,247 @@ import org.junit.Before;
 import org.junit.Test;
 
 public class ProbabilisticMimeDetectionTestWithTika {
-	private static final Charset UTF8 = Charset.forName("UTF-8");
-	// private ProbabilisticMimeDetectionSelector proDetector;
-	private Tika tika;
-	private MediaTypeRegistry registry;
-
-	/** @inheritDoc */
-	@Before
-	public void setUp() {
-		try {
-			registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
-			tika = new Tika(new TikaConfig() {
-				@Override
-				protected Detector getDefaultDetector(MimeTypes types,
-						ServiceLoader loader) {
-					/*
-					 * here is an example with the use of the builder to
-					 * instantiate the object.
-					 */
-					Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
-					ProbabilisticMimeDetectionSelector proDetector = new ProbabilisticMimeDetectionSelector(
-							types, builder.priorMagicFileType(0.5f)
-									.priorExtensionFileType(0.5f)
-									.priorMetaFileType(0.5f));
-					return new DefaultProbDetector(proDetector, loader);
-				}
-			});
-		} catch (TikaException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		} catch (IOException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		} finally {
-
-		}
-
-	}
-
-	@Test
-	public void testDetection() throws Exception {
-		testFile("image/svg+xml", "circles.svg");
-		testFile("image/svg+xml", "circles-with-prefix.svg");
-		testFile("image/png", "datamatrix.png");
-		testFile("text/html", "test.html");
-		testFile("application/xml", "test-iso-8859-1.xml");
-		testFile("application/xml", "test-utf8.xml");
-		testFile("application/xml", "test-utf8-bom.xml");
-		testFile("application/xml", "test-utf16le.xml");
-		testFile("application/xml", "test-utf16be.xml");
-		testFile("application/xml", "test-long-comment.xml");
-		testFile("application/xslt+xml", "stylesheet.xsl");
-		testUrl("application/rdf+xml",
-				"http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
-				"test-difficult-rdf1.xml");
-		testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
-				"test-difficult-rdf2.xml");
-		// add evil test from TIKA-327
-		testFile("text/html", "test-tika-327.html");
-		// add another evil html test from TIKA-357
-		testFile("text/html", "testlargerbuffer.html");
-		// test fragment of HTML with <div> (TIKA-1102)
-		testFile("text/html", "htmlfragment");
-		// test binary CGM detection (TIKA-1170)
-		testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
-		// test HTML detection of malformed file, previously identified as
-		// image/cgm (TIKA-1170)
-		testFile("text/html", "test-malformed-header.html.bin");
-	}
-
-	@Test
-	public void testByteOrderMark() throws Exception {
-		assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
-				new Metadata()));
-		assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
-				new Metadata()));
-
-		assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
-				new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
-				new Metadata()));
-	}
-
-	@Test
-	public void testSuperTypes() {
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.parse("text/something")));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something; charset=UTF-8"),
-				MediaType.OCTET_STREAM));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/something+xml"),
-				MediaType.APPLICATION_XML));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/something+zip"),
-				MediaType.APPLICATION_ZIP));
-
-		assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
-				MediaType.TEXT_PLAIN));
-
-		assertTrue(registry.isSpecializationOf(
-				MediaType.parse("application/vnd.apple.iwork"),
-				MediaType.APPLICATION_ZIP));
-	}
-
-	@SuppressWarnings("unused")
-	private void testUrlOnly(String expected, String url) throws IOException {
-		InputStream in = new URL(url).openStream();
-		testStream(expected, url, in);
-	}
-
-	private void testUrl(String expected, String url, String file)
-			throws IOException {
-		InputStream in = getClass().getResourceAsStream(file);
-		testStream(expected, url, in);
-	}
-
-	private void testFile(String expected, String filename) throws IOException {
-		InputStream in = getClass().getResourceAsStream(filename);
-		testStream(expected, filename, in);
-	}
-
-	private void testStream(String expected, String urlOrFileName,
-			InputStream in) throws IOException {
-		assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
-		if (!in.markSupported()) {
-			in = new java.io.BufferedInputStream(in);
-		}
-		try {
-			Metadata metadata = new Metadata();
-			// String mime = this.proDetector.detect(in, metadata).toString();
-			String mime = tika.detect(in, metadata).toString();
-			assertEquals(
-					urlOrFileName + " is not properly detected: detected.",
-					expected, mime);
-
-			// Add resource name and test again
-			metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
-			// mime = this.proDetector.detect(in, metadata).toString();
-			mime = tika.detect(in, metadata).toString();
-			assertEquals(urlOrFileName
-					+ " is not properly detected after adding resource name.",
-					expected, mime);
-		} finally {
-			in.close();
-		}
-	}
-
-	private void assertNotNull(String string, InputStream in) {
-		// TODO Auto-generated method stub
-
-	}
-
-	/**
-	 * Test for type detection of empty documents.
-	 * 
-	 * @see <a
-	 *      href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
-	 */
-	@Test
-	public void testEmptyDocument() throws IOException {
-		assertEquals(MediaType.OCTET_STREAM.toString(), tika.detect(
-				new ByteArrayInputStream(new byte[0]), new Metadata()));
-
-		Metadata namehint = new Metadata();
-		namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
-		assertEquals(MediaType.TEXT_PLAIN.toString(),
-				tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
-
-		Metadata typehint = new Metadata();
-		typehint.set(Metadata.CONTENT_TYPE, "text/plain");
-		assertEquals(MediaType.TEXT_PLAIN.toString(),
-				tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
-
-	}
-
-	/**
-	 * Test for things like javascript files whose content is enclosed in XML
-	 * comment delimiters, but that aren't actually XML.
-	 * 
-	 * @see <a
-	 *      href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
-	 */
-	@Test
-	public void testNotXML() throws IOException {
-		assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
-				new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
-				new Metadata()));
-	}
-
-	/**
-	 * Tests that when we repeatedly test the detection of a document that can
-	 * be detected with Mime Magic, that we consistently detect it correctly.
-	 * See TIKA-391 for more details.
-	 */
-	@Test
-	public void testMimeMagicStability() throws IOException {
-		for (int i = 0; i < 100; i++) {
-			testFile("application/vnd.ms-excel", "test.xls");
-		}
-	}
-
-	/**
-	 * Tests that when two magic matches both apply, and both have the same
-	 * priority, we use the name to pick the right one based on the glob, or the
-	 * first one we come across if not. See TIKA-1292 for more details.
-	 */
-	@Test
-	public void testMimeMagicClashSamePriority() throws IOException {
-		byte[] helloWorld = "Hello, World!".getBytes(UTF8);
-		MediaType helloType = MediaType.parse("hello/world-file");
-		MediaType helloXType = MediaType.parse("hello/x-world-hello");
-		Metadata metadata;
-
-		// With a filename, picks the right one
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
-		assertEquals(helloType.toString(),
-				tika.detect(new ByteArrayInputStream(helloWorld), metadata));
-
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
-		assertEquals(helloXType.toString(),
-				tika.detect(new ByteArrayInputStream(helloWorld), metadata));
-
-		// Without, goes for the one that sorts last
-		metadata = new Metadata();
-		metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
-		assertEquals(helloXType.toString(),
-				tika.detect(new ByteArrayInputStream(helloWorld), metadata));
-	}
+    private static final Charset UTF8 = Charset.forName("UTF-8");
+    // private ProbabilisticMimeDetectionSelector proDetector;
+    private Tika tika;
+    private MediaTypeRegistry registry;
+
+    /** @inheritDoc */
+    @Before
+    public void setUp() {
+        try {
+            registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
+            tika = new Tika(new TikaConfig() {
+                @Override
+                protected Detector getDefaultDetector(MimeTypes types,
+                        ServiceLoader loader) {
+                    /*
+                     * here is an example with the use of the builder to
+                     * instantiate the object.
+                     */
+                    Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
+                    ProbabilisticMimeDetectionSelector proDetector = new ProbabilisticMimeDetectionSelector(
+                            types, builder.priorMagicFileType(0.5f)
+                            .priorExtensionFileType(0.5f)
+                            .priorMetaFileType(0.5f));
+                    return new DefaultProbDetector(proDetector, loader);
+                }
+            });
+        } catch (TikaException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } finally {
+
+        }
+
+    }
+
+    @Test
+    public void testDetection() throws Exception {
+        testFile("image/svg+xml", "circles.svg");
+        testFile("image/svg+xml", "circles-with-prefix.svg");
+        testFile("image/png", "datamatrix.png");
+        testFile("text/html", "test.html");
+        testFile("application/xml", "test-iso-8859-1.xml");
+        testFile("application/xml", "test-utf8.xml");
+        testFile("application/xml", "test-utf8-bom.xml");
+        testFile("application/xml", "test-utf16le.xml");
+        testFile("application/xml", "test-utf16be.xml");
+        testFile("application/xml", "test-long-comment.xml");
+        testFile("application/xslt+xml", "stylesheet.xsl");
+        testUrl("application/rdf+xml",
+                "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+                "test-difficult-rdf1.xml");
+        testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
+                "test-difficult-rdf2.xml");
+        // add evil test from TIKA-327
+        testFile("text/html", "test-tika-327.html");
+        // add another evil html test from TIKA-357
+        testFile("text/html", "testlargerbuffer.html");
+        // test fragment of HTML with <div> (TIKA-1102)
+        testFile("text/html", "htmlfragment");
+        // test binary CGM detection (TIKA-1170)
+        testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
+        // test HTML detection of malformed file, previously identified as
+        // image/cgm (TIKA-1170)
+        testFile("text/html", "test-malformed-header.html.bin");
+    }
+
+    @Test
+    public void testByteOrderMark() throws Exception {
+        assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
+                new Metadata()));
+        assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
+                new Metadata()));
+
+        assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+                new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
+                new Metadata()));
+    }
+
+    @Test
+    public void testSuperTypes() {
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.parse("text/something")));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something; charset=UTF-8"),
+                MediaType.OCTET_STREAM));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/something+xml"),
+                MediaType.APPLICATION_XML));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/something+zip"),
+                MediaType.APPLICATION_ZIP));
+
+        assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
+                MediaType.TEXT_PLAIN));
+
+        assertTrue(registry.isSpecializationOf(
+                MediaType.parse("application/vnd.apple.iwork"),
+                MediaType.APPLICATION_ZIP));
+    }
+
+    @SuppressWarnings("unused")
+    private void testUrlOnly(String expected, String url) throws IOException {
+        InputStream in = new URL(url).openStream();
+        testStream(expected, url, in);
+    }
+
+    private void testUrl(String expected, String url, String file)
+            throws IOException {
+        InputStream in = getClass().getResourceAsStream(file);
+        testStream(expected, url, in);
+    }
+
+    private void testFile(String expected, String filename) throws IOException {
+        InputStream in = getClass().getResourceAsStream(filename);
+        testStream(expected, filename, in);
+    }
+
+    private void testStream(String expected, String urlOrFileName,
+            InputStream in) throws IOException {
+        assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
+        if (!in.markSupported()) {
+            in = new java.io.BufferedInputStream(in);
+        }
+        try {
+            Metadata metadata = new Metadata();
+            // String mime = this.proDetector.detect(in, metadata).toString();
+            String mime = tika.detect(in, metadata).toString();
+            assertEquals(
+                    urlOrFileName + " is not properly detected: detected.",
+                    expected, mime);
+
+            // Add resource name and test again
+            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
+            // mime = this.proDetector.detect(in, metadata).toString();
+            mime = tika.detect(in, metadata).toString();
+            assertEquals(urlOrFileName
+                    + " is not properly detected after adding resource name.",
+                    expected, mime);
+        } finally {
+            in.close();
+        }
+    }
+
+    private void assertNotNull(String string, InputStream in) {
+        // TODO Auto-generated method stub
+
+    }
+
+    /**
+     * Test for type detection of empty documents.
+     * 
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+     */
+    @Test
+    public void testEmptyDocument() throws IOException {
+        assertEquals(MediaType.OCTET_STREAM.toString(), tika.detect(
+                new ByteArrayInputStream(new byte[0]), new Metadata()));
+
+        Metadata namehint = new Metadata();
+        namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
+        assertEquals(MediaType.TEXT_PLAIN.toString(),
+                tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
+
+        Metadata typehint = new Metadata();
+        typehint.set(Metadata.CONTENT_TYPE, "text/plain");
+        assertEquals(MediaType.TEXT_PLAIN.toString(),
+                tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
+
+    }
+
+    /**
+     * Test for things like javascript files whose content is enclosed in XML
+     * comment delimiters, but that aren't actually XML.
+     * 
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+     */
+    @Test
+    public void testNotXML() throws IOException {
+        assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+                new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
+                new Metadata()));
+    }
+
+    /**
+     * Tests that when we repeatedly test the detection of a document that can
+     * be detected with Mime Magic, that we consistently detect it correctly.
+     * See TIKA-391 for more details.
+     */
+    @Test
+    public void testMimeMagicStability() throws IOException {
+        for (int i = 0; i < 100; i++) {
+            testFile("application/vnd.ms-excel", "test.xls");
+        }
+    }
+
+    /**
+     * Tests that when two magic matches both apply, and both have the same
+     * priority, we use the name to pick the right one based on the glob, or the
+     * first one we come across if not. See TIKA-1292 for more details.
+     */
+    @Test
+    public void testMimeMagicClashSamePriority() throws IOException {
+        byte[] helloWorld = "Hello, World!".getBytes(UTF8);
+        MediaType helloType = MediaType.parse("hello/world-file");
+        MediaType helloXType = MediaType.parse("hello/x-world-hello");
+        Metadata metadata;
+
+        // With a filename, picks the right one
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
+        assertEquals(helloType.toString(),
+                tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
+        assertEquals(helloXType.toString(),
+                tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+
+        // Without, goes for the one that sorts last
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
+        assertEquals(helloXType.toString(),
+                tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+    }
 }