You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/05/02 17:31:53 UTC
svn commit: r1677327 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/mime/ test/java/org/apache/tika/mime/
Author: nick
Date: Sat May 2 15:31:53 2015
New Revision: 1677327
URL: http://svn.apache.org/r1677327
Log:
Update whitespace to match coding conventions
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java Sat May 2 15:31:53 2015
@@ -39,510 +39,503 @@ public class ProbabilisticMimeDetectionS
private MimeTypes mimeTypes;
- private final MediaType rootMediaType;
+ private final MediaType rootMediaType;
- /** probability parameters default value */
- private static final float DEFAULT_MAGIC_TRUST = 0.9f;
- private static final float DEFAULT_META_TRUST = 0.8f;
- private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
- private float priorMagicFileType, priorExtensionFileType,
- priorMetaFileType;
- private float magic_trust, extension_trust, meta_trust;
- private float magic_neg, extension_neg, meta_neg;
- /*
- * any posterior probability lower than the threshold, will be considered as
- * an oct-stream type, the default value is 0.5
- */
- private float threshold;
-
- /*
- * this change rate is used when there are multiple types predicted by
- * magic-bytes. the first predicted type has the highest probability, and
- * the probability for the next type predicted by magic-bytes will decay
- * with this change rate. The idea is to have the first one to take
- * precedence among the multiple possible types predicted by MAGIC-bytes.
- */
- private float changeRate;
-
- /** ***********************/
-
- public ProbabilisticMimeDetectionSelector() {
- this(MimeTypes.getDefaultMimeTypes(), null);
- }
-
- public ProbabilisticMimeDetectionSelector(final Builder builder) {
- this(MimeTypes.getDefaultMimeTypes(), builder);
- }
-
- public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes) {
- this(mimeTypes, null);
- }
-
- public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes,
- final Builder builder) {
- this.mimeTypes = mimeTypes;
- rootMediaType = MediaType.OCTET_STREAM;
- this.initializeDefaultProbabilityParameters();
- this.changeRate = 0.1f;
- if (builder != null) {
- priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType
- : builder.priorMagicFileType;
- priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType
- : builder.priorExtensionFileType;
- priorMetaFileType = builder.priorMetaFileType == 0f ? priorMetaFileType
- : builder.priorMetaFileType;
-
- magic_trust = builder.magic_trust == 0f ? magic_trust
- : builder.extension_neg;
- extension_trust = builder.extension_trust == 0f ? extension_trust
- : builder.extension_trust;
- meta_trust = builder.meta_trust == 0f ? meta_trust
- : builder.meta_trust;
-
- magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg;
- extension_neg = builder.extension_neg == 0f ? extension_neg
- : builder.extension_neg;
- meta_neg = builder.meta_neg == 0f ? meta_neg : builder.meta_neg;
- threshold = builder.threshold == 0f ? threshold : builder.threshold;
- }
- }
-
- /**
- * Initilize probability parameters with default values;
- */
- private void initializeDefaultProbabilityParameters() {
- priorMagicFileType = 0.5f;
- priorExtensionFileType = 0.5f;
- priorMetaFileType = 0.5f;
- magic_trust = DEFAULT_MAGIC_TRUST;
- extension_trust = DEFAULT_EXTENSION_TRUST;
- meta_trust = DEFAULT_META_TRUST;
-
- // probability of the type detected by magic test given that the type is
- // not the detected type. The default is taken by 1 - the magic trust
- magic_neg = 1 - DEFAULT_MAGIC_TRUST;
- // probability of the type detected by extension test given that the
- // type is not the type detected by extension test
- extension_neg = 1 - DEFAULT_EXTENSION_TRUST;
- // same as above; but it could be customized to suffice different use.
- meta_neg = 1 - DEFAULT_META_TRUST;
- threshold = 0.5001f;
- }
-
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- List<MimeType> possibleTypes = null;
-
- // Get type based on magic prefix
- if (input != null) {
- input.mark(mimeTypes.getMinLength());
- try {
- byte[] prefix = mimeTypes.readMagicHeader(input);
- possibleTypes = mimeTypes.getMimeType(prefix);
- } finally {
- input.reset();
- }
- }
-
- MimeType extHint = null;
- // Get type based on resourceName hint (if available)
- String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (resourceName != null) {
- String name = null;
-
- // Deal with a URI or a path name in as the resource name
- try {
- URI uri = new URI(resourceName);
- String path = uri.getPath();
- if (path != null) {
- int slash = path.lastIndexOf('/');
- if (slash + 1 < path.length()) {
- name = path.substring(slash + 1);
- }
- }
- } catch (URISyntaxException e) {
- name = resourceName;
- }
-
- if (name != null) {
- // MimeType hint = getMimeType(name);
- extHint = mimeTypes.getMimeType(name);
- // If we have some types based on mime magic, try to specialise
- // and/or select the type based on that
- // Otherwise, use the type identified from the name
- // possibleTypes = applyHint(possibleTypes, hint);
- }
- }
-
- // Get type based on metadata hint (if available)
- MimeType metaHint = null;
- String typeName = metadata.get(Metadata.CONTENT_TYPE);
- if (typeName != null) {
- try {
- // MimeType hint = forName(typeName);
- metaHint = mimeTypes.forName(typeName);
- // possibleTypes = applyHint(possibleTypes, hint);
- } catch (MimeTypeException e) {
- // Malformed type name, ignore
- }
- }
-
- /*
- * the following calls the probability selection.
- */
- return applyProbilities(possibleTypes, extHint, metaHint);
- }
-
- private MediaType applyProbilities(final List<MimeType> possibleTypes,
- final MimeType extMimeType, final MimeType metadataMimeType) {
-
- /* initialize some probability variables */
- MediaType extensionMediaType_ = extMimeType == null ? null
- : extMimeType.getType();
- MediaType metaMediaType_ = metadataMimeType == null ? null
- : metadataMimeType.getType();
-
- int n = possibleTypes.size();
- float mag_trust = magic_trust;
- float mag_neg = magic_neg;
- float ext_trust = extension_trust;
- float ext_neg = extension_neg;
- float met_trust = meta_trust;
- float met_neg = meta_neg;
- /* ************************** */
-
- /* pre-process some probability variables */
- if (extensionMediaType_ == null
- || extensionMediaType_.compareTo(rootMediaType) == 0) {
- /*
- * this is a root type, that means the extension method fails to
- * identify any type.
- */
- ext_trust = 1;
- ext_neg = 1;
- }
- if (metaMediaType_ == null
- || metaMediaType_.compareTo(rootMediaType) == 0) {
- met_trust = 1;
- met_neg = 1;
- }
-
- float maxProb = -1f;
- MediaType bestEstimate = rootMediaType;
-
- if (possibleTypes != null && !possibleTypes.isEmpty()) {
- int i;
- for (i = 0; i < n; i++) {
- MediaType magictype = possibleTypes.get(i).getType();
- MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
- if (magictype != null && magictype.equals(rootMediaType)) {
- mag_trust = 1;
- mag_neg = 1;
- } else {
- // check if each identified type belongs to the same class;
- if (extensionMediaType_ != null) {
- if (extensionMediaType_.equals(magictype)
- || registry.isSpecializationOf(
- extensionMediaType_, magictype)) {
- // Use just this type
- possibleTypes.set(i, extMimeType);
- } else if (registry.isSpecializationOf(magictype,
- extensionMediaType_)) {
- extensionMediaType_ = magictype;
- }
- }
- if (metaMediaType_ != null) {
- if (metaMediaType_.equals(magictype)
- || registry.isSpecializationOf(metaMediaType_,
- magictype)) {
- // Use just this type
- possibleTypes.set(i, metadataMimeType);
- } else if (registry.isSpecializationOf(magictype,
- metaMediaType_)) {
- metaMediaType_ = magictype;
- }
- }
- }
-
- /*
- * prepare the conditional probability for file type prediction.
- */
-
- float[] results = new float[3];
- float[] trust1 = new float[3];
- float[] negtrust1 = new float[3];
- magictype = possibleTypes.get(i).getType();
-
- if (i > 0) {
- /*
- * decay as our trust goes down with next type predicted by
- * magic
- */
- mag_trust = mag_trust * (1 - changeRate);
- /*
- * grow as our trust goes down
- */
- mag_neg = mag_neg * (1 + changeRate);
-
- }
-
- if (magictype != null && mag_trust != 1) {
- trust1[0] = mag_trust;
- negtrust1[0] = mag_neg;
- if (metaMediaType_ != null && met_trust != 1) {
- if (magictype.equals(metaMediaType_)) {
- trust1[1] = met_trust;
- negtrust1[1] = met_neg;
- } else {
- trust1[1] = 1 - met_trust;
- negtrust1[1] = 1 - met_neg;
- }
- } else {
- trust1[1] = 1;
- negtrust1[1] = 1;
- }
- if (extensionMediaType_ != null && ext_trust != 1) {
- if (magictype.equals(extensionMediaType_)) {
- trust1[2] = ext_trust;
- negtrust1[2] = ext_neg;
- } else {
- trust1[2] = 1 - ext_trust;
- negtrust1[2] = 1 - ext_neg;
- }
- } else {
- trust1[2] = 1;
- negtrust1[2] = 1;
- }
- } else {
- results[0] = 0.1f;
- }
-
- float[] trust2 = new float[3];
- float[] negtrust2 = new float[3];
- if (metadataMimeType != null && met_trust != 1) {
- trust2[1] = met_trust;
- negtrust2[1] = met_neg;
- if (magictype != null && mag_trust != 1) {
- if (metaMediaType_.equals(magictype)) {
- trust2[0] = mag_trust;
- negtrust2[0] = mag_neg;
- } else {
- trust2[0] = 1 - mag_trust;
- negtrust2[0] = 1 - mag_neg;
- }
-
- } else {
- trust2[0] = 1f;
- negtrust2[0] = 1f;
- }
- if (extensionMediaType_ != null && ext_trust != 1) {
- if (metaMediaType_.equals(extensionMediaType_)) {
- trust2[2] = ext_trust;
- negtrust2[2] = ext_neg;
- } else {
- trust2[2] = 1 - ext_trust;
- negtrust2[2] = 1 - ext_neg;
- }
- } else {
- trust2[2] = 1f;
- negtrust2[2] = 1f;
- }
- } else {
- results[1] = 0.1f;
- }
-
- float[] trust3 = new float[3];
- float[] negtrust3 = new float[3];
- if (extensionMediaType_ != null && ext_trust != 1) {
- trust3[2] = ext_trust;
- negtrust3[2] = ext_neg;
- if (magictype != null && mag_trust != 1) {
- if (magictype.equals(extensionMediaType_)) {
- trust3[0] = mag_trust;
- negtrust3[0] = mag_neg;
- } else {
- trust3[0] = 1 - mag_trust;
- negtrust3[0] = 1 - mag_neg;
- }
- } else {
- trust3[0] = 1f;
- negtrust3[0] = 1f;
- }
-
- if (metaMediaType_ != null && met_trust != 1) {
- if (metaMediaType_.equals(extensionMediaType_)) {
- trust3[1] = met_trust;
- negtrust3[1] = met_neg;
- } else {
- trust3[1] = 1 - met_trust;
- negtrust3[1] = 1 - met_neg;
- }
- } else {
- trust3[1] = 1f;
- negtrust3[1] = 1f;
- }
- } else {
- results[2] = 0.1f;
- }
- /*
- * compute the posterior probability for each predicted file
- * type and store them into the "results" array.
- */
- float pPrime = priorMagicFileType;
- float deno = 1 - priorMagicFileType;
- int j;
-
- if (results[0] == 0) {
- for (j = 0; j < trust1.length; j++) {
- pPrime *= trust1[j];
- if (trust1[j] != 1) {
- deno *= negtrust1[j];
- }
- }
- pPrime /= (pPrime + deno);
- results[0] = pPrime;
-
- }
- if (maxProb < results[0]) {
- maxProb = results[0];
- bestEstimate = magictype;
- }
-
- pPrime = priorMetaFileType;
- deno = 1 - priorMetaFileType;
- if (results[1] == 0) {
- for (j = 0; j < trust2.length; j++) {
- pPrime *= trust2[j];
- if (trust2[j] != 1) {
- deno *= negtrust2[j];
- }
- }
- pPrime /= (pPrime + deno);
- results[1] = pPrime;
-
- }
- if (maxProb < results[1]) {
- maxProb = results[1];
- bestEstimate = metaMediaType_;
- }
-
- pPrime = priorExtensionFileType;
- deno = 1 - priorExtensionFileType;
- if (results[2] == 0) {
- for (j = 0; j < trust3.length; j++) {
- pPrime *= trust3[j];
- if (trust3[j] != 1) {
- deno *= negtrust3[j];
- }
- }
- pPrime /= (pPrime + deno);
- results[2] = pPrime;
- }
- if (maxProb < results[2]) {
- maxProb = results[2];
- bestEstimate = extensionMediaType_;
- }
-/*
+ /** probability parameters default value */
+ private static final float DEFAULT_MAGIC_TRUST = 0.9f;
+ private static final float DEFAULT_META_TRUST = 0.8f;
+ private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
+ private float priorMagicFileType, priorExtensionFileType,
+ priorMetaFileType;
+ private float magic_trust, extension_trust, meta_trust;
+ private float magic_neg, extension_neg, meta_neg;
+ /*
+ * any posterior probability lower than the threshold, will be considered as
+ * an oct-stream type, the default value is 0.5
+ */
+ private float threshold;
+
+ /*
+ * this change rate is used when there are multiple types predicted by
+ * magic-bytes. the first predicted type has the highest probability, and
+ * the probability for the next type predicted by magic-bytes will decay
+ * with this change rate. The idea is to have the first one to take
+ * precedence among the multiple possible types predicted by MAGIC-bytes.
+ */
+ private float changeRate;
+
+ /** ***********************/
+
+ public ProbabilisticMimeDetectionSelector() {
+ this(MimeTypes.getDefaultMimeTypes(), null);
+ }
+
+ public ProbabilisticMimeDetectionSelector(final Builder builder) {
+ this(MimeTypes.getDefaultMimeTypes(), builder);
+ }
+
+ public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes) {
+ this(mimeTypes, null);
+ }
+
+ public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes,
+ final Builder builder) {
+ this.mimeTypes = mimeTypes;
+ rootMediaType = MediaType.OCTET_STREAM;
+ this.initializeDefaultProbabilityParameters();
+ this.changeRate = 0.1f;
+ if (builder != null) {
+ priorMagicFileType = builder.priorMagicFileType == 0f ?
+ priorMagicFileType : builder.priorMagicFileType;
+ priorExtensionFileType = builder.priorExtensionFileType == 0f ?
+ priorExtensionFileType : builder.priorExtensionFileType;
+ priorMetaFileType = builder.priorMetaFileType == 0f ?
+ priorMetaFileType : builder.priorMetaFileType;
+
+ magic_trust = builder.magic_trust == 0f ? magic_trust : builder.extension_neg;
+ extension_trust = builder.extension_trust == 0f ? extension_trust : builder.extension_trust;
+ meta_trust = builder.meta_trust == 0f ? meta_trust : builder.meta_trust;
+
+ magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg;
+ extension_neg = builder.extension_neg == 0f ?
+ extension_neg : builder.extension_neg;
+ meta_neg = builder.meta_neg == 0f ? meta_neg : builder.meta_neg;
+ threshold = builder.threshold == 0f ? threshold : builder.threshold;
+ }
+ }
+
+ /**
+ * Initilize probability parameters with default values;
+ */
+ private void initializeDefaultProbabilityParameters() {
+ priorMagicFileType = 0.5f;
+ priorExtensionFileType = 0.5f;
+ priorMetaFileType = 0.5f;
+ magic_trust = DEFAULT_MAGIC_TRUST;
+ extension_trust = DEFAULT_EXTENSION_TRUST;
+ meta_trust = DEFAULT_META_TRUST;
+
+ // probability of the type detected by magic test given that the type is
+ // not the detected type. The default is taken by 1 - the magic trust
+ magic_neg = 1 - DEFAULT_MAGIC_TRUST;
+ // probability of the type detected by extension test given that the
+ // type is not the type detected by extension test
+ extension_neg = 1 - DEFAULT_EXTENSION_TRUST;
+ // same as above; but it could be customized to suffice different use.
+ meta_neg = 1 - DEFAULT_META_TRUST;
+ threshold = 0.5001f;
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ List<MimeType> possibleTypes = null;
+
+ // Get type based on magic prefix
+ if (input != null) {
+ input.mark(mimeTypes.getMinLength());
+ try {
+ byte[] prefix = mimeTypes.readMagicHeader(input);
+ possibleTypes = mimeTypes.getMimeType(prefix);
+ } finally {
+ input.reset();
+ }
+ }
+
+ MimeType extHint = null;
+ // Get type based on resourceName hint (if available)
+ String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (resourceName != null) {
+ String name = null;
+
+ // Deal with a URI or a path name in as the resource name
+ try {
+ URI uri = new URI(resourceName);
+ String path = uri.getPath();
+ if (path != null) {
+ int slash = path.lastIndexOf('/');
+ if (slash + 1 < path.length()) {
+ name = path.substring(slash + 1);
+ }
+ }
+ } catch (URISyntaxException e) {
+ name = resourceName;
+ }
+
+ if (name != null) {
+ // MimeType hint = getMimeType(name);
+ extHint = mimeTypes.getMimeType(name);
+ // If we have some types based on mime magic, try to specialise
+ // and/or select the type based on that
+ // Otherwise, use the type identified from the name
+ // possibleTypes = applyHint(possibleTypes, hint);
+ }
+ }
+
+ // Get type based on metadata hint (if available)
+ MimeType metaHint = null;
+ String typeName = metadata.get(Metadata.CONTENT_TYPE);
+ if (typeName != null) {
+ try {
+ // MimeType hint = forName(typeName);
+ metaHint = mimeTypes.forName(typeName);
+ // possibleTypes = applyHint(possibleTypes, hint);
+ } catch (MimeTypeException e) {
+ // Malformed type name, ignore
+ }
+ }
+
+ /*
+ * the following calls the probability selection.
+ */
+ return applyProbilities(possibleTypes, extHint, metaHint);
+ }
+
+ private MediaType applyProbilities(final List<MimeType> possibleTypes,
+ final MimeType extMimeType, final MimeType metadataMimeType) {
+
+ /* initialize some probability variables */
+ MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
+ MediaType metaMediaType_ = metadataMimeType == null ? null : metadataMimeType.getType();
+
+ int n = possibleTypes.size();
+ float mag_trust = magic_trust;
+ float mag_neg = magic_neg;
+ float ext_trust = extension_trust;
+ float ext_neg = extension_neg;
+ float met_trust = meta_trust;
+ float met_neg = meta_neg;
+ /* ************************** */
+
+ /* pre-process some probability variables */
+ if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) {
+ /*
+ * this is a root type, that means the extension method fails to
+ * identify any type.
+ */
+ ext_trust = 1;
+ ext_neg = 1;
+ }
+ if (metaMediaType_ == null || metaMediaType_.compareTo(rootMediaType) == 0) {
+ met_trust = 1;
+ met_neg = 1;
+ }
+
+ float maxProb = -1f;
+ MediaType bestEstimate = rootMediaType;
+
+ if (possibleTypes != null && !possibleTypes.isEmpty()) {
+ int i;
+ for (i = 0; i < n; i++) {
+ MediaType magictype = possibleTypes.get(i).getType();
+ MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+ if (magictype != null && magictype.equals(rootMediaType)) {
+ mag_trust = 1;
+ mag_neg = 1;
+ } else {
+ // check if each identified type belongs to the same class;
+ if (extensionMediaType_ != null) {
+ if (extensionMediaType_.equals(magictype)
+ || registry.isSpecializationOf(
+ extensionMediaType_, magictype)) {
+ // Use just this type
+ possibleTypes.set(i, extMimeType);
+ } else if (registry.isSpecializationOf(magictype,
+ extensionMediaType_)) {
+ extensionMediaType_ = magictype;
+ }
+ }
+ if (metaMediaType_ != null) {
+ if (metaMediaType_.equals(magictype)
+ || registry.isSpecializationOf(metaMediaType_,
+ magictype)) {
+ // Use just this type
+ possibleTypes.set(i, metadataMimeType);
+ } else if (registry.isSpecializationOf(magictype,
+ metaMediaType_)) {
+ metaMediaType_ = magictype;
+ }
+ }
+ }
+
+ /*
+ * prepare the conditional probability for file type prediction.
+ */
+
+ float[] results = new float[3];
+ float[] trust1 = new float[3];
+ float[] negtrust1 = new float[3];
+ magictype = possibleTypes.get(i).getType();
+
+ if (i > 0) {
+ /*
+ * decay as our trust goes down with next type predicted by
+ * magic
+ */
+ mag_trust = mag_trust * (1 - changeRate);
+ /*
+ * grow as our trust goes down
+ */
+ mag_neg = mag_neg * (1 + changeRate);
+
+ }
+
+ if (magictype != null && mag_trust != 1) {
+ trust1[0] = mag_trust;
+ negtrust1[0] = mag_neg;
+ if (metaMediaType_ != null && met_trust != 1) {
+ if (magictype.equals(metaMediaType_)) {
+ trust1[1] = met_trust;
+ negtrust1[1] = met_neg;
+ } else {
+ trust1[1] = 1 - met_trust;
+ negtrust1[1] = 1 - met_neg;
+ }
+ } else {
+ trust1[1] = 1;
+ negtrust1[1] = 1;
+ }
+ if (extensionMediaType_ != null && ext_trust != 1) {
+ if (magictype.equals(extensionMediaType_)) {
+ trust1[2] = ext_trust;
+ negtrust1[2] = ext_neg;
+ } else {
+ trust1[2] = 1 - ext_trust;
+ negtrust1[2] = 1 - ext_neg;
+ }
+ } else {
+ trust1[2] = 1;
+ negtrust1[2] = 1;
+ }
+ } else {
+ results[0] = 0.1f;
+ }
+
+ float[] trust2 = new float[3];
+ float[] negtrust2 = new float[3];
+ if (metadataMimeType != null && met_trust != 1) {
+ trust2[1] = met_trust;
+ negtrust2[1] = met_neg;
+ if (magictype != null && mag_trust != 1) {
+ if (metaMediaType_.equals(magictype)) {
+ trust2[0] = mag_trust;
+ negtrust2[0] = mag_neg;
+ } else {
+ trust2[0] = 1 - mag_trust;
+ negtrust2[0] = 1 - mag_neg;
+ }
+
+ } else {
+ trust2[0] = 1f;
+ negtrust2[0] = 1f;
+ }
+ if (extensionMediaType_ != null && ext_trust != 1) {
+ if (metaMediaType_.equals(extensionMediaType_)) {
+ trust2[2] = ext_trust;
+ negtrust2[2] = ext_neg;
+ } else {
+ trust2[2] = 1 - ext_trust;
+ negtrust2[2] = 1 - ext_neg;
+ }
+ } else {
+ trust2[2] = 1f;
+ negtrust2[2] = 1f;
+ }
+ } else {
+ results[1] = 0.1f;
+ }
+
+ float[] trust3 = new float[3];
+ float[] negtrust3 = new float[3];
+ if (extensionMediaType_ != null && ext_trust != 1) {
+ trust3[2] = ext_trust;
+ negtrust3[2] = ext_neg;
+ if (magictype != null && mag_trust != 1) {
+ if (magictype.equals(extensionMediaType_)) {
+ trust3[0] = mag_trust;
+ negtrust3[0] = mag_neg;
+ } else {
+ trust3[0] = 1 - mag_trust;
+ negtrust3[0] = 1 - mag_neg;
+ }
+ } else {
+ trust3[0] = 1f;
+ negtrust3[0] = 1f;
+ }
+
+ if (metaMediaType_ != null && met_trust != 1) {
+ if (metaMediaType_.equals(extensionMediaType_)) {
+ trust3[1] = met_trust;
+ negtrust3[1] = met_neg;
+ } else {
+ trust3[1] = 1 - met_trust;
+ negtrust3[1] = 1 - met_neg;
+ }
+ } else {
+ trust3[1] = 1f;
+ negtrust3[1] = 1f;
+ }
+ } else {
+ results[2] = 0.1f;
+ }
+ /*
+ * compute the posterior probability for each predicted file
+ * type and store them into the "results" array.
+ */
+ float pPrime = priorMagicFileType;
+ float deno = 1 - priorMagicFileType;
+ int j;
+
+ if (results[0] == 0) {
+ for (j = 0; j < trust1.length; j++) {
+ pPrime *= trust1[j];
+ if (trust1[j] != 1) {
+ deno *= negtrust1[j];
+ }
+ }
+ pPrime /= (pPrime + deno);
+ results[0] = pPrime;
+
+ }
+ if (maxProb < results[0]) {
+ maxProb = results[0];
+ bestEstimate = magictype;
+ }
+
+ pPrime = priorMetaFileType;
+ deno = 1 - priorMetaFileType;
+ if (results[1] == 0) {
+ for (j = 0; j < trust2.length; j++) {
+ pPrime *= trust2[j];
+ if (trust2[j] != 1) {
+ deno *= negtrust2[j];
+ }
+ }
+ pPrime /= (pPrime + deno);
+ results[1] = pPrime;
+
+ }
+ if (maxProb < results[1]) {
+ maxProb = results[1];
+ bestEstimate = metaMediaType_;
+ }
+
+ pPrime = priorExtensionFileType;
+ deno = 1 - priorExtensionFileType;
+ if (results[2] == 0) {
+ for (j = 0; j < trust3.length; j++) {
+ pPrime *= trust3[j];
+ if (trust3[j] != 1) {
+ deno *= negtrust3[j];
+ }
+ }
+ pPrime /= (pPrime + deno);
+ results[2] = pPrime;
+ }
+ if (maxProb < results[2]) {
+ maxProb = results[2];
+ bestEstimate = extensionMediaType_;
+ }
+ /*
for (float r : results) {
System.out.print(r + "; ");
}
System.out.println();
-*/
- }
+ */
+ }
- }
- return maxProb < threshold ? this.rootMediaType : bestEstimate;
+ }
+ return maxProb < threshold ? this.rootMediaType : bestEstimate;
- }
+ }
- public MediaTypeRegistry getMediaTypeRegistry() {
- return this.mimeTypes.getMediaTypeRegistry();
- }
-
- /**
- * build class for probability parameters setting
- *
- *
- */
- public static class Builder {
- /*
- * the following are the prior probabilities for the file type
- * identified by each method.
- */
- private float priorMagicFileType, priorExtensionFileType,
- priorMetaFileType;
- /*
- * the following are the conditional probability for each method with
- * positive conditions
- */
- private float magic_trust, extension_trust, meta_trust;
-
- /*
- * the following *_neg are the conditional probabilities with negative
- * conditions
- */
- private float magic_neg, extension_neg, meta_neg;
-
- private float threshold;
-
- public synchronized Builder priorMagicFileType(final float prior) {
- this.priorMagicFileType = prior;
- return this;
- }
-
- public synchronized Builder priorExtensionFileType(final float prior) {
- this.priorExtensionFileType = prior;
- return this;
- }
-
- public synchronized Builder priorMetaFileType(final float prior) {
- this.priorMetaFileType = prior;
- return this;
- }
-
- public synchronized Builder magic_trust(final float trust) {
- this.magic_trust = trust;
- return this;
- }
-
- public synchronized Builder extension_trust(final float trust) {
- this.extension_trust = trust;
- return this;
- }
-
- public synchronized Builder meta_trust(final float trust) {
- this.meta_trust = trust;
- return this;
- }
-
- public synchronized Builder magic_neg(final float trust) {
- this.magic_neg = trust;
- return this;
- }
-
- public synchronized Builder extension_neg(final float trust) {
- this.extension_neg = trust;
- return this;
- }
-
- public synchronized Builder meta_neg(final float trust) {
- this.meta_neg = trust;
- return this;
- }
-
- public synchronized Builder threshold(final float threshold) {
- this.threshold = threshold;
- return this;
- }
-
- /**
- * initialize the MimeTypes with this builder instance
- *
- * @return
- */
- public ProbabilisticMimeDetectionSelector build2() {
- return new ProbabilisticMimeDetectionSelector(this);
- }
- }
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return this.mimeTypes.getMediaTypeRegistry();
+ }
+
+ /**
+ * build class for probability parameters setting
+ *
+ *
+ */
+ public static class Builder {
+ /*
+ * the following are the prior probabilities for the file type
+ * identified by each method.
+ */
+ private float priorMagicFileType, priorExtensionFileType,
+ priorMetaFileType;
+ /*
+ * the following are the conditional probability for each method with
+ * positive conditions
+ */
+ private float magic_trust, extension_trust, meta_trust;
+
+ /*
+ * the following *_neg are the conditional probabilities with negative
+ * conditions
+ */
+ private float magic_neg, extension_neg, meta_neg;
+
+ private float threshold;
+
+ public synchronized Builder priorMagicFileType(final float prior) {
+ this.priorMagicFileType = prior;
+ return this;
+ }
+
+ public synchronized Builder priorExtensionFileType(final float prior) {
+ this.priorExtensionFileType = prior;
+ return this;
+ }
+
+ public synchronized Builder priorMetaFileType(final float prior) {
+ this.priorMetaFileType = prior;
+ return this;
+ }
+
+ public synchronized Builder magic_trust(final float trust) {
+ this.magic_trust = trust;
+ return this;
+ }
+
+ public synchronized Builder extension_trust(final float trust) {
+ this.extension_trust = trust;
+ return this;
+ }
+
+ public synchronized Builder meta_trust(final float trust) {
+ this.meta_trust = trust;
+ return this;
+ }
+
+ public synchronized Builder magic_neg(final float trust) {
+ this.magic_neg = trust;
+ return this;
+ }
+
+ public synchronized Builder extension_neg(final float trust) {
+ this.extension_neg = trust;
+ return this;
+ }
+
+ public synchronized Builder meta_neg(final float trust) {
+ this.meta_neg = trust;
+ return this;
+ }
+
+ public synchronized Builder threshold(final float threshold) {
+ this.threshold = threshold;
+ return this;
+ }
+
+ /**
+ * initialize the MimeTypes with this builder instance
+ *
+ * @return
+ */
+ public ProbabilisticMimeDetectionSelector build2() {
+ return new ProbabilisticMimeDetectionSelector(this);
+ }
+ }
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java Sat May 2 15:31:53 2015
@@ -30,218 +30,218 @@ import org.junit.Before;
import org.junit.Test;
public class ProbabilisticMimeDetectionTest {
- private static final Charset UTF8 = Charset.forName("UTF-8");
- private ProbabilisticMimeDetectionSelector proDetector;
+ private static final Charset UTF8 = Charset.forName("UTF-8");
+ private ProbabilisticMimeDetectionSelector proDetector;
- private MediaTypeRegistry registry;
+ private MediaTypeRegistry registry;
- /** @inheritDoc */
- @Before
- public void setUp() {
- proDetector = new ProbabilisticMimeDetectionSelector();
- this.registry = proDetector.getMediaTypeRegistry();
- }
-
- @Test
- public void testDetection() throws Exception {
- testFile("image/svg+xml", "circles.svg");
- testFile("image/svg+xml", "circles-with-prefix.svg");
- testFile("image/png", "datamatrix.png");
- testFile("text/html", "test.html");
- testFile("application/xml", "test-iso-8859-1.xml");
- testFile("application/xml", "test-utf8.xml");
- testFile("application/xml", "test-utf8-bom.xml");
- testFile("application/xml", "test-utf16le.xml");
- testFile("application/xml", "test-utf16be.xml");
- testFile("application/xml", "test-long-comment.xml");
- testFile("application/xslt+xml", "stylesheet.xsl");
- testUrl("application/rdf+xml",
- "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
- "test-difficult-rdf1.xml");
- testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
- "test-difficult-rdf2.xml");
- // add evil test from TIKA-327
- testFile("text/html", "test-tika-327.html");
- // add another evil html test from TIKA-357
- testFile("text/html", "testlargerbuffer.html");
- // test fragment of HTML with <div> (TIKA-1102)
- testFile("text/html", "htmlfragment");
- // test binary CGM detection (TIKA-1170)
- testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
- // test HTML detection of malformed file, previously identified as
- // image/cgm (TIKA-1170)
- testFile("text/html", "test-malformed-header.html.bin");
- }
-
- @Test
- public void testByteOrderMark() throws Exception {
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
- new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
- new Metadata()));
-
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
- new Metadata()));
- }
-
- @Test
- public void testSuperTypes() {
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.parse("text/something")));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.OCTET_STREAM));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/something+xml"),
- MediaType.APPLICATION_XML));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/something+zip"),
- MediaType.APPLICATION_ZIP));
-
- assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
- MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/vnd.apple.iwork"),
- MediaType.APPLICATION_ZIP));
- }
-
- @SuppressWarnings("unused")
- private void testUrlOnly(String expected, String url) throws IOException {
- InputStream in = new URL(url).openStream();
- testStream(expected, url, in);
- }
-
- private void testUrl(String expected, String url, String file)
- throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
- testStream(expected, url, in);
- }
-
- private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
- testStream(expected, filename, in);
- }
-
- private void testStream(String expected, String urlOrFileName,
- InputStream in) throws IOException {
- assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
- if (!in.markSupported()) {
- in = new java.io.BufferedInputStream(in);
- }
- try {
- Metadata metadata = new Metadata();
- String mime = this.proDetector.detect(in, metadata).toString();
- assertEquals(
- urlOrFileName + " is not properly detected: detected.",
- expected, mime);
-
- // Add resource name and test again
- metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
- mime = this.proDetector.detect(in, metadata).toString();
- assertEquals(urlOrFileName
- + " is not properly detected after adding resource name.",
- expected, mime);
- } finally {
- in.close();
- }
- }
-
- private void assertNotNull(String string, InputStream in) {
- // TODO Auto-generated method stub
-
- }
-
- /**
- * Test for type detection of empty documents.
- *
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
- */
- @Test
- public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM, proDetector.detect(
- new ByteArrayInputStream(new byte[0]), new Metadata()));
-
- Metadata namehint = new Metadata();
- namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream(new byte[0]), namehint));
-
- Metadata typehint = new Metadata();
- typehint.set(Metadata.CONTENT_TYPE, "text/plain");
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream(new byte[0]), typehint));
-
- }
-
- /**
- * Test for things like javascript files whose content is enclosed in XML
- * comment delimiters, but that aren't actually XML.
- *
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
- */
- @Test
- public void testNotXML() throws IOException {
- assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
- new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
- new Metadata()));
- }
-
- /**
- * Tests that when we repeatedly test the detection of a document that can
- * be detected with Mime Magic, that we consistently detect it correctly.
- * See TIKA-391 for more details.
- */
- @Test
- public void testMimeMagicStability() throws IOException {
- for (int i = 0; i < 100; i++) {
- testFile("application/vnd.ms-excel", "test.xls");
- }
- }
-
- /**
- * Tests that when two magic matches both apply, and both have the same
- * priority, we use the name to pick the right one based on the glob, or the
- * first one we come across if not. See TIKA-1292 for more details.
- */
- @Test
- public void testMimeMagicClashSamePriority() throws IOException {
- byte[] helloWorld = "Hello, World!".getBytes(UTF8);
- MediaType helloType = MediaType.parse("hello/world-file");
- MediaType helloXType = MediaType.parse("hello/x-world-hello");
- Metadata metadata;
-
- // With a filename, picks the right one
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
- assertEquals(helloType, proDetector.detect(new ByteArrayInputStream(
- helloWorld), metadata));
-
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
- assertEquals(helloXType, proDetector.detect(new ByteArrayInputStream(
- helloWorld), metadata));
-
- // Without, goes for the one that sorts last
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
- assertEquals(helloXType, proDetector.detect(new ByteArrayInputStream(
- helloWorld), metadata));
- }
+ /** @inheritDoc */
+ @Before
+ public void setUp() {
+ proDetector = new ProbabilisticMimeDetectionSelector();
+ this.registry = proDetector.getMediaTypeRegistry();
+ }
+
+ @Test
+ public void testDetection() throws Exception {
+ testFile("image/svg+xml", "circles.svg");
+ testFile("image/svg+xml", "circles-with-prefix.svg");
+ testFile("image/png", "datamatrix.png");
+ testFile("text/html", "test.html");
+ testFile("application/xml", "test-iso-8859-1.xml");
+ testFile("application/xml", "test-utf8.xml");
+ testFile("application/xml", "test-utf8-bom.xml");
+ testFile("application/xml", "test-utf16le.xml");
+ testFile("application/xml", "test-utf16be.xml");
+ testFile("application/xml", "test-long-comment.xml");
+ testFile("application/xslt+xml", "stylesheet.xsl");
+ testUrl("application/rdf+xml",
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ "test-difficult-rdf1.xml");
+ testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
+ "test-difficult-rdf2.xml");
+ // add evil test from TIKA-327
+ testFile("text/html", "test-tika-327.html");
+ // add another evil html test from TIKA-357
+ testFile("text/html", "testlargerbuffer.html");
+ // test fragment of HTML with <div> (TIKA-1102)
+ testFile("text/html", "htmlfragment");
+ // test binary CGM detection (TIKA-1170)
+ testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
+ // test HTML detection of malformed file, previously identified as
+ // image/cgm (TIKA-1170)
+ testFile("text/html", "test-malformed-header.html.bin");
+ }
+
+ @Test
+ public void testByteOrderMark() throws Exception {
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
+ new Metadata()));
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
+ new Metadata()));
+
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
+ new Metadata()));
+ }
+
+ @Test
+ public void testSuperTypes() {
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.OCTET_STREAM));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+xml"),
+ MediaType.APPLICATION_XML));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+zip"),
+ MediaType.APPLICATION_ZIP));
+
+ assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.iwork"),
+ MediaType.APPLICATION_ZIP));
+ }
+
+ @SuppressWarnings("unused")
+ private void testUrlOnly(String expected, String url) throws IOException {
+ InputStream in = new URL(url).openStream();
+ testStream(expected, url, in);
+ }
+
+ private void testUrl(String expected, String url, String file)
+ throws IOException {
+ InputStream in = getClass().getResourceAsStream(file);
+ testStream(expected, url, in);
+ }
+
+ private void testFile(String expected, String filename) throws IOException {
+ InputStream in = getClass().getResourceAsStream(filename);
+ testStream(expected, filename, in);
+ }
+
+ private void testStream(String expected, String urlOrFileName,
+ InputStream in) throws IOException {
+ assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
+ if (!in.markSupported()) {
+ in = new java.io.BufferedInputStream(in);
+ }
+ try {
+ Metadata metadata = new Metadata();
+ String mime = this.proDetector.detect(in, metadata).toString();
+ assertEquals(
+ urlOrFileName + " is not properly detected: detected.",
+ expected, mime);
+
+ // Add resource name and test again
+ metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
+ mime = this.proDetector.detect(in, metadata).toString();
+ assertEquals(urlOrFileName
+ + " is not properly detected after adding resource name.",
+ expected, mime);
+ } finally {
+ in.close();
+ }
+ }
+
+ private void assertNotNull(String string, InputStream in) {
+ // TODO Auto-generated method stub
+
+ }
+
+ /**
+ * Test for type detection of empty documents.
+ *
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+ */
+ @Test
+ public void testEmptyDocument() throws IOException {
+ assertEquals(MediaType.OCTET_STREAM, proDetector.detect(
+ new ByteArrayInputStream(new byte[0]), new Metadata()));
+
+ Metadata namehint = new Metadata();
+ namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream(new byte[0]), namehint));
+
+ Metadata typehint = new Metadata();
+ typehint.set(Metadata.CONTENT_TYPE, "text/plain");
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream(new byte[0]), typehint));
+
+ }
+
+ /**
+ * Test for things like javascript files whose content is enclosed in XML
+ * comment delimiters, but that aren't actually XML.
+ *
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+ */
+ @Test
+ public void testNotXML() throws IOException {
+ assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(
+ new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
+ new Metadata()));
+ }
+
+ /**
+ * Tests that when we repeatedly test the detection of a document that can
+ * be detected with Mime Magic, that we consistently detect it correctly.
+ * See TIKA-391 for more details.
+ */
+ @Test
+ public void testMimeMagicStability() throws IOException {
+ for (int i = 0; i < 100; i++) {
+ testFile("application/vnd.ms-excel", "test.xls");
+ }
+ }
+
+ /**
+ * Tests that when two magic matches both apply, and both have the same
+ * priority, we use the name to pick the right one based on the glob, or the
+ * first one we come across if not. See TIKA-1292 for more details.
+ */
+ @Test
+ public void testMimeMagicClashSamePriority() throws IOException {
+ byte[] helloWorld = "Hello, World!".getBytes(UTF8);
+ MediaType helloType = MediaType.parse("hello/world-file");
+ MediaType helloXType = MediaType.parse("hello/x-world-hello");
+ Metadata metadata;
+
+ // With a filename, picks the right one
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
+ assertEquals(helloType, proDetector.detect(
+ new ByteArrayInputStream(helloWorld), metadata));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
+ assertEquals(helloXType, proDetector.detect(
+ new ByteArrayInputStream(helloWorld), metadata));
+
+ // Without, goes for the one that sorts last
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
+ assertEquals(helloXType, proDetector.detect(
+ new ByteArrayInputStream(helloWorld), metadata));
+ }
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1677327&r1=1677326&r2=1677327&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Sat May 2 15:31:53 2015
@@ -42,247 +42,247 @@ import org.junit.Before;
import org.junit.Test;
public class ProbabilisticMimeDetectionTestWithTika {
- private static final Charset UTF8 = Charset.forName("UTF-8");
- // private ProbabilisticMimeDetectionSelector proDetector;
- private Tika tika;
- private MediaTypeRegistry registry;
-
- /** @inheritDoc */
- @Before
- public void setUp() {
- try {
- registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
- tika = new Tika(new TikaConfig() {
- @Override
- protected Detector getDefaultDetector(MimeTypes types,
- ServiceLoader loader) {
- /*
- * here is an example with the use of the builder to
- * instantiate the object.
- */
- Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
- ProbabilisticMimeDetectionSelector proDetector = new ProbabilisticMimeDetectionSelector(
- types, builder.priorMagicFileType(0.5f)
- .priorExtensionFileType(0.5f)
- .priorMetaFileType(0.5f));
- return new DefaultProbDetector(proDetector, loader);
- }
- });
- } catch (TikaException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } finally {
-
- }
-
- }
-
- @Test
- public void testDetection() throws Exception {
- testFile("image/svg+xml", "circles.svg");
- testFile("image/svg+xml", "circles-with-prefix.svg");
- testFile("image/png", "datamatrix.png");
- testFile("text/html", "test.html");
- testFile("application/xml", "test-iso-8859-1.xml");
- testFile("application/xml", "test-utf8.xml");
- testFile("application/xml", "test-utf8-bom.xml");
- testFile("application/xml", "test-utf16le.xml");
- testFile("application/xml", "test-utf16be.xml");
- testFile("application/xml", "test-long-comment.xml");
- testFile("application/xslt+xml", "stylesheet.xsl");
- testUrl("application/rdf+xml",
- "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
- "test-difficult-rdf1.xml");
- testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
- "test-difficult-rdf2.xml");
- // add evil test from TIKA-327
- testFile("text/html", "test-tika-327.html");
- // add another evil html test from TIKA-357
- testFile("text/html", "testlargerbuffer.html");
- // test fragment of HTML with <div> (TIKA-1102)
- testFile("text/html", "htmlfragment");
- // test binary CGM detection (TIKA-1170)
- testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
- // test HTML detection of malformed file, previously identified as
- // image/cgm (TIKA-1170)
- testFile("text/html", "test-malformed-header.html.bin");
- }
-
- @Test
- public void testByteOrderMark() throws Exception {
- assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
- new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
- new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
- new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
- new Metadata()));
-
- assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
- new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
- new Metadata()));
- }
-
- @Test
- public void testSuperTypes() {
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.parse("text/something")));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something; charset=UTF-8"),
- MediaType.OCTET_STREAM));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/something+xml"),
- MediaType.APPLICATION_XML));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/something+zip"),
- MediaType.APPLICATION_ZIP));
-
- assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
- MediaType.TEXT_PLAIN));
-
- assertTrue(registry.isSpecializationOf(
- MediaType.parse("application/vnd.apple.iwork"),
- MediaType.APPLICATION_ZIP));
- }
-
- @SuppressWarnings("unused")
- private void testUrlOnly(String expected, String url) throws IOException {
- InputStream in = new URL(url).openStream();
- testStream(expected, url, in);
- }
-
- private void testUrl(String expected, String url, String file)
- throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
- testStream(expected, url, in);
- }
-
- private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
- testStream(expected, filename, in);
- }
-
- private void testStream(String expected, String urlOrFileName,
- InputStream in) throws IOException {
- assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
- if (!in.markSupported()) {
- in = new java.io.BufferedInputStream(in);
- }
- try {
- Metadata metadata = new Metadata();
- // String mime = this.proDetector.detect(in, metadata).toString();
- String mime = tika.detect(in, metadata).toString();
- assertEquals(
- urlOrFileName + " is not properly detected: detected.",
- expected, mime);
-
- // Add resource name and test again
- metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
- // mime = this.proDetector.detect(in, metadata).toString();
- mime = tika.detect(in, metadata).toString();
- assertEquals(urlOrFileName
- + " is not properly detected after adding resource name.",
- expected, mime);
- } finally {
- in.close();
- }
- }
-
- private void assertNotNull(String string, InputStream in) {
- // TODO Auto-generated method stub
-
- }
-
- /**
- * Test for type detection of empty documents.
- *
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
- */
- @Test
- public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM.toString(), tika.detect(
- new ByteArrayInputStream(new byte[0]), new Metadata()));
-
- Metadata namehint = new Metadata();
- namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
-
- Metadata typehint = new Metadata();
- typehint.set(Metadata.CONTENT_TYPE, "text/plain");
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
-
- }
-
- /**
- * Test for things like javascript files whose content is enclosed in XML
- * comment delimiters, but that aren't actually XML.
- *
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
- */
- @Test
- public void testNotXML() throws IOException {
- assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
- new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
- new Metadata()));
- }
-
- /**
- * Tests that when we repeatedly test the detection of a document that can
- * be detected with Mime Magic, that we consistently detect it correctly.
- * See TIKA-391 for more details.
- */
- @Test
- public void testMimeMagicStability() throws IOException {
- for (int i = 0; i < 100; i++) {
- testFile("application/vnd.ms-excel", "test.xls");
- }
- }
-
- /**
- * Tests that when two magic matches both apply, and both have the same
- * priority, we use the name to pick the right one based on the glob, or the
- * first one we come across if not. See TIKA-1292 for more details.
- */
- @Test
- public void testMimeMagicClashSamePriority() throws IOException {
- byte[] helloWorld = "Hello, World!".getBytes(UTF8);
- MediaType helloType = MediaType.parse("hello/world-file");
- MediaType helloXType = MediaType.parse("hello/x-world-hello");
- Metadata metadata;
-
- // With a filename, picks the right one
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
- assertEquals(helloType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
-
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
- assertEquals(helloXType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
-
- // Without, goes for the one that sorts last
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
- assertEquals(helloXType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
- }
+ private static final Charset UTF8 = Charset.forName("UTF-8");
+ // private ProbabilisticMimeDetectionSelector proDetector;
+ private Tika tika;
+ private MediaTypeRegistry registry;
+
+ /** @inheritDoc */
+ @Before
+ public void setUp() {
+ try {
+ registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
+ tika = new Tika(new TikaConfig() {
+ @Override
+ protected Detector getDefaultDetector(MimeTypes types,
+ ServiceLoader loader) {
+ /*
+ * here is an example with the use of the builder to
+ * instantiate the object.
+ */
+ Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
+ ProbabilisticMimeDetectionSelector proDetector = new ProbabilisticMimeDetectionSelector(
+ types, builder.priorMagicFileType(0.5f)
+ .priorExtensionFileType(0.5f)
+ .priorMetaFileType(0.5f));
+ return new DefaultProbDetector(proDetector, loader);
+ }
+ });
+ } catch (TikaException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } finally {
+
+ }
+
+ }
+
+ @Test
+ public void testDetection() throws Exception {
+ testFile("image/svg+xml", "circles.svg");
+ testFile("image/svg+xml", "circles-with-prefix.svg");
+ testFile("image/png", "datamatrix.png");
+ testFile("text/html", "test.html");
+ testFile("application/xml", "test-iso-8859-1.xml");
+ testFile("application/xml", "test-utf8.xml");
+ testFile("application/xml", "test-utf8-bom.xml");
+ testFile("application/xml", "test-utf16le.xml");
+ testFile("application/xml", "test-utf16be.xml");
+ testFile("application/xml", "test-long-comment.xml");
+ testFile("application/xslt+xml", "stylesheet.xsl");
+ testUrl("application/rdf+xml",
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ "test-difficult-rdf1.xml");
+ testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#",
+ "test-difficult-rdf2.xml");
+ // add evil test from TIKA-327
+ testFile("text/html", "test-tika-327.html");
+ // add another evil html test from TIKA-357
+ testFile("text/html", "testlargerbuffer.html");
+ // test fragment of HTML with <div> (TIKA-1102)
+ testFile("text/html", "htmlfragment");
+ // test binary CGM detection (TIKA-1170)
+ testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
+ // test HTML detection of malformed file, previously identified as
+ // image/cgm (TIKA-1170)
+ testFile("text/html", "test-malformed-header.html.bin");
+ }
+
+ @Test
+ public void testByteOrderMark() throws Exception {
+ assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
+ new Metadata()));
+ assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
+ new Metadata()));
+
+ assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF8)),
+ new Metadata()));
+ }
+
+ @Test
+ public void testSuperTypes() {
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.OCTET_STREAM));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+xml"),
+ MediaType.APPLICATION_XML));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+zip"),
+ MediaType.APPLICATION_ZIP));
+
+ assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML,
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.iwork"),
+ MediaType.APPLICATION_ZIP));
+ }
+
+ @SuppressWarnings("unused")
+ private void testUrlOnly(String expected, String url) throws IOException {
+ InputStream in = new URL(url).openStream();
+ testStream(expected, url, in);
+ }
+
+ private void testUrl(String expected, String url, String file)
+ throws IOException {
+ InputStream in = getClass().getResourceAsStream(file);
+ testStream(expected, url, in);
+ }
+
+ private void testFile(String expected, String filename) throws IOException {
+ InputStream in = getClass().getResourceAsStream(filename);
+ testStream(expected, filename, in);
+ }
+
+ private void testStream(String expected, String urlOrFileName,
+ InputStream in) throws IOException {
+ assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in);
+ if (!in.markSupported()) {
+ in = new java.io.BufferedInputStream(in);
+ }
+ try {
+ Metadata metadata = new Metadata();
+ // String mime = this.proDetector.detect(in, metadata).toString();
+ String mime = tika.detect(in, metadata).toString();
+ assertEquals(
+ urlOrFileName + " is not properly detected: detected.",
+ expected, mime);
+
+ // Add resource name and test again
+ metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
+ // mime = this.proDetector.detect(in, metadata).toString();
+ mime = tika.detect(in, metadata).toString();
+ assertEquals(urlOrFileName
+ + " is not properly detected after adding resource name.",
+ expected, mime);
+ } finally {
+ in.close();
+ }
+ }
+
+ private void assertNotNull(String string, InputStream in) {
+ // TODO Auto-generated method stub
+
+ }
+
+ /**
+ * Test for type detection of empty documents.
+ *
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+ */
+ @Test
+ public void testEmptyDocument() throws IOException {
+ assertEquals(MediaType.OCTET_STREAM.toString(), tika.detect(
+ new ByteArrayInputStream(new byte[0]), new Metadata()));
+
+ Metadata namehint = new Metadata();
+ namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
+ assertEquals(MediaType.TEXT_PLAIN.toString(),
+ tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
+
+ Metadata typehint = new Metadata();
+ typehint.set(Metadata.CONTENT_TYPE, "text/plain");
+ assertEquals(MediaType.TEXT_PLAIN.toString(),
+ tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
+
+ }
+
+ /**
+ * Test for things like javascript files whose content is enclosed in XML
+ * comment delimiters, but that aren't actually XML.
+ *
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+ */
+ @Test
+ public void testNotXML() throws IOException {
+ assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect(
+ new ByteArrayInputStream("<!-- test -->".getBytes(UTF8)),
+ new Metadata()));
+ }
+
+ /**
+ * Tests that when we repeatedly test the detection of a document that can
+ * be detected with Mime Magic, that we consistently detect it correctly.
+ * See TIKA-391 for more details.
+ */
+ @Test
+ public void testMimeMagicStability() throws IOException {
+ for (int i = 0; i < 100; i++) {
+ testFile("application/vnd.ms-excel", "test.xls");
+ }
+ }
+
+ /**
+ * Tests that when two magic matches both apply, and both have the same
+ * priority, we use the name to pick the right one based on the glob, or the
+ * first one we come across if not. See TIKA-1292 for more details.
+ */
+ @Test
+ public void testMimeMagicClashSamePriority() throws IOException {
+ byte[] helloWorld = "Hello, World!".getBytes(UTF8);
+ MediaType helloType = MediaType.parse("hello/world-file");
+ MediaType helloXType = MediaType.parse("hello/x-world-hello");
+ Metadata metadata;
+
+ // With a filename, picks the right one
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
+ assertEquals(helloType.toString(),
+ tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
+ assertEquals(helloXType.toString(),
+ tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+
+ // Without, goes for the one that sorts last
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
+ assertEquals(helloXType.toString(),
+ tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+ }
}