You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/07/05 22:36:33 UTC
svn commit: r1608130 - in /nutch: branches/2.x/
branches/2.x/src/java/org/apache/nutch/util/
branches/2.x/src/test/org/apache/nutch/util/
branches/2.x/src/testresources/test-mime-util/ trunk/
trunk/src/java/org/apache/nutch/util/ trunk/src/test/org/apa...
Author: snagel
Date: Sat Jul 5 20:36:33 2014
New Revision: 1608130
URL: http://svn.apache.org/r1608130
Log:
NUTCH-1605 MIME type detector recognizes xlsx as zip file
Added:
nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java (with props)
nutch/branches/2.x/src/testresources/test-mime-util/
nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx (with props)
nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (with props)
nutch/trunk/src/testresources/test-mime-util/
nutch/trunk/src/testresources/test-mime-util/test.xlsx (with props)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608130&r1=1608129&r2=1608130&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Jul 5 20:36:33 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)
+
* NUTCH-385 Improve description of thread related configuration for Fetcher (jnioche,lufeng)
* NUTCH-1798 Crawl script not calling index command correctly (Aaron Bedward via jnioche)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130&r1=1608129&r2=1608130&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul 5 20:36:33 2014
@@ -19,13 +19,16 @@ package org.apache.nutch.util;
// JDK imports
import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
// Tika imports
import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
@@ -128,10 +131,10 @@ public final class MimeUtil {
* strategies available within Tika. First, the mime type provided in
* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
- * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
- * found, then that mime type is used, otherwise URL resolution is
- * used to try and determine the mime type. If that means is unsuccessful, and
- * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
+ * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
+ * is found, then that mime type is used, otherwise URL resolution is
+ * used to try and determine the mime type. However, if
+ * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
* better-than-the-default approximation of the {@link MimeType}.
*
@@ -145,24 +148,19 @@ public final class MimeUtil {
*/
public String autoResolveContentType(String typeName, String url, byte[] data) {
String retType = null;
- String magicType = null;
MimeType type = null;
String cleanedMimeType = null;
- try {
- cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
- .forName(MimeUtil.cleanMimeType(typeName)).getName()
- : null;
- } catch (MimeTypeException mte) {
- // Seems to be a malformed mime type name...
- }
-
+ cleanedMimeType = MimeUtil.cleanMimeType(typeName);
// first try to get the type from the cleaned type name
- try {
- type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
- : null;
- } catch (MimeTypeException e) {
- type = null;
+ if (cleanedMimeType != null) {
+ try {
+ type = mimeTypes.forName(cleanedMimeType);
+ cleanedMimeType = type.getName();
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ cleanedMimeType = null;
+ }
}
// if returned null, or if it's the default type then try url resolution
@@ -172,8 +170,6 @@ public final class MimeUtil {
// mime-type, then guess a mime-type from the url pattern
try {
- TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
- Tika tika = new Tika(tikaConfig);
retType = tika.detect(url) != null ? tika.detect(url) : null;
} catch (Exception e) {
String message = "Problem loading default Tika configuration";
@@ -189,10 +185,21 @@ public final class MimeUtil {
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
- magicType = tika.detect(data);
+ String magicType = null;
+ // pass URL (file name) and (cleansed) content type from protocol to Tika
+ Metadata tikaMeta = new Metadata();
+ tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
+ tikaMeta.add(Metadata.CONTENT_TYPE,
+ (cleanedMimeType != null ? cleanedMimeType : typeName));
+ try {
+ InputStream stream = TikaInputStream.get(data);
+ try {
+ magicType = tika.detect(stream, tikaMeta);
+ } finally {
+ stream.close();
+ }
+ } catch (IOException ignore) {}
- // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230
- //MimeType magicType = this.mimeTypes.getMimeType(data);
if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
&& !magicType.equals(MimeTypes.PLAIN_TEXT)
&& retType != null && !retType.equals(magicType)) {
Added: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java?rev=1608130&view=auto
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java (added)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java Sat Jul 5 20:36:33 2014
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.io.Files;
+
+import junit.framework.TestCase;
+
+public class TestMimeUtil extends TestCase {
+
+ public static String urlPrefix = "http://localhost/";
+
+ private static Charset defaultCharset = Charset.forName("UTF-8");
+
+ private File sampleDir = new File(System.getProperty("test.build.data", "."),
+ "test-mime-util");
+
+ /** test data, every element on "test page":
+ * <ol>
+ * <li>MIME type</li>
+ * <li>file name (last URL path element)</li>
+ * <li>Content-Type (HTTP header)</li>
+ * <li>content: if empty, do not test MIME magic</li>
+ * </ol>
+ */
+ public static String[][] textBasedFormats = {
+ {
+ "text/html",
+ "test.html",
+ "text/html; charset=utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n"
+ + "</head>\n<body>Hello, World!</body></html>" },
+ {
+ "text/html",
+ "test.html",
+ "", // no Content-Type in HTTP header => test URL pattern
+ "<!DOCTYPE html>\n<html>\n<head>\n"
+ + "</head>\n<body>Hello, World!</body></html>" },
+ {
+ "application/xhtml+xml",
+ "test.html",
+ "application/xhtml+xml; charset=utf-8",
+ "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+ + "</head>\n<body>Hello, World!</body></html>" }
+ };
+
+ public static String[][] binaryFiles = {
+ {
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "test.xlsx",
+ "" }
+ };
+
+ private String getMimeType(String url, File file, String contentType,
+ boolean useMagic) throws IOException {
+ return getMimeType(url, Files.toByteArray(file), contentType, useMagic);
+ }
+
+ private String getMimeType(String url, byte[] bytes, String contentType,
+ boolean useMagic) {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("mime.type.magic", useMagic);
+ MimeUtil mimeUtil = new MimeUtil(conf);
+ return mimeUtil.autoResolveContentType(contentType, url, bytes);
+ }
+
+ /** use HTTP Content-Type, URL pattern, and MIME magic */
+ public void testWithMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix,
+ testPage[3].getBytes(defaultCharset), testPage[2], true);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** use only HTTP Content-Type (if given) and URL pattern */
+ public void testWithoutMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix + testPage[1],
+ testPage[3].getBytes(defaultCharset), testPage[2], false);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** use only MIME magic (detection from content bytes) */
+ public void testOnlyMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix,
+ testPage[3].getBytes(defaultCharset), "", true);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** test binary file formats (real files) */
+ public void testBinaryFiles() throws IOException {
+ for (String[] testPage : binaryFiles) {
+ File dataFile = new File(sampleDir, testPage[1]);
+ String mimeType = getMimeType(urlPrefix + testPage[1],
+ dataFile, testPage[2], false);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+}
Propchange: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx?rev=1608130&view=auto
==============================================================================
Binary file - no diff available.
Propchange: nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1608130&r1=1608129&r2=1608130&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Jul 5 20:36:33 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel)
+
* NUTCH-1802 Move TestbedProxy to test environment (jnioche)
* NUTCH-1803 Put test dependencies in a separate lib dir (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130&r1=1608129&r2=1608130&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul 5 20:36:33 2014
@@ -19,13 +19,16 @@ package org.apache.nutch.util;
// JDK imports
import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
// Tika imports
import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
@@ -128,10 +131,10 @@ public final class MimeUtil {
* strategies available within Tika. First, the mime type provided in
* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
- * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
- * found, then that mime type is used, otherwise URL resolution is
- * used to try and determine the mime type. If that means is unsuccessful, and
- * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
+ * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
+ * is found, then that mime type is used, otherwise URL resolution is
+ * used to try and determine the mime type. However, if
+ * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
* better-than-the-default approximation of the {@link MimeType}.
*
@@ -145,24 +148,19 @@ public final class MimeUtil {
*/
public String autoResolveContentType(String typeName, String url, byte[] data) {
String retType = null;
- String magicType = null;
MimeType type = null;
String cleanedMimeType = null;
- try {
- cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
- .forName(MimeUtil.cleanMimeType(typeName)).getName()
- : null;
- } catch (MimeTypeException mte) {
- // Seems to be a malformed mime type name...
- }
-
+ cleanedMimeType = MimeUtil.cleanMimeType(typeName);
// first try to get the type from the cleaned type name
- try {
- type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
- : null;
- } catch (MimeTypeException e) {
- type = null;
+ if (cleanedMimeType != null) {
+ try {
+ type = mimeTypes.forName(cleanedMimeType);
+ cleanedMimeType = type.getName();
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ cleanedMimeType = null;
+ }
}
// if returned null, or if it's the default type then try url resolution
@@ -171,8 +169,6 @@ public final class MimeUtil {
// If no mime-type header, or cannot find a corresponding registered
// mime-type, then guess a mime-type from the url pattern
try {
- TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
- Tika tika = new Tika(tikaConfig);
retType = tika.detect(url) != null ? tika.detect(url) : null;
} catch (Exception e) {
String message = "Problem loading default Tika configuration";
@@ -188,10 +184,21 @@ public final class MimeUtil {
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
- magicType = tika.detect(data);
+ String magicType = null;
+ // pass URL (file name) and (cleansed) content type from protocol to Tika
+ Metadata tikaMeta = new Metadata();
+ tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
+ tikaMeta.add(Metadata.CONTENT_TYPE,
+ (cleanedMimeType != null ? cleanedMimeType : typeName));
+ try {
+ InputStream stream = TikaInputStream.get(data);
+ try {
+ magicType = tika.detect(stream, tikaMeta);
+ } finally {
+ stream.close();
+ }
+ } catch (IOException ignore) {}
- // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230
- //MimeType magicType = this.mimeTypes.getMimeType(data);
if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
&& !magicType.equals(MimeTypes.PLAIN_TEXT)
&& retType != null && !retType.equals(magicType)) {
Added: nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java?rev=1608130&view=auto
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (added)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java Sat Jul 5 20:36:33 2014
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.io.Files;
+
+import junit.framework.TestCase;
+
+public class TestMimeUtil extends TestCase {
+
+ public static String urlPrefix = "http://localhost/";
+
+ private static Charset defaultCharset = Charset.forName("UTF-8");
+
+ private File sampleDir = new File(System.getProperty("test.build.data", "."),
+ "test-mime-util");
+
+ /** test data, every element on "test page":
+ * <ol>
+ * <li>MIME type</li>
+ * <li>file name (last URL path element)</li>
+ * <li>Content-Type (HTTP header)</li>
+ * <li>content: if empty, do not test MIME magic</li>
+ * </ol>
+ */
+ public static String[][] textBasedFormats = {
+ {
+ "text/html",
+ "test.html",
+ "text/html; charset=utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n"
+ + "</head>\n<body>Hello, World!</body></html>" },
+ {
+ "text/html",
+ "test.html",
+ "", // no Content-Type in HTTP header => test URL pattern
+ "<!DOCTYPE html>\n<html>\n<head>\n"
+ + "</head>\n<body>Hello, World!</body></html>" },
+ {
+ "application/xhtml+xml",
+ "test.html",
+ "application/xhtml+xml; charset=utf-8",
+ "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+ + "</head>\n<body>Hello, World!</body></html>" }
+ };
+
+ public static String[][] binaryFiles = {
+ {
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "test.xlsx",
+ "" }
+ };
+
+ private String getMimeType(String url, File file, String contentType,
+ boolean useMagic) throws IOException {
+ return getMimeType(url, Files.toByteArray(file), contentType, useMagic);
+ }
+
+ private String getMimeType(String url, byte[] bytes, String contentType,
+ boolean useMagic) {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("mime.type.magic", useMagic);
+ MimeUtil mimeUtil = new MimeUtil(conf);
+ return mimeUtil.autoResolveContentType(contentType, url, bytes);
+ }
+
+ /** use HTTP Content-Type, URL pattern, and MIME magic */
+ public void testWithMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix,
+ testPage[3].getBytes(defaultCharset), testPage[2], true);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** use only HTTP Content-Type (if given) and URL pattern */
+ public void testWithoutMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix + testPage[1],
+ testPage[3].getBytes(defaultCharset), testPage[2], false);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** use only MIME magic (detection from content bytes) */
+ public void testOnlyMimeMagic() {
+ for (String[] testPage : textBasedFormats) {
+ String mimeType = getMimeType(urlPrefix,
+ testPage[3].getBytes(defaultCharset), "", true);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+ /** test binary file formats (real files) */
+ public void testBinaryFiles() throws IOException {
+ for (String[] testPage : binaryFiles) {
+ File dataFile = new File(sampleDir, testPage[1]);
+ String mimeType = getMimeType(urlPrefix + testPage[1],
+ dataFile, testPage[2], false);
+ assertEquals("", testPage[0], mimeType);
+ }
+ }
+
+}
Propchange: nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: nutch/trunk/src/testresources/test-mime-util/test.xlsx
URL: http://svn.apache.org/viewvc/nutch/trunk/src/testresources/test-mime-util/test.xlsx?rev=1608130&view=auto
==============================================================================
Binary file - no diff available.
Propchange: nutch/trunk/src/testresources/test-mime-util/test.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream