You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 16:20:30 UTC
svn commit: r1558420 - in /nutch/trunk: ./ src/plugin/parse-tika/
src/plugin/parse-tika/sample/
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/
src/plugin/parse-tika/src/test/org/apache/nutch/tika/
Author: lewismc
Date: Wed Jan 15 15:20:29 2014
New Revision: 1558420
URL: http://svn.apache.org/r1558420
Log:
NUTCH-1699 Tika Parser - Image Parse Bug
Added:
nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif (with props)
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parse-tika/build.xml
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 15 15:20:29 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc)
+
* NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus)
* NUTCH-1675 NutchField to support long (markus)
Modified: nutch/trunk/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/build.xml?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/build.xml Wed Jan 15 15:20:29 2014
@@ -47,6 +47,7 @@
<include name="*.pdf"/>
<include name="ootest.*"/>
<include name="*.doc"/>
+ <include name="*.gif"/>
</fileset>
</copy>
Added: nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif?rev=1558420&view=auto
==============================================================================
Binary file - no diff available.
Propchange: nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Wed Jan 15 15:20:29 2014
@@ -93,6 +93,7 @@ public class TikaParser implements org.a
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
ParseContext context = new ParseContext();
+ tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
} catch (Exception e) {
Added: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java?rev=1558420&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java (added)
+++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java Wed Jan 15 15:20:29 2014
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+import junit.framework.TestCase;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data",".");
+ // Make sure sample files are copied to "test.data" as specified in
+ private String[] sampleFiles = {
+ "nutch_logo_tm.gif",
+ };
+
+ public TestImageMetadata(String name) {
+ super(name);
+ }
+
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ Configuration conf = NutchConfiguration.create();
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
+
+ assertEquals("121", parse.getData().getMeta("width"));
+ assertEquals("48", parse.getData().getMeta("height"));
+ }
+ }
+
+}