You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 16:20:30 UTC

svn commit: r1558420 - in /nutch/trunk: ./ src/plugin/parse-tika/ src/plugin/parse-tika/sample/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/ src/plugin/parse-tika/src/test/org/apache/nutch/tika/

Author: lewismc
Date: Wed Jan 15 15:20:29 2014
New Revision: 1558420

URL: http://svn.apache.org/r1558420
Log:
NUTCH-1699 Tika Parser - Image Parse Bug

Added:
    nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif   (with props)
    nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/parse-tika/build.xml
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 15 15:20:29 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc)
+
 * NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus)
 
 * NUTCH-1675 NutchField to support long (markus)

Modified: nutch/trunk/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/build.xml?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/build.xml Wed Jan 15 15:20:29 2014
@@ -47,6 +47,7 @@
       <include name="*.pdf"/>
       <include name="ootest.*"/>
       <include name="*.doc"/>
+      <include name="*.gif"/>
     </fileset>
   </copy>
   

Added: nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif?rev=1558420&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/trunk/src/plugin/parse-tika/sample/nutch_logo_tm.gif
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1558420&r1=1558419&r2=1558420&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Wed Jan 15 15:20:29 2014
@@ -93,6 +93,7 @@ public class TikaParser implements org.a
 		DocumentFragment root = doc.createDocumentFragment();
 		DOMBuilder domhandler = new DOMBuilder(doc, root);
 		ParseContext context = new ParseContext();
+		tikamd.set(Metadata.CONTENT_TYPE, mimeType);
 		try {
 		  parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
 		} catch (Exception e) {

Added: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java?rev=1558420&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java (added)
+++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java Wed Jan 15 15:20:29 2014
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+import junit.framework.TestCase;
+
+/** 
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = {
+      "nutch_logo_tm.gif",
+  };
+
+  public TestImageMetadata(String name) { 
+    super(name); 
+  }
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
+
+      assertEquals("121", parse.getData().getMeta("width"));
+      assertEquals("48", parse.getData().getMeta("height"));
+    }
+  }
+
+}