You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 16:14:33 UTC

svn commit: r1558418 - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/parse/ src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/ src/plugin/parse-tika/ src/plugin/parse-tika/sample/ src/plugin/parse-tika/src/java/org/apa...

Author: lewismc
Date: Wed Jan 15 15:14:32 2014
New Revision: 1558418

URL: http://svn.apache.org/r1558418
Log:
NUTCH-1699 Tika Parser - Image Parse Bug

Added:
    nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif   (with props)
    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
    nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
    nutch/branches/2.x/src/plugin/parse-tika/build.xml
    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 15 15:14:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc)
+
 * NUTCH-1568 port pluggable indexing architecture to 2.x (Talat UYARER via lewismc)
 
 * NUTCH-1672 Inlinks are added twice in DbUpdateReducer (Tien Nguyen Manh via lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Wed Jan 15 15:14:32 2014
@@ -32,7 +32,6 @@ import org.apache.hadoop.util.StringUtil
 import org.apache.nutch.crawl.CrawlStatus;
 import org.apache.nutch.crawl.Signature;
 import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.crawl.URLWebPage;
 import org.apache.nutch.fetcher.FetcherJob;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Wed Jan 15 15:14:32 2014
@@ -32,6 +32,7 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
+
 import static org.junit.Assert.*;
 
 /**
@@ -65,6 +66,7 @@ public class TestRelTagParser {
   public void testRelTagParser() throws ParseException, ProtocolException, IOException {
     conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
+    @SuppressWarnings("unused")
     Parse parse;
     String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
 

Modified: nutch/branches/2.x/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/build.xml?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/build.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/build.xml Wed Jan 15 15:14:32 2014
@@ -34,6 +34,7 @@
       <include name="*.pdf"/>
       <include name="ootest.*"/>
       <include name="*.doc"/>
+      <include name="*.gif"/>
     </fileset>
   </copy>
   

Added: nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif?rev=1558418&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Wed Jan 15 15:14:32 2014
@@ -112,6 +112,7 @@ public class TikaParser implements org.a
     ParseContext context = new ParseContext();
     // to add once available in Tika
     // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
     try {
       parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(),
           raw.remaining()), domhandler, tikamd, context);
@@ -234,6 +235,7 @@ public class TikaParser implements org.a
     String url = "file:" + name;
     File file = new File(name);
     byte[] bytes = new byte[(int) file.length()];
+    @SuppressWarnings("resource")
     DataInputStream in = new DataInputStream(new FileInputStream(file));
     in.readFully(bytes);
     Configuration conf = NutchConfiguration.create();

Added: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java?rev=1558418&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java (added)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java Wed Jan 15 15:14:32 2014
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+public class TestImageMetadata {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = {"nutch_logo_tm.gif"};
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+    String urlString;
+    @SuppressWarnings("unused")
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    MimeUtil mimeutil = new MimeUtil(conf);
+    
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+      
+      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+      byte[] bytes = new byte[(int) file.length()];
+      DataInputStream in = new DataInputStream(new FileInputStream(file));
+      in.readFully(bytes);
+      in.close();
+      
+      WebPage page = new WebPage();
+      page.setBaseUrl(new Utf8(urlString));
+      page.setContent(ByteBuffer.wrap(bytes));
+      String mtype = mimeutil.getMimeType(file);
+      page.setContentType(new Utf8(mtype));
+      
+      parse = new ParseUtil(conf).parse(urlString, page);
+      
+      //assert width
+      ByteBuffer bbufW = page.getFromMetadata(new Utf8("width"));
+      byte[] byteArrayW = new byte[bbufW.remaining()];
+      bbufW.get(byteArrayW);
+      String width = new String(byteArrayW);
+      assertEquals("121", width);
+      
+      //assert height
+      ByteBuffer bbufH = page.getFromMetadata(new Utf8("height"));
+      byte[] byteArrayH = new byte[bbufH.remaining()];
+      bbufH.get(byteArrayH);
+      String height = new String(byteArrayH);
+      assertEquals("48", height);
+    }
+  }
+}