You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 16:14:33 UTC
svn commit: r1558418 - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/parse/
src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/
src/plugin/parse-tika/ src/plugin/parse-tika/sample/
src/plugin/parse-tika/src/java/org/apa...
Author: lewismc
Date: Wed Jan 15 15:14:32 2014
New Revision: 1558418
URL: http://svn.apache.org/r1558418
Log:
NUTCH-1699 Tika Parser - Image Parse Bug
Added:
nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif (with props)
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
nutch/branches/2.x/src/plugin/parse-tika/build.xml
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 15 15:14:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc)
+
* NUTCH-1568 port pluggable indexing architecture to 2.x (Talat UYARER via lewismc)
* NUTCH-1672 Inlinks are added twice in DbUpdateReducer (Tien Nguyen Manh via lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Wed Jan 15 15:14:32 2014
@@ -32,7 +32,6 @@ import org.apache.hadoop.util.StringUtil
import org.apache.nutch.crawl.CrawlStatus;
import org.apache.nutch.crawl.Signature;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.crawl.URLWebPage;
import org.apache.nutch.fetcher.FetcherJob;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Wed Jan 15 15:14:32 2014
@@ -32,6 +32,7 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
+
import static org.junit.Assert.*;
/**
@@ -65,6 +66,7 @@ public class TestRelTagParser {
public void testRelTagParser() throws ParseException, ProtocolException, IOException {
conf = NutchConfiguration.create();
conf.set("file.content.limit", "-1");
+ @SuppressWarnings("unused")
Parse parse;
String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
Modified: nutch/branches/2.x/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/build.xml?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/build.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/build.xml Wed Jan 15 15:14:32 2014
@@ -34,6 +34,7 @@
<include name="*.pdf"/>
<include name="ootest.*"/>
<include name="*.doc"/>
+ <include name="*.gif"/>
</fileset>
</copy>
Added: nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif?rev=1558418&view=auto
==============================================================================
Binary file - no diff available.
Propchange: nutch/branches/2.x/src/plugin/parse-tika/sample/nutch_logo_tm.gif
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1558418&r1=1558417&r2=1558418&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Wed Jan 15 15:14:32 2014
@@ -112,6 +112,7 @@ public class TikaParser implements org.a
ParseContext context = new ParseContext();
// to add once available in Tika
// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+ tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(),
raw.remaining()), domhandler, tikamd, context);
@@ -234,6 +235,7 @@ public class TikaParser implements org.a
String url = "file:" + name;
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
+ @SuppressWarnings("resource")
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
Added: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java?rev=1558418&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java (added)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java Wed Jan 15 15:14:32 2014
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+public class TestImageMetadata {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data",".");
+ // Make sure sample files are copied to "test.data" as specified in
+ private String[] sampleFiles = {"nutch_logo_tm.gif"};
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ String urlString;
+ @SuppressWarnings("unused")
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+ MimeUtil mimeutil = new MimeUtil(conf);
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
+
+ parse = new ParseUtil(conf).parse(urlString, page);
+
+ //assert width
+ ByteBuffer bbufW = page.getFromMetadata(new Utf8("width"));
+ byte[] byteArrayW = new byte[bbufW.remaining()];
+ bbufW.get(byteArrayW);
+ String width = new String(byteArrayW);
+ assertEquals("121", width);
+
+ //assert height
+ ByteBuffer bbufH = page.getFromMetadata(new Utf8("height"));
+ byte[] byteArrayH = new byte[bbufH.remaining()];
+ bbufH.get(byteArrayH);
+ String height = new String(byteArrayH);
+ assertEquals("48", height);
+ }
+ }
+}