You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/04 22:54:01 UTC
svn commit: r278626 - in /lucene/nutch/trunk/src/plugin: ./ parse-zip/
parse-zip/sample/ parse-zip/src/ parse-zip/src/java/
parse-zip/src/java/org/ parse-zip/src/java/org/apache/
parse-zip/src/java/org/apache/nutch/ parse-zip/src/java/org/apache/nutch/...
Author: jerome
Date: Sun Sep 4 13:53:49 2005
New Revision: 278626
URL: http://svn.apache.org/viewcvs?rev=278626&view=rev
Log:
NUTCH-53, Parser plugin for Zip files (Rohit Kulkarni)
Added:
lucene/nutch/trunk/src/plugin/parse-zip/
lucene/nutch/trunk/src/plugin/parse-zip/build.xml (with props)
lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/parse-zip/sample/
lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip (with props)
lucene/nutch/trunk/src/plugin/parse-zip/src/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (with props)
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (with props)
lucene/nutch/trunk/src/plugin/parse-zip/src/test/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=278626&r1=278625&r2=278626&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sun Sep 4 13:53:49 2005
@@ -21,6 +21,7 @@
<!-- <ant dir="parse-mp3" target="deploy"/> -->
<!-- <ant dir="parse-rtf" target="deploy"/> -->
<ant dir="parse-ext" target="deploy"/>
+ <ant dir="parse-zip" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="query-basic" target="deploy"/>
@@ -48,6 +49,7 @@
<!-- <ant dir="parse-mp3" target="test"/> -->
<!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-ext" target="test"/>
+ <ant dir="parse-zip" target="test"/>
<ant dir="creativecommons" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="ontology" target="test"/>
@@ -72,6 +74,7 @@
<ant dir="parse-mp3" target="clean"/>
<ant dir="parse-rtf" target="clean"/>
<ant dir="parse-ext" target="clean"/>
+ <ant dir="parse-zip" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="query-basic" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/build.xml?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/build.xml Sun Sep 4 13:53:49 2005
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+
+<project name="parse-zip" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.zip" />
+ </fileset>
+ </copy>
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml Sun Sep 4 13:53:49 2005
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-zip"
+ name="Zip Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-zip.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.parse.zip"
+ name="ZipParser"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="org.apache.nutch.parse.zip.ZipParser"
+ class="org.apache.nutch.parse.zip.ZipParser"
+ contentType="application/zip"
+ pathSuffix="zip"/>
+ </extension>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip?rev=278626&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Sun Sep 4 13:53:49 2005
@@ -0,0 +1,101 @@
+/*
+ * ZipParser.java
+ *
+ * Nutch parse plugin for zip files - Content Type : application/zip
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter
+ */
+public class ZipParser implements Parser{
+
+ private static final Logger LOG = LogFormatter.getLogger(ZipParser.class.getName());
+ /** Creates a new instance of ZipParser */
+ public ZipParser() {
+ }
+
+ public Parse getParse(final Content content) {
+
+ // check that contentType is one we can handle
+ final String contentType = content.getContentType();
+ if (contentType != null && !contentType.startsWith("application/zip")) {
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/zip: " + contentType).getEmptyParse();
+ }
+
+ String resultText = null;
+ String resultTitle = null;
+ Outlink[] outlinks = null;
+ List outLinksList = new ArrayList();
+ Properties properties = null;
+
+ try {
+ final String contentLen = content.get("Content-Length");
+ final int len = Integer.parseInt(contentLen);
+ System.out.println("ziplen: " + len);
+ final byte[] contentInBytes = content.getContent();
+ final ByteArrayInputStream bainput = new ByteArrayInputStream(contentInBytes);
+ final InputStream input = bainput;
+
+ if (contentLen != null && contentInBytes.length != len) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + contentInBytes.length +
+ " bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
+ }
+
+ ZipTextExtractor extractor = new ZipTextExtractor();
+
+ // extract text
+ resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes),
+ content.getUrl(), outLinksList);
+
+ } catch (Exception e) {
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as Zip document. " + e).getEmptyParse();
+ }
+
+ // collect meta data
+ final Properties metadata = new Properties();
+ metadata.putAll(content.getMetadata()); // copy through
+
+ if (resultText == null) {
+ resultText = "";
+ }
+
+ if (resultTitle == null) {
+ resultTitle = "";
+ }
+
+ outlinks = (Outlink[])outLinksList.toArray(new Outlink[0]);
+ final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ resultTitle,
+ outlinks,
+ metadata);
+
+ LOG.finest("Zip file parsed sucessfully !!");
+ return new ParseImpl(resultText, parseData);
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sun Sep 4 13:53:49 2005
@@ -0,0 +1,119 @@
+/*
+ * ZipTextExtractor.java
+ *
+ *
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.util.logging.Logger;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.net.URL;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class ZipTextExtractor {
+ public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName());
+
+ /** Creates a new instance of ZipTextExtractor */
+ public ZipTextExtractor() {
+ }
+
+ public String extractText(InputStream input, String url, List outLinksList) throws IOException {
+ String resultText = "";
+ byte temp;
+
+ ZipInputStream zin = new ZipInputStream(input);
+
+ ZipEntry entry;
+
+ while ((entry = zin.getNextEntry()) != null) {
+
+ if (!entry.isDirectory()) {
+ int size = (int) entry.getSize();
+ byte[] b = new byte[size];
+ for(int x = 0; x < size; x++) {
+ int err = zin.read();
+ if(err != -1) {
+ b[x] = (byte)err;
+ }
+ }
+ String newurl = url + "/";
+ String fname = entry.getName();
+ newurl += fname;
+ URL aURL = new URL(newurl);
+ String base = aURL.toString();
+ int i = fname.lastIndexOf('.');
+ if (i != -1) {
+ // file name has extension
+ String contentType = "";
+ String ext = fname.substring(i + 1, fname.length());
+ if (ext.equalsIgnoreCase("txt") || ext.equalsIgnoreCase("c")
+ || ext.equalsIgnoreCase("cc") || ext.equalsIgnoreCase("pl")
+ || ext.equalsIgnoreCase("sh") || ext.equalsIgnoreCase("java")
+ || ext.equalsIgnoreCase("cpp")) {
+ contentType = "text/plain";
+ } else if (ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm")) {
+ contentType = "text/html";
+ } else if (ext.equalsIgnoreCase("xls") || ext.equalsIgnoreCase("xla")
+ || ext.equalsIgnoreCase("xlt") || ext.equalsIgnoreCase("xlw")) {
+ contentType = "application/vnd.ms-excel";
+ } else if (ext.equalsIgnoreCase("ppt") || ext.equalsIgnoreCase("pps")) {
+ contentType = "application/vnd.ms-powerpoint";
+ } else if (ext.equalsIgnoreCase("doc")) {
+ contentType = "application/msword";
+ } else if (ext.equalsIgnoreCase("mp3")) {
+ contentType = "audio/mpeg";
+ } else if (ext.equalsIgnoreCase("pdf")) {
+ contentType = "application/pdf";
+ } else if (ext.equalsIgnoreCase("rtf")) {
+ contentType = "application/rtf";
+ } else if (ext.equalsIgnoreCase("zip")) {
+ contentType = "application/zip";
+ }
+ System.out.println("trying to parse " + fname);
+ try {
+ Properties metadata = new Properties();
+ metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
+ metadata.setProperty("Content-Type", contentType);
+ Content content = new Content(newurl, base, b, contentType, metadata);
+ Parser parser = ParserFactory.getParser(contentType, newurl);
+ Parse parse = parser.getParse(content);
+ ParseData theParseData = parse.getData();
+ Outlink[] theOutlinks = theParseData.getOutlinks();
+
+ for(int count = 0; count < theOutlinks.length; count++) {
+ outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
+ }
+
+ resultText += entry.getName() + " " + parse.getText() + " ";
+ } catch (ParseException e) {
+
+ LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
+ }
+ }
+ }
+ }
+
+ return resultText;
+ }
+
+}
+
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Sun Sep 4 13:53:49 2005
@@ -0,0 +1,63 @@
+/*
+ * TestZipParser.java
+ */
+
+package org.apache.nutch.parse.zip;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+
+import junit.framework.TestCase;
+
+/**
+ * Based on Unit tests for MSWordParser by John Xing
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestZipParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data",".");
+
+ // Make sure sample files are copied to "test.data"
+
+ private String[] sampleFiles = {"test.zip"};
+
+ private String expectedText = "textfile.txt This is text file number 1 ";
+
+ public TestZipParser(String name) {
+ super(name);
+ }
+
+ protected void setUp() {}
+
+ protected void tearDown() {}
+
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parser parser;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = ProtocolFactory.getProtocol(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
+
+ parser = ParserFactory.getParser(content.getContentType(), urlString);
+ parse = parser.getParse(content);
+ assertTrue(parse.getText().equals(expectedText));
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
------------------------------------------------------------------------------
svn:eol-style = native