You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/04 23:24:31 UTC
svn commit: r278629 -
/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
Author: jerome
Date: Sun Sep 4 14:24:26 2005
New Revision: 278629
URL: http://svn.apache.org/viewcvs?rev=278629&view=rev
Log:
Use the MimeTypes resolver instead of hard coding it
Modified:
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=278629&r1=278628&r2=278629&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sun Sep 4 14:24:26 2005
@@ -6,16 +6,17 @@
package org.apache.nutch.parse.zip;
-import java.util.logging.Logger;
-
+// JDK imports
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
+import java.util.logging.Logger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.net.URL;
+
+// Nutch imports
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParseData;
@@ -24,96 +25,80 @@
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.mime.MimeTypes;
+
/**
*
* @author Rohit Kulkarni & Ashish Vaidya
*/
public class ZipTextExtractor {
- public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName());
+
+ /** Get the MimeTypes resolver instance. */
+ private final static MimeTypes MIME =
+ MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
+ public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName());
+
+
+ /** Creates a new instance of ZipTextExtractor */
+ public ZipTextExtractor() {
+ }
+
+ public String extractText(InputStream input, String url, List outLinksList) throws IOException {
+ String resultText = "";
+ byte temp;
- /** Creates a new instance of ZipTextExtractor */
- public ZipTextExtractor() {
- }
+ ZipInputStream zin = new ZipInputStream(input);
+
+ ZipEntry entry;
- public String extractText(InputStream input, String url, List outLinksList) throws IOException {
- String resultText = "";
- byte temp;
-
- ZipInputStream zin = new ZipInputStream(input);
-
- ZipEntry entry;
-
- while ((entry = zin.getNextEntry()) != null) {
+ while ((entry = zin.getNextEntry()) != null) {
+
+ if (!entry.isDirectory()) {
+ int size = (int) entry.getSize();
+ byte[] b = new byte[size];
+ for(int x = 0; x < size; x++) {
+ int err = zin.read();
+ if(err != -1) {
+ b[x] = (byte)err;
+ }
+ }
+ String newurl = url + "/";
+ String fname = entry.getName();
+ newurl += fname;
+ URL aURL = new URL(newurl);
+ String base = aURL.toString();
+ int i = fname.lastIndexOf('.');
+ if (i != -1) {
+ // Trying to resolve the Mime-Type
+ String contentType = MIME.getMimeType(fname).getName();
+ try {
+ Properties metadata = new Properties();
+ metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
+ metadata.setProperty("Content-Type", contentType);
+ Content content = new Content(newurl, base, b, contentType, metadata);
+ Parser parser = ParserFactory.getParser(contentType, newurl);
+ Parse parse = parser.getParse(content);
+ ParseData theParseData = parse.getData();
+ Outlink[] theOutlinks = theParseData.getOutlinks();
- if (!entry.isDirectory()) {
- int size = (int) entry.getSize();
- byte[] b = new byte[size];
- for(int x = 0; x < size; x++) {
- int err = zin.read();
- if(err != -1) {
- b[x] = (byte)err;
- }
- }
- String newurl = url + "/";
- String fname = entry.getName();
- newurl += fname;
- URL aURL = new URL(newurl);
- String base = aURL.toString();
- int i = fname.lastIndexOf('.');
- if (i != -1) {
- // file name has extension
- String contentType = "";
- String ext = fname.substring(i + 1, fname.length());
- if (ext.equalsIgnoreCase("txt") || ext.equalsIgnoreCase("c")
- || ext.equalsIgnoreCase("cc") || ext.equalsIgnoreCase("pl")
- || ext.equalsIgnoreCase("sh") || ext.equalsIgnoreCase("java")
- || ext.equalsIgnoreCase("cpp")) {
- contentType = "text/plain";
- } else if (ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm")) {
- contentType = "text/html";
- } else if (ext.equalsIgnoreCase("xls") || ext.equalsIgnoreCase("xla")
- || ext.equalsIgnoreCase("xlt") || ext.equalsIgnoreCase("xlw")) {
- contentType = "application/vnd.ms-excel";
- } else if (ext.equalsIgnoreCase("ppt") || ext.equalsIgnoreCase("pps")) {
- contentType = "application/vnd.ms-powerpoint";
- } else if (ext.equalsIgnoreCase("doc")) {
- contentType = "application/msword";
- } else if (ext.equalsIgnoreCase("mp3")) {
- contentType = "audio/mpeg";
- } else if (ext.equalsIgnoreCase("pdf")) {
- contentType = "application/pdf";
- } else if (ext.equalsIgnoreCase("rtf")) {
- contentType = "application/rtf";
- } else if (ext.equalsIgnoreCase("zip")) {
- contentType = "application/zip";
- }
- System.out.println("trying to parse " + fname);
- try {
- Properties metadata = new Properties();
- metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
- metadata.setProperty("Content-Type", contentType);
- Content content = new Content(newurl, base, b, contentType, metadata);
- Parser parser = ParserFactory.getParser(contentType, newurl);
- Parse parse = parser.getParse(content);
- ParseData theParseData = parse.getData();
- Outlink[] theOutlinks = theParseData.getOutlinks();
-
- for(int count = 0; count < theOutlinks.length; count++) {
- outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
- }
-
- resultText += entry.getName() + " " + parse.getText() + " ";
- } catch (ParseException e) {
-
- LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
- }
- }
+ for(int count = 0; count < theOutlinks.length; count++) {
+ outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
+
+ resultText += entry.getName() + " " + parse.getText() + " ";
+ } catch (ParseException e) {
+
+ LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
+ }
}
-
- return resultText;
+ }
}
+ return resultText;
+ }
+
}