You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/01/31 17:13:17 UTC
svn commit: r373853 [5/6] - in /lucene/nutch/trunk/src:
java/org/apache/nutch/analysis/ java/org/apache/nutch/clustering/
java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/
java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apac...
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.CommandRunner;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
@@ -55,39 +56,13 @@
static final int TIMEOUT_DEFAULT = 30; // in seconds
// handy map from String contentType to String[] {command, timeoutString}
- static Hashtable TYPE_PARAMS_MAP = new Hashtable();
+ Hashtable TYPE_PARAMS_MAP = new Hashtable();
- // set TYPE_PARAMS_MAP using plugin.xml of this plugin
- static {
- Extension[] extensions = PluginRepository.getInstance()
- .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
+ private NutchConf nutchConf;
- String contentType, command, timeoutString;
-
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
-
- // only look for extensions defined by plugin parse-ext
- if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
- continue;
-
- contentType = extension.getAttribute("contentType");
- if (contentType == null || contentType.equals(""))
- continue;
+ private boolean loaded = false;
- command = extension.getAttribute("command");
- if (command == null || command.equals(""))
- continue;
-
- timeoutString = extension.getAttribute("timeout");
- if (timeoutString == null || timeoutString.equals(""))
- timeoutString = "" + TIMEOUT_DEFAULT;
-
- TYPE_PARAMS_MAP.put(contentType, new String[]{command, timeoutString});
- }
- }
-
- public ExtParser () {}
+ public ExtParser () { }
public Parse getParse(Content content) {
@@ -96,7 +71,7 @@
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED,
- "No external command defined for contentType: " + contentType).getEmptyParse();
+ "No external command defined for contentType: " + contentType).getEmptyParse(getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
@@ -118,7 +93,7 @@
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at " + raw.length
+" bytes. Parser can't handle incomplete "
- + contentType + " file.").getEmptyParse();
+ + contentType + " file.").getEmptyParse(getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
@@ -138,12 +113,12 @@
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED,
"External command " + command
- + " failed with error: " + es.toString()).getEmptyParse();
+ + " failed with error: " + es.toString()).getEmptyParse(getConf());
text = os.toString();
} catch (Exception e) { // run time exception
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
if (text == null)
@@ -153,14 +128,48 @@
title = "";
// collect outlink
- Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
// collect meta data
ContentProperties metaData = new ContentProperties();
metaData.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
+ parseData.setConf(this.nutchConf);
return new ParseImpl(text, parseData);
}
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ Extension[] extensions = conf.getPluginRepository().getExtensionPoint(
+ "org.apache.nutch.parse.Parser").getExtensions();
+
+ String contentType, command, timeoutString;
+
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+
+ // only look for extensions defined by plugin parse-ext
+ if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
+ continue;
+
+ contentType = extension.getAttribute("contentType");
+ if (contentType == null || contentType.equals(""))
+ continue;
+
+ command = extension.getAttribute("command");
+ if (command == null || command.equals(""))
+ continue;
+
+ timeoutString = extension.getAttribute("timeout");
+ if (timeoutString == null || timeoutString.equals(""))
+ timeoutString = "" + TIMEOUT_DEFAULT;
+
+ TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString });
+ }
+ }
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -79,7 +80,7 @@
fos.close();
// get nutch content
- Protocol protocol = ProtocolFactory.getProtocol(urlString);
+ Protocol protocol = new ProtocolFactory(new NutchConf()).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
protocol = null;
}
@@ -103,18 +104,19 @@
return;
}
+ NutchConf nutchConf = new NutchConf();
// loop alternately, total 10*2 times of invoking external command
for (int i=0; i<10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parse = ParseUtil.parseByParserId("parse-ext", content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-ext", content);
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parse = ParseUtil.parseByParserId("parse-ext", content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-ext", content);
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Tue Jan 31 08:08:58 2006
@@ -22,6 +22,7 @@
import java.util.HashMap;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
import org.w3c.dom.*;
@@ -286,7 +287,7 @@
* nekohtml).
*/
public static final void getOutlinks(URL base, ArrayList outlinks,
- Node node) {
+ Node node, NutchConf nutchConf) {
NodeList children = node.getChildNodes();
int childLen= 0;
@@ -322,7 +323,7 @@
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),
- linkText.toString().trim()));
+ linkText.toString().trim(), nutchConf));
} catch (MalformedURLException e) {
// don't care
}
@@ -332,7 +333,7 @@
}
}
for ( int i = 0; i < childLen; i++ ) {
- getOutlinks(base, outlinks, children.item(i));
+ getOutlinks(base, outlinks, children.item(i), nutchConf);
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Jan 31 08:08:58 2006
@@ -51,7 +51,7 @@
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
- private static String parserImpl = NutchConf.get().get("parser.html.impl", "neko");
+ private String parserImpl;
/**
* Given a <code>byte[]</code> representing an html file of an
@@ -91,8 +91,11 @@
}
- private static String defaultCharEncoding =
- NutchConf.get().get("parser.character.encoding.default", "windows-1252");
+ private String defaultCharEncoding;
+
+ private NutchConf nutchConf;
+
+ private HtmlParseFilters htmlParseFilters;
public Parse getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
@@ -101,7 +104,7 @@
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
String text = "";
@@ -151,14 +154,14 @@
LOG.fine("Parsing...");
root = parse(input);
} catch (IOException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
} catch (DOMException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
} catch (SAXException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
} catch (Exception e) {
e.printStackTrace();
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
// get meta directives
@@ -180,7 +183,7 @@
ArrayList l = new ArrayList(); // extract outlinks
URL baseTag = DOMContentUtils.getBase(root);
LOG.fine("Getting links...");
- DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root);
+ DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root, getConf());
outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
}
@@ -197,10 +200,11 @@
status.setMessage(metaTags.getRefreshHref().toString());
}
ParseData parseData = new ParseData(status, title, outlinks, metadata);
+ parseData.setConf(this.nutchConf);
Parse parse = new ParseImpl(text, parseData);
// run filters on parse
- return HtmlParseFilters.filter(content, parse, metaTags, root);
+ return this.htmlParseFilters.filter(content, parse, metaTags, root);
}
private DocumentFragment parse(InputSource input) throws Exception {
@@ -267,10 +271,22 @@
in.readFully(bytes);
Parse parse = new HtmlParser().getParse(new Content(url,url,
bytes,"text/html",
- new ContentProperties()));
+ new ContentProperties(), new NutchConf()));
System.out.println("data: "+parse.getData());
System.out.println("text: "+parse.getText());
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.htmlParseFilters = new HtmlParseFilters(getConf());
+ this.parserImpl = getConf().get("parser.html.impl", "neko");
+ this.defaultCharEncoding = getConf().get(
+ "parser.character.encoding.default", "windows-1252");
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
import junit.framework.TestCase;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
@@ -174,6 +175,7 @@
}
private static void setup() {
+ NutchConf nutchConf = new NutchConf();
DOMFragmentParser parser= new DOMFragmentParser();
for (int i= 0; i < testPages.length; i++) {
DocumentFragment node=
@@ -192,36 +194,36 @@
try {
answerOutlinks = new Outlink[][]{
{
- new Outlink("http://www.nutch.org", "anchor"),
+ new Outlink("http://www.nutch.org", "anchor", nutchConf),
},
{
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
+ new Outlink("http://www.nutch.org/", "home", nutchConf),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots", nutchConf),
},
{
- new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"),
+ new Outlink("http://www.nutch.org/", "separate this", nutchConf),
+ new Outlink("http://www.nutch.org/docs/ok", "from this", nutchConf),
},
{
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"),
+ new Outlink("http://www.nutch.org/", "home", nutchConf),
+ new Outlink("http://www.nutch.org/docs/1", "1", nutchConf),
+ new Outlink("http://www.nutch.org/docs/2", "2", nutchConf),
},
{
- new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""),
+ new Outlink("http://www.nutch.org/frames/top.html", "", nutchConf),
+ new Outlink("http://www.nutch.org/frames/left.html", "", nutchConf),
+ new Outlink("http://www.nutch.org/frames/invalid.html", "", nutchConf),
+ new Outlink("http://www.nutch.org/frames/right.html", "", nutchConf),
},
{
- new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/logo.gif", "", nutchConf),
+ new Outlink("http://www.nutch.org/index.html", "", nutchConf),
+ new Outlink("http://www.nutch.org/maps/#bottom", "", nutchConf),
+ new Outlink("http://www.nutch.org/bot.html", "", nutchConf),
+ new Outlink("http://www.nutch.org/docs/index.html", "", nutchConf),
},
{
- new Outlink("http://www.nutch.org/index.html", "whitespace test"),
+ new Outlink("http://www.nutch.org/index.html", "whitespace test", nutchConf),
},
{
}
@@ -282,7 +284,7 @@
setup();
for (int i= 0; i < testPages.length; i++) {
ArrayList outlinks= new ArrayList();
- DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i], new NutchConf());
Outlink[] outlinkArr= new Outlink[outlinks.size()];
outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
compareOutlinks(answerOutlinks[i], outlinkArr);
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
@@ -50,6 +51,8 @@
LogFormatter.getLogger("org.apache.nutch.parse.js.JSParseFilter");
private static final int MAX_TITLE_LEN = 80;
+
+ private NutchConf nutchConf;
public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
String url = content.getBaseUrl();
@@ -64,7 +67,9 @@
ParseStatus status = parse.getData().getStatus();
String text = parse.getText();
Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
- parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
+ ParseData parseData = new ParseData(status, title, newlinks, metadata);
+ parseData.setConf(this.nutchConf);
+ parse = new ParseImpl(text, parseData);
}
return parse;
}
@@ -123,7 +128,7 @@
String type = c.getContentType();
if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content not JavaScript: '" + type + "'").getEmptyParse();
+ "Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
if (outlinks == null) outlinks = new Outlink[0];
@@ -141,6 +146,7 @@
metadata.putAll(c.getMetadata());
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, metadata);
+ pd.setConf(this.nutchConf);
Parse parse = new ParseImpl(script, pd);
return parse;
}
@@ -154,7 +160,7 @@
/**
* This method extracts URLs from literals embedded in JavaScript.
*/
- private static Outlink[] getJSLinks(String plainText, String anchor, String base) {
+ private Outlink[] getJSLinks(String plainText, String anchor, String base) {
final List outlinks = new ArrayList();
URL baseURL = null;
@@ -195,7 +201,7 @@
} else url = new URL(baseURL, url).toString();
url = url.replaceAll("&", "&");
LOG.fine(" - outlink from JS: '" + url + "'");
- outlinks.add(new Outlink(url, anchor));
+ outlinks.add(new Outlink(url, anchor, getConf()));
}
} catch (Exception ex) {
// if it is a malformed URL we just throw it away and continue with
@@ -225,9 +231,19 @@
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) sb.append(line + "\n");
- Outlink[] links = getJSLinks(sb.toString(), args[1], args[1]);
+ JSParseFilter parseFilter = new JSParseFilter();
+ parseFilter.setConf(new NutchConf());
+ Outlink[] links = parseFilter.getJSLinks(sb.toString(), args[1], args[1]);
System.out.println("Outlinks extracted: " + links.length);
for (int i = 0; i < links.length; i++)
System.out.println(" - " + links[i]);
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConf;
import org.farng.mp3.MP3File;
import org.farng.mp3.TagException;
import org.farng.mp3.id3.AbstractID3v2;
@@ -39,7 +40,8 @@
public class MP3Parser implements Parser {
- private MetadataCollector metadataCollector = new MetadataCollector();
+ private MetadataCollector metadataCollector;
+ private NutchConf nutchConf;
public Parse getParse(Content content) throws ParseException {
Parse parse = null;
@@ -84,7 +86,7 @@
metadataCollector.notifyProperty("TYER-Text", tag.getYear());
ParseData parseData = new ParseData(metadataCollector.getTitle(),
metadataCollector.getOutlinks(),
- metadataCollector.getData());
+ metadataCollector.getData(), getConf());
return new ParseImpl(metadataCollector.getText(), parseData);
}
@@ -113,4 +115,12 @@
}
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.metadataCollector = new MetadataCollector(conf);
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java Tue Jan 31 08:08:58 2006
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.mp3;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
import java.net.MalformedURLException;
import java.util.ArrayList;
@@ -34,7 +35,12 @@
private String album = null;
private ArrayList links = new ArrayList();
private String text = "";
+ private NutchConf nutchConf;
+ public MetadataCollector(NutchConf nutchConf) {
+ this.nutchConf = nutchConf;
+ }
+
public void notifyProperty(String name, String value) throws MalformedURLException {
if (name.equals("TIT2-Text"))
setTitle(value);
@@ -44,7 +50,7 @@
setArtist(value);
if (name.indexOf("URL Link") > -1) {
- links.add(new Outlink(value, ""));
+ links.add(new Outlink(value, "", this.nutchConf));
} else if (name.indexOf("Text") > -1) {
text += value + "\n";
}
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Tue Jan 31 08:08:58 2006
@@ -32,6 +32,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
/**
* Nutch-Parser for parsing MS PowerPoint slides ( mime type:
@@ -51,6 +52,8 @@
private static final Logger LOG = LogFormatter
.getLogger(MSPowerPointParser.class.getName());
+ private NutchConf nutchConf;
+
/**
*
*/
@@ -77,7 +80,7 @@
ContentProperties prop = new ContentProperties();
prop.setProperty("Content-Length", "" + raw.length);
- Content content = new Content(file, file, raw, MIME_TYPE, prop);
+ Content content = new Content(file, file, raw, MIME_TYPE, prop, new NutchConf());
System.out.println(ppe.getParse(content).getText());
}
@@ -106,7 +109,7 @@
+ raw.length
+ " bytes. Please increase <protocol>.content.limit at nutch-default.xml. "
+ "Parser can't handle incomplete PowerPoint files.")
- .getEmptyParse();
+ .getEmptyParse(getConf());
}
final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
@@ -114,11 +117,11 @@
plainText = extractor.getText();
properties = extractor.getProperties();
- outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl());
+ outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl(), getConf());
} catch (Exception e) {
LOG.throwing(this.getClass().getName(), "getParse", e);
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
// collect meta data
@@ -141,6 +144,7 @@
final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
final ParseData parseData = new ParseData(status, title, outlinks, metadata);
+ parseData.setConf(this.nutchConf);
LOG.finest("PowerPoint file parsed sucessful.");
return new ParseImpl(plainText, parseData);
@@ -160,5 +164,13 @@
return null;
}
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Tue Jan 31 08:08:58 2006
@@ -34,6 +34,7 @@
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -105,7 +106,7 @@
this.urlString = createUrl(this.testFile.getName());
System.out.println("Testing file: " + this.urlString + "...");
- this.protocol = ProtocolFactory.getProtocol(this.urlString);
+ this.protocol =new ProtocolFactory(new NutchConf()).getProtocol(this.urlString);
this.content = this.protocol.getProtocolOutput(new UTF8(this.urlString), new CrawlDatum()).getContent();
}
@@ -125,7 +126,7 @@
*/
public void testContent() throws Exception {
- Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",this.content);
+ Parse parse = new ParseUtil(new NutchConf()).parseByParserId("parse-mspowerpoint",this.content);
ParseData data = parse.getData();
String text = parse.getText();
@@ -162,7 +163,7 @@
*/
public void testMeta() throws Exception {
- Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",content);
+ Parse parse = new ParseUtil(new NutchConf()).parseByParserId("parse-mspowerpoint",content);
ParseData data = parse.getData();
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
@@ -50,6 +51,8 @@
*/
public class MSWordParser implements Parser {
+ private NutchConf nutchConf;
+
// public static final Logger LOG =
// LogFormatter.getLogger("org.apache.nutch.parse.msword");
@@ -70,7 +73,7 @@
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at " + raw.length
- +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
+ +" bytes. Parser can't handle incomplete msword file.").getEmptyParse(this.nutchConf);
}
WordExtractor extractor = new WordExtractor();
@@ -84,14 +87,14 @@
extractor = null;
} catch (ParseException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(this.nutchConf);
} catch (FastSavedException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(this.nutchConf);
} catch (PasswordProtectedException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(this.nutchConf);
} catch (Exception e) { // run time exception
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as msword document. " + e).getEmptyParse();
+ "Can't be handled as msword document. " + e).getEmptyParse(this.nutchConf);
} finally {
// nothing so far
}
@@ -113,12 +116,21 @@
title = "";
// collect outlink
- Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.nutchConf);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
+ parseData.setConf(this.nutchConf);
return new ParseImpl(text, parseData);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -61,12 +62,13 @@
Content content;
Parse parse;
+ NutchConf nutchConf = new NutchConf();
for (int i=0; i<sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- protocol = ProtocolFactory.getProtocol(urlString);
+ protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = ParseUtil.parseByParserId("parse-msword",content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-msword",content);
assertTrue(parse.getText().startsWith(expectedText));
}
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
@@ -62,6 +63,7 @@
public class PdfParser implements Parser {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.parse.pdf");
+ private NutchConf nutchConf;
public PdfParser () {
// redirect org.apache.log4j.Logger to java's native logger, in order
@@ -99,7 +101,7 @@
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
+ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
}
PDFParser parser = new PDFParser(
@@ -134,13 +136,13 @@
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED,
- "Error decrypting document. " + e).getEmptyParse();
+ "Error decrypting document. " + e).getEmptyParse(getConf());
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED,
- "Can't decrypt document - invalid password. " + e).getEmptyParse();
+ "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
} catch (Exception e) { // run time exception
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as pdf document. " + e).getEmptyParse();
+ "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
} finally {
try {
if (pdf != null)
@@ -157,13 +159,14 @@
title = "";
// collect outlink
- Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
// collect meta data
ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
+ parseData.setConf(this.nutchConf);
return new ParseImpl(text, parseData);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
@@ -178,6 +181,14 @@
retval = formatter.format(date.getTime());
}
return retval;
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -64,9 +65,10 @@
for (int i=0; i<sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- protocol = ProtocolFactory.getProtocol(urlString);
+ NutchConf nutchConf = new NutchConf();
+ protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = ParseUtil.parseByParserId("parse-pdf",content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-pdf",content);
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Tue Jan 31 08:08:58 2006
@@ -18,6 +18,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseStatus;
@@ -63,6 +64,7 @@
public class RSSParser implements Parser {
public static final Logger LOG = LogFormatter
.getLogger("org.apache.nutch.parse.rss");
+ private NutchConf nutchConf;
/**
* <p>
@@ -122,7 +124,7 @@
e.printStackTrace();
LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as rss document. " + e).getEmptyParse();
+ "Can't be handled as rss document. " + e).getEmptyParse(getConf());
}
StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
@@ -149,9 +151,9 @@
try {
// get the outlink
if (r.getDescription()!= null ) {
- theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
+ theOutlinks.add(new Outlink(r.getLink(), r.getDescription(), getConf()));
} else {
- theOutlinks.add(new Outlink(r.getLink(), ""));
+ theOutlinks.add(new Outlink(r.getLink(), "", getConf()));
}
} catch (MalformedURLException e) {
LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
@@ -179,9 +181,9 @@
if (whichLink != null) {
try {
if (theRSSItem.getDescription()!=null) {
- theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
+ theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription(), getConf()));
} else {
- theOutlinks.add(new Outlink(whichLink, ""));
+ theOutlinks.add(new Outlink(whichLink, "", getConf()));
}
} catch (MalformedURLException e) {
LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
@@ -211,7 +213,16 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
contentTitle.toString(), outlinks, content.getMetadata());
+ parseData.setConf(this.nutchConf);
return new ParseImpl(indexText.toString(), parseData);
}
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -79,12 +80,13 @@
Content content;
Parse parse;
+ NutchConf nutchConf = new NutchConf();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- protocol = ProtocolFactory.getProtocol(urlString);
+ protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = ParseUtil.parseByParserId("parse-rss",content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-rss",content);
//check that there are 3 outlinks:
//http://test.channel.com
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Tue Jan 31 08:08:58 2006
@@ -17,9 +17,8 @@
package org.apache.nutch.parse.rtf;
import org.apache.nutch.parse.*;
-import org.apache.nutch.parse.ParseException;
import org.apache.nutch.protocol.Content;
-
+import org.apache.nutch.util.NutchConf;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
@@ -29,10 +28,13 @@
/**
* A parser for RTF documents
+ *
* @author Andy Hedges
*/
public class RTFParseFactory implements Parser {
+ private NutchConf nutchConf;
+
public Parse getParse(Content content) throws ParseException {
byte[] raw = content.getContent();
Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
@@ -53,7 +55,7 @@
metadata.putAll(delegate.getMetaData());
String title = metadata.getProperty("title");
- if(title != null){
+ if (title != null) {
metadata.remove(title);
} else {
title = "";
@@ -61,11 +63,15 @@
String text = delegate.getText();
- return new ParseImpl(text,
- new ParseData(title,
- OutlinkExtractor.getOutlinks(text),
- metadata));
+ return new ParseImpl(text, new ParseData(title, OutlinkExtractor
+ .getOutlinks(text, this.nutchConf), metadata));
}
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Tue Jan 31 08:08:58 2006
@@ -25,6 +25,7 @@
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConf;
import java.util.Properties;
@@ -60,11 +61,12 @@
Content content;
Parse parse;
+ NutchConf nutchConf = new NutchConf();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
- protocol = ProtocolFactory.getProtocol(urlString);
+ protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
content = protocol.getContent(urlString);
- parse = ParseUtil.parseByParserId("parse-rtf",content);
+ parse = new ParseUtil(nutchConf).parseByParserId("parse-rtf", content);
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue Jan 31 08:08:58 2006
@@ -24,31 +24,42 @@
import org.apache.nutch.util.*;
public class TextParser implements Parser {
+ private NutchConf nutchConf;
+
public Parse getParse(Content content) {
// copy content meta data through
ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata());
- //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
+ // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
+ // Outlink[0], metadata);
- String encoding =
- StringUtil.parseCharacterEncoding(content.getContentType());
+ String encoding = StringUtil.parseCharacterEncoding(content
+ .getContentType());
String text;
- if (encoding != null) { // found an encoding header
- try { // try to use named encoding
+ if (encoding != null) { // found an encoding header
+ try { // try to use named encoding
text = new String(content.getContent(), encoding);
} catch (java.io.UnsupportedEncodingException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
} else {
- // FIXME: implement charset detector. This code causes problem when
- // character set isn't specified in HTTP header.
- text = new String(content.getContent()); // use default encoding
+ // FIXME: implement charset detector. This code causes problem when
+ // character set isn't specified in HTTP header.
+ text = new String(content.getContent()); // use default encoding
}
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
+ OutlinkExtractor.getOutlinks(text, getConf()), metadata);
+ parseData.setConf(this.nutchConf);
+ return new ParseImpl(text, parseData);
+
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
- return new ParseImpl(text,
- new ParseData(ParseStatus.STATUS_SUCCESS, "",
- OutlinkExtractor.getOutlinks(text),
- metadata));
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Tue Jan 31 08:08:58 2006
@@ -33,73 +33,87 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
/**
* ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
* Nutch parse plugin for zip files - Content Type : application/zip
+ *
* @author Rohit Kulkarni & Ashish Vaidya
*/
-public class ZipParser implements Parser{
-
- private static final Logger LOG = LogFormatter.getLogger(ZipParser.class.getName());
- /** Creates a new instance of ZipParser */
- public ZipParser() {
+public class ZipParser implements Parser {
+
+ private static final Logger LOG = LogFormatter.getLogger(ZipParser.class
+ .getName());
+ private NutchConf nutchConf;
+
+ /** Creates a new instance of ZipParser */
+ public ZipParser() {
+ }
+
+ public Parse getParse(final Content content) {
+
+ String resultText = null;
+ String resultTitle = null;
+ Outlink[] outlinks = null;
+ List outLinksList = new ArrayList();
+ Properties properties = null;
+
+ try {
+ final String contentLen = content.get("Content-Length");
+ final int len = Integer.parseInt(contentLen);
+ System.out.println("ziplen: " + len);
+ final byte[] contentInBytes = content.getContent();
+ final ByteArrayInputStream bainput = new ByteArrayInputStream(
+ contentInBytes);
+ final InputStream input = bainput;
+
+ if (contentLen != null && contentInBytes.length != len) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+ + contentInBytes.length
+ + " bytes. Parser can't handle incomplete pdf file.")
+ .getEmptyParse(getConf());
+ }
+
+ ZipTextExtractor extractor = new ZipTextExtractor(getConf());
+
+ // extract text
+ resultText = extractor.extractText(new ByteArrayInputStream(
+ contentInBytes), content.getUrl(), outLinksList);
+
+ } catch (Exception e) {
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as Zip document. " + e).getEmptyParse(getConf());
}
-
- public Parse getParse(final Content content) {
-
- String resultText = null;
- String resultTitle = null;
- Outlink[] outlinks = null;
- List outLinksList = new ArrayList();
- Properties properties = null;
-
- try {
- final String contentLen = content.get("Content-Length");
- final int len = Integer.parseInt(contentLen);
- System.out.println("ziplen: " + len);
- final byte[] contentInBytes = content.getContent();
- final ByteArrayInputStream bainput = new ByteArrayInputStream(contentInBytes);
- final InputStream input = bainput;
-
- if (contentLen != null && contentInBytes.length != len) {
- return new ParseStatus(ParseStatus.FAILED,
- ParseStatus.FAILED_TRUNCATED,
- "Content truncated at " + contentInBytes.length +
- " bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
- }
-
- ZipTextExtractor extractor = new ZipTextExtractor();
-
- // extract text
- resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes),
- content.getUrl(), outLinksList);
-
- } catch (Exception e) {
- return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as Zip document. " + e).getEmptyParse();
- }
-
- // collect meta data
- final ContentProperties metadata = new ContentProperties();
- metadata.putAll(content.getMetadata()); // copy through
-
- if (resultText == null) {
- resultText = "";
- }
-
- if (resultTitle == null) {
- resultTitle = "";
- }
-
- outlinks = (Outlink[])outLinksList.toArray(new Outlink[0]);
- final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
- resultTitle,
- outlinks,
- metadata);
-
- LOG.finest("Zip file parsed sucessfully !!");
- return new ParseImpl(resultText, parseData);
+
+ // collect meta data
+ final ContentProperties metadata = new ContentProperties();
+ metadata.putAll(content.getMetadata()); // copy through
+
+ if (resultText == null) {
+ resultText = "";
}
-
+
+ if (resultTitle == null) {
+ resultTitle = "";
+ }
+
+ outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
+ final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ resultTitle, outlinks, metadata);
+ parseData.setConf(this.nutchConf);
+
+ LOG.finest("Zip file parsed sucessfully !!");
+ return new ParseImpl(resultText, parseData);
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
+
}
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Tue Jan 31 08:08:58 2006
@@ -46,14 +46,17 @@
public class ZipTextExtractor {
/** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
+ private MimeTypes MIME;
public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName());
+
+private NutchConf nutchConf;
/** Creates a new instance of ZipTextExtractor */
- public ZipTextExtractor() {
+ public ZipTextExtractor(NutchConf nutchConf) {
+ this.nutchConf = nutchConf;
+ this.MIME = MimeTypes.get(nutchConf.get("mime.types.file"));
}
public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -88,13 +91,13 @@
ContentProperties metadata = new ContentProperties();
metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
metadata.setProperty("Content-Type", contentType);
- Content content = new Content(newurl, base, b, contentType, metadata);
- Parse parse = ParseUtil.parse(content);
+ Content content = new Content(newurl, base, b, contentType, metadata, this.nutchConf);
+ Parse parse = new ParseUtil(this.nutchConf).parse(content);
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for(int count = 0; count < theOutlinks.length; count++) {
- outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
+ outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor(), this.nutchConf));
}
resultText += entry.getName() + " " + parse.getText() + " ";
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
@@ -61,12 +62,13 @@
Content content;
Parse parse;
+ NutchConf conf = new NutchConf();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- protocol = ProtocolFactory.getProtocol(urlString);
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = ParseUtil.parseByParserId("parse-zip",content);
+ parse = new ParseUtil(conf).parseByParserId("parse-zip",content);
assertTrue(parse.getText().equals(expectedText));
}
}
Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Jan 31 08:08:58 2006
@@ -50,7 +50,7 @@
static final int MAX_REDIRECTS = 5;
- static int maxContentLength = NutchConf.get().getInt("file.content.limit", 64 * 1024);
+ int maxContentLength;
// 20040412, xing
// the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
@@ -59,6 +59,8 @@
// http date format
HttpDateFormat httpDateFormat = null;
+ private NutchConf nutchConf;
+
// constructor
public File() {
this.httpDateFormat = new HttpDateFormat();
@@ -76,7 +78,7 @@
while (true) {
FileResponse response;
- response = new FileResponse(u, datum, this); // make a request
+ response = new FileResponse(u, datum, this, getConf()); // make a request
int code = response.getCode();
@@ -152,4 +154,12 @@
file = null;
}
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
/************************************
@@ -63,6 +64,7 @@
private ContentProperties headers = new ContentProperties();
private final File file;
+ private NutchConf nutchConf;
/** Returns the response code. */
public int getCode() { return code; }
@@ -77,15 +79,16 @@
public Content toContent() {
return new Content(orig, base, content,
getHeader("Content-Type"),
- headers);
+ headers, this.nutchConf);
}
- public FileResponse(URL url, CrawlDatum datum, File file)
+ public FileResponse(URL url, CrawlDatum datum, File file, NutchConf nutchConf)
throws FileException, IOException {
this.orig = url.toString();
this.base = url.toString();
this.file = file;
+ this.nutchConf = nutchConf;
if (!"file".equals(url.getProtocol()))
throw new FileException("Not a file url:" + url);
Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue Jan 31 08:08:58 2006
@@ -56,23 +56,23 @@
static final int MAX_REDIRECTS = 5;
- static int timeout = NutchConf.get().getInt("ftp.timeout", 10000);
+ int timeout;
- static int maxContentLength = NutchConf.get().getInt("ftp.content.limit",64*1024);
+ int maxContentLength;
- String userName = NutchConf.get().get("ftp.username", "anonymous");
- String passWord = NutchConf.get().get("ftp.password", "anonymous@example.com");
+ String userName;
+ String passWord;
// typical/default server timeout is 120*1000 millisec.
// better be conservative here
- int serverTimeout = NutchConf.get().getInt("ftp.server.timeout", 60*1000);
+ int serverTimeout;
// when to have client start anew
long renewalTime = -1;
- boolean keepConnection = NutchConf.get().getBoolean("ftp.keep.connection", false);
+ boolean keepConnection;
- boolean followTalk = NutchConf.get().getBoolean("ftp.follow.talk", false);
+ boolean followTalk;
// ftp client
Client client = null;
@@ -86,6 +86,8 @@
// http date format
HttpDateFormat httpDateFormat = null;
+ private NutchConf nutchConf;
+
// constructor
public Ftp() {
@@ -121,7 +123,7 @@
while (true) {
FtpResponse response;
- response = new FtpResponse(u, datum, this); // make a request
+ response = new FtpResponse(u, datum, this, getConf()); // make a request
int code = response.getCode();
@@ -218,6 +220,22 @@
}
ftp = null;
+ }
+
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+ this.timeout = conf.getInt("ftp.timeout", 10000);
+ this.userName = conf.get("ftp.username", "anonymous");
+ this.passWord = conf.get("ftp.password", "anonymous@example.com");
+ this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+ this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+ this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Tue Jan 31 08:08:58 2006
@@ -27,6 +27,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
import java.net.InetAddress;
import java.net.URL;
@@ -64,6 +65,7 @@
private ContentProperties headers = new ContentProperties();
private final Ftp ftp;
+ private NutchConf nutchConf;
/** Returns the response code. */
public int getCode() { return code; }
@@ -78,15 +80,16 @@
public Content toContent() {
return new Content(orig, base, content,
getHeader("Content-Type"),
- headers);
+ headers, this.nutchConf);
}
- public FtpResponse(URL url, CrawlDatum datum, Ftp ftp)
+ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, NutchConf nutchConf)
throws FtpException, IOException {
this.orig = url.toString();
this.base = url.toString();
this.ftp = ftp;
+ this.nutchConf = nutchConf;
if (!"ftp".equals(url.getProtocol()))
throw new FtpException("Not a ftp url:" + url);
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Tue Jan 31 08:08:58 2006
@@ -33,25 +33,29 @@
public class Http extends HttpBase {
public static final Logger LOG =
- LogFormatter.getLogger("org.apache.nutch.net.Http");
-
- static {
- if (NutchConf.get().getBoolean("http.verbose", false))
- LOG.setLevel(Level.FINE);
- }
+ LogFormatter.getLogger(Http.class.getName());
public Http() {
super(LOG);
}
+ public void setConf(NutchConf conf) {
+ super.setConf(conf);
+ Level logLevel = Level.WARNING;
+ if (conf.getBoolean("http.verbose", false)) {
+ logLevel = Level.FINE;
+ }
+ LOG.setLevel(logLevel);
+ }
+
public static void main(String[] args) throws Exception {
main(new Http(), args);
}
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
- return new HttpResponse(url, datum);
+ return new HttpResponse(this, url, datum);
}
}
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Jan 31 08:08:58 2006
@@ -35,13 +35,15 @@
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.http.api.HttpException;
import org.apache.nutch.util.GZIPUtils;
/** An HTTP response. */
public class HttpResponse implements Response {
-
+
+ private HttpBase http;
private URL url;
private String orig;
private String base;
@@ -50,9 +52,10 @@
private ContentProperties headers = new ContentProperties();
- public HttpResponse(URL url, CrawlDatum datum)
+ public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
+ this.http = http;
this.url = url;
this.orig = url.toString();
this.base = url.toString();
@@ -83,20 +86,20 @@
try {
socket = new Socket(); // create the socket
- socket.setSoTimeout(Http.TIMEOUT);
+ socket.setSoTimeout(http.getTimeout());
// connect
- String sockHost = Http.PROXY ? Http.PROXY_HOST : host;
- int sockPort = Http.PROXY ? Http.PROXY_PORT : port;
+ String sockHost = http.useProxy() ? http.getProxyHost() : host;
+ int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
- socket.connect(sockAddr, Http.TIMEOUT);
+ socket.connect(sockAddr, http.getTimeout());
// make request
OutputStream req = socket.getOutputStream();
StringBuffer reqStr = new StringBuffer("GET ");
- if(Http.PROXY){
+ if (http.useProxy()) {
reqStr.append(url.getProtocol()+"://"+host+portString+path);
} else {
reqStr.append(path);
@@ -111,11 +114,12 @@
reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
- if ((Http.AGENT_STRING == null) || (Http.AGENT_STRING.length() == 0)) {
+ String userAgent = http.getUserAgent();
+ if ((userAgent == null) || (userAgent.length() == 0)) {
Http.LOG.severe("User-agent is not set!");
} else {
reqStr.append("User-Agent: ");
- reqStr.append(Http.AGENT_STRING);
+ reqStr.append(userAgent);
reqStr.append("\r\n");
}
@@ -148,7 +152,7 @@
Http.LOG.fine("uncompressing....");
byte[] compressed = content;
- content = GZIPUtils.unzipBestEffort(compressed, Http.MAX_CONTENT);
+ content = GZIPUtils.unzipBestEffort(compressed, http.getMaxContent());
if (content == null)
throw new HttpException("unzipBestEffort returned null");
@@ -212,9 +216,9 @@
throw new HttpException("bad content length: "+contentLengthString);
}
}
- if (Http.MAX_CONTENT >= 0
- && contentLength > Http.MAX_CONTENT) // limit download size
- contentLength = Http.MAX_CONTENT;
+ if (http.getMaxContent() >= 0
+ && contentLength > http.getMaxContent()) // limit download size
+ contentLength = http.getMaxContent();
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
byte[] bytes = new byte[Http.BUFFER_SIZE];
@@ -265,8 +269,8 @@
break;
}
- if ( (contentBytesRead + chunkLen) > Http.MAX_CONTENT )
- chunkLen= Http.MAX_CONTENT - contentBytesRead;
+ if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+ chunkLen= http.getMaxContent() - contentBytesRead;
// read one chunk
int chunkBytesRead= 0;
@@ -295,7 +299,7 @@
}
if (!doneChunks) {
- if (contentBytesRead != Http.MAX_CONTENT)
+ if (contentBytesRead != http.getMaxContent())
throw new HttpException("chunk eof: !doneChunk && didn't max out");
return;
}
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Tue Jan 31 08:08:58 2006
@@ -46,70 +46,70 @@
public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.net.Http");
- static {
- if (NutchConf.get().getBoolean("http.verbose", false)) {
- LOG.setLevel(Level.FINE);
- } else { // shush about redirects
- Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector")
- .setLevel(Level.WARNING);
- }
- }
-
private static MultiThreadedHttpConnectionManager connectionManager =
new MultiThreadedHttpConnectionManager();
-
- private static HttpClient client;
+
+ // Since the NutchConf has not yet been setted,
+ // then an unconfigured client is returned.
+ private static HttpClient client = new HttpClient(connectionManager);
static synchronized HttpClient getClient() {
- if (client != null) return client;
- configureClient();
return client;
}
- static int MAX_THREADS_TOTAL = NutchConf.get().getInt("fetcher.threads.fetch", 10);
- static String NTLM_USERNAME = NutchConf.get().get("http.auth.ntlm.username", "");
- static String NTLM_PASSWORD = NutchConf.get().get("http.auth.ntlm.password", "");
- static String NTLM_DOMAIN = NutchConf.get().get("http.auth.ntlm.domain", "");
- static String NTLM_HOST = NutchConf.get().get("http.auth.ntlm.host", "");
-
- static {
- LOG.info("http.auth.ntlm.username = " + NTLM_USERNAME);
- }
-
+ boolean verbose = false;
+ int maxThreadsTotal = 10;
+ String ntlmUsername = "";
+ String ntlmPassword = "";
+ String ntlmDomain = "";
+ String ntlmHost = "";
public Http() {
super(LOG);
}
+ public void setConf(NutchConf conf) {
+ super.setConf(conf);
+ this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
+ this.ntlmUsername = conf.get("http.auth.ntlm.username", "");
+ this.ntlmPassword = conf.get("http.auth.ntlm.password", "");
+ this.ntlmDomain = conf.get("http.auth.ntlm.domain", "");
+ this.ntlmHost = conf.get("http.auth.ntlm.host", "");
+ Level logLevel = Level.WARNING;
+ if (conf.getBoolean("http.verbose", false)) {
+ logLevel = Level.FINE;
+ }
+ LOG.setLevel(logLevel);
+ Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector")
+ .setLevel(logLevel);
+ configureClient();
+ }
+
public static void main(String[] args) throws Exception {
main(new Http(), args);
}
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
- return new HttpResponse(url, datum, redirect);
+ return new HttpResponse(this, url, datum, redirect);
}
- private static void configureClient() {
-
- // get a client isntance -- we just need one.
-
- client = new HttpClient(connectionManager);
+ private void configureClient() {
// Set up an HTTPS socket factory that accepts self-signed certs.
Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443);
Protocol.registerProtocol("https", dummyhttps);
HttpConnectionManagerParams params = connectionManager.getParams();
- params.setConnectionTimeout(TIMEOUT);
- params.setSoTimeout(TIMEOUT);
+ params.setConnectionTimeout(timeout);
+ params.setSoTimeout(timeout);
params.setSendBufferSize(BUFFER_SIZE);
params.setReceiveBufferSize(BUFFER_SIZE);
- params.setMaxTotalConnections(MAX_THREADS_TOTAL);
- if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
- params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+ params.setMaxTotalConnections(maxThreadsTotal);
+ if (maxThreadsTotal > maxThreadsPerHost) {
+ params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost);
} else {
- params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+ params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
}
HostConfiguration hostConf = client.getHostConfiguration();
@@ -122,15 +122,15 @@
headers.add(new Header("Accept",
"text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
hostConf.getParams().setParameter("http.default-headers", headers);
- if (PROXY) {
- hostConf.setProxy(PROXY_HOST, PROXY_PORT);
+ if (useProxy) {
+ hostConf.setProxy(proxyHost, proxyPort);
}
- if (NTLM_USERNAME.length() > 0) {
- Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN);
- client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds);
+ if (ntlmUsername.length() > 0) {
+ Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, ntlmHost, ntlmDomain);
+ client.getState().setCredentials(new AuthScope(ntlmHost, AuthScope.ANY_PORT), ntCreds);
- LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
+ LOG.info("Added NTLM credentials for " + ntlmUsername);
}
LOG.info("Configured Client");
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Tue Jan 31 08:08:58 2006
@@ -15,6 +15,8 @@
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
+
/**
* Provides the Http protocol implementation
@@ -28,25 +30,50 @@
*
* @author Matt Tencati
*/
-public class HttpAuthenticationFactory {
- /** The HTTP Authentication (WWW-Authenticate) header which is returned
+public class HttpAuthenticationFactory implements NutchConfigurable {
+
+ /**
+ * The HTTP Authentication (WWW-Authenticate) header which is returned
* by a webserver requiring authentication.
*/
public static final String AUTH_HEADER = "WWW-Authenticate";
- public static final Logger LOG =
- LogFormatter.getLogger("net.nutch.protocol.http.HttpAuthenticationFactory");
+ public static final Logger LOG =
+ LogFormatter.getLogger(HttpAuthenticationFactory.class.getName());
- static {
- if (NutchConf.get().getBoolean("http.auth.verbose", false))
- LOG.setLevel(Level.FINE);
- }
-
private static Map auths = new TreeMap();
+
+ private NutchConf conf = null;
- private HttpAuthenticationFactory() { }
- public static HttpAuthentication findAuthentication(ContentProperties header) {
+ public HttpAuthenticationFactory(NutchConf conf) {
+ setConf(conf);
+ }
+
+
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+ public void setConf(NutchConf conf) {
+ this.conf = conf;
+ if (conf.getBoolean("http.auth.verbose", false)) {
+ LOG.setLevel(Level.FINE);
+ } else {
+ LOG.setLevel(Level.WARNING);
+ }
+ }
+
+ public NutchConf getConf() {
+ return conf;
+ }
+
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+
+ public HttpAuthentication findAuthentication(ContentProperties header) {
if (header == null) return null;
try {
@@ -80,7 +107,7 @@
}
LOG.fine("Checking challengeString=" + challengeString);
- auth = HttpBasicAuthentication.getAuthentication(challengeString);
+ auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
if (auth != null) return auth;
//TODO Add additional Authentication lookups here
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java Tue Jan 31 08:08:58 2006
@@ -16,6 +16,7 @@
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
/**
* Implementation of RFC 2617 Basic Authentication. Usernames and passwords are stored
@@ -25,23 +26,21 @@
*
* @author Matt Tencati
*/
-public class HttpBasicAuthentication implements HttpAuthentication {
- public static final Logger LOG =
- LogFormatter.getLogger("net.nutch.net.HttpBasicAuthentication");
-
- static {
- if (NutchConf.get().getBoolean("http.auth.verbose", false))
- LOG.setLevel(Level.FINE);
- }
+public class HttpBasicAuthentication implements HttpAuthentication, NutchConfigurable {
- private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
+ public static final Logger LOG =
+ LogFormatter.getLogger(HttpBasicAuthentication.class.getName());
+
+ private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
private static Map authMap = new TreeMap();
-
+
+ private NutchConf conf = null;
private String challenge = null;
private ArrayList credentials = null;
private String realm = null;
+
/**
* Construct an HttpBasicAuthentication for the given challenge
* parameters. The challenge parameters are returned by the web
@@ -50,14 +49,16 @@
*
* @param challenge WWW-Authenticate header from web server
*/
- protected HttpBasicAuthentication(String challenge) throws HttpAuthenticationException {
+ protected HttpBasicAuthentication(String challenge, NutchConf nutchConf) throws HttpAuthenticationException {
+
+ setConf(nutchConf);
this.challenge = challenge;
LOG.fine("BasicAuthentication challenge is " + challenge);
credentials = new ArrayList();
- String username = NutchConf.get().get("http.auth.basic." + challenge + ".user");
+ String username = this.conf.get("http.auth.basic." + challenge + ".user");
LOG.fine("BasicAuthentication username=" + username);
- String password = NutchConf.get().get("http.auth.basic." + challenge + ".password");
+ String password = this.conf.get("http.auth.basic." + challenge + ".password");
LOG.fine("BasicAuthentication password=" + password);
if (username == null) {
@@ -73,6 +74,29 @@
LOG.fine("Basic credentials: " + credentials);
}
+
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+ public void setConf(NutchConf conf) {
+ this.conf = conf;
+ if (conf.getBoolean("http.auth.verbose", false)) {
+ LOG.setLevel(Level.FINE);
+ } else {
+ LOG.setLevel(Level.WARNING);
+ }
+ }
+
+ public NutchConf getConf() {
+ return this.conf;
+ }
+
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+
/**
* Gets the Basic credentials generated by this
* HttpBasicAuthentication object
@@ -105,7 +129,7 @@
* @return An HttpBasicAuthentication object or null
* if unable to generate appropriate credentials.
*/
- public static HttpBasicAuthentication getAuthentication(String challenge) {
+ public static HttpBasicAuthentication getAuthentication(String challenge, NutchConf conf) {
if (challenge == null) return null;
Matcher basicMatcher = basic.matcher(challenge);
if (basicMatcher.matches()) {
@@ -114,7 +138,7 @@
if (auth == null) {
HttpBasicAuthentication newAuth = null;
try {
- newAuth = new HttpBasicAuthentication(realm);
+ newAuth = new HttpBasicAuthentication(realm, conf);
} catch (HttpAuthenticationException hae) {
LOG.fine("HttpBasicAuthentication failed for " + challenge);
}