You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/01/31 17:13:17 UTC

svn commit: r373853 [5/6] - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/clustering/ java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/ java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apac...

Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.CommandRunner;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
@@ -55,39 +56,13 @@
   static final int TIMEOUT_DEFAULT = 30; // in seconds
 
   // handy map from String contentType to String[] {command, timeoutString}
-  static Hashtable TYPE_PARAMS_MAP = new Hashtable();
+  Hashtable TYPE_PARAMS_MAP = new Hashtable();
 
-  // set TYPE_PARAMS_MAP using plugin.xml of this plugin
-  static {
-    Extension[] extensions = PluginRepository.getInstance()
-      .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
+  private NutchConf nutchConf;  
 
-    String contentType, command, timeoutString;
-
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-
-      // only look for extensions defined by plugin parse-ext
-      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
-        continue;
-
-      contentType = extension.getAttribute("contentType");
-      if (contentType == null || contentType.equals(""))
-        continue;
+  private boolean loaded = false;
 
-      command = extension.getAttribute("command");
-      if (command == null || command.equals(""))
-        continue;
-
-      timeoutString = extension.getAttribute("timeout");
-      if (timeoutString == null || timeoutString.equals(""))
-        timeoutString = "" + TIMEOUT_DEFAULT;
-
-      TYPE_PARAMS_MAP.put(contentType, new String[]{command, timeoutString});
-    }
-  }
-
-  public ExtParser () {}
+  public ExtParser () { }
 
   public Parse getParse(Content content) {
 
@@ -96,7 +71,7 @@
     String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
     if (params == null)
       return new ParseStatus(ParseStatus.FAILED,
-                      "No external command defined for contentType: " + contentType).getEmptyParse();
+                      "No external command defined for contentType: " + contentType).getEmptyParse(getConf());
 
     String command = params[0];
     int timeout = Integer.parseInt(params[1]);
@@ -118,7 +93,7 @@
           return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                 "Content truncated at " + raw.length
             +" bytes. Parser can't handle incomplete "
-            + contentType + " file.").getEmptyParse();
+            + contentType + " file.").getEmptyParse(getConf());
       }
 
       ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
@@ -138,12 +113,12 @@
       if (cr.getExitValue() != 0)
         return new ParseStatus(ParseStatus.FAILED,
                         "External command " + command
-                        + " failed with error: " + es.toString()).getEmptyParse();
+                        + " failed with error: " + es.toString()).getEmptyParse(getConf());
 
       text = os.toString();
 
     } catch (Exception e) { // run time exception
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
 
     if (text == null)
@@ -153,14 +128,48 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
 
     // collect meta data
     ContentProperties metaData = new ContentProperties();
     metaData.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
+    parseData.setConf(this.nutchConf);
     return new ParseImpl(text, parseData);
   }
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    Extension[] extensions = conf.getPluginRepository().getExtensionPoint(
+        "org.apache.nutch.parse.Parser").getExtensions();
+
+    String contentType, command, timeoutString;
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+
+      // only look for extensions defined by plugin parse-ext
+      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
+        continue;
+
+      contentType = extension.getAttribute("contentType");
+      if (contentType == null || contentType.equals(""))
+        continue;
+
+      command = extension.getAttribute("command");
+      if (command == null || command.equals(""))
+        continue;
+
+      timeoutString = extension.getAttribute("timeout");
+      if (timeoutString == null || timeoutString.equals(""))
+        timeoutString = "" + TIMEOUT_DEFAULT;
+
+      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString });
+    }
+  }
 
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -79,7 +80,7 @@
     fos.close();
 
     // get nutch content
-    Protocol protocol = ProtocolFactory.getProtocol(urlString);
+    Protocol protocol = new ProtocolFactory(new NutchConf()).getProtocol(urlString);
     content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
     protocol = null;
   }
@@ -103,18 +104,19 @@
       return;
     }
 
+    NutchConf nutchConf = new NutchConf();
     // loop alternately, total 10*2 times of invoking external command
     for (int i=0; i<10; i++) {
       // check external parser that does 'cat'
       contentType = "application/vnd.nutch.example.cat";
       content.setContentType(contentType);
-      parse = ParseUtil.parseByParserId("parse-ext", content);
+      parse = new ParseUtil(nutchConf).parseByParserId("parse-ext", content);
       assertEquals(expectedText,parse.getText());
 
       // check external parser that does 'md5sum'
       contentType = "application/vnd.nutch.example.md5sum";
       content.setContentType(contentType);
-      parse = ParseUtil.parseByParserId("parse-ext", content);
+      parse = new ParseUtil(nutchConf).parseByParserId("parse-ext", content);
       assertTrue(parse.getText().startsWith(expectedMD5sum));
     }
   }

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Tue Jan 31 08:08:58 2006
@@ -22,6 +22,7 @@
 import java.util.HashMap;
 
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
 
 import org.w3c.dom.*;
 
@@ -286,7 +287,7 @@
    * nekohtml).
    */
   public static final void getOutlinks(URL base, ArrayList outlinks, 
-                                       Node node) {
+                                       Node node, NutchConf nutchConf) {
 
     NodeList children = node.getChildNodes();
     int childLen= 0;
@@ -322,7 +323,7 @@
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),
-                                       linkText.toString().trim()));
+                                       linkText.toString().trim(), nutchConf));
             } catch (MalformedURLException e) {
               // don't care
             }
@@ -332,7 +333,7 @@
       }
     }
     for ( int i = 0; i < childLen; i++ ) {
-      getOutlinks(base, outlinks, children.item(i));
+      getOutlinks(base, outlinks, children.item(i), nutchConf);
     }
   }
 

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Jan 31 08:08:58 2006
@@ -51,7 +51,7 @@
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
                     Pattern.CASE_INSENSITIVE);
   
-  private static String parserImpl = NutchConf.get().get("parser.html.impl", "neko");
+  private String parserImpl;
 
   /**
    * Given a <code>byte[]</code> representing an html file of an 
@@ -91,8 +91,11 @@
   }
 
 
-  private static String defaultCharEncoding =
-    NutchConf.get().get("parser.character.encoding.default", "windows-1252");
+  private String defaultCharEncoding;
+
+  private NutchConf nutchConf;
+
+  private HtmlParseFilters htmlParseFilters;
 
   public Parse getParse(Content content) {
     HTMLMetaTags metaTags = new HTMLMetaTags();
@@ -101,7 +104,7 @@
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
 
     String text = "";
@@ -151,14 +154,14 @@
       LOG.fine("Parsing...");
       root = parse(input);
     } catch (IOException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     } catch (DOMException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     } catch (SAXException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     } catch (Exception e) {
       e.printStackTrace();
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
       
     // get meta directives
@@ -180,7 +183,7 @@
       ArrayList l = new ArrayList();              // extract outlinks
       URL baseTag = DOMContentUtils.getBase(root);
       LOG.fine("Getting links...");
-      DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root);
+      DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root, getConf());
       outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
     }
@@ -197,10 +200,11 @@
       status.setMessage(metaTags.getRefreshHref().toString());
     }
     ParseData parseData = new ParseData(status, title, outlinks, metadata);
+    parseData.setConf(this.nutchConf);
     Parse parse = new ParseImpl(text, parseData);
 
     // run filters on parse
-    return HtmlParseFilters.filter(content, parse, metaTags, root);
+    return this.htmlParseFilters.filter(content, parse, metaTags, root);
   }
 
   private DocumentFragment parse(InputSource input) throws Exception {
@@ -267,10 +271,22 @@
     in.readFully(bytes);
     Parse parse = new HtmlParser().getParse(new Content(url,url,
                                                         bytes,"text/html",
-                                                        new ContentProperties()));
+                                                        new ContentProperties(), new NutchConf()));
     System.out.println("data: "+parse.getData());
 
     System.out.println("text: "+parse.getText());
     
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.parserImpl = getConf().get("parser.html.impl", "neko");
+    this.defaultCharEncoding = getConf().get(
+        "parser.character.encoding.default", "windows-1252");
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
 import junit.framework.TestCase;
 
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
 
 import java.io.ByteArrayInputStream;
 import java.net.MalformedURLException;
@@ -174,6 +175,7 @@
   }
 
   private static void setup() {
+    NutchConf nutchConf = new NutchConf();
     DOMFragmentParser parser= new DOMFragmentParser();
     for (int i= 0; i < testPages.length; i++) {
         DocumentFragment node= 
@@ -192,36 +194,36 @@
     try {
      answerOutlinks = new Outlink[][]{ 
          {
-           new Outlink("http://www.nutch.org", "anchor"),
+           new Outlink("http://www.nutch.org", "anchor", nutchConf),
          },
          {
-           new Outlink("http://www.nutch.org/", "home"),
-           new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
+           new Outlink("http://www.nutch.org/", "home", nutchConf),
+           new Outlink("http://www.nutch.org/docs/bot.html", "bots", nutchConf),
          },
          {
-           new Outlink("http://www.nutch.org/", "separate this"),
-           new Outlink("http://www.nutch.org/docs/ok", "from this"),
+           new Outlink("http://www.nutch.org/", "separate this", nutchConf),
+           new Outlink("http://www.nutch.org/docs/ok", "from this", nutchConf),
          },
          {
-           new Outlink("http://www.nutch.org/", "home"),
-           new Outlink("http://www.nutch.org/docs/1", "1"),
-           new Outlink("http://www.nutch.org/docs/2", "2"),
+           new Outlink("http://www.nutch.org/", "home", nutchConf),
+           new Outlink("http://www.nutch.org/docs/1", "1", nutchConf),
+           new Outlink("http://www.nutch.org/docs/2", "2", nutchConf),
          },
          {
-           new Outlink("http://www.nutch.org/frames/top.html", ""),
-           new Outlink("http://www.nutch.org/frames/left.html", ""),
-           new Outlink("http://www.nutch.org/frames/invalid.html", ""),
-           new Outlink("http://www.nutch.org/frames/right.html", ""),
+           new Outlink("http://www.nutch.org/frames/top.html", "", nutchConf),
+           new Outlink("http://www.nutch.org/frames/left.html", "", nutchConf),
+           new Outlink("http://www.nutch.org/frames/invalid.html", "", nutchConf),
+           new Outlink("http://www.nutch.org/frames/right.html", "", nutchConf),
          },
          {
-           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
-           new Outlink("http://www.nutch.org/index.html", ""),
-           new Outlink("http://www.nutch.org/maps/#bottom", ""),
-           new Outlink("http://www.nutch.org/bot.html", ""),
-           new Outlink("http://www.nutch.org/docs/index.html", ""),
+           new Outlink("http://www.nutch.org/maps/logo.gif", "", nutchConf),
+           new Outlink("http://www.nutch.org/index.html", "", nutchConf),
+           new Outlink("http://www.nutch.org/maps/#bottom", "", nutchConf),
+           new Outlink("http://www.nutch.org/bot.html", "", nutchConf),
+           new Outlink("http://www.nutch.org/docs/index.html", "", nutchConf),
          },
          {
-             new Outlink("http://www.nutch.org/index.html", "whitespace test"),
+             new Outlink("http://www.nutch.org/index.html", "whitespace test", nutchConf),
          },
          {
          }
@@ -282,7 +284,7 @@
       setup();
     for (int i= 0; i < testPages.length; i++) {
       ArrayList outlinks= new ArrayList();
-      DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i], new NutchConf());
       Outlink[] outlinkArr= new Outlink[outlinks.size()];
       outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
       compareOutlinks(answerOutlinks[i], outlinkArr);

Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
 import org.apache.oro.text.regex.PatternCompiler;
@@ -50,6 +51,8 @@
     LogFormatter.getLogger("org.apache.nutch.parse.js.JSParseFilter");
 
   private static final int MAX_TITLE_LEN = 80;
+
+  private NutchConf nutchConf;
   
   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
     String url = content.getBaseUrl();
@@ -64,7 +67,9 @@
       ParseStatus status = parse.getData().getStatus();
       String text = parse.getText();
       Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
-      parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
+      ParseData parseData = new ParseData(status, title, newlinks, metadata);
+      parseData.setConf(this.nutchConf);
+      parse = new ParseImpl(text, parseData);
     }
     return parse;
   }
@@ -123,7 +128,7 @@
     String type = c.getContentType();
     if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
       return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
-              "Content not JavaScript: '" + type + "'").getEmptyParse();
+              "Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
     String script = new String(c.getContent());
     Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
     if (outlinks == null) outlinks = new Outlink[0];
@@ -141,6 +146,7 @@
     metadata.putAll(c.getMetadata());
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
             outlinks, metadata);
+    pd.setConf(this.nutchConf);
     Parse parse = new ParseImpl(script, pd);
     return parse;
   }
@@ -154,7 +160,7 @@
   /**
    *  This method extracts URLs from literals embedded in JavaScript.
    */
-  private static Outlink[] getJSLinks(String plainText, String anchor, String base) {
+  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
     final List outlinks = new ArrayList();
     URL baseURL = null;
@@ -195,7 +201,7 @@
         } else url = new URL(baseURL, url).toString();
         url = url.replaceAll("&amp;", "&");
         LOG.fine(" - outlink from JS: '" + url + "'");
-        outlinks.add(new Outlink(url, anchor));
+        outlinks.add(new Outlink(url, anchor, getConf()));
       }
     } catch (Exception ex) {
       // if it is a malformed URL we just throw it away and continue with
@@ -225,9 +231,19 @@
     StringBuffer sb = new StringBuffer();
     String line = null;
     while ((line = br.readLine()) != null) sb.append(line + "\n");
-    Outlink[] links = getJSLinks(sb.toString(), args[1], args[1]);
+    JSParseFilter parseFilter = new JSParseFilter();
+    parseFilter.setConf(new NutchConf());
+    Outlink[] links = parseFilter.getJSLinks(sb.toString(), args[1], args[1]);
     System.out.println("Outlinks extracted: " + links.length);
     for (int i = 0; i < links.length; i++)
       System.out.println(" - " + links[i]);
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
 
 import org.apache.nutch.parse.*;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConf;
 import org.farng.mp3.MP3File;
 import org.farng.mp3.TagException;
 import org.farng.mp3.id3.AbstractID3v2;
@@ -39,7 +40,8 @@
 
 public class MP3Parser implements Parser {
 
-  private MetadataCollector metadataCollector = new MetadataCollector();
+  private MetadataCollector metadataCollector;
+  private NutchConf nutchConf;
 
   public Parse getParse(Content content) throws ParseException {
     Parse parse = null;
@@ -84,7 +86,7 @@
     metadataCollector.notifyProperty("TYER-Text", tag.getYear());
     ParseData parseData = new ParseData(metadataCollector.getTitle(),
         metadataCollector.getOutlinks(),
-        metadataCollector.getData());
+        metadataCollector.getData(), getConf());
     return new ParseImpl(metadataCollector.getText(), parseData);
   }
 
@@ -113,4 +115,12 @@
   }
 
 
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.metadataCollector = new MetadataCollector(conf);
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java Tue Jan 31 08:08:58 2006
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse.mp3;
 
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
@@ -34,7 +35,12 @@
   private String album = null;
   private ArrayList links = new ArrayList();
   private String text = "";
+  private NutchConf nutchConf;
 
+  public MetadataCollector(NutchConf nutchConf) {
+      this.nutchConf = nutchConf;
+  }
+  
   public void notifyProperty(String name, String value) throws MalformedURLException {
     if (name.equals("TIT2-Text"))
       setTitle(value);
@@ -44,7 +50,7 @@
       setArtist(value);
 
     if (name.indexOf("URL Link") > -1) {
-      links.add(new Outlink(value, ""));
+      links.add(new Outlink(value, "", this.nutchConf));
     } else if (name.indexOf("Text") > -1) {
       text += value + "\n";
     }

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Tue Jan 31 08:08:58 2006
@@ -32,6 +32,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 /**
  * Nutch-Parser for parsing MS PowerPoint slides ( mime type:
@@ -51,6 +52,8 @@
   private static final Logger LOG = LogFormatter
       .getLogger(MSPowerPointParser.class.getName());
 
+  private NutchConf nutchConf;
+
   /**
    * 
    */
@@ -77,7 +80,7 @@
     ContentProperties prop = new ContentProperties();
     prop.setProperty("Content-Length", "" + raw.length);
 
-    Content content = new Content(file, file, raw, MIME_TYPE, prop);
+    Content content = new Content(file, file, raw, MIME_TYPE, prop, new NutchConf());
 
     System.out.println(ppe.getParse(content).getText());
   }
@@ -106,7 +109,7 @@
                 + raw.length
                 + " bytes. Please increase <protocol>.content.limit at nutch-default.xml. "
                 + "Parser can't handle incomplete PowerPoint files.")
-            .getEmptyParse();
+            .getEmptyParse(getConf());
       }
 
       final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
@@ -114,11 +117,11 @@
 
       plainText = extractor.getText();
       properties = extractor.getProperties();
-      outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl());
+      outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl(), getConf());
 
     } catch (Exception e) {
       LOG.throwing(this.getClass().getName(), "getParse", e);
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
 
     // collect meta data
@@ -141,6 +144,7 @@
 
     final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
     final ParseData parseData = new ParseData(status, title, outlinks, metadata);
+    parseData.setConf(this.nutchConf);
 
     LOG.finest("PowerPoint file parsed sucessful.");
     return new ParseImpl(plainText, parseData);
@@ -160,5 +164,13 @@
       return null;
     }
 
+  }
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Tue Jan 31 08:08:58 2006
@@ -34,6 +34,7 @@
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -105,7 +106,7 @@
     this.urlString = createUrl(this.testFile.getName());
 
     System.out.println("Testing file: " + this.urlString + "...");
-    this.protocol = ProtocolFactory.getProtocol(this.urlString);
+    this.protocol =new ProtocolFactory(new NutchConf()).getProtocol(this.urlString);
     this.content = this.protocol.getProtocolOutput(new UTF8(this.urlString), new CrawlDatum()).getContent();
   }
 
@@ -125,7 +126,7 @@
    */
   public void testContent() throws Exception {
 
-    Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",this.content);
+    Parse parse = new ParseUtil(new NutchConf()).parseByParserId("parse-mspowerpoint",this.content);
 
     ParseData data = parse.getData();
     String text = parse.getText();
@@ -162,7 +163,7 @@
    */
   public void testMeta() throws Exception {
 
-    Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",content);
+    Parse parse = new ParseUtil(new NutchConf()).parseByParserId("parse-mspowerpoint",content);
     
     ParseData data = parse.getData();
 

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
@@ -50,6 +51,8 @@
  */
 
 public class MSWordParser implements Parser {
+  private NutchConf nutchConf;
+
 //  public static final Logger LOG =
 //    LogFormatter.getLogger("org.apache.nutch.parse.msword");
 
@@ -70,7 +73,7 @@
             && raw.length != Integer.parseInt(contentLength)) {
           return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                   "Content truncated at " + raw.length
-            +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
+            +" bytes. Parser can't handle incomplete msword file.").getEmptyParse(this.nutchConf);
       }
 
       WordExtractor extractor = new WordExtractor();
@@ -84,14 +87,14 @@
       extractor = null;
 
     } catch (ParseException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(this.nutchConf);
     } catch (FastSavedException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(this.nutchConf);
     } catch (PasswordProtectedException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(this.nutchConf);
     } catch (Exception e) { // run time exception
       return new ParseStatus(ParseStatus.FAILED,
-              "Can't be handled as msword document. " + e).getEmptyParse();
+              "Can't be handled as msword document. " + e).getEmptyParse(this.nutchConf);
     } finally {
       // nothing so far
     }
@@ -113,12 +116,21 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.nutchConf);
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
+    parseData.setConf(this.nutchConf);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -61,12 +62,13 @@
     Content content;
     Parse parse;
 
+    NutchConf nutchConf = new NutchConf();
     for (int i=0; i<sampleFiles.length; i++) {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
-      protocol = ProtocolFactory.getProtocol(urlString);
+      protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-      parse = ParseUtil.parseByParserId("parse-msword",content);
+      parse = new ParseUtil(nutchConf).parseByParserId("parse-msword",content);
 
       assertTrue(parse.getText().startsWith(expectedText));
     }

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
@@ -62,6 +63,7 @@
 public class PdfParser implements Parser {
   public static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.parse.pdf");
+  private NutchConf nutchConf;
 
   public PdfParser () {
     // redirect org.apache.log4j.Logger to java's native logger, in order
@@ -99,7 +101,7 @@
             && raw.length != Integer.parseInt(contentLength)) {
           return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                   "Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
+            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
       }
 
       PDFParser parser = new PDFParser(
@@ -134,13 +136,13 @@
 
     } catch (CryptographyException e) {
       return new ParseStatus(ParseStatus.FAILED,
-              "Error decrypting document. " + e).getEmptyParse();
+              "Error decrypting document. " + e).getEmptyParse(getConf());
     } catch (InvalidPasswordException e) {
       return new ParseStatus(ParseStatus.FAILED,
-              "Can't decrypt document - invalid password. " + e).getEmptyParse();
+              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
     } catch (Exception e) { // run time exception
       return new ParseStatus(ParseStatus.FAILED,
-              "Can't be handled as pdf document. " + e).getEmptyParse();
+              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
     } finally {
       try {
         if (pdf != null)
@@ -157,13 +159,14 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
 
     // collect meta data
     ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
+    parseData.setConf(this.nutchConf);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);
@@ -178,6 +181,14 @@
       retval = formatter.format(date.getTime());
     }
     return retval;
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -64,9 +65,10 @@
     for (int i=0; i<sampleFiles.length; i++) {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
-      protocol = ProtocolFactory.getProtocol(urlString);
+      NutchConf nutchConf = new NutchConf();
+      protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-      parse = ParseUtil.parseByParserId("parse-pdf",content);
+      parse = new ParseUtil(nutchConf).parseByParserId("parse-pdf",content);
 
       int index = parse.getText().indexOf(expectedText);
       assertTrue(index > 0);

Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Tue Jan 31 08:08:58 2006
@@ -18,6 +18,7 @@
 
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseStatus;
@@ -63,6 +64,7 @@
 public class RSSParser implements Parser {
     public static final Logger LOG = LogFormatter
             .getLogger("org.apache.nutch.parse.rss");
+    private NutchConf nutchConf;
 
     /**
      * <p>
@@ -122,7 +124,7 @@
             e.printStackTrace();
             LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
             return new ParseStatus(ParseStatus.FAILED,
-                    "Can't be handled as rss document. " + e).getEmptyParse();
+                    "Can't be handled as rss document. " + e).getEmptyParse(getConf());
         }
 
         StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
@@ -149,9 +151,9 @@
                     try {
                         // get the outlink
 			if (r.getDescription()!= null ) {
-			    theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
+			    theOutlinks.add(new Outlink(r.getLink(), r.getDescription(), getConf()));
 			} else {
-			    theOutlinks.add(new Outlink(r.getLink(), ""));
+			    theOutlinks.add(new Outlink(r.getLink(), "", getConf()));
 			}
                     } catch (MalformedURLException e) {
                         LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
@@ -179,9 +181,9 @@
                     if (whichLink != null) {
                         try {
 			    if (theRSSItem.getDescription()!=null) {
-				theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
+				theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription(), getConf()));
 			    } else {
-				theOutlinks.add(new Outlink(whichLink, ""));
+				theOutlinks.add(new Outlink(whichLink, "", getConf()));
 			    }
                         } catch (MalformedURLException e) {
                             LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
@@ -211,7 +213,16 @@
 
         ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                 contentTitle.toString(), outlinks, content.getMetadata());
+        parseData.setConf(this.nutchConf);
         return new ParseImpl(indexText.toString(), parseData);
     }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 
 }

Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -79,12 +80,13 @@
         Content content;
         Parse parse;
 
+        NutchConf nutchConf = new NutchConf();
         for (int i = 0; i < sampleFiles.length; i++) {
             urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
-            protocol = ProtocolFactory.getProtocol(urlString);
+            protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
             content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-            parse = ParseUtil.parseByParserId("parse-rss",content);
+            parse = new ParseUtil(nutchConf).parseByParserId("parse-rss",content);
 
             //check that there are 3 outlinks:
             //http://test.channel.com

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Tue Jan 31 08:08:58 2006
@@ -17,9 +17,8 @@
 package org.apache.nutch.parse.rtf;
 
 import org.apache.nutch.parse.*;
-import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.protocol.Content;
-
+import org.apache.nutch.util.NutchConf;
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
@@ -29,10 +28,13 @@
 
 /**
  * A parser for RTF documents
+ * 
  * @author Andy Hedges
  */
 public class RTFParseFactory implements Parser {
 
+  private NutchConf nutchConf;
+
   public Parse getParse(Content content) throws ParseException {
     byte[] raw = content.getContent();
     Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
@@ -53,7 +55,7 @@
     metadata.putAll(delegate.getMetaData());
     String title = metadata.getProperty("title");
 
-    if(title != null){
+    if (title != null) {
       metadata.remove(title);
     } else {
       title = "";
@@ -61,11 +63,15 @@
 
     String text = delegate.getText();
 
-    return new ParseImpl(text, 
-                         new ParseData(title,
-                                       OutlinkExtractor.getOutlinks(text),
-                                       metadata));
+    return new ParseImpl(text, new ParseData(title, OutlinkExtractor
+        .getOutlinks(text, this.nutchConf), metadata));
   }
 
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
 
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Tue Jan 31 08:08:58 2006
@@ -25,6 +25,7 @@
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConf;
 
 import java.util.Properties;
 
@@ -60,11 +61,12 @@
     Content content;
     Parse parse;
 
+    NutchConf nutchConf = new NutchConf();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
-    protocol = ProtocolFactory.getProtocol(urlString);
+    protocol = new ProtocolFactory(nutchConf).getProtocol(urlString);
     content = protocol.getContent(urlString);
 
-    parse = ParseUtil.parseByParserId("parse-rtf",content);
+    parse = new ParseUtil(nutchConf).parseByParserId("parse-rtf", content);
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 

Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue Jan 31 08:08:58 2006
@@ -24,31 +24,42 @@
 import org.apache.nutch.util.*;
 
 public class TextParser implements Parser {
+  private NutchConf nutchConf;
+
   public Parse getParse(Content content) {
     // copy content meta data through
     ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata());
 
-    //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
+    // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
+    // Outlink[0], metadata);
 
-    String encoding =
-      StringUtil.parseCharacterEncoding(content.getContentType());
+    String encoding = StringUtil.parseCharacterEncoding(content
+        .getContentType());
     String text;
-    if (encoding != null) {                       // found an encoding header
-      try {                                       // try to use named encoding
+    if (encoding != null) { // found an encoding header
+      try { // try to use named encoding
         text = new String(content.getContent(), encoding);
       } catch (java.io.UnsupportedEncodingException e) {
-        return new ParseStatus(e).getEmptyParse();
+        return new ParseStatus(e).getEmptyParse(getConf());
       }
     } else {
-      // FIXME: implement charset detector. This code causes problem when 
-      //        character set isn't specified in HTTP header.
-      text = new String(content.getContent());    // use default encoding
+      // FIXME: implement charset detector. This code causes problem when
+      // character set isn't specified in HTTP header.
+      text = new String(content.getContent()); // use default encoding
     }
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
+        OutlinkExtractor.getOutlinks(text, getConf()), metadata);
+    parseData.setConf(this.nutchConf);
+    return new ParseImpl(text, parseData);
+    
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
 
-    return new ParseImpl(text,
-                         new ParseData(ParseStatus.STATUS_SUCCESS, "",
-                                       OutlinkExtractor.getOutlinks(text),
-                                       metadata));
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Tue Jan 31 08:08:58 2006
@@ -33,73 +33,87 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 /**
  * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
  * Nutch parse plugin for zip files - Content Type : application/zip
+ * 
  * @author Rohit Kulkarni & Ashish Vaidya
  */
-public class ZipParser implements Parser{
-    
-    private static final Logger LOG = LogFormatter.getLogger(ZipParser.class.getName());
-    /** Creates a new instance of ZipParser */
-    public ZipParser() {
+public class ZipParser implements Parser {
+
+  private static final Logger LOG = LogFormatter.getLogger(ZipParser.class
+      .getName());
+  private NutchConf nutchConf;
+
+  /** Creates a new instance of ZipParser */
+  public ZipParser() {
+  }
+
+  public Parse getParse(final Content content) {
+
+    String resultText = null;
+    String resultTitle = null;
+    Outlink[] outlinks = null;
+    List outLinksList = new ArrayList();
+    Properties properties = null;
+
+    try {
+      final String contentLen = content.get("Content-Length");
+      final int len = Integer.parseInt(contentLen);
+      System.out.println("ziplen: " + len);
+      final byte[] contentInBytes = content.getContent();
+      final ByteArrayInputStream bainput = new ByteArrayInputStream(
+          contentInBytes);
+      final InputStream input = bainput;
+
+      if (contentLen != null && contentInBytes.length != len) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+                + contentInBytes.length
+                + " bytes. Parser can't handle incomplete pdf file.")
+            .getEmptyParse(getConf());
+      }
+
+      ZipTextExtractor extractor = new ZipTextExtractor(getConf());
+
+      // extract text
+      resultText = extractor.extractText(new ByteArrayInputStream(
+          contentInBytes), content.getUrl(), outLinksList);
+
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as Zip document. " + e).getEmptyParse(getConf());
     }
-    
-    public Parse getParse(final Content content) {
-        
-        String resultText = null;
-        String resultTitle = null;
-        Outlink[] outlinks = null;
-        List outLinksList = new ArrayList();
-	Properties properties = null;
-        
-        try {
-            final String contentLen = content.get("Content-Length");
-            final int len = Integer.parseInt(contentLen);
-            System.out.println("ziplen: " + len);
-            final byte[] contentInBytes = content.getContent();
-            final ByteArrayInputStream bainput = new ByteArrayInputStream(contentInBytes);
-            final InputStream input = bainput;
-            
-            if (contentLen != null && contentInBytes.length != len) {
-                return new ParseStatus(ParseStatus.FAILED,
-                                       ParseStatus.FAILED_TRUNCATED,
-                                       "Content truncated at " + contentInBytes.length +
-                                       " bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
-            }
-            
-            ZipTextExtractor extractor = new ZipTextExtractor();
-            
-            // extract text
-            resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes),
-	    				content.getUrl(), outLinksList);
-            
-        } catch (Exception e) {
-            return new ParseStatus(ParseStatus.FAILED,
-                                   "Can't be handled as Zip document. " + e).getEmptyParse();
-        }
-        
-        // collect meta data
-        final ContentProperties metadata = new ContentProperties();
-        metadata.putAll(content.getMetadata()); // copy through
-        
-        if (resultText == null) {
-            resultText = "";
-        }
-        
-        if (resultTitle == null) {
-            resultTitle = "";
-        }
-	
-        outlinks = (Outlink[])outLinksList.toArray(new Outlink[0]);
-        final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
-                                                  resultTitle, 
-                                                  outlinks, 
-                                                  metadata);
-        
-        LOG.finest("Zip file parsed sucessfully !!");
-        return new ParseImpl(resultText, parseData);
+
+    // collect meta data
+    final ContentProperties metadata = new ContentProperties();
+    metadata.putAll(content.getMetadata()); // copy through
+
+    if (resultText == null) {
+      resultText = "";
     }
-    
+
+    if (resultTitle == null) {
+      resultTitle = "";
+    }
+
+    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
+    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+        resultTitle, outlinks, metadata);
+    parseData.setConf(this.nutchConf);
+
+    LOG.finest("Zip file parsed sucessfully !!");
+    return new ParseImpl(resultText, parseData);
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
+
 }

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Tue Jan 31 08:08:58 2006
@@ -46,14 +46,17 @@
 public class ZipTextExtractor {
   
   /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME =
-          MimeTypes.get(NutchConf.get().get("mime.types.file"));
+  private MimeTypes MIME;
   
   public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName());
+
+private NutchConf nutchConf;
   
   
   /** Creates a new instance of ZipTextExtractor */
-  public ZipTextExtractor() {
+  public ZipTextExtractor(NutchConf nutchConf) {
+      this.nutchConf = nutchConf;
+      this.MIME = MimeTypes.get(nutchConf.get("mime.types.file"));
   }
   
   public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -88,13 +91,13 @@
             ContentProperties metadata = new ContentProperties();
             metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
             metadata.setProperty("Content-Type", contentType);
-            Content content = new Content(newurl, base, b, contentType, metadata);
-            Parse parse = ParseUtil.parse(content);
+            Content content = new Content(newurl, base, b, contentType, metadata, this.nutchConf);
+            Parse parse = new ParseUtil(this.nutchConf).parse(content);
             ParseData theParseData = parse.getData();
             Outlink[] theOutlinks = theParseData.getOutlinks();
             
             for(int count = 0; count < theOutlinks.length; count++) {
-              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
+              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor(), this.nutchConf));
             }
             
             resultText += entry.getName() + " " + parse.getText() + " ";

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -61,12 +62,13 @@
     Content content;
     Parse parse;
 
+    NutchConf conf = new NutchConf();
     for (int i = 0; i < sampleFiles.length; i++) {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
-      protocol = ProtocolFactory.getProtocol(urlString);
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-      parse = ParseUtil.parseByParserId("parse-zip",content);
+      parse = new ParseUtil(conf).parseByParserId("parse-zip",content);
       assertTrue(parse.getText().equals(expectedText));
     }
   }

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Jan 31 08:08:58 2006
@@ -50,7 +50,7 @@
 
   static final int MAX_REDIRECTS = 5;
 
-  static int maxContentLength = NutchConf.get().getInt("file.content.limit", 64 * 1024);
+  int maxContentLength;
 
   // 20040412, xing
   // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
@@ -59,6 +59,8 @@
   // http date format
   HttpDateFormat httpDateFormat = null;
 
+  private NutchConf nutchConf;
+
   // constructor
   public File() {
     this.httpDateFormat = new HttpDateFormat();
@@ -76,7 +78,7 @@
   
       while (true) {
         FileResponse response;
-        response = new FileResponse(u, datum, this);   // make a request
+        response = new FileResponse(u, datum, this, getConf());   // make a request
   
         int code = response.getCode();
   
@@ -152,4 +154,12 @@
     file = null;
   }
 
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
 
 
 /************************************
@@ -63,6 +64,7 @@
   private ContentProperties headers = new ContentProperties();
 
   private final File file;
+  private NutchConf nutchConf;
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -77,15 +79,16 @@
   public Content toContent() {
     return new Content(orig, base, content,
                        getHeader("Content-Type"),
-                       headers);
+                       headers, this.nutchConf);
   }
 
-  public FileResponse(URL url, CrawlDatum datum, File file)
+  public FileResponse(URL url, CrawlDatum datum, File file, NutchConf nutchConf)
     throws FileException, IOException {
 
     this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
+    this.nutchConf = nutchConf;
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);

Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue Jan 31 08:08:58 2006
@@ -56,23 +56,23 @@
 
   static final int MAX_REDIRECTS = 5;
 
-  static int timeout = NutchConf.get().getInt("ftp.timeout", 10000);
+  int timeout;
 
-  static int maxContentLength = NutchConf.get().getInt("ftp.content.limit",64*1024);
+  int maxContentLength;
 
-  String userName = NutchConf.get().get("ftp.username", "anonymous");
-  String passWord = NutchConf.get().get("ftp.password", "anonymous@example.com");
+  String userName;
+  String passWord; 
 
   // typical/default server timeout is 120*1000 millisec.
   // better be conservative here
-  int serverTimeout = NutchConf.get().getInt("ftp.server.timeout", 60*1000);
+  int serverTimeout;
 
   // when to have client start anew
   long renewalTime = -1;
 
-  boolean keepConnection = NutchConf.get().getBoolean("ftp.keep.connection", false);
+  boolean keepConnection;
 
-  boolean followTalk = NutchConf.get().getBoolean("ftp.follow.talk", false);
+  boolean followTalk;
 
   // ftp client
   Client client = null;
@@ -86,6 +86,8 @@
   // http date format
   HttpDateFormat httpDateFormat = null;
 
+  private NutchConf nutchConf;
+
 
   // constructor
   public Ftp() {
@@ -121,7 +123,7 @@
   
       while (true) {
         FtpResponse response;
-        response = new FtpResponse(u, datum, this);   // make a request
+        response = new FtpResponse(u, datum, this, getConf());   // make a request
   
         int code = response.getCode();
   
@@ -218,6 +220,22 @@
     }
 
     ftp = null;
+  }
+
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+    this.timeout = conf.getInt("ftp.timeout", 10000);
+    this.userName = conf.get("ftp.username", "anonymous");
+    this.passWord = conf.get("ftp.password", "anonymous@example.com");
+    this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+    this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+    this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Tue Jan 31 08:08:58 2006
@@ -27,6 +27,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
 
 import java.net.InetAddress;
 import java.net.URL;
@@ -64,6 +65,7 @@
   private ContentProperties headers = new ContentProperties();
 
   private final Ftp ftp;
+  private NutchConf nutchConf;
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -78,15 +80,16 @@
   public Content toContent() {
     return new Content(orig, base, content,
                        getHeader("Content-Type"),
-                       headers);
+                       headers, this.nutchConf);
   }
 
-  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp)
+  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, NutchConf nutchConf)
     throws FtpException, IOException {
 
     this.orig = url.toString();
     this.base = url.toString();
     this.ftp = ftp;
+    this.nutchConf = nutchConf;
 
     if (!"ftp".equals(url.getProtocol()))
       throw new FtpException("Not a ftp url:" + url);

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Tue Jan 31 08:08:58 2006
@@ -33,25 +33,29 @@
 public class Http extends HttpBase {
 
   public static final Logger LOG =
-    LogFormatter.getLogger("org.apache.nutch.net.Http");
-
-  static {
-    if (NutchConf.get().getBoolean("http.verbose", false))
-      LOG.setLevel(Level.FINE);
-  }
+    LogFormatter.getLogger(Http.class.getName());
 
 
   public Http() {
     super(LOG);
   }
 
+  public void setConf(NutchConf conf) {
+    super.setConf(conf);
+    Level logLevel = Level.WARNING;
+    if (conf.getBoolean("http.verbose", false)) {
+      logLevel = Level.FINE;
+    }
+    LOG.setLevel(logLevel);
+  }
+
   public static void main(String[] args) throws Exception {
     main(new Http(), args);
   }
 
   protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
     throws ProtocolException, IOException {
-    return new HttpResponse(url, datum);
+    return new HttpResponse(this, url, datum);
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Jan 31 08:08:58 2006
@@ -35,13 +35,15 @@
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.http.api.HttpException;
 import org.apache.nutch.util.GZIPUtils;
 
 
 /** An HTTP response. */
 public class HttpResponse implements Response {
-  
+ 
+  private HttpBase http; 
   private URL url;
   private String orig;
   private String base;
@@ -50,9 +52,10 @@
   private ContentProperties headers = new ContentProperties();
 
 
-  public HttpResponse(URL url, CrawlDatum datum)
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
     throws ProtocolException, IOException {
 
+    this.http = http;
     this.url = url;
     this.orig = url.toString();
     this.base = url.toString();
@@ -83,20 +86,20 @@
 
     try {
       socket = new Socket();                    // create the socket
-      socket.setSoTimeout(Http.TIMEOUT);
+      socket.setSoTimeout(http.getTimeout());
 
 
       // connect
-      String sockHost = Http.PROXY ? Http.PROXY_HOST : host;
-      int sockPort = Http.PROXY ? Http.PROXY_PORT : port;
+      String sockHost = http.useProxy() ? http.getProxyHost() : host;
+      int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
-      socket.connect(sockAddr, Http.TIMEOUT);
+      socket.connect(sockAddr, http.getTimeout());
 
       // make request
       OutputStream req = socket.getOutputStream();
 
       StringBuffer reqStr = new StringBuffer("GET ");
-      if(Http.PROXY){
+      if (http.useProxy()) {
       	reqStr.append(url.getProtocol()+"://"+host+portString+path);
       } else {
       	reqStr.append(path);
@@ -111,11 +114,12 @@
 
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
 
-      if ((Http.AGENT_STRING == null) || (Http.AGENT_STRING.length() == 0)) {
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
         Http.LOG.severe("User-agent is not set!");
       } else {
         reqStr.append("User-Agent: ");
-        reqStr.append(Http.AGENT_STRING);
+        reqStr.append(userAgent);
         reqStr.append("\r\n");
       }
 
@@ -148,7 +152,7 @@
         Http.LOG.fine("uncompressing....");
         byte[] compressed = content;
 
-        content = GZIPUtils.unzipBestEffort(compressed, Http.MAX_CONTENT);
+        content = GZIPUtils.unzipBestEffort(compressed, http.getMaxContent());
 
         if (content == null)
           throw new HttpException("unzipBestEffort returned null");
@@ -212,9 +216,9 @@
         throw new HttpException("bad content length: "+contentLengthString);
       }
     }
-    if (Http.MAX_CONTENT >= 0
-      && contentLength > Http.MAX_CONTENT)   // limit download size
-      contentLength  = Http.MAX_CONTENT;
+    if (http.getMaxContent() >= 0
+      && contentLength > http.getMaxContent())   // limit download size
+      contentLength  = http.getMaxContent();
 
     ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
     byte[] bytes = new byte[Http.BUFFER_SIZE];
@@ -265,8 +269,8 @@
         break;
       }
 
-      if ( (contentBytesRead + chunkLen) > Http.MAX_CONTENT )
-        chunkLen= Http.MAX_CONTENT - contentBytesRead;
+      if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+        chunkLen= http.getMaxContent() - contentBytesRead;
 
       // read one chunk
       int chunkBytesRead= 0;
@@ -295,7 +299,7 @@
     }
 
     if (!doneChunks) {
-      if (contentBytesRead != Http.MAX_CONTENT) 
+      if (contentBytesRead != http.getMaxContent()) 
         throw new HttpException("chunk eof: !doneChunk && didn't max out");
       return;
     }

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Tue Jan 31 08:08:58 2006
@@ -46,70 +46,70 @@
 
   public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.net.Http");
 
-  static {
-    if (NutchConf.get().getBoolean("http.verbose", false)) {
-      LOG.setLevel(Level.FINE);
-    } else {                                      // shush about redirects
-      Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector")
-        .setLevel(Level.WARNING);
-    }
-  }
-
   private static MultiThreadedHttpConnectionManager connectionManager =
           new MultiThreadedHttpConnectionManager();
-  
-  private static HttpClient client;
+
+  // Since the NutchConf has not yet been setted,
+  // then an unconfigured client is returned.
+  private static HttpClient client = new HttpClient(connectionManager);
 
   static synchronized HttpClient getClient() {
-    if (client != null) return client;
-    configureClient();
     return client;
   }
 
-  static int MAX_THREADS_TOTAL = NutchConf.get().getInt("fetcher.threads.fetch", 10);
-  static String NTLM_USERNAME = NutchConf.get().get("http.auth.ntlm.username", "");
-  static String NTLM_PASSWORD = NutchConf.get().get("http.auth.ntlm.password", "");
-  static String NTLM_DOMAIN = NutchConf.get().get("http.auth.ntlm.domain", "");
-  static String NTLM_HOST = NutchConf.get().get("http.auth.ntlm.host", "");
-
-  static {
-    LOG.info("http.auth.ntlm.username = " + NTLM_USERNAME);
-  }
-
+  boolean verbose = false;
+  int maxThreadsTotal = 10;
+  String ntlmUsername = "";
+  String ntlmPassword = "";
+  String ntlmDomain = "";
+  String ntlmHost = "";
 
   public Http() {
     super(LOG);
   }
 
+  public void setConf(NutchConf conf) {
+    super.setConf(conf);
+    this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
+    this.ntlmUsername = conf.get("http.auth.ntlm.username", "");
+    this.ntlmPassword = conf.get("http.auth.ntlm.password", "");
+    this.ntlmDomain = conf.get("http.auth.ntlm.domain", "");
+    this.ntlmHost = conf.get("http.auth.ntlm.host", "");
+    Level logLevel = Level.WARNING;
+    if (conf.getBoolean("http.verbose", false)) {
+      logLevel = Level.FINE;
+    }
+    LOG.setLevel(logLevel);
+    Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector")
+          .setLevel(logLevel);
+    configureClient();
+  }
+
   public static void main(String[] args) throws Exception {
     main(new Http(), args);
   }
 
   protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
     throws ProtocolException, IOException {
-    return new HttpResponse(url, datum, redirect);
+    return new HttpResponse(this, url, datum, redirect);
   }
   
-  private static void configureClient() {
-
-    // get a client isntance -- we just need one.
-
-    client = new HttpClient(connectionManager);
+  private void configureClient() {
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
     Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443);
     Protocol.registerProtocol("https", dummyhttps);
     
     HttpConnectionManagerParams params = connectionManager.getParams();
-    params.setConnectionTimeout(TIMEOUT);
-    params.setSoTimeout(TIMEOUT);
+    params.setConnectionTimeout(timeout);
+    params.setSoTimeout(timeout);
     params.setSendBufferSize(BUFFER_SIZE);
     params.setReceiveBufferSize(BUFFER_SIZE);
-    params.setMaxTotalConnections(MAX_THREADS_TOTAL);
-    if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
-      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+    params.setMaxTotalConnections(maxThreadsTotal);
+    if (maxThreadsTotal > maxThreadsPerHost) {
+      params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost);
     } else {
-      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+      params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
     }
 
     HostConfiguration hostConf = client.getHostConfiguration();
@@ -122,15 +122,15 @@
     headers.add(new Header("Accept",
             "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
     hostConf.getParams().setParameter("http.default-headers", headers);
-    if (PROXY) {
-      hostConf.setProxy(PROXY_HOST, PROXY_PORT);
+    if (useProxy) {
+      hostConf.setProxy(proxyHost, proxyPort);
     }
-    if (NTLM_USERNAME.length() > 0) {
-      Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN);
-      client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds);
+    if (ntlmUsername.length() > 0) {
+      Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, ntlmHost, ntlmDomain);
+      client.getState().setCredentials(new AuthScope(ntlmHost, AuthScope.ANY_PORT), ntCreds);
 
-      LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
+      LOG.info("Added NTLM credentials for " + ntlmUsername);
     }
     LOG.info("Configured Client");
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Tue Jan 31 08:08:58 2006
@@ -15,6 +15,8 @@
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
+
 
 /**
  * Provides the Http protocol implementation
@@ -28,25 +30,50 @@
  * 
  * @author Matt Tencati
  */
-public class HttpAuthenticationFactory {
-    /** The HTTP Authentication (WWW-Authenticate) header which is returned 
+public class HttpAuthenticationFactory implements NutchConfigurable {
+
+    /** 
+     * The HTTP Authentication (WWW-Authenticate) header which is returned 
      * by a webserver requiring authentication.
      */
     public static final String AUTH_HEADER = "WWW-Authenticate";
 	
-	public static final Logger LOG =
-		LogFormatter.getLogger("net.nutch.protocol.http.HttpAuthenticationFactory");
+    public static final Logger LOG =
+		LogFormatter.getLogger(HttpAuthenticationFactory.class.getName());
 
-	static {
-		if (NutchConf.get().getBoolean("http.auth.verbose", false))
-			LOG.setLevel(Level.FINE);
-	}
-	  
     private static Map auths = new TreeMap(); 
+
+    private NutchConf conf = null;
     
-    private HttpAuthenticationFactory() { }
     
-    public static HttpAuthentication findAuthentication(ContentProperties header) {
+    public HttpAuthenticationFactory(NutchConf conf) {
+      setConf(conf);
+    }
+
+   
+    /* ---------------------------------- *
+     * <implementation:NutchConfigurable> *
+     * ---------------------------------- */
+
+    public void setConf(NutchConf conf) {
+      this.conf = conf;
+      if (conf.getBoolean("http.auth.verbose", false)) {
+        LOG.setLevel(Level.FINE);
+      } else {
+        LOG.setLevel(Level.WARNING);
+      }
+    }
+
+    public NutchConf getConf() {
+      return conf;
+    }
+ 
+    /* ---------------------------------- *
+     * <implementation:NutchConfigurable> *
+     * ---------------------------------- */
+
+
+    public HttpAuthentication findAuthentication(ContentProperties header) {
         if (header == null) return null;
         
     	try {
@@ -80,7 +107,7 @@
 		                  }
 		                
 		                LOG.fine("Checking challengeString=" + challengeString);
-				auth = HttpBasicAuthentication.getAuthentication(challengeString);
+				auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
 				if (auth != null) return auth;
 				
 				//TODO Add additional Authentication lookups here

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java Tue Jan 31 08:08:58 2006
@@ -16,6 +16,7 @@
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
 
 /**
  * Implementation of RFC 2617 Basic Authentication.  Usernames and passwords are stored 
@@ -25,23 +26,21 @@
  *
  * @author    Matt Tencati
  */
-public class HttpBasicAuthentication implements HttpAuthentication {
-	public static final Logger LOG =
-		LogFormatter.getLogger("net.nutch.net.HttpBasicAuthentication");
-
-	static {
-		if (NutchConf.get().getBoolean("http.auth.verbose", false))
-			LOG.setLevel(Level.FINE);
-	}
+public class HttpBasicAuthentication implements HttpAuthentication, NutchConfigurable {
 
-	private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
+    public static final Logger LOG =
+		LogFormatter.getLogger(HttpBasicAuthentication.class.getName());
+
+    private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
 	
     private static Map authMap = new TreeMap();
-    
+   
+    private NutchConf conf = null; 
     private String challenge = null;
     private ArrayList credentials = null;
     private String realm = null;
 
+
     /**
      *  Construct an HttpBasicAuthentication for the given challenge
      *  parameters. The challenge parameters are returned by the web
@@ -50,14 +49,16 @@
      *
      * @param  challenge  WWW-Authenticate header from web server
      */
-    protected HttpBasicAuthentication(String challenge) throws HttpAuthenticationException {
+    protected HttpBasicAuthentication(String challenge, NutchConf nutchConf) throws HttpAuthenticationException {
+        
+        setConf(nutchConf);
         this.challenge = challenge;
         LOG.fine("BasicAuthentication challenge is " + challenge);
         credentials = new ArrayList();
         
-        String username = NutchConf.get().get("http.auth.basic." + challenge + ".user");
+        String username = this.conf.get("http.auth.basic." + challenge + ".user");
         LOG.fine("BasicAuthentication username=" + username);
-        String password = NutchConf.get().get("http.auth.basic." + challenge + ".password");
+        String password = this.conf.get("http.auth.basic." + challenge + ".password");
         LOG.fine("BasicAuthentication password=" + password);
         
         if (username == null) {
@@ -73,6 +74,29 @@
         LOG.fine("Basic credentials: " + credentials);
     }
 
+
+    /* ---------------------------------- *
+     * <implementation:NutchConfigurable> *
+     * ---------------------------------- */
+
+    public void setConf(NutchConf conf) {
+      this.conf = conf;
+      if (conf.getBoolean("http.auth.verbose", false)) {
+        LOG.setLevel(Level.FINE);
+      } else {
+        LOG.setLevel(Level.WARNING);
+      }
+    }
+
+    public NutchConf getConf() {
+      return this.conf;
+    }
+
+    /* ---------------------------------- *
+     * <implementation:NutchConfigurable> *
+     * ---------------------------------- */
+
+
     /**
      *  Gets the Basic credentials generated by this
      *  HttpBasicAuthentication object
@@ -105,7 +129,7 @@
      * @return An HttpBasicAuthentication object or null 
      * if unable to generate appropriate credentials.
      */
-    public static HttpBasicAuthentication getAuthentication(String challenge) {
+    public static HttpBasicAuthentication getAuthentication(String challenge, NutchConf conf) {
         if (challenge == null) return null;
         Matcher basicMatcher = basic.matcher(challenge);
         if (basicMatcher.matches()) {
@@ -114,7 +138,7 @@
 	        if (auth == null) {
 	            HttpBasicAuthentication newAuth = null;
 	            try {
-	            	newAuth = new HttpBasicAuthentication(realm);
+	            	newAuth = new HttpBasicAuthentication(realm, conf);
 	            } catch (HttpAuthenticationException hae) { 
 	            	LOG.fine("HttpBasicAuthentication failed for " + challenge);
 	            }